Commit 6eb178dc authored by Dr.李's avatar Dr.李

added scikit-learn compatible standardize

parent 25b755fe
...@@ -7,3 +7,6 @@ Alpha_Mind.egg-info/* ...@@ -7,3 +7,6 @@ Alpha_Mind.egg-info/*
*.c *.c
*.cpp *.cpp
*.html *.html
*.nbc
*.nbi
/notebooks/.ipynb_checkpoints
\ No newline at end of file
...@@ -8,8 +8,13 @@ Created on 2017-4-25 ...@@ -8,8 +8,13 @@ Created on 2017-4-25
import numpy as np import numpy as np
from alphamind.utilities import group_mapping from alphamind.utilities import group_mapping
from alphamind.utilities import transform from alphamind.utilities import transform
from alphamind.utilities import aggregate
from alphamind.utilities import simple_mean from alphamind.utilities import simple_mean
from alphamind.utilities import simple_std from alphamind.utilities import simple_std
from alphamind.utilities import array_index
from numba import jitclass
from numba import int32, float64
def standardize(x: np.ndarray, groups: np.ndarray=None, ddof=1) -> np.ndarray: def standardize(x: np.ndarray, groups: np.ndarray=None, ddof=1) -> np.ndarray:
...@@ -21,4 +26,64 @@ def standardize(x: np.ndarray, groups: np.ndarray=None, ddof=1) -> np.ndarray: ...@@ -21,4 +26,64 @@ def standardize(x: np.ndarray, groups: np.ndarray=None, ddof=1) -> np.ndarray:
return (x - mean_values) / std_values return (x - mean_values) / std_values
else: else:
return (x - simple_mean(x, axis=0)) / simple_std(x, axis=0) return (x - simple_mean(x, axis=0)) / simple_std(x, axis=0, ddof=ddof)
class Standardizer(object):
def __init__(self, ddof=1):
self.ddof_ = ddof
self.mean_ = None
self.std_ = None
def fit(self, x):
self.mean_ = simple_mean(x, axis=0)
self.std_ = simple_std(x, axis=0, ddof=self.ddof_)
def transform(self, x):
return (x - self.mean_) / self.std_
class GroupedStandardizer(object):
def __init__(self, ddof=1):
self.labels_ = None
self.mean_ = None
self.std_ = None
self.ddof_ = ddof
def fit(self, x):
raw_groups = x[:, 0].astype(int)
groups = group_mapping(raw_groups)
self.mean_ = aggregate(groups, x[:, 1:], 'mean')
self.std_ = aggregate(groups, x[:, 1:], 'std', self.ddof_)
self.labels_ = np.unique(raw_groups)
def transform(self, x):
groups = x[:, 0].astype(int)
index = array_index(self.labels_, groups)
return (x[:, 1:] - self.mean_[index]) / self.std_[index]
if __name__ == '__main__':
import datetime as dt
x_value = np.random.randn(1000, 3)
groups = np.random.randint(20, size=1000)
x = np.concatenate([groups.reshape((-1, 1)), x_value], axis=1)
start = dt.datetime.now()
for i in range(10000):
x1 = standardize(x_value, groups)
print(dt.datetime.now() - start)
s = GroupedStandardizer(1)
start = dt.datetime.now()
for i in range(10000):
s.fit(x)
x2 = s.transform(x)
print(dt.datetime.now() - start)
np.testing.assert_array_almost_equal(x1, x2)
\ No newline at end of file
...@@ -10,6 +10,8 @@ import numpy as np ...@@ -10,6 +10,8 @@ import numpy as np
import pandas as pd import pandas as pd
from scipy.stats import zscore from scipy.stats import zscore
from alphamind.data.standardize import standardize from alphamind.data.standardize import standardize
from alphamind.data.standardize import Standardizer
from alphamind.data.standardize import GroupedStandardizer
class TestStandardize(unittest.TestCase): class TestStandardize(unittest.TestCase):
...@@ -23,7 +25,7 @@ class TestStandardize(unittest.TestCase): ...@@ -23,7 +25,7 @@ class TestStandardize(unittest.TestCase):
exp_zscore = zscore(self.x, ddof=1) exp_zscore = zscore(self.x, ddof=1)
np.testing.assert_array_almost_equal(calc_zscore, exp_zscore) np.testing.assert_array_almost_equal(calc_zscore, exp_zscore)
def test_standardize_with_group(self): def test_standardize_with_group(self):
calc_zscore = standardize(self.x, self.groups) calc_zscore = standardize(self.x, self.groups)
exp_zscore = pd.DataFrame(self.x).\ exp_zscore = pd.DataFrame(self.x).\
...@@ -31,6 +33,45 @@ class TestStandardize(unittest.TestCase): ...@@ -31,6 +33,45 @@ class TestStandardize(unittest.TestCase):
transform(lambda s: (s - s.mean(axis=0)) / s.std(axis=0, ddof=1)) transform(lambda s: (s - s.mean(axis=0)) / s.std(axis=0, ddof=1))
np.testing.assert_array_almost_equal(calc_zscore, exp_zscore) np.testing.assert_array_almost_equal(calc_zscore, exp_zscore)
def test_standardizer(self):
s = Standardizer()
s.fit(self.x)
calc_zscore = s.transform(self.x)
exp_zscore = standardize(self.x)
np.testing.assert_array_almost_equal(calc_zscore, exp_zscore)
def test_groupedstandardizer(self):
x = np.concatenate([self.groups.reshape((-1, 1)), self.x], axis=1)
s = GroupedStandardizer()
s.fit(x)
calc_zscore = s.transform(x)
exp_zscore = standardize(self.x, self.groups)
np.testing.assert_array_almost_equal(calc_zscore, exp_zscore)
if __name__ == '__main__': if __name__ == '__main__':
unittest.main() import datetime as dt
from sklearn.preprocessing import StandardScaler
x = np.random.randn(1000, 2)
y = np.random.randn(50, 2)
start = dt.datetime.now()
for i in range(10000):
s1 = StandardScaler()
s1.fit(x)
x1 = s1.transform(y)
print(dt.datetime.now() - start)
start = dt.datetime.now()
for i in range(10000):
s2 = Standardizer(ddof=0)
s2.fit(x)
x2 = s2.transform(y)
print(dt.datetime.now() - start)
np.testing.assert_array_almost_equal(x1, x2)
\ No newline at end of file
...@@ -219,6 +219,21 @@ def scale_value(groups, source, x, scale): ...@@ -219,6 +219,21 @@ def scale_value(groups, source, x, scale):
return destination return destination
@nb.njit(nogil=True, cache=True)
def array_index(array, items):
to_look_length = items.shape[0]
arr_length = array.shape[0]
res = np.zeros(to_look_length, dtype=array.dtype)
for i in range(to_look_length):
for j in range(arr_length):
if items[i] == array[j]:
res[i] = j
break
return res
def transform(groups: np.ndarray, def transform(groups: np.ndarray,
x: np.ndarray, x: np.ndarray,
func: str, func: str,
...@@ -255,3 +270,15 @@ def aggregate(groups, x, func, ddof=1): ...@@ -255,3 +270,15 @@ def aggregate(groups, x, func, ddof=1):
raise ValueError('({0}) is not recognized as valid functor'.format(func)) raise ValueError('({0}) is not recognized as valid functor'.format(func))
return value_data return value_data
if __name__ == '__main__':
x1 = np.random.randint(30, size=1000)
array = np.unique(x1)
x2 = np.random.randint(30, size=1000)
res = array_index(array, x2)
print(res)
\ No newline at end of file
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment