Commit 6eb178dc authored by Dr.李's avatar Dr.李

added scikit-learn compatible standardize

parent 25b755fe
......@@ -7,3 +7,6 @@ Alpha_Mind.egg-info/*
*.c
*.cpp
*.html
*.nbc
*.nbi
/notebooks/.ipynb_checkpoints
\ No newline at end of file
......@@ -8,8 +8,13 @@ Created on 2017-4-25
import numpy as np
from alphamind.utilities import group_mapping
from alphamind.utilities import transform
from alphamind.utilities import aggregate
from alphamind.utilities import simple_mean
from alphamind.utilities import simple_std
from alphamind.utilities import array_index
from numba import jitclass
from numba import int32, float64
def standardize(x: np.ndarray, groups: np.ndarray=None, ddof=1) -> np.ndarray:
......@@ -21,4 +26,64 @@ def standardize(x: np.ndarray, groups: np.ndarray=None, ddof=1) -> np.ndarray:
return (x - mean_values) / std_values
else:
return (x - simple_mean(x, axis=0)) / simple_std(x, axis=0)
return (x - simple_mean(x, axis=0)) / simple_std(x, axis=0, ddof=ddof)
class Standardizer(object):
def __init__(self, ddof=1):
self.ddof_ = ddof
self.mean_ = None
self.std_ = None
def fit(self, x):
self.mean_ = simple_mean(x, axis=0)
self.std_ = simple_std(x, axis=0, ddof=self.ddof_)
def transform(self, x):
return (x - self.mean_) / self.std_
class GroupedStandardizer(object):
def __init__(self, ddof=1):
self.labels_ = None
self.mean_ = None
self.std_ = None
self.ddof_ = ddof
def fit(self, x):
raw_groups = x[:, 0].astype(int)
groups = group_mapping(raw_groups)
self.mean_ = aggregate(groups, x[:, 1:], 'mean')
self.std_ = aggregate(groups, x[:, 1:], 'std', self.ddof_)
self.labels_ = np.unique(raw_groups)
def transform(self, x):
groups = x[:, 0].astype(int)
index = array_index(self.labels_, groups)
return (x[:, 1:] - self.mean_[index]) / self.std_[index]
if __name__ == '__main__':
import datetime as dt
x_value = np.random.randn(1000, 3)
groups = np.random.randint(20, size=1000)
x = np.concatenate([groups.reshape((-1, 1)), x_value], axis=1)
start = dt.datetime.now()
for i in range(10000):
x1 = standardize(x_value, groups)
print(dt.datetime.now() - start)
s = GroupedStandardizer(1)
start = dt.datetime.now()
for i in range(10000):
s.fit(x)
x2 = s.transform(x)
print(dt.datetime.now() - start)
np.testing.assert_array_almost_equal(x1, x2)
\ No newline at end of file
......@@ -10,6 +10,8 @@ import numpy as np
import pandas as pd
from scipy.stats import zscore
from alphamind.data.standardize import standardize
from alphamind.data.standardize import Standardizer
from alphamind.data.standardize import GroupedStandardizer
class TestStandardize(unittest.TestCase):
......@@ -31,6 +33,45 @@ class TestStandardize(unittest.TestCase):
transform(lambda s: (s - s.mean(axis=0)) / s.std(axis=0, ddof=1))
np.testing.assert_array_almost_equal(calc_zscore, exp_zscore)
def test_standardizer(self):
s = Standardizer()
s.fit(self.x)
calc_zscore = s.transform(self.x)
exp_zscore = standardize(self.x)
np.testing.assert_array_almost_equal(calc_zscore, exp_zscore)
def test_groupedstandardizer(self):
x = np.concatenate([self.groups.reshape((-1, 1)), self.x], axis=1)
s = GroupedStandardizer()
s.fit(x)
calc_zscore = s.transform(x)
exp_zscore = standardize(self.x, self.groups)
np.testing.assert_array_almost_equal(calc_zscore, exp_zscore)
if __name__ == '__main__':
unittest.main()
import datetime as dt
from sklearn.preprocessing import StandardScaler
x = np.random.randn(1000, 2)
y = np.random.randn(50, 2)
start = dt.datetime.now()
for i in range(10000):
s1 = StandardScaler()
s1.fit(x)
x1 = s1.transform(y)
print(dt.datetime.now() - start)
start = dt.datetime.now()
for i in range(10000):
s2 = Standardizer(ddof=0)
s2.fit(x)
x2 = s2.transform(y)
print(dt.datetime.now() - start)
np.testing.assert_array_almost_equal(x1, x2)
\ No newline at end of file
......@@ -219,6 +219,21 @@ def scale_value(groups, source, x, scale):
return destination
@nb.njit(nogil=True, cache=True)
def array_index(array, items):
to_look_length = items.shape[0]
arr_length = array.shape[0]
res = np.zeros(to_look_length, dtype=array.dtype)
for i in range(to_look_length):
for j in range(arr_length):
if items[i] == array[j]:
res[i] = j
break
return res
def transform(groups: np.ndarray,
x: np.ndarray,
func: str,
......@@ -255,3 +270,15 @@ def aggregate(groups, x, func, ddof=1):
raise ValueError('({0}) is not recognized as valid functor'.format(func))
return value_data
if __name__ == '__main__':
x1 = np.random.randint(30, size=1000)
array = np.unique(x1)
x2 = np.random.randint(30, size=1000)
res = array_index(array, x2)
print(res)
\ No newline at end of file
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment