added scikit-learn compatible standardize

6eb178dc · Dr.李 · 25b755fe · 6eb178dc · 6eb178dc · 6eb178dc
Commit 6eb178dc authored Jul 03, 2017 by Dr.李
Showing with 139 additions and 3 deletions

.gitignore .gitignore +3 -0

standardize.py alphamind/data/standardize.py +66 -1

test_standardize.py alphamind/tests/data/test_standardize.py +43 -2

utilities.py alphamind/utilities.py +27 -0

No files found.
--- a/.gitignore
+++ b/.gitignore
@@ -7,3 +7,6 @@ Alpha_Mind.egg-info/*
 *.c
 *.cpp
 *.html
+*.nbc
+*.nbi
+/notebooks/.ipynb_checkpoints
\ No newline at end of file
--- a/alphamind/data/standardize.py
+++ b/alphamind/data/standardize.py
@@ -8,8 +8,13 @@ Created on 2017-4-25
 import numpy as np
 from alphamind.utilities import group_mapping
 from alphamind.utilities import transform
+from alphamind.utilities import aggregate
 from alphamind.utilities import simple_mean
 from alphamind.utilities import simple_std
+from alphamind.utilities import array_index
+
+from numba import jitclass
+from numba import int32, float64


 def standardize(x: np.ndarray, groups: np.ndarray=None, ddof=1) -> np.ndarray:
@@ -21,4 +26,64 @@ def standardize(x: np.ndarray, groups: np.ndarray=None, ddof=1) -> np.ndarray:

        return (x - mean_values) / std_values
    else:
-        return (x - simple_mean(x, axis=0)) / simple_std(x, axis=0)
+        return (x - simple_mean(x, axis=0)) / simple_std(x, axis=0, ddof=ddof)
+
+
+class Standardizer(object):
+
+    def __init__(self, ddof=1):
+        self.ddof_ = ddof
+        self.mean_ = None
+        self.std_ = None
+
+    def fit(self, x):
+        self.mean_ = simple_mean(x, axis=0)
+        self.std_ = simple_std(x, axis=0, ddof=self.ddof_)
+
+    def transform(self, x):
+        return (x - self.mean_) / self.std_
+
+
+class GroupedStandardizer(object):
+
+    def __init__(self, ddof=1):
+        self.labels_ = None
+        self.mean_ = None
+        self.std_ = None
+        self.ddof_ = ddof
+
+    def fit(self, x):
+        raw_groups = x[:, 0].astype(int)
+        groups = group_mapping(raw_groups)
+        self.mean_ = aggregate(groups, x[:, 1:], 'mean')
+        self.std_ = aggregate(groups, x[:, 1:], 'std', self.ddof_)
+        self.labels_ = np.unique(raw_groups)
+
+    def transform(self, x):
+        groups = x[:, 0].astype(int)
+        index = array_index(self.labels_, groups)
+        return (x[:, 1:] - self.mean_[index]) / self.std_[index]
+
+
+if __name__ == '__main__':
+
+    import datetime as dt
+
+    x_value = np.random.randn(1000, 3)
+    groups = np.random.randint(20, size=1000)
+    x = np.concatenate([groups.reshape((-1, 1)), x_value], axis=1)
+
+    start = dt.datetime.now()
+    for i in range(10000):
+        x1 = standardize(x_value, groups)
+    print(dt.datetime.now() - start)
+
+    s = GroupedStandardizer(1)
+
+    start = dt.datetime.now()
+    for i in range(10000):
+        s.fit(x)
+        x2 = s.transform(x)
+    print(dt.datetime.now() - start)
+
+    np.testing.assert_array_almost_equal(x1, x2)
\ No newline at end of file
--- a/alphamind/tests/data/test_standardize.py
+++ b/alphamind/tests/data/test_standardize.py
@@ -10,6 +10,8 @@ import numpy as np
 import pandas as pd
 from scipy.stats import zscore
 from alphamind.data.standardize import standardize
+from alphamind.data.standardize import Standardizer
+from alphamind.data.standardize import GroupedStandardizer


 class TestStandardize(unittest.TestCase):
@@ -23,7 +25,7 @@ class TestStandardize(unittest.TestCase):
        exp_zscore = zscore(self.x, ddof=1)

        np.testing.assert_array_almost_equal(calc_zscore, exp_zscore)
-        
+
    def test_standardize_with_group(self):
        calc_zscore = standardize(self.x, self.groups)
        exp_zscore = pd.DataFrame(self.x).\
@@ -31,6 +33,45 @@ class TestStandardize(unittest.TestCase):
            transform(lambda s: (s - s.mean(axis=0)) / s.std(axis=0, ddof=1))
        np.testing.assert_array_almost_equal(calc_zscore, exp_zscore)

+    def test_standardizer(self):
+        s = Standardizer()
+        s.fit(self.x)
+        calc_zscore = s.transform(self.x)
+
+        exp_zscore = standardize(self.x)
+        np.testing.assert_array_almost_equal(calc_zscore, exp_zscore)
+
+    def test_groupedstandardizer(self):
+
+        x = np.concatenate([self.groups.reshape((-1, 1)), self.x], axis=1)
+
+        s = GroupedStandardizer()
+        s.fit(x)
+        calc_zscore = s.transform(x)
+
+        exp_zscore = standardize(self.x, self.groups)
+        np.testing.assert_array_almost_equal(calc_zscore, exp_zscore)
+

 if __name__ == '__main__':
-    unittest.main()
+    import datetime as dt
+    from sklearn.preprocessing import StandardScaler
+
+    x = np.random.randn(1000, 2)
+    y = np.random.randn(50, 2)
+
+    start = dt.datetime.now()
+    for i in range(10000):
+        s1 = StandardScaler()
+        s1.fit(x)
+        x1 = s1.transform(y)
+    print(dt.datetime.now() - start)
+
+    start = dt.datetime.now()
+    for i in range(10000):
+        s2 = Standardizer(ddof=0)
+        s2.fit(x)
+        x2 = s2.transform(y)
+    print(dt.datetime.now() - start)
+
+    np.testing.assert_array_almost_equal(x1, x2)
\ No newline at end of file
--- a/alphamind/utilities.py
+++ b/alphamind/utilities.py
@@ -219,6 +219,21 @@ def scale_value(groups, source, x, scale):
    return destination


+@nb.njit(nogil=True, cache=True)
+def array_index(array, items):
+    to_look_length = items.shape[0]
+    arr_length = array.shape[0]
+
+    res = np.zeros(to_look_length, dtype=array.dtype)
+
+    for i in range(to_look_length):
+        for j in range(arr_length):
+            if items[i] == array[j]:
+                res[i] = j
+                break
+    return res
+
+
 def transform(groups: np.ndarray,
              x: np.ndarray,
              func: str,
@@ -255,3 +270,15 @@ def aggregate(groups, x, func, ddof=1):
        raise ValueError('({0}) is not recognized as valid functor'.format(func))

    return value_data
+
+
+if __name__ == '__main__':
+
+    x1 = np.random.randint(30, size=1000)
+    array = np.unique(x1)
+
+    x2 = np.random.randint(30, size=1000)
+
+    res = array_index(array, x2)
+
+    print(res)
\ No newline at end of file