Commit 1a6908af authored by Dr.李's avatar Dr.李

update data utilities

parent 55b727e7
...@@ -87,8 +87,8 @@ def _sub_step(x, y, w, curr_idx, res): ...@@ -87,8 +87,8 @@ def _sub_step(x, y, w, curr_idx, res):
@nb.njit(nogil=True, cache=True) @nb.njit(nogil=True, cache=True)
def ls_fit(x: np.ndarray, y: np.ndarray, w: np.ndarray) -> np.ndarray: def ls_fit(x: np.ndarray, y: np.ndarray, w: np.ndarray) -> np.ndarray:
x_bar = x.T x_bar = x.T * w
b = np.linalg.solve(x_bar * w @ x, x_bar * w @ y) b = np.linalg.solve(x_bar @ x, x_bar @ y)
return b return b
......
...@@ -9,12 +9,9 @@ import numpy as np ...@@ -9,12 +9,9 @@ import numpy as np
from alphamind.utilities import group_mapping from alphamind.utilities import group_mapping
from alphamind.utilities import transform from alphamind.utilities import transform
from alphamind.utilities import aggregate from alphamind.utilities import aggregate
from alphamind.utilities import array_index
from alphamind.utilities import simple_mean from alphamind.utilities import simple_mean
from alphamind.utilities import simple_std from alphamind.utilities import simple_std
from alphamind.utilities import array_index
from numba import jitclass
from numba import int32, float64
def standardize(x: np.ndarray, groups: np.ndarray=None, ddof=1) -> np.ndarray: def standardize(x: np.ndarray, groups: np.ndarray=None, ddof=1) -> np.ndarray:
...@@ -31,59 +28,35 @@ def standardize(x: np.ndarray, groups: np.ndarray=None, ddof=1) -> np.ndarray: ...@@ -31,59 +28,35 @@ def standardize(x: np.ndarray, groups: np.ndarray=None, ddof=1) -> np.ndarray:
class Standardizer(object): class Standardizer(object):
def __init__(self, ddof=1): def __init__(self, ddof: int=1):
self.ddof_ = ddof self.ddof_ = ddof
self.mean_ = None self.mean_ = None
self.std_ = None self.std_ = None
def fit(self, x): def fit(self, x: np.ndarray):
self.mean_ = simple_mean(x, axis=0) self.mean_ = simple_mean(x, axis=0)
self.std_ = simple_std(x, axis=0, ddof=self.ddof_) self.std_ = simple_std(x, axis=0, ddof=self.ddof_)
def transform(self, x): def transform(self, x: np.ndarray) -> np.ndarray:
return (x - self.mean_) / self.std_ return (x - self.mean_) / self.std_
class GroupedStandardizer(object): class GroupedStandardizer(object):
def __init__(self, ddof=1): def __init__(self, ddof: int=1):
self.labels_ = None self.labels_ = None
self.mean_ = None self.mean_ = None
self.std_ = None self.std_ = None
self.ddof_ = ddof self.ddof_ = ddof
def fit(self, x): def fit(self, x: np.ndarray):
raw_groups = x[:, 0].astype(int) raw_groups = x[:, 0].astype(int)
groups = group_mapping(raw_groups) groups = group_mapping(raw_groups)
self.mean_ = aggregate(groups, x[:, 1:], 'mean') self.mean_ = aggregate(groups, x[:, 1:], 'mean')
self.std_ = aggregate(groups, x[:, 1:], 'std', self.ddof_) self.std_ = aggregate(groups, x[:, 1:], 'std', self.ddof_)
self.labels_ = np.unique(raw_groups) self.labels_ = np.unique(raw_groups)
def transform(self, x): def transform(self, x: np.ndarray) -> np.ndarray:
groups = x[:, 0].astype(int) groups = x[:, 0].astype(int)
index = array_index(self.labels_, groups) index = array_index(self.labels_, groups)
return (x[:, 1:] - self.mean_[index]) / self.std_[index] return (x[:, 1:] - self.mean_[index]) / self.std_[index]
if __name__ == '__main__':
import datetime as dt
x_value = np.random.randn(1000, 3)
groups = np.random.randint(20, size=1000)
x = np.concatenate([groups.reshape((-1, 1)), x_value], axis=1)
start = dt.datetime.now()
for i in range(10000):
x1 = standardize(x_value, groups)
print(dt.datetime.now() - start)
s = GroupedStandardizer(1)
start = dt.datetime.now()
for i in range(10000):
s.fit(x)
x2 = s.transform(x)
print(dt.datetime.now() - start)
np.testing.assert_array_almost_equal(x1, x2)
\ No newline at end of file
...@@ -52,14 +52,14 @@ def mask_values_1d(x: np.ndarray, ...@@ -52,14 +52,14 @@ def mask_values_1d(x: np.ndarray,
return res return res
def winsorize_normal(x: np.ndarray, num_stds: int = 3, groups: np.ndarray = None) -> np.ndarray: def winsorize_normal(x: np.ndarray, num_stds: int = 3, ddof=1, groups: np.ndarray = None) -> np.ndarray:
if groups is not None: if groups is not None:
groups = group_mapping(groups) groups = group_mapping(groups)
mean_values = transform(groups, x, 'mean') mean_values = transform(groups, x, 'mean')
std_values = transform(groups, x, 'std') std_values = transform(groups, x, 'std')
res = mask_values_2d(x, mean_values, std_values, num_stds) res = mask_values_2d(x, mean_values, std_values, num_stds)
else: else:
std_values = simple_std(x, axis=0) std_values = simple_std(x, axis=0, ddof=ddof)
mean_values = simple_mean(x, axis=0) mean_values = simple_mean(x, axis=0)
res = mask_values_1d(x, mean_values, std_values, num_stds) res = mask_values_1d(x, mean_values, std_values, num_stds)
return res return res
...@@ -36,7 +36,7 @@ class TestWinsorize(unittest.TestCase): ...@@ -36,7 +36,7 @@ class TestWinsorize(unittest.TestCase):
np.testing.assert_array_almost_equal(col_data, calculated_col) np.testing.assert_array_almost_equal(col_data, calculated_col)
def test_winsorize_normal_with_group(self): def test_winsorize_normal_with_group(self):
cal_winsorized = winsorize_normal(self.x, self.num_stds, self.groups) cal_winsorized = winsorize_normal(self.x, self.num_stds, groups=self.groups)
def impl(x): def impl(x):
std_values = x.std(axis=0, ddof=1) std_values = x.std(axis=0, ddof=1)
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment