Commit 1a6908af authored by Dr.李's avatar Dr.李

update data utilities

parent 55b727e7
......@@ -87,8 +87,8 @@ def _sub_step(x, y, w, curr_idx, res):
@nb.njit(nogil=True, cache=True)
def ls_fit(x: np.ndarray, y: np.ndarray, w: np.ndarray) -> np.ndarray:
x_bar = x.T
b = np.linalg.solve(x_bar * w @ x, x_bar * w @ y)
x_bar = x.T * w
b = np.linalg.solve(x_bar @ x, x_bar @ y)
return b
......
......@@ -9,12 +9,9 @@ import numpy as np
from alphamind.utilities import group_mapping
from alphamind.utilities import transform
from alphamind.utilities import aggregate
from alphamind.utilities import array_index
from alphamind.utilities import simple_mean
from alphamind.utilities import simple_std
from alphamind.utilities import array_index
from numba import jitclass
from numba import int32, float64
def standardize(x: np.ndarray, groups: np.ndarray=None, ddof=1) -> np.ndarray:
......@@ -31,59 +28,35 @@ def standardize(x: np.ndarray, groups: np.ndarray=None, ddof=1) -> np.ndarray:
class Standardizer(object):
def __init__(self, ddof=1):
def __init__(self, ddof: int=1):
self.ddof_ = ddof
self.mean_ = None
self.std_ = None
def fit(self, x):
def fit(self, x: np.ndarray):
self.mean_ = simple_mean(x, axis=0)
self.std_ = simple_std(x, axis=0, ddof=self.ddof_)
def transform(self, x):
def transform(self, x: np.ndarray) -> np.ndarray:
return (x - self.mean_) / self.std_
class GroupedStandardizer(object):
def __init__(self, ddof=1):
def __init__(self, ddof: int=1):
self.labels_ = None
self.mean_ = None
self.std_ = None
self.ddof_ = ddof
def fit(self, x):
def fit(self, x: np.ndarray):
raw_groups = x[:, 0].astype(int)
groups = group_mapping(raw_groups)
self.mean_ = aggregate(groups, x[:, 1:], 'mean')
self.std_ = aggregate(groups, x[:, 1:], 'std', self.ddof_)
self.labels_ = np.unique(raw_groups)
def transform(self, x):
def transform(self, x: np.ndarray) -> np.ndarray:
groups = x[:, 0].astype(int)
index = array_index(self.labels_, groups)
return (x[:, 1:] - self.mean_[index]) / self.std_[index]
if __name__ == '__main__':
import datetime as dt
x_value = np.random.randn(1000, 3)
groups = np.random.randint(20, size=1000)
x = np.concatenate([groups.reshape((-1, 1)), x_value], axis=1)
start = dt.datetime.now()
for i in range(10000):
x1 = standardize(x_value, groups)
print(dt.datetime.now() - start)
s = GroupedStandardizer(1)
start = dt.datetime.now()
for i in range(10000):
s.fit(x)
x2 = s.transform(x)
print(dt.datetime.now() - start)
np.testing.assert_array_almost_equal(x1, x2)
\ No newline at end of file
......@@ -52,14 +52,14 @@ def mask_values_1d(x: np.ndarray,
return res
def winsorize_normal(x: np.ndarray, num_stds: int = 3, groups: np.ndarray = None) -> np.ndarray:
def winsorize_normal(x: np.ndarray, num_stds: int = 3, ddof=1, groups: np.ndarray = None) -> np.ndarray:
if groups is not None:
groups = group_mapping(groups)
mean_values = transform(groups, x, 'mean')
std_values = transform(groups, x, 'std')
res = mask_values_2d(x, mean_values, std_values, num_stds)
else:
std_values = simple_std(x, axis=0)
std_values = simple_std(x, axis=0, ddof=ddof)
mean_values = simple_mean(x, axis=0)
res = mask_values_1d(x, mean_values, std_values, num_stds)
return res
......@@ -36,7 +36,7 @@ class TestWinsorize(unittest.TestCase):
np.testing.assert_array_almost_equal(col_data, calculated_col)
def test_winsorize_normal_with_group(self):
cal_winsorized = winsorize_normal(self.x, self.num_stds, self.groups)
cal_winsorized = winsorize_normal(self.x, self.num_stds, groups=self.groups)
def impl(x):
std_values = x.std(axis=0, ddof=1)
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment