Commit bc595833 authored by Dr.李's avatar Dr.李

modified to get better performance

parent 8ad5bdb1
...@@ -15,8 +15,15 @@ def standardize(x: np.ndarray, groups: np.ndarray=None) -> np.ndarray: ...@@ -15,8 +15,15 @@ def standardize(x: np.ndarray, groups: np.ndarray=None) -> np.ndarray:
df = pd.DataFrame(x) df = pd.DataFrame(x)
gs = df.groupby(groups) gs = df.groupby(groups)
mean_values = gs.transform(np.mean).values mean_values = gs.mean()
std_values = gs.transform(np.std).values std_values = gs.std().values
value_index = np.searchsorted(mean_values.index, groups)
mean_values = mean_values.values
mean_values = mean_values[value_index]
std_values = std_values[value_index]
return (x - mean_values) / std_values return (x - mean_values) / std_values
else: else:
return (x - x.mean(axis=0)) / x.std(axis=0) return (x - x.mean(axis=0)) / x.std(axis=0)
......
...@@ -8,22 +8,38 @@ Created on 2017-4-25 ...@@ -8,22 +8,38 @@ Created on 2017-4-25
import pandas as pd import pandas as pd
import numpy as np import numpy as np
def winsorize_normal(x: np.ndarray, num_stds: int=3, groups: np.ndarray=None) -> np.ndarray: def winsorize_normal(x: np.ndarray, num_stds: int=3, groups: np.ndarray=None) -> np.ndarray:
if groups is not None: if groups is not None:
df = pd.DataFrame(x) df = pd.DataFrame(x)
gs = df.groupby(groups) gs = df.groupby(groups)
mean_values = gs.transform(np.mean).values mean_values = gs.mean()
std_values = gs.transform(np.std).values std_values = gs.std().values
value_index = np.searchsorted(mean_values.index, groups)
mean_values = mean_values.values
ubound = mean_values + num_stds * std_values
lbound = mean_values - num_stds * std_values
ubound = ubound[value_index]
lbound = lbound[value_index]
else: else:
std_values = x.std(axis=0) std_values = x.std(axis=0)
mean_values = x.mean(axis=0) mean_values = x.mean(axis=0)
ubound = mean_values + num_stds * std_values ubound = mean_values + num_stds * std_values
lbound = mean_values - num_stds * std_values lbound = mean_values - num_stds * std_values
res = np.where(x > ubound, ubound, x)
res = np.where(res < lbound, lbound, res) res = np.where(x > ubound, ubound, np.where(x < lbound, lbound, x))
return res return res
if __name__ == '__main__':
x = np.random.randn(3000, 10)
groups = np.random.randint(20, 40, size=3000)
for _ in range(1000):
winsorize_normal(x, 2, groups)
\ No newline at end of file
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment