Commit 8ad5bdb1 authored by Dr.李's avatar Dr.李

modified winsorized benchmark and tests

parent 3dbd0208
# -*- coding: utf-8 -*-
"""
Created on 2017-4-25
@author: cheng.li
"""
from alphamind.benchmarks.neutralize import benchmark_neutralize
from alphamind.benchmarks.standardize import benchmark_standardize
from alphamind.benchmarks.standardize import benchmark_standardize_with_group
from alphamind.benchmarks.winsorize import benchmark_winsorize_normal
from alphamind.benchmarks.winsorize import benchmark_winsorize_normal_with_group
if __name__ == '__main__':
benchmark_neutralize(3000, 10, 1000)
benchmark_standardize(3000, 10, 1000)
benchmark_standardize_with_group(3000, 10, 100, 30)
benchmark_winsorize_normal(3000, 10, 1000)
benchmark_winsorize_normal_with_group(3000, 10, 100, 30)
# -*- coding: utf-8 -*-
"""
Created on 2017-4-25
@author: cheng.li
"""
import datetime as dt
import numpy as np
import pandas as pd
from alphamind.data.winsorize import winsorize_normal
def benchmark_winsorize_normal(n_samples: int, n_features: int, n_loops: int) -> None:
print("-" * 60)
print("Starting winsorize normal benchmarking")
print("Parameters(n_samples: {0}, n_features: {1}, n_loops: {2})".format(n_samples, n_features, n_loops))
num_stds = 2
x = np.random.randn(n_samples, n_features)
start = dt.datetime.now()
for _ in range(n_loops):
_ = winsorize_normal(x, num_stds)
impl_model_time = dt.datetime.now() - start
print('{0:20s}: {1}'.format('Implemented model', impl_model_time))
def impl(x):
std_values = x.std(axis=0)
mean_value = x.mean(axis=0)
lower_bound = mean_value - num_stds * std_values
upper_bound = mean_value + num_stds * std_values
res = np.where(x > upper_bound, upper_bound, x)
res = np.where(res < lower_bound, lower_bound, res)
return res
start = dt.datetime.now()
for _ in range(n_loops):
_ = impl(x)
benchmark_model_time = dt.datetime.now() - start
print('{0:20s}: {1}'.format('Benchmark model', benchmark_model_time))
def benchmark_winsorize_normal_with_group(n_samples: int, n_features: int, n_loops: int, n_groups: int) -> None:
print("-" * 60)
print("Starting winsorize normal with group-by values benchmarking")
print("Parameters(n_samples: {0}, n_features: {1}, n_loops: {2}, n_groups: {3})".format(n_samples, n_features, n_loops, n_groups))
num_stds = 2
x = np.random.randn(n_samples, n_features)
groups = np.random.randint(n_groups, size=n_samples)
start = dt.datetime.now()
for _ in range(n_loops):
_ = winsorize_normal(x, num_stds, groups=groups)
impl_model_time = dt.datetime.now() - start
print('{0:20s}: {1}'.format('Implemented model', impl_model_time))
def impl(x):
std_values = x.std(axis=0)
mean_value = x.mean(axis=0)
lower_bound = mean_value - num_stds * std_values
upper_bound = mean_value + num_stds * std_values
res = np.where(x > upper_bound, upper_bound, x)
res = np.where(res < lower_bound, lower_bound, res)
return res
start = dt.datetime.now()
for _ in range(n_loops):
_ = pd.DataFrame(x).groupby(groups).transform(impl)
benchmark_model_time = dt.datetime.now() - start
print('{0:20s}: {1}'.format('Benchmark model', benchmark_model_time))
if __name__ == '__main__':
benchmark_winsorize_normal(3000, 10, 1000)
benchmark_winsorize_normal_with_group(3000, 10, 1000, 30)
...@@ -5,10 +5,25 @@ Created on 2017-4-25 ...@@ -5,10 +5,25 @@ Created on 2017-4-25
@author: cheng.li @author: cheng.li
""" """
import pandas as pd
import numpy as np
def winsorize(x, num_stds=3, groups=None):
def winsorize_normal(x: np.ndarray, num_stds: int=3, groups: np.ndarray=None) -> np.ndarray:
if groups is not None: if groups is not None:
pass df = pd.DataFrame(x)
gs = df.groupby(groups)
mean_values = gs.transform(np.mean).values
std_values = gs.transform(np.std).values
else: else:
pass std_values = x.std(axis=0)
mean_values = x.mean(axis=0)
ubound = mean_values + num_stds * std_values
lbound = mean_values - num_stds * std_values
res = np.where(x > ubound, ubound, x)
res = np.where(res < lbound, lbound, res)
return res
# -*- coding: utf-8 -*-
"""
Created on 2017-4-25
@author: cheng.li
"""
import unittest
import numpy as np
import pandas as pd
from alphamind.data.winsorize import winsorize_normal
class TestWinsorize(unittest.TestCase):
def test_winsorize_normal(self):
num_stds = 2
x = np.random.randn(3000, 10)
calc_winsorized = winsorize_normal(x, num_stds)
std_values = x.std(axis=0)
mean_value = x.mean(axis=0)
lower_bound = mean_value - num_stds * std_values
upper_bound = mean_value + num_stds * std_values
for i in range(np.size(calc_winsorized, 1)):
col_data = x[:, i]
col_data[col_data > upper_bound[i]] = upper_bound[i]
col_data[col_data < lower_bound[i]] = lower_bound[i]
calculated_col = calc_winsorized[:, i]
np.testing.assert_array_almost_equal(col_data, calculated_col)
def test_winsorize_normal_with_group(self):
num_stds = 2
x = np.random.randn(3000, 10)
groups = np.random.randint(30, size=3000)
cal_winsorized = winsorize_normal(x, num_stds, groups)
def impl(x):
std_values = x.std(axis=0)
mean_value = x.mean(axis=0)
lower_bound = mean_value - num_stds * std_values
upper_bound = mean_value + num_stds * std_values
res = np.where(x > upper_bound, upper_bound, x)
res = np.where(res < lower_bound, lower_bound, res)
return res
exp_winsorized = pd.DataFrame(x).groupby(groups).transform(impl).values
np.testing.assert_array_almost_equal(cal_winsorized, exp_winsorized)
if __name__ == "__main__":
unittest.main()
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment