Commit 474f1fbb authored by Dr.李's avatar Dr.李

fixed rank build bug

parent 66b2fbf2
...@@ -18,7 +18,7 @@ def benchmark_neutralize(n_samples: int, n_features: int, n_loops: int) -> None: ...@@ -18,7 +18,7 @@ def benchmark_neutralize(n_samples: int, n_features: int, n_loops: int) -> None:
print("Starting least square fitting benchmarking") print("Starting least square fitting benchmarking")
print("Parameters(n_samples: {0}, n_features: {1}, n_loops: {2})".format(n_samples, n_features, n_loops)) print("Parameters(n_samples: {0}, n_features: {1}, n_loops: {2})".format(n_samples, n_features, n_loops))
y = np.random.randn(n_samples) y = np.random.randn(n_samples, 5)
x = np.random.randn(n_samples, n_features) x = np.random.randn(n_samples, n_features)
start = dt.datetime.now() start = dt.datetime.now()
...@@ -32,7 +32,7 @@ def benchmark_neutralize(n_samples: int, n_features: int, n_loops: int) -> None: ...@@ -32,7 +32,7 @@ def benchmark_neutralize(n_samples: int, n_features: int, n_loops: int) -> None:
for _ in range(n_loops): for _ in range(n_loops):
benchmark_model = LinearRegression(fit_intercept=False) benchmark_model = LinearRegression(fit_intercept=False)
benchmark_model.fit(x, y) benchmark_model.fit(x, y)
_ = y - x @ benchmark_model.coef_ _ = y - x @ benchmark_model.coef_.T
benchmark_model_time = dt.datetime.now() - start benchmark_model_time = dt.datetime.now() - start
print('{0:20s}: {1}'.format('Benchmark model', benchmark_model_time)) print('{0:20s}: {1}'.format('Benchmark model', benchmark_model_time))
......
...@@ -16,7 +16,7 @@ def benchmark_build_rank(n_samples: int, n_loops: int, n_included: int) -> None: ...@@ -16,7 +16,7 @@ def benchmark_build_rank(n_samples: int, n_loops: int, n_included: int) -> None:
print("Starting portfolio construction by rank benchmarking") print("Starting portfolio construction by rank benchmarking")
print("Parameters(n_samples: {0}, n_included: {1}, n_loops: {2})".format(n_samples, n_included, n_loops)) print("Parameters(n_samples: {0}, n_included: {1}, n_loops: {2})".format(n_samples, n_included, n_loops))
x = np.random.randn(n_samples) x = np.random.randn(n_samples, 1)
start = dt.datetime.now() start = dt.datetime.now()
for _ in range(n_loops): for _ in range(n_loops):
...@@ -27,8 +27,8 @@ def benchmark_build_rank(n_samples: int, n_loops: int, n_included: int) -> None: ...@@ -27,8 +27,8 @@ def benchmark_build_rank(n_samples: int, n_loops: int, n_included: int) -> None:
start = dt.datetime.now() start = dt.datetime.now()
for _ in range(n_loops): for _ in range(n_loops):
expected_weights = np.zeros(len(x)) expected_weights = np.zeros((len(x), 1))
expected_weights[(-x).argsort().argsort() < n_included] = 1. / n_included expected_weights[(-x).argsort(axis=0).argsort(axis=0) < n_included] = 1. / n_included
benchmark_model_time = dt.datetime.now() - start benchmark_model_time = dt.datetime.now() - start
print('{0:20s}: {1}'.format('Benchmark model', benchmark_model_time)) print('{0:20s}: {1}'.format('Benchmark model', benchmark_model_time))
...@@ -39,7 +39,7 @@ def benchmark_build_rank_with_group(n_samples: int, n_loops: int, n_included: in ...@@ -39,7 +39,7 @@ def benchmark_build_rank_with_group(n_samples: int, n_loops: int, n_included: in
print("Starting portfolio construction by rank with group-by values benchmarking") print("Starting portfolio construction by rank with group-by values benchmarking")
print("Parameters(n_samples: {0}, n_included: {1}, n_loops: {2}, n_groups: {3})".format(n_samples, n_included, n_loops, n_groups)) print("Parameters(n_samples: {0}, n_included: {1}, n_loops: {2}, n_groups: {3})".format(n_samples, n_included, n_loops, n_groups))
x = np.random.randn(n_samples) x = np.random.randn(n_samples, 1)
groups = np.random.randint(n_groups, size=n_samples) groups = np.random.randint(n_groups, size=n_samples)
start = dt.datetime.now() start = dt.datetime.now()
...@@ -51,8 +51,8 @@ def benchmark_build_rank_with_group(n_samples: int, n_loops: int, n_included: in ...@@ -51,8 +51,8 @@ def benchmark_build_rank_with_group(n_samples: int, n_loops: int, n_included: in
start = dt.datetime.now() start = dt.datetime.now()
for _ in range(n_loops): for _ in range(n_loops):
grouped_ordering = pd.Series(-x).groupby(groups).rank() grouped_ordering = pd.DataFrame(-x).groupby(groups).rank()
expected_weights = np.zeros(len(x)) expected_weights = np.zeros((len(x), 1))
masks = grouped_ordering <= n_included masks = grouped_ordering <= n_included
expected_weights[masks] = 1. / np.sum(masks) expected_weights[masks] = 1. / np.sum(masks)
benchmark_model_time = dt.datetime.now() - start benchmark_model_time = dt.datetime.now() - start
......
...@@ -7,18 +7,21 @@ Created on 2017-4-26 ...@@ -7,18 +7,21 @@ Created on 2017-4-26
import numpy as np import numpy as np
from numpy import zeros from numpy import zeros
from numpy import arange
def rank_build(er: np.ndarray, use_rank: int, groups: np.ndarray=None) -> np.ndarray: def rank_build(er: np.ndarray, use_rank: int, groups: np.ndarray=None) -> np.ndarray:
length = len(er)
neg_er = -er
masks = zeros(length, dtype=bool)
weights = zeros(length)
if er.ndim == 1 or (er.shape[0] == 1 or er.shape[1] == 1):
""" fast path """
neg_er = -er.flatten()
length = len(neg_er)
weights = zeros((length, 1))
if groups is not None: if groups is not None:
max_g = groups.max() max_g = groups.max()
index_range = np.arange(length) index_range = arange(length)
for i in range(max_g+1): masks = zeros(length, dtype=bool)
for i in range(max_g + 1):
current_mask = groups == i current_mask = groups == i
current_index = index_range[current_mask] current_index = index_range[current_mask]
current_ordering = neg_er[current_mask].argsort() current_ordering = neg_er[current_mask].argsort()
...@@ -26,8 +29,32 @@ def rank_build(er: np.ndarray, use_rank: int, groups: np.ndarray=None) -> np.nda ...@@ -26,8 +29,32 @@ def rank_build(er: np.ndarray, use_rank: int, groups: np.ndarray=None) -> np.nda
weights[masks] = 1. / masks.sum() weights[masks] = 1. / masks.sum()
else: else:
ordering = neg_er.argsort() ordering = neg_er.argsort()
masks[ordering[:use_rank]] = True weights[ordering[:use_rank]] = 1. / use_rank
weights[masks] = 1. / use_rank return weights
else:
length = er.shape[0]
width = er.shape[1]
neg_er = -er
weights = zeros((length, width))
if groups is not None:
max_g = groups.max()
index_range = arange(length)
masks = zeros((length, width), dtype=bool)
for i in range(max_g+1):
current_mask = groups == i
current_index = index_range[current_mask]
current_ordering = neg_er[current_mask].argsort(axis=0)
for j in range(width):
masks[current_index[current_ordering[:use_rank, j]], j] = True
choosed = masks.sum(axis=0)
for j in range(width):
weights[masks[:, j], j] = 1. / choosed[j]
else:
ordering = neg_er.argsort(axis=0)
for j in range(width):
weights[ordering[:use_rank, j], j] = 1. / use_rank
return weights return weights
...@@ -35,12 +62,12 @@ if __name__ == '__main__': ...@@ -35,12 +62,12 @@ if __name__ == '__main__':
import datetime as dt import datetime as dt
x = np.random.randn(4) x = np.random.randn(4, 2)
groups = np.random.randint(2, size=4) groups = np.random.randint(2, size=4)
start = dt.datetime.now() start = dt.datetime.now()
for i in range(10000): for i in range(100):
weights = rank_build(x, 1, groups) weights = rank_build(x, 1)#, groups)
print(dt.datetime.now() - start) print(dt.datetime.now() - start)
...@@ -15,7 +15,7 @@ class TestNeutralize(unittest.TestCase): ...@@ -15,7 +15,7 @@ class TestNeutralize(unittest.TestCase):
def test_neutralize(self): def test_neutralize(self):
y = np.random.randn(3000) y = np.random.randn(3000, 4)
x = np.random.randn(3000, 10) x = np.random.randn(3000, 10)
calc_res = neutralize(x, y) calc_res = neutralize(x, y)
...@@ -23,7 +23,7 @@ class TestNeutralize(unittest.TestCase): ...@@ -23,7 +23,7 @@ class TestNeutralize(unittest.TestCase):
model = LinearRegression(fit_intercept=False) model = LinearRegression(fit_intercept=False)
model.fit(x, y) model.fit(x, y)
exp_res = y - x @ model.coef_ exp_res = y - x @ model.coef_.T
np.testing.assert_array_almost_equal(calc_res, exp_res) np.testing.assert_array_almost_equal(calc_res, exp_res)
......
...@@ -18,12 +18,15 @@ class TestRankBuild(unittest.TestCase): ...@@ -18,12 +18,15 @@ class TestRankBuild(unittest.TestCase):
n_samples = 3000 n_samples = 3000
n_included = 300 n_included = 300
x = np.random.randn(n_samples) x = np.random.randn(n_samples, 2)
calc_weights = rank_build(x, n_included) calc_weights = rank_build(x, n_included)
expected_weights = np.zeros(len(x)) expected_weights = np.zeros((len(x), 2))
expected_weights[(-x).argsort().argsort() < n_included] = 1. / n_included masks = (-x).argsort(axis=0).argsort(axis=0) < n_included
for j in range(x.shape[1]):
expected_weights[masks[:, j], j] = 1. / n_included
np.testing.assert_array_almost_equal(calc_weights, expected_weights) np.testing.assert_array_almost_equal(calc_weights, expected_weights)
...@@ -33,15 +36,17 @@ class TestRankBuild(unittest.TestCase): ...@@ -33,15 +36,17 @@ class TestRankBuild(unittest.TestCase):
n_include = 10 n_include = 10
n_groups = 30 n_groups = 30
x = np.random.randn(n_samples) x = np.random.randn(n_samples, 2)
groups = np.random.randint(n_groups, size=n_samples) groups = np.random.randint(n_groups, size=n_samples)
calc_weights = rank_build(x, n_include, groups) calc_weights = rank_build(x, n_include, groups)
grouped_ordering = pd.Series(-x).groupby(groups).rank() grouped_ordering = pd.DataFrame(-x).groupby(groups).rank()
expected_weights = np.zeros(len(x)) expected_weights = np.zeros((len(x), 2))
masks = grouped_ordering <= n_include masks = (grouped_ordering <= n_include).values
expected_weights[masks] = 1. / np.sum(masks) choosed = masks.sum(axis=0)
for j in range(x.shape[1]):
expected_weights[masks[:, j], j] = 1. / choosed[j]
np.testing.assert_array_almost_equal(calc_weights, expected_weights) np.testing.assert_array_almost_equal(calc_weights, expected_weights)
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment