Commit 474f1fbb authored by Dr.李's avatar Dr.李

fixed rank build bug

parent 66b2fbf2
......@@ -18,7 +18,7 @@ def benchmark_neutralize(n_samples: int, n_features: int, n_loops: int) -> None:
print("Starting least square fitting benchmarking")
print("Parameters(n_samples: {0}, n_features: {1}, n_loops: {2})".format(n_samples, n_features, n_loops))
y = np.random.randn(n_samples)
y = np.random.randn(n_samples, 5)
x = np.random.randn(n_samples, n_features)
start = dt.datetime.now()
......@@ -32,7 +32,7 @@ def benchmark_neutralize(n_samples: int, n_features: int, n_loops: int) -> None:
for _ in range(n_loops):
benchmark_model = LinearRegression(fit_intercept=False)
benchmark_model.fit(x, y)
_ = y - x @ benchmark_model.coef_
_ = y - x @ benchmark_model.coef_.T
benchmark_model_time = dt.datetime.now() - start
print('{0:20s}: {1}'.format('Benchmark model', benchmark_model_time))
......
......@@ -16,7 +16,7 @@ def benchmark_build_rank(n_samples: int, n_loops: int, n_included: int) -> None:
print("Starting portfolio construction by rank benchmarking")
print("Parameters(n_samples: {0}, n_included: {1}, n_loops: {2})".format(n_samples, n_included, n_loops))
x = np.random.randn(n_samples)
x = np.random.randn(n_samples, 1)
start = dt.datetime.now()
for _ in range(n_loops):
......@@ -27,8 +27,8 @@ def benchmark_build_rank(n_samples: int, n_loops: int, n_included: int) -> None:
start = dt.datetime.now()
for _ in range(n_loops):
expected_weights = np.zeros(len(x))
expected_weights[(-x).argsort().argsort() < n_included] = 1. / n_included
expected_weights = np.zeros((len(x), 1))
expected_weights[(-x).argsort(axis=0).argsort(axis=0) < n_included] = 1. / n_included
benchmark_model_time = dt.datetime.now() - start
print('{0:20s}: {1}'.format('Benchmark model', benchmark_model_time))
......@@ -39,7 +39,7 @@ def benchmark_build_rank_with_group(n_samples: int, n_loops: int, n_included: in
print("Starting portfolio construction by rank with group-by values benchmarking")
print("Parameters(n_samples: {0}, n_included: {1}, n_loops: {2}, n_groups: {3})".format(n_samples, n_included, n_loops, n_groups))
x = np.random.randn(n_samples)
x = np.random.randn(n_samples, 1)
groups = np.random.randint(n_groups, size=n_samples)
start = dt.datetime.now()
......@@ -51,8 +51,8 @@ def benchmark_build_rank_with_group(n_samples: int, n_loops: int, n_included: in
start = dt.datetime.now()
for _ in range(n_loops):
grouped_ordering = pd.Series(-x).groupby(groups).rank()
expected_weights = np.zeros(len(x))
grouped_ordering = pd.DataFrame(-x).groupby(groups).rank()
expected_weights = np.zeros((len(x), 1))
masks = grouped_ordering <= n_included
expected_weights[masks] = 1. / np.sum(masks)
benchmark_model_time = dt.datetime.now() - start
......
......@@ -7,40 +7,67 @@ Created on 2017-4-26
import numpy as np
from numpy import zeros
from numpy import arange
def rank_build(er: np.ndarray, use_rank: int, groups: np.ndarray=None) -> np.ndarray:
length = len(er)
neg_er = -er
masks = zeros(length, dtype=bool)
weights = zeros(length)
if groups is not None:
max_g = groups.max()
index_range = np.arange(length)
for i in range(max_g+1):
current_mask = groups == i
current_index = index_range[current_mask]
current_ordering = neg_er[current_mask].argsort()
masks[current_index[current_ordering[:use_rank]]] = True
weights[masks] = 1. / masks.sum()
if er.ndim == 1 or (er.shape[0] == 1 or er.shape[1] == 1):
""" fast path """
neg_er = -er.flatten()
length = len(neg_er)
weights = zeros((length, 1))
if groups is not None:
max_g = groups.max()
index_range = arange(length)
masks = zeros(length, dtype=bool)
for i in range(max_g + 1):
current_mask = groups == i
current_index = index_range[current_mask]
current_ordering = neg_er[current_mask].argsort()
masks[current_index[current_ordering[:use_rank]]] = True
weights[masks] = 1. / masks.sum()
else:
ordering = neg_er.argsort()
weights[ordering[:use_rank]] = 1. / use_rank
return weights
else:
ordering = neg_er.argsort()
masks[ordering[:use_rank]] = True
weights[masks] = 1. / use_rank
return weights
length = er.shape[0]
width = er.shape[1]
neg_er = -er
weights = zeros((length, width))
if groups is not None:
max_g = groups.max()
index_range = arange(length)
masks = zeros((length, width), dtype=bool)
for i in range(max_g+1):
current_mask = groups == i
current_index = index_range[current_mask]
current_ordering = neg_er[current_mask].argsort(axis=0)
for j in range(width):
masks[current_index[current_ordering[:use_rank, j]], j] = True
choosed = masks.sum(axis=0)
for j in range(width):
weights[masks[:, j], j] = 1. / choosed[j]
else:
ordering = neg_er.argsort(axis=0)
for j in range(width):
weights[ordering[:use_rank, j], j] = 1. / use_rank
return weights
if __name__ == '__main__':
import datetime as dt
x = np.random.randn(4)
x = np.random.randn(4, 2)
groups = np.random.randint(2, size=4)
start = dt.datetime.now()
for i in range(10000):
weights = rank_build(x, 1, groups)
for i in range(100):
weights = rank_build(x, 1)#, groups)
print(dt.datetime.now() - start)
......@@ -15,7 +15,7 @@ class TestNeutralize(unittest.TestCase):
def test_neutralize(self):
y = np.random.randn(3000)
y = np.random.randn(3000, 4)
x = np.random.randn(3000, 10)
calc_res = neutralize(x, y)
......@@ -23,7 +23,7 @@ class TestNeutralize(unittest.TestCase):
model = LinearRegression(fit_intercept=False)
model.fit(x, y)
exp_res = y - x @ model.coef_
exp_res = y - x @ model.coef_.T
np.testing.assert_array_almost_equal(calc_res, exp_res)
......
......@@ -18,12 +18,15 @@ class TestRankBuild(unittest.TestCase):
n_samples = 3000
n_included = 300
x = np.random.randn(n_samples)
x = np.random.randn(n_samples, 2)
calc_weights = rank_build(x, n_included)
expected_weights = np.zeros(len(x))
expected_weights[(-x).argsort().argsort() < n_included] = 1. / n_included
expected_weights = np.zeros((len(x), 2))
masks = (-x).argsort(axis=0).argsort(axis=0) < n_included
for j in range(x.shape[1]):
expected_weights[masks[:, j], j] = 1. / n_included
np.testing.assert_array_almost_equal(calc_weights, expected_weights)
......@@ -33,15 +36,17 @@ class TestRankBuild(unittest.TestCase):
n_include = 10
n_groups = 30
x = np.random.randn(n_samples)
x = np.random.randn(n_samples, 2)
groups = np.random.randint(n_groups, size=n_samples)
calc_weights = rank_build(x, n_include, groups)
grouped_ordering = pd.Series(-x).groupby(groups).rank()
expected_weights = np.zeros(len(x))
masks = grouped_ordering <= n_include
expected_weights[masks] = 1. / np.sum(masks)
grouped_ordering = pd.DataFrame(-x).groupby(groups).rank()
expected_weights = np.zeros((len(x), 2))
masks = (grouped_ordering <= n_include).values
choosed = masks.sum(axis=0)
for j in range(x.shape[1]):
expected_weights[masks[:, j], j] = 1. / choosed[j]
np.testing.assert_array_almost_equal(calc_weights, expected_weights)
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment