fixed rank build bug

474f1fbb · Dr.李 · 66b2fbf2 · 474f1fbb · 474f1fbb · 474f1fbb
Commit 474f1fbb authored Apr 28, 2017 by Dr.李
5 changed files
--- a/alphamind/benchmarks/data/neutralize.py
+++ b/alphamind/benchmarks/data/neutralize.py
@@ -18,7 +18,7 @@ def benchmark_neutralize(n_samples: int, n_features: int, n_loops: int) -> None:
    print("Starting least square fitting benchmarking")
    print("Parameters(n_samples: {0}, n_features: {1}, n_loops: {2})".format(n_samples, n_features, n_loops))

-    y = np.random.randn(n_samples)
+    y = np.random.randn(n_samples, 5)
    x = np.random.randn(n_samples, n_features)

    start = dt.datetime.now()
@@ -32,7 +32,7 @@ def benchmark_neutralize(n_samples: int, n_features: int, n_loops: int) -> None:
    for _ in range(n_loops):
        benchmark_model = LinearRegression(fit_intercept=False)
        benchmark_model.fit(x, y)
-        _ = y - x @ benchmark_model.coef_
+        _ = y - x @ benchmark_model.coef_.T
    benchmark_model_time = dt.datetime.now() - start

    print('{0:20s}: {1}'.format('Benchmark model', benchmark_model_time))

--- a/alphamind/benchmarks/portfolio/rankbuild.py
+++ b/alphamind/benchmarks/portfolio/rankbuild.py
@@ -16,7 +16,7 @@ def benchmark_build_rank(n_samples: int, n_loops: int, n_included: int) -> None:
    print("Starting portfolio construction by rank benchmarking")
    print("Parameters(n_samples: {0}, n_included: {1}, n_loops: {2})".format(n_samples, n_included, n_loops))

-    x = np.random.randn(n_samples)
+    x = np.random.randn(n_samples, 1)

    start = dt.datetime.now()
    for _ in range(n_loops):
@@ -27,8 +27,8 @@ def benchmark_build_rank(n_samples: int, n_loops: int, n_included: int) -> None:

    start = dt.datetime.now()
    for _ in range(n_loops):
-        expected_weights = np.zeros(len(x))
-        expected_weights[(-x).argsort().argsort() < n_included] = 1. / n_included
+        expected_weights = np.zeros((len(x), 1))
+        expected_weights[(-x).argsort(axis=0).argsort(axis=0) < n_included] = 1. / n_included
    benchmark_model_time = dt.datetime.now() - start

    print('{0:20s}: {1}'.format('Benchmark model', benchmark_model_time))
@@ -39,7 +39,7 @@ def benchmark_build_rank_with_group(n_samples: int, n_loops: int, n_included: in
    print("Starting  portfolio construction by rank with group-by values benchmarking")
    print("Parameters(n_samples: {0}, n_included: {1}, n_loops: {2}, n_groups: {3})".format(n_samples, n_included, n_loops, n_groups))

-    x = np.random.randn(n_samples)
+    x = np.random.randn(n_samples, 1)
    groups = np.random.randint(n_groups, size=n_samples)

    start = dt.datetime.now()
@@ -51,8 +51,8 @@ def benchmark_build_rank_with_group(n_samples: int, n_loops: int, n_included: in

    start = dt.datetime.now()
    for _ in range(n_loops):
-        grouped_ordering = pd.Series(-x).groupby(groups).rank()
-        expected_weights = np.zeros(len(x))
+        grouped_ordering = pd.DataFrame(-x).groupby(groups).rank()
+        expected_weights = np.zeros((len(x), 1))
        masks = grouped_ordering <= n_included
        expected_weights[masks] = 1. / np.sum(masks)
    benchmark_model_time = dt.datetime.now() - start

--- a/alphamind/portfolio/rankbuilder.py
+++ b/alphamind/portfolio/rankbuilder.py
@@ -7,40 +7,67 @@ Created on 2017-4-26

 import numpy as np
 from numpy import zeros
+from numpy import arange


 def rank_build(er: np.ndarray, use_rank: int, groups: np.ndarray=None) -> np.ndarray:
-    length = len(er)
-    neg_er = -er
-    masks = zeros(length, dtype=bool)
-    weights = zeros(length)
-
-    if groups is not None:
-        max_g = groups.max()
-        index_range = np.arange(length)
-        for i in range(max_g+1):
-            current_mask = groups == i
-            current_index = index_range[current_mask]
-            current_ordering = neg_er[current_mask].argsort()
-            masks[current_index[current_ordering[:use_rank]]] = True
-        weights[masks] = 1. / masks.sum()
+
+    if er.ndim == 1 or (er.shape[0] == 1 or er.shape[1] == 1):
+        """ fast path """
+        neg_er = -er.flatten()
+        length = len(neg_er)
+        weights = zeros((length, 1))
+        if groups is not None:
+            max_g = groups.max()
+            index_range = arange(length)
+            masks = zeros(length, dtype=bool)
+            for i in range(max_g + 1):
+                current_mask = groups == i
+                current_index = index_range[current_mask]
+                current_ordering = neg_er[current_mask].argsort()
+                masks[current_index[current_ordering[:use_rank]]] = True
+            weights[masks] = 1. / masks.sum()
+        else:
+            ordering = neg_er.argsort()
+            weights[ordering[:use_rank]] = 1. / use_rank
+        return weights
    else:
-        ordering = neg_er.argsort()
-        masks[ordering[:use_rank]] = True
-        weights[masks] = 1. / use_rank
-    return weights
+        length = er.shape[0]
+        width = er.shape[1]
+        neg_er = -er
+        weights = zeros((length, width))
+
+        if groups is not None:
+            max_g = groups.max()
+            index_range = arange(length)
+            masks = zeros((length, width), dtype=bool)
+            for i in range(max_g+1):
+                current_mask = groups == i
+                current_index = index_range[current_mask]
+                current_ordering = neg_er[current_mask].argsort(axis=0)
+                for j in range(width):
+                    masks[current_index[current_ordering[:use_rank, j]], j] = True
+            choosed = masks.sum(axis=0)
+
+            for j in range(width):
+                weights[masks[:, j], j] = 1. / choosed[j]
+        else:
+            ordering = neg_er.argsort(axis=0)
+            for j in range(width):
+                weights[ordering[:use_rank, j], j] = 1. / use_rank
+        return weights


 if __name__ == '__main__':

    import datetime as dt

-    x = np.random.randn(4)
+    x = np.random.randn(4, 2)

    groups = np.random.randint(2, size=4)

    start = dt.datetime.now()
-    for i in range(10000):
-        weights = rank_build(x, 1, groups)
+    for i in range(100):
+        weights = rank_build(x, 1)#, groups)
    print(dt.datetime.now() - start)

--- a/alphamind/tests/data/test_neutralize.py
+++ b/alphamind/tests/data/test_neutralize.py
@@ -15,7 +15,7 @@ class TestNeutralize(unittest.TestCase):

    def test_neutralize(self):

-        y = np.random.randn(3000)
+        y = np.random.randn(3000, 4)
        x = np.random.randn(3000, 10)

        calc_res = neutralize(x, y)
@@ -23,7 +23,7 @@ class TestNeutralize(unittest.TestCase):
        model = LinearRegression(fit_intercept=False)
        model.fit(x, y)

-        exp_res = y - x @ model.coef_
+        exp_res = y - x @ model.coef_.T

        np.testing.assert_array_almost_equal(calc_res, exp_res)


--- a/alphamind/tests/portfolio/test_rankbuild.py
+++ b/alphamind/tests/portfolio/test_rankbuild.py
@@ -18,12 +18,15 @@ class TestRankBuild(unittest.TestCase):
        n_samples = 3000
        n_included = 300

-        x = np.random.randn(n_samples)
+        x = np.random.randn(n_samples, 2)

        calc_weights = rank_build(x, n_included)

-        expected_weights = np.zeros(len(x))
-        expected_weights[(-x).argsort().argsort() < n_included] = 1. / n_included
+        expected_weights = np.zeros((len(x), 2))
+        masks = (-x).argsort(axis=0).argsort(axis=0) < n_included
+
+        for j in range(x.shape[1]):
+            expected_weights[masks[:, j], j] = 1. / n_included

        np.testing.assert_array_almost_equal(calc_weights, expected_weights)

@@ -33,15 +36,17 @@ class TestRankBuild(unittest.TestCase):
        n_include = 10
        n_groups = 30

-        x = np.random.randn(n_samples)
+        x = np.random.randn(n_samples, 2)
        groups = np.random.randint(n_groups, size=n_samples)

        calc_weights = rank_build(x, n_include, groups)

-        grouped_ordering = pd.Series(-x).groupby(groups).rank()
-        expected_weights = np.zeros(len(x))
-        masks = grouped_ordering <= n_include
-        expected_weights[masks] = 1. / np.sum(masks)
+        grouped_ordering = pd.DataFrame(-x).groupby(groups).rank()
+        expected_weights = np.zeros((len(x), 2))
+        masks = (grouped_ordering <= n_include).values
+        choosed = masks.sum(axis=0)
+        for j in range(x.shape[1]):
+            expected_weights[masks[:, j], j] = 1. / choosed[j]

        np.testing.assert_array_almost_equal(calc_weights, expected_weights)