made rank_build work with very large groups

ae3af0c2 · Dr.李 · 1c62292b · ae3af0c2 · ae3af0c2 · ae3af0c2
Commit ae3af0c2 authored Apr 29, 2017 by Dr.李
5 changed files
--- a/alphamind/aggregate.pyx
+++ b/alphamind/aggregate.pyx
@@ -6,8 +6,6 @@ Created on 2017-4-26
 """

 cimport numpy as np
-from numpy import zeros
-from numpy import asarray
 cimport cython
 from libc.math cimport sqrt
 from libc.math cimport fabs

--- a/alphamind/benchmarks/benchmarks.py
+++ b/alphamind/benchmarks/benchmarks.py
@@ -18,30 +18,30 @@ from alphamind.benchmarks.settlement.simplesettle import benchmark_simple_settle

 if __name__ == '__main__':

-    benchmark_neutralize(3000, 10, 1000)
-    benchmark_neutralize(30, 10, 50000)
-    benchmark_neutralize(50000, 50, 20)
-    benchmark_standardize(3000, 10, 1000)
-    benchmark_standardize_with_group(3000, 10, 1000, 30)
-    benchmark_standardize(30, 10, 50000)
-    benchmark_standardize_with_group(30, 10, 5000, 5)
-    benchmark_standardize(50000, 50, 20)
-    benchmark_standardize_with_group(50000, 50, 20, 50)
-    benchmark_winsorize_normal(3000, 10, 1000)
-    benchmark_winsorize_normal_with_group(3000, 10, 1000, 30)
-    benchmark_winsorize_normal(30, 10, 50000)
-    benchmark_winsorize_normal_with_group(30, 10, 5000, 5)
-    benchmark_winsorize_normal(50000, 50, 20)
-    benchmark_winsorize_normal_with_group(50000, 50, 20, 50)
+    # benchmark_neutralize(3000, 10, 1000)
+    # benchmark_neutralize(30, 10, 50000)
+    # benchmark_neutralize(50000, 50, 20)
+    # benchmark_standardize(3000, 10, 1000)
+    # benchmark_standardize_with_group(3000, 10, 1000, 30)
+    # benchmark_standardize(30, 10, 50000)
+    # benchmark_standardize_with_group(30, 10, 5000, 5)
+    # benchmark_standardize(50000, 50, 20)
+    # benchmark_standardize_with_group(50000, 50, 20, 50)
+    # benchmark_winsorize_normal(3000, 10, 1000)
+    # benchmark_winsorize_normal_with_group(3000, 10, 1000, 30)
+    # benchmark_winsorize_normal(30, 10, 50000)
+    # benchmark_winsorize_normal_with_group(30, 10, 5000, 5)
+    # benchmark_winsorize_normal(50000, 50, 20)
+    # benchmark_winsorize_normal_with_group(50000, 50, 20, 50)
    benchmark_build_rank(3000, 1000, 300)
    benchmark_build_rank_with_group(3000, 1000, 10, 30)
    benchmark_build_rank(30, 50000, 3)
    benchmark_build_rank_with_group(30, 50000, 1, 3)
    benchmark_build_rank(50000, 20, 3000)
    benchmark_build_rank_with_group(50000, 20, 10, 300)
-    benchmark_simple_settle(3000, 10, 1000)
-    benchmark_simple_settle_with_group(3000, 10, 1000, 30)
-    benchmark_simple_settle(30, 10, 50000)
-    benchmark_simple_settle_with_group(30, 10, 5000, 5)
-    benchmark_simple_settle(50000, 50, 20)
-    benchmark_simple_settle_with_group(50000, 50, 20, 50)
+    # benchmark_simple_settle(3000, 10, 1000)
+    # benchmark_simple_settle_with_group(3000, 10, 1000, 30)
+    # benchmark_simple_settle(30, 10, 50000)
+    # benchmark_simple_settle_with_group(30, 10, 5000, 5)
+    # benchmark_simple_settle(50000, 50, 20)
+    # benchmark_simple_settle_with_group(50000, 50, 20, 50)
--- a/alphamind/portfolio/impl.pyx
+++ b/alphamind/portfolio/impl.pyx
+# -*- coding: utf-8 -*-
+"""
+Created on 2017-4-29
+
+@author: cheng.li
+"""
+
+import numpy as np
+from numpy import array
+cimport numpy as cnp
+cimport cython
+import cytoolz
+
+
+@cython.boundscheck(False)
+@cython.wraparound(False)
+@cython.initializedcheck(False)
+cdef inline long index(tuple x):
+    return x[0]
+
+
+@cython.boundscheck(False)
+@cython.wraparound(False)
+@cython.initializedcheck(False)
+cpdef list groupby(long[:] groups):
+
+    cdef int i
+    cdef long d
+    cdef list table
+    cdef tuple t
+    cdef list v
+    cdef dict group_dict
+    cdef list group_ids
+
+    table = [(d, i) for i, d in enumerate(groups)]
+    group_dict = cytoolz.groupby(index, table)
+    group_ids = [array([t[1] for t in v]) for v in group_dict.values()]
+    return group_ids
\ No newline at end of file
--- a/alphamind/portfolio/rankbuilder.py
+++ b/alphamind/portfolio/rankbuilder.py
@@ -7,7 +7,7 @@ Created on 2017-4-26

 import numpy as np
 from numpy import zeros
-from numpy import arange
+from alphamind.portfolio.impl import groupby


 def rank_build(er: np.ndarray, use_rank: int, groups: np.ndarray=None) -> np.ndarray:
@@ -18,13 +18,10 @@ def rank_build(er: np.ndarray, use_rank: int, groups: np.ndarray=None) -> np.nda
        length = len(neg_er)
        weights = zeros((length, 1))
        if groups is not None:
-            max_g = groups.max()
-            index_range = arange(length)
+            group_ids = groupby(groups)
            masks = zeros(length, dtype=bool)
-            for i in range(max_g + 1):
-                current_mask = groups == i
-                current_index = index_range[current_mask]
-                current_ordering = neg_er[current_mask].argsort()
+            for current_index in group_ids:
+                current_ordering = neg_er[current_index].argsort()
                masks[current_index[current_ordering[:use_rank]]] = True
            weights[masks] = 1. / masks.sum()
        else:
@@ -38,13 +35,10 @@ def rank_build(er: np.ndarray, use_rank: int, groups: np.ndarray=None) -> np.nda
        weights = zeros((length, width))

        if groups is not None:
-            max_g = groups.max()
-            index_range = arange(length)
+            group_ids = groupby(groups)
            masks = zeros((length, width), dtype=bool)
-            for i in range(max_g+1):
-                current_mask = groups == i
-                current_index = index_range[current_mask]
-                current_ordering = neg_er[current_mask].argsort(axis=0)
+            for current_index in group_ids:
+                current_ordering = neg_er[current_index].argsort(axis=0)
                for j in range(width):
                    masks[current_index[current_ordering[:use_rank, j]], j] = True
            choosed = masks.sum(axis=0)
@@ -58,3 +52,12 @@ def rank_build(er: np.ndarray, use_rank: int, groups: np.ndarray=None) -> np.nda
        return weights


+if __name__ == '__main__':
+    n_samples = 4
+    n_include = 1
+    n_groups = 2
+
+    x = np.random.randn(n_samples, 2)
+    groups = np.random.randint(n_groups, size=n_samples)
+
+    calc_weights = rank_build(x, n_include, groups)
\ No newline at end of file
--- a/setup.py
+++ b/setup.py
@@ -23,7 +23,8 @@ else:
    line_trace = False


-ext_modules = ['alphamind/aggregate.pyx']
+ext_modules = ['alphamind/aggregate.pyx',
+               'alphamind/portfolio/impl.pyx']


 def generate_extensions(ext_modules, line_trace=False):