added rank build tests and benchmark

258b0348 · Dr.李 · b971f376 · 258b0348 · 258b0348 · 258b0348
Commit 258b0348 authored Apr 27, 2017 by Dr.李
7 changed files
--- a/alphamind/aggregate.pyx
+++ b/alphamind/aggregate.pyx
@@ -10,6 +10,8 @@ from numpy import zeros
 from numpy import asarray
 cimport cython
 from libc.math cimport sqrt
+from libc.stdlib cimport calloc
+from libc.stdlib cimport free


 @cython.boundscheck(False)
@@ -26,69 +28,76 @@ cdef int max_groups(long* groups, size_t length) nogil:
            curr_max = curr
    return curr_max

+
 @cython.boundscheck(False)
 @cython.wraparound(False)
 @cython.cdivision(True)
 @cython.initializedcheck(False)
-cdef double[:, :] agg_mean(long* groups, double* x, size_t length, size_t width):
+cdef double* agg_mean(long* groups, double* x, size_t length, size_t width) nogil:
    cdef long max_g = max_groups(groups, length)
-    cdef double[:, :] res = zeros((max_g+1, width))
-    cdef double* res_ptr = &res[0, 0]
-    cdef long[:] bin_count = zeros(max_g+1, dtype=int)
-    cdef long* bin_count_ptr = &bin_count[0]
+    cdef double* res_ptr = <double*>calloc((max_g+1)*width, sizeof(double))
+    cdef long* bin_count_ptr = <long*>calloc(max_g+1, sizeof(int))
    cdef size_t i
    cdef size_t j
+    cdef size_t loop_idx1
+    cdef size_t loop_idx2
    cdef long curr

-    with nogil:
-        for i in range(length):
+    for i in range(length):
+        loop_idx1 = i*width
+        loop_idx2 = groups[i]*width
+        for j in range(width):
+            res_ptr[loop_idx2 + j] += x[loop_idx1 + j]
+        bin_count_ptr[groups[i]] += 1
+
+    for i in range(max_g+1):
+        curr = bin_count_ptr[i]
+        if curr != 0:
+            loop_idx1 = i*width
            for j in range(width):
-                res_ptr[groups[i]*width + j] += x[i*width + j]
-            bin_count_ptr[groups[i]] += 1
+                res_ptr[loop_idx1 + j] /= curr

-        for i in range(max_g+1):
-            curr = bin_count_ptr[i]
-            if curr != 0:
-                for j in range(width):
-                    res_ptr[i*width + j] /= curr
-    return res
+    free(bin_count_ptr)
+    return res_ptr


 @cython.boundscheck(False)
 @cython.wraparound(False)
 @cython.cdivision(True)
 @cython.initializedcheck(False)
-cdef double[:, :] agg_std(long* groups, double* x, size_t length, size_t width, long ddof=1):
+cdef double* agg_std(long* groups, double* x, size_t length, size_t width, long ddof=1) nogil:
    cdef long max_g = max_groups(groups, length)
-    cdef double[:, :] running_sum_square = zeros((max_g+1, width))
-    cdef double* running_sum_square_ptr = &running_sum_square[0, 0]
-    cdef double[:, :] running_sum = zeros((max_g+1, width))
-    cdef double* running_sum_ptr = &running_sum[0, 0]
-    cdef long[:] bin_count = zeros(max_g+1, dtype=int)
-    cdef long* bin_count_ptr = &bin_count[0]
+    cdef double* running_sum_square_ptr = <double*>calloc((max_g+1)*width, sizeof(double))
+    cdef double* running_sum_ptr = <double*>calloc((max_g+1)*width, sizeof(double))
+    cdef long* bin_count_ptr = <long*>calloc(max_g+1, sizeof(int))
    cdef size_t i
    cdef size_t j
-    cdef long k
-    cdef size_t indice
+    cdef size_t loop_idx1
+    cdef size_t loop_idx2
    cdef long curr
    cdef double raw_value

-    with nogil:
-        for i in range(length):
-            k = groups[i]
+    for i in range(length):
+        loop_idx1 = i * width
+        loop_idx2 = groups[i] * width
+
+        for j in range(width):
+            raw_value = x[loop_idx1 + j]
+            running_sum_ptr[loop_idx2 + j] += raw_value
+            running_sum_square_ptr[loop_idx2 + j] += raw_value * raw_value
+        bin_count_ptr[groups[i]] += 1
+
+    for i in range(max_g+1):
+        curr = bin_count_ptr[i]
+        loop_idx1 = i * width
+        if curr != 0:
            for j in range(width):
-                raw_value = x[i*width + j]
-                running_sum_ptr[k*width + j] += raw_value
-                running_sum_square_ptr[k*width + j] += raw_value * raw_value
-            bin_count_ptr[k] += 1
+                loop_idx2 = loop_idx1 + j
+                running_sum_square_ptr[loop_idx2] = sqrt((running_sum_square_ptr[loop_idx2] - running_sum_ptr[loop_idx2] * running_sum_ptr[loop_idx2] / curr) / (curr - ddof))

-        for i in range(max_g+1):
-            curr = bin_count_ptr[i]
-            if curr != 0:
-                for j in range(width):
-                    indice = i * width + j
-                    running_sum_square_ptr[indice] = sqrt((running_sum_square_ptr[indice] - running_sum_ptr[indice] * running_sum_ptr[indice] / curr) / (curr - ddof))
-    return running_sum_square
+    free(running_sum_ptr)
+    free(bin_count_ptr)
+    return running_sum_square_ptr


 @cython.boundscheck(False)
@@ -100,23 +109,22 @@ cpdef np.ndarray[double, ndim=2] transform(long[:] groups, double[:, :] x, str f
    cdef size_t width = x.shape[1]
    cdef double[:, :] res_data = zeros((length, width))
    cdef double* res_data_ptr = &res_data[0, 0]
-    cdef double[:, :] value_data = zeros((length, width))
    cdef double* value_data_ptr
    cdef size_t i
    cdef size_t j
-    cdef size_t k
+    cdef size_t loop_idx1
+    cdef size_t loop_idx2

    if func == 'mean':
-        value_data = agg_mean(&groups[0], &x[0, 0], length, width)
+        value_data_ptr = agg_mean(&groups[0], &x[0, 0], length, width)
    elif func == 'std':
-        value_data = agg_std(&groups[0], &x[0, 0], length, width, ddof=1)
-
-    value_data_ptr = &value_data[0, 0]
+        value_data_ptr = agg_std(&groups[0], &x[0, 0], length, width, ddof=1)

    with nogil:
        for i in range(length):
-            k = groups[i]
+            loop_idx1 = i*width
+            loop_idx2 = groups[i] * width
            for j in range(width):
-                res_data_ptr[i*width + j] = value_data_ptr[k*width + j]
-
+                res_data_ptr[loop_idx1 + j] = value_data_ptr[loop_idx2 + j]
+    free(value_data_ptr)
    return asarray(res_data)
\ No newline at end of file
--- a/alphamind/benchmarks/benchmarks.py
+++ b/alphamind/benchmarks/benchmarks.py
@@ -10,6 +10,8 @@ from alphamind.benchmarks.data.standardize import benchmark_standardize
 from alphamind.benchmarks.data.standardize import benchmark_standardize_with_group
 from alphamind.benchmarks.data.winsorize import benchmark_winsorize_normal
 from alphamind.benchmarks.data.winsorize import benchmark_winsorize_normal_with_group
+from alphamind.benchmarks.portfolio.rankbuild import benchmark_build_rank
+from alphamind.benchmarks.portfolio.rankbuild import benchmark_build_rank_with_group


 if __name__ == '__main__':
@@ -28,3 +30,9 @@ if __name__ == '__main__':
    benchmark_winsorize_normal_with_group(30, 10, 5000, 5)
    benchmark_winsorize_normal(50000, 50, 20)
    benchmark_winsorize_normal_with_group(50000, 50, 20, 50)
+    benchmark_build_rank(3000, 1000, 300)
+    benchmark_build_rank_with_group(3000, 1000, 10, 30)
+    benchmark_build_rank(30, 50000, 3)
+    benchmark_build_rank_with_group(30, 50000, 1, 3)
+    benchmark_build_rank(50000, 20, 3000)
+    benchmark_build_rank_with_group(50000, 20, 10, 300)
--- a/alphamind/benchmarks/portfolio/__init__.py
+++ b/alphamind/benchmarks/portfolio/__init__.py
+# -*- coding: utf-8 -*-
+"""
+Created on 2017-4-27
+
+@author: cheng.li
+"""
\ No newline at end of file
--- a/alphamind/benchmarks/portfolio/rankbuild.py
+++ b/alphamind/benchmarks/portfolio/rankbuild.py
+# -*- coding: utf-8 -*-
+"""
+Created on 2017-4-27
+
+@author: cheng.li
+"""
+
+import datetime as dt
+import numpy as np
+import pandas as pd
+from alphamind.portfolio.rankbuilder import rank_build
+
+
+def benchmark_build_rank(n_samples: int, n_loops: int, n_included: int) -> None:
+    print("-" * 60)
+    print("Starting portfolio construction by rank benchmarking")
+    print("Parameters(n_samples: {0}, n_included: {1}, n_loops: {2})".format(n_samples, n_included, n_loops))
+
+    x = np.random.randn(n_samples)
+
+    start = dt.datetime.now()
+    for _ in range(n_loops):
+        _ = rank_build(x, n_included)
+    impl_model_time = dt.datetime.now() - start
+
+    print('{0:20s}: {1}'.format('Implemented model', impl_model_time))
+
+    start = dt.datetime.now()
+    for _ in range(n_loops):
+        expected_weights = np.zeros(len(x))
+        expected_weights[(-x).argsort().argsort() < n_included] = 1. / n_included
+    benchmark_model_time = dt.datetime.now() - start
+
+    print('{0:20s}: {1}'.format('Benchmark model', benchmark_model_time))
+
+
+def benchmark_build_rank_with_group(n_samples: int, n_loops: int, n_included: int, n_groups: int) -> None:
+    print("-" * 60)
+    print("Starting  portfolio construction by rank with group-by values benchmarking")
+    print("Parameters(n_samples: {0}, n_included: {1}, n_loops: {2}, n_groups: {3})".format(n_samples, n_included, n_loops, n_groups))
+
+    x = np.random.randn(n_samples)
+    groups = np.random.randint(n_groups, size=n_samples)
+
+    start = dt.datetime.now()
+    for _ in range(n_loops):
+        _ = rank_build(x, n_included, groups=groups)
+    impl_model_time = dt.datetime.now() - start
+
+    print('{0:20s}: {1}'.format('Implemented model', impl_model_time))
+
+    start = dt.datetime.now()
+    for _ in range(n_loops):
+        grouped_ordering = pd.Series(-x).groupby(groups).rank()
+        expected_weights = np.zeros(len(x))
+        masks = grouped_ordering <= n_included
+        expected_weights[masks] = 1. / np.sum(masks)
+    benchmark_model_time = dt.datetime.now() - start
+
+    print('{0:20s}: {1}'.format('Benchmark model', benchmark_model_time))
+
+
+if __name__ == '__main__':
+    benchmark_build_rank(3000, 1000, 300)
+    benchmark_build_rank_with_group(3000, 1000, 10, 30)
--- a/alphamind/portfolio/rankbuilder.py
+++ b/alphamind/portfolio/rankbuilder.py
@@ -15,12 +15,15 @@ def rank_build(er: np.ndarray, use_rank: int, groups: np.ndarray=None) -> np.nda

    if groups is not None:
        max_g = np.max(groups)
+        index_range = np.arange(len(er))

        for i in range(max_g + 1):
            current_mask = groups == i
-            current_ordering = ordering[current_mask]
-            masks[current_ordering[:use_rank]] = True
+            current_index = index_range[current_mask]
+            current_ordering = neg_er[current_mask].argsort()
+            masks[current_index[current_ordering[:use_rank]]] = True
    else:
+
        masks[ordering[:use_rank]] = True

    weights = np.zeros(len(er))
@@ -33,10 +36,10 @@ if __name__ == '__main__':
    import datetime as dt

    x = np.random.randn(3000)
-    groups = np.random.randint(20, 50, size=3000)
+
+    groups = np.random.randint(30, size=3000)

    start = dt.datetime.now()
    for i in range(10000):
-        weights = rank_build(x, 20, groups)
+        weights = rank_build(x, 30, groups)
    print(dt.datetime.now() - start)
-    #print(x, '\n', weights)
--- a/alphamind/tests/portfolio/__init__.py
+++ b/alphamind/tests/portfolio/__init__.py
+# -*- coding: utf-8 -*-
+"""
+Created on 2017-4-27
+
+@author: cheng.li
+"""
\ No newline at end of file
--- a/alphamind/tests/portfolio/test_rankbuild.py
+++ b/alphamind/tests/portfolio/test_rankbuild.py
+# -*- coding: utf-8 -*-
+"""
+Created on 2017-4-27
+
+@author: cheng.li
+"""
+
+import unittest
+import numpy as np
+import pandas as pd
+from alphamind.portfolio.rankbuilder import rank_build
+
+
+class TestRankBuild(unittest.TestCase):
+
+    def test_rank_build(self):
+
+        n_samples = 3000
+        n_included = 300
+
+        x = np.random.randn(n_samples)
+
+        calc_weights = rank_build(x, n_included)
+
+        expected_weights = np.zeros(len(x))
+        expected_weights[(-x).argsort().argsort() < n_included] = 1. / n_included
+
+        np.testing.assert_array_almost_equal(calc_weights, expected_weights)
+
+    def test_rank_build_with_group(self):
+
+        n_samples = 3000
+        n_include = 10
+        n_groups = 30
+
+        x = np.random.randn(n_samples)
+        groups = np.random.randint(n_groups, size=n_samples)
+
+        calc_weights = rank_build(x, n_include, groups)
+
+        grouped_ordering = pd.Series(-x).groupby(groups).rank()
+        expected_weights = np.zeros(len(x))
+        masks = grouped_ordering <= n_include
+        expected_weights[masks] = 1. / np.sum(masks)
+
+        np.testing.assert_array_almost_equal(calc_weights, expected_weights)
+
+
+if __name__ == '__main__':
+    unittest.main()