Commit 258b0348 authored by Dr.李's avatar Dr.李

added rank build tests and benchmark

parent b971f376
...@@ -10,6 +10,8 @@ from numpy import zeros ...@@ -10,6 +10,8 @@ from numpy import zeros
from numpy import asarray from numpy import asarray
cimport cython cimport cython
from libc.math cimport sqrt from libc.math cimport sqrt
from libc.stdlib cimport calloc
from libc.stdlib cimport free
@cython.boundscheck(False) @cython.boundscheck(False)
...@@ -26,69 +28,76 @@ cdef int max_groups(long* groups, size_t length) nogil: ...@@ -26,69 +28,76 @@ cdef int max_groups(long* groups, size_t length) nogil:
curr_max = curr curr_max = curr
return curr_max return curr_max
@cython.boundscheck(False) @cython.boundscheck(False)
@cython.wraparound(False) @cython.wraparound(False)
@cython.cdivision(True) @cython.cdivision(True)
@cython.initializedcheck(False) @cython.initializedcheck(False)
cdef double[:, :] agg_mean(long* groups, double* x, size_t length, size_t width): cdef double* agg_mean(long* groups, double* x, size_t length, size_t width) nogil:
cdef long max_g = max_groups(groups, length) cdef long max_g = max_groups(groups, length)
cdef double[:, :] res = zeros((max_g+1, width)) cdef double* res_ptr = <double*>calloc((max_g+1)*width, sizeof(double))
cdef double* res_ptr = &res[0, 0] cdef long* bin_count_ptr = <long*>calloc(max_g+1, sizeof(int))
cdef long[:] bin_count = zeros(max_g+1, dtype=int)
cdef long* bin_count_ptr = &bin_count[0]
cdef size_t i cdef size_t i
cdef size_t j cdef size_t j
cdef size_t loop_idx1
cdef size_t loop_idx2
cdef long curr cdef long curr
with nogil: for i in range(length):
for i in range(length): loop_idx1 = i*width
loop_idx2 = groups[i]*width
for j in range(width):
res_ptr[loop_idx2 + j] += x[loop_idx1 + j]
bin_count_ptr[groups[i]] += 1
for i in range(max_g+1):
curr = bin_count_ptr[i]
if curr != 0:
loop_idx1 = i*width
for j in range(width): for j in range(width):
res_ptr[groups[i]*width + j] += x[i*width + j] res_ptr[loop_idx1 + j] /= curr
bin_count_ptr[groups[i]] += 1
for i in range(max_g+1): free(bin_count_ptr)
curr = bin_count_ptr[i] return res_ptr
if curr != 0:
for j in range(width):
res_ptr[i*width + j] /= curr
return res
@cython.boundscheck(False) @cython.boundscheck(False)
@cython.wraparound(False) @cython.wraparound(False)
@cython.cdivision(True) @cython.cdivision(True)
@cython.initializedcheck(False) @cython.initializedcheck(False)
cdef double[:, :] agg_std(long* groups, double* x, size_t length, size_t width, long ddof=1): cdef double* agg_std(long* groups, double* x, size_t length, size_t width, long ddof=1) nogil:
cdef long max_g = max_groups(groups, length) cdef long max_g = max_groups(groups, length)
cdef double[:, :] running_sum_square = zeros((max_g+1, width)) cdef double* running_sum_square_ptr = <double*>calloc((max_g+1)*width, sizeof(double))
cdef double* running_sum_square_ptr = &running_sum_square[0, 0] cdef double* running_sum_ptr = <double*>calloc((max_g+1)*width, sizeof(double))
cdef double[:, :] running_sum = zeros((max_g+1, width)) cdef long* bin_count_ptr = <long*>calloc(max_g+1, sizeof(int))
cdef double* running_sum_ptr = &running_sum[0, 0]
cdef long[:] bin_count = zeros(max_g+1, dtype=int)
cdef long* bin_count_ptr = &bin_count[0]
cdef size_t i cdef size_t i
cdef size_t j cdef size_t j
cdef long k cdef size_t loop_idx1
cdef size_t indice cdef size_t loop_idx2
cdef long curr cdef long curr
cdef double raw_value cdef double raw_value
with nogil: for i in range(length):
for i in range(length): loop_idx1 = i * width
k = groups[i] loop_idx2 = groups[i] * width
for j in range(width):
raw_value = x[loop_idx1 + j]
running_sum_ptr[loop_idx2 + j] += raw_value
running_sum_square_ptr[loop_idx2 + j] += raw_value * raw_value
bin_count_ptr[groups[i]] += 1
for i in range(max_g+1):
curr = bin_count_ptr[i]
loop_idx1 = i * width
if curr != 0:
for j in range(width): for j in range(width):
raw_value = x[i*width + j] loop_idx2 = loop_idx1 + j
running_sum_ptr[k*width + j] += raw_value running_sum_square_ptr[loop_idx2] = sqrt((running_sum_square_ptr[loop_idx2] - running_sum_ptr[loop_idx2] * running_sum_ptr[loop_idx2] / curr) / (curr - ddof))
running_sum_square_ptr[k*width + j] += raw_value * raw_value
bin_count_ptr[k] += 1
for i in range(max_g+1): free(running_sum_ptr)
curr = bin_count_ptr[i] free(bin_count_ptr)
if curr != 0: return running_sum_square_ptr
for j in range(width):
indice = i * width + j
running_sum_square_ptr[indice] = sqrt((running_sum_square_ptr[indice] - running_sum_ptr[indice] * running_sum_ptr[indice] / curr) / (curr - ddof))
return running_sum_square
@cython.boundscheck(False) @cython.boundscheck(False)
...@@ -100,23 +109,22 @@ cpdef np.ndarray[double, ndim=2] transform(long[:] groups, double[:, :] x, str f ...@@ -100,23 +109,22 @@ cpdef np.ndarray[double, ndim=2] transform(long[:] groups, double[:, :] x, str f
cdef size_t width = x.shape[1] cdef size_t width = x.shape[1]
cdef double[:, :] res_data = zeros((length, width)) cdef double[:, :] res_data = zeros((length, width))
cdef double* res_data_ptr = &res_data[0, 0] cdef double* res_data_ptr = &res_data[0, 0]
cdef double[:, :] value_data = zeros((length, width))
cdef double* value_data_ptr cdef double* value_data_ptr
cdef size_t i cdef size_t i
cdef size_t j cdef size_t j
cdef size_t k cdef size_t loop_idx1
cdef size_t loop_idx2
if func == 'mean': if func == 'mean':
value_data = agg_mean(&groups[0], &x[0, 0], length, width) value_data_ptr = agg_mean(&groups[0], &x[0, 0], length, width)
elif func == 'std': elif func == 'std':
value_data = agg_std(&groups[0], &x[0, 0], length, width, ddof=1) value_data_ptr = agg_std(&groups[0], &x[0, 0], length, width, ddof=1)
value_data_ptr = &value_data[0, 0]
with nogil: with nogil:
for i in range(length): for i in range(length):
k = groups[i] loop_idx1 = i*width
loop_idx2 = groups[i] * width
for j in range(width): for j in range(width):
res_data_ptr[i*width + j] = value_data_ptr[k*width + j] res_data_ptr[loop_idx1 + j] = value_data_ptr[loop_idx2 + j]
free(value_data_ptr)
return asarray(res_data) return asarray(res_data)
\ No newline at end of file
...@@ -10,6 +10,8 @@ from alphamind.benchmarks.data.standardize import benchmark_standardize ...@@ -10,6 +10,8 @@ from alphamind.benchmarks.data.standardize import benchmark_standardize
from alphamind.benchmarks.data.standardize import benchmark_standardize_with_group from alphamind.benchmarks.data.standardize import benchmark_standardize_with_group
from alphamind.benchmarks.data.winsorize import benchmark_winsorize_normal from alphamind.benchmarks.data.winsorize import benchmark_winsorize_normal
from alphamind.benchmarks.data.winsorize import benchmark_winsorize_normal_with_group from alphamind.benchmarks.data.winsorize import benchmark_winsorize_normal_with_group
from alphamind.benchmarks.portfolio.rankbuild import benchmark_build_rank
from alphamind.benchmarks.portfolio.rankbuild import benchmark_build_rank_with_group
if __name__ == '__main__': if __name__ == '__main__':
...@@ -28,3 +30,9 @@ if __name__ == '__main__': ...@@ -28,3 +30,9 @@ if __name__ == '__main__':
benchmark_winsorize_normal_with_group(30, 10, 5000, 5) benchmark_winsorize_normal_with_group(30, 10, 5000, 5)
benchmark_winsorize_normal(50000, 50, 20) benchmark_winsorize_normal(50000, 50, 20)
benchmark_winsorize_normal_with_group(50000, 50, 20, 50) benchmark_winsorize_normal_with_group(50000, 50, 20, 50)
benchmark_build_rank(3000, 1000, 300)
benchmark_build_rank_with_group(3000, 1000, 10, 30)
benchmark_build_rank(30, 50000, 3)
benchmark_build_rank_with_group(30, 50000, 1, 3)
benchmark_build_rank(50000, 20, 3000)
benchmark_build_rank_with_group(50000, 20, 10, 300)
# -*- coding: utf-8 -*-
"""
Created on 2017-4-27
@author: cheng.li
"""
\ No newline at end of file
# -*- coding: utf-8 -*-
"""
Created on 2017-4-27
@author: cheng.li
"""
import datetime as dt
import numpy as np
import pandas as pd
from alphamind.portfolio.rankbuilder import rank_build
def benchmark_build_rank(n_samples: int, n_loops: int, n_included: int) -> None:
print("-" * 60)
print("Starting portfolio construction by rank benchmarking")
print("Parameters(n_samples: {0}, n_included: {1}, n_loops: {2})".format(n_samples, n_included, n_loops))
x = np.random.randn(n_samples)
start = dt.datetime.now()
for _ in range(n_loops):
_ = rank_build(x, n_included)
impl_model_time = dt.datetime.now() - start
print('{0:20s}: {1}'.format('Implemented model', impl_model_time))
start = dt.datetime.now()
for _ in range(n_loops):
expected_weights = np.zeros(len(x))
expected_weights[(-x).argsort().argsort() < n_included] = 1. / n_included
benchmark_model_time = dt.datetime.now() - start
print('{0:20s}: {1}'.format('Benchmark model', benchmark_model_time))
def benchmark_build_rank_with_group(n_samples: int, n_loops: int, n_included: int, n_groups: int) -> None:
print("-" * 60)
print("Starting portfolio construction by rank with group-by values benchmarking")
print("Parameters(n_samples: {0}, n_included: {1}, n_loops: {2}, n_groups: {3})".format(n_samples, n_included, n_loops, n_groups))
x = np.random.randn(n_samples)
groups = np.random.randint(n_groups, size=n_samples)
start = dt.datetime.now()
for _ in range(n_loops):
_ = rank_build(x, n_included, groups=groups)
impl_model_time = dt.datetime.now() - start
print('{0:20s}: {1}'.format('Implemented model', impl_model_time))
start = dt.datetime.now()
for _ in range(n_loops):
grouped_ordering = pd.Series(-x).groupby(groups).rank()
expected_weights = np.zeros(len(x))
masks = grouped_ordering <= n_included
expected_weights[masks] = 1. / np.sum(masks)
benchmark_model_time = dt.datetime.now() - start
print('{0:20s}: {1}'.format('Benchmark model', benchmark_model_time))
if __name__ == '__main__':
benchmark_build_rank(3000, 1000, 300)
benchmark_build_rank_with_group(3000, 1000, 10, 30)
...@@ -15,12 +15,15 @@ def rank_build(er: np.ndarray, use_rank: int, groups: np.ndarray=None) -> np.nda ...@@ -15,12 +15,15 @@ def rank_build(er: np.ndarray, use_rank: int, groups: np.ndarray=None) -> np.nda
if groups is not None: if groups is not None:
max_g = np.max(groups) max_g = np.max(groups)
index_range = np.arange(len(er))
for i in range(max_g + 1): for i in range(max_g + 1):
current_mask = groups == i current_mask = groups == i
current_ordering = ordering[current_mask] current_index = index_range[current_mask]
masks[current_ordering[:use_rank]] = True current_ordering = neg_er[current_mask].argsort()
masks[current_index[current_ordering[:use_rank]]] = True
else: else:
masks[ordering[:use_rank]] = True masks[ordering[:use_rank]] = True
weights = np.zeros(len(er)) weights = np.zeros(len(er))
...@@ -33,10 +36,10 @@ if __name__ == '__main__': ...@@ -33,10 +36,10 @@ if __name__ == '__main__':
import datetime as dt import datetime as dt
x = np.random.randn(3000) x = np.random.randn(3000)
groups = np.random.randint(20, 50, size=3000)
groups = np.random.randint(30, size=3000)
start = dt.datetime.now() start = dt.datetime.now()
for i in range(10000): for i in range(10000):
weights = rank_build(x, 20, groups) weights = rank_build(x, 30, groups)
print(dt.datetime.now() - start) print(dt.datetime.now() - start)
#print(x, '\n', weights)
# -*- coding: utf-8 -*-
"""
Created on 2017-4-27
@author: cheng.li
"""
\ No newline at end of file
# -*- coding: utf-8 -*-
"""
Created on 2017-4-27
@author: cheng.li
"""
import unittest
import numpy as np
import pandas as pd
from alphamind.portfolio.rankbuilder import rank_build
class TestRankBuild(unittest.TestCase):
def test_rank_build(self):
n_samples = 3000
n_included = 300
x = np.random.randn(n_samples)
calc_weights = rank_build(x, n_included)
expected_weights = np.zeros(len(x))
expected_weights[(-x).argsort().argsort() < n_included] = 1. / n_included
np.testing.assert_array_almost_equal(calc_weights, expected_weights)
def test_rank_build_with_group(self):
n_samples = 3000
n_include = 10
n_groups = 30
x = np.random.randn(n_samples)
groups = np.random.randint(n_groups, size=n_samples)
calc_weights = rank_build(x, n_include, groups)
grouped_ordering = pd.Series(-x).groupby(groups).rank()
expected_weights = np.zeros(len(x))
masks = grouped_ordering <= n_include
expected_weights[masks] = 1. / np.sum(masks)
np.testing.assert_array_almost_equal(calc_weights, expected_weights)
if __name__ == '__main__':
unittest.main()
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment