Commit 77b3d469 authored by Dr.李's avatar Dr.李

using cython to enhance the performance

parent eb4a9adc
...@@ -17,10 +17,10 @@ if __name__ == '__main__': ...@@ -17,10 +17,10 @@ if __name__ == '__main__':
benchmark_neutralize(3000, 10, 1000) benchmark_neutralize(3000, 10, 1000)
benchmark_neutralize(30, 10, 50000) benchmark_neutralize(30, 10, 50000)
benchmark_standardize(3000, 10, 1000) benchmark_standardize(3000, 10, 1000)
benchmark_standardize_with_group(3000, 10, 100, 30) benchmark_standardize_with_group(3000, 10, 1000, 30)
benchmark_standardize(30, 10, 50000) benchmark_standardize(30, 10, 50000)
benchmark_standardize_with_group(30, 10, 5000, 5) benchmark_standardize_with_group(30, 10, 5000, 5)
benchmark_winsorize_normal(30, 10, 50000) benchmark_winsorize_normal(3000, 10, 1000)
benchmark_winsorize_normal_with_group(30, 10, 5000, 5) benchmark_winsorize_normal_with_group(3000, 10, 1000, 30)
benchmark_winsorize_normal(30, 10, 50000) benchmark_winsorize_normal(30, 10, 50000)
benchmark_winsorize_normal_with_group(30, 10, 5000, 5) benchmark_winsorize_normal_with_group(30, 10, 5000, 5)
...@@ -14,6 +14,7 @@ from alphamind.data.neutralize import ls_fit ...@@ -14,6 +14,7 @@ from alphamind.data.neutralize import ls_fit
def benchmark_neutralize(n_samples: int, n_features: int, n_loops: int) -> None: def benchmark_neutralize(n_samples: int, n_features: int, n_loops: int) -> None:
print("-" * 60)
print("Starting least square fitting benchmarking") print("Starting least square fitting benchmarking")
print("Parameters(n_samples: {0}, n_features: {1}, n_loops: {2})".format(n_samples, n_features, n_loops)) print("Parameters(n_samples: {0}, n_features: {1}, n_loops: {2})".format(n_samples, n_features, n_loops))
......
# -*- coding: utf-8 -*-
"""
Created on 2017-4-26
@author: cheng.li
"""
import numpy as np
cimport numpy as np
cimport cython
from libc.math cimport sqrt
@cython.boundscheck(False)
@cython.wraparound(False)
cdef int max_groups(long[:] groups, long length) nogil:
cdef long curr_max = 0
cdef long i
cdef long curr
for i in range(length):
curr = groups[i]
if curr > curr_max:
curr_max = curr
return curr_max
@cython.boundscheck(False)
@cython.wraparound(False)
@cython.cdivision(True)
cpdef np.ndarray[double, ndim=2] agg_mean(long[:] groups, double[:, :] x):
cdef long length = groups.shape[0]
cdef long width = x.shape[1]
cdef long max_g = max_groups(groups, length)
cdef double[:, :] res = np.zeros((max_g+1, width))
cdef long[:] bin_count = np.zeros(max_g+1, dtype=int)
cdef long i
cdef long j
cdef long curr
for i in range(length):
for j in range(width):
res[groups[i], j] += x[i, j]
bin_count[groups[i]] += 1
for i in range(res.shape[0]):
curr = bin_count[i]
if curr != 0:
for j in range(width):
res[i, j] /= curr
return np.asarray(res)
@cython.boundscheck(False)
@cython.wraparound(False)
@cython.cdivision(True)
cpdef np.ndarray[double, ndim=2] agg_std(long[:] groups, double[:, :] x, long ddof=1):
cdef long length = groups.shape[0]
cdef long width = x.shape[1]
cdef long max_g = max_groups(groups, length)
cdef double[:, :] running_sum_square = np.zeros((max_g+1, width))
cdef double[:, :] running_sum = np.zeros((max_g+1, width))
cdef long[:] bin_count = np.zeros(max_g+1, dtype=int)
cdef long i
cdef long j
cdef long curr
cdef double raw_value
for i in range(length):
for j in range(width):
raw_value = x[i, j]
running_sum[groups[i], j] += raw_value
running_sum_square[groups[i], j] += raw_value * raw_value
bin_count[groups[i]] += 1
for i in range(running_sum_square.shape[0]):
curr = bin_count[i]
if curr > ddof:
for j in range(width):
running_sum_square[i, j] = sqrt((running_sum_square[i, j] - running_sum[i, j] * running_sum[i, j] / curr) / (curr - ddof))
return np.asarray(running_sum_square)
\ No newline at end of file
...@@ -6,14 +6,15 @@ Created on 2017-4-25 ...@@ -6,14 +6,15 @@ Created on 2017-4-25
""" """
import numpy as np import numpy as np
import numpy_groupies as npg from alphamind.data.impl import agg_mean
from alphamind.data.impl import agg_std
def standardize(x: np.ndarray, groups: np.ndarray=None) -> np.ndarray: def standardize(x: np.ndarray, groups: np.ndarray=None) -> np.ndarray:
if groups is not None: if groups is not None:
mean_values = npg.aggregate_nb(groups, x, axis=0, func='mean') mean_values = agg_mean(groups, x)
std_values = npg.aggregate_nb(groups, x, axis=0, func='std', ddof=1) std_values = agg_std(groups, x, ddof=1)
value_index = np.searchsorted(range(len(mean_values)), groups) value_index = np.searchsorted(range(len(mean_values)), groups)
......
...@@ -6,14 +6,15 @@ Created on 2017-4-25 ...@@ -6,14 +6,15 @@ Created on 2017-4-25
""" """
import numpy as np import numpy as np
import numpy_groupies as npg from alphamind.data.impl import agg_mean
from alphamind.data.impl import agg_std
def winsorize_normal(x: np.ndarray, num_stds: int=3, groups: np.ndarray=None) -> np.ndarray: def winsorize_normal(x: np.ndarray, num_stds: int=3, groups: np.ndarray=None) -> np.ndarray:
if groups is not None: if groups is not None:
mean_values = npg.aggregate_nb(groups, x, axis=0, func='mean') mean_values = agg_mean(groups, x)
std_values = npg.aggregate_nb(groups, x, axis=0, func='std', ddof=1) std_values = agg_std(groups, x, ddof=1)
value_index = np.searchsorted(range(len(mean_values)), groups) value_index = np.searchsorted(range(len(mean_values)), groups)
...@@ -36,7 +37,7 @@ def winsorize_normal(x: np.ndarray, num_stds: int=3, groups: np.ndarray=None) -> ...@@ -36,7 +37,7 @@ def winsorize_normal(x: np.ndarray, num_stds: int=3, groups: np.ndarray=None) ->
if __name__ == '__main__': if __name__ == '__main__':
x = np.random.randn(3000, 10) x = np.random.randn(3000, 10)
groups = np.random.randint(20, 40, size=3000) groups = np.random.randint(0, 20, size=3000)
for _ in range(1000): for _ in range(1000):
winsorize_normal(x, 2, groups) winsorize_normal(x, 2, groups)
\ No newline at end of file
numba >= 0.32.0 cython >= 0.25.2
numpy >= 1.12.1 numpy >= 1.12.1
numpy_groupies >= 0.9.6
scikit-learn >= 0.18.1 scikit-learn >= 0.18.1
scipy >= 0.19.0 scipy >= 0.19.0
pandas >= 0.19.2 pandas >= 0.19.2
\ No newline at end of file
...@@ -5,16 +5,65 @@ Created on 2017-4-25 ...@@ -5,16 +5,65 @@ Created on 2017-4-25
@author: cheng.li @author: cheng.li
""" """
import platform
import sys
from setuptools import setup from setuptools import setup
from setuptools import find_packages from setuptools import find_packages
from distutils.extension import Extension
import numpy as np
from Cython.Build import cythonize
VERSION = "0.1.0"
if "--line_trace" in sys.argv:
line_trace = True
print("Build with line trace enabled ...")
sys.argv.remove("--line_trace")
else:
line_trace = False
ext_modules = ['alphamind/data/impl.pyx']
def generate_extensions(ext_modules, line_trace=False):
extensions = []
if line_trace:
print("define cython trace to True ...")
define_macros = [('CYTHON_TRACE', 1), ('CYTHON_TRACE_NOGIL', 1)]
else:
define_macros = []
for pyxfile in ext_modules:
ext = Extension(name='.'.join(pyxfile.split('/'))[:-4],
sources=[pyxfile],
define_macros=define_macros)
extensions.append(ext)
return extensions
if platform.system() != "Windows":
import multiprocessing
n_cpu = multiprocessing.cpu_count()
else:
n_cpu = 0
ext_modules_settings = cythonize(generate_extensions(ext_modules, line_trace),
compiler_directives={'embedsignature': True, 'linetrace': line_trace},
nthreads=n_cpu)
setup( setup(
name='Alpha-Mind', name='Alpha-Mind',
version='', version=VERSION,
packages=find_packages(), packages=find_packages(),
url='', url='',
license='', license='',
author='wegamekinglc', author='wegamekinglc',
author_email='', author_email='',
ext_modules=ext_modules_settings,
include_dirs=[np.get_include()],
description='' description=''
) )
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment