using cython to enhance the performance

77b3d469 · Dr.李 · eb4a9adc · 77b3d469 · 77b3d469 · 77b3d469
Commit 77b3d469 authored Apr 26, 2017 by Dr.李
7 changed files
--- a/alphamind/benchmarks/benchmarks.py
+++ b/alphamind/benchmarks/benchmarks.py
@@ -17,10 +17,10 @@ if __name__ == '__main__':
    benchmark_neutralize(3000, 10, 1000)
    benchmark_neutralize(30, 10, 50000)
    benchmark_standardize(3000, 10, 1000)
-    benchmark_standardize_with_group(3000, 10, 100, 30)
+    benchmark_standardize_with_group(3000, 10, 1000, 30)
    benchmark_standardize(30, 10, 50000)
    benchmark_standardize_with_group(30, 10, 5000, 5)
-    benchmark_winsorize_normal(30, 10, 50000)
+    benchmark_winsorize_normal(3000, 10, 1000)
-    benchmark_winsorize_normal_with_group(30, 10, 5000, 5)
+    benchmark_winsorize_normal_with_group(3000, 10, 1000, 30)
    benchmark_winsorize_normal(30, 10, 50000)
    benchmark_winsorize_normal_with_group(30, 10, 5000, 5)
--- a/alphamind/benchmarks/neutralize.py
+++ b/alphamind/benchmarks/neutralize.py
@@ -14,6 +14,7 @@ from alphamind.data.neutralize import ls_fit
 def benchmark_neutralize(n_samples: int, n_features: int, n_loops: int) -> None:
+    print("-" * 60)
    print("Starting least square fitting benchmarking")
    print("Parameters(n_samples: {0}, n_features: {1}, n_loops: {2})".format(n_samples, n_features, n_loops))

--- a/alphamind/data/impl.pyx
+++ b/alphamind/data/impl.pyx
+# -*- coding: utf-8 -*-
+"""
+Created on 2017-4-26
+@author: cheng.li
+"""
+import numpy as np
+cimport numpy as np
+cimport cython
+from libc.math cimport sqrt
+@cython.boundscheck(False)
+@cython.wraparound(False)
+cdef int max_groups(long[:] groups, long length) nogil:
+    cdef long curr_max = 0
+    cdef long i
+    cdef long curr
+    for i in range(length):
+        curr = groups[i]
+        if curr > curr_max:
+            curr_max = curr
+    return curr_max
+@cython.boundscheck(False)
+@cython.wraparound(False)
+@cython.cdivision(True)
+cpdef np.ndarray[double, ndim=2] agg_mean(long[:] groups, double[:, :] x):
+    cdef long length = groups.shape[0]
+    cdef long width = x.shape[1]
+    cdef long max_g = max_groups(groups, length)
+    cdef double[:, :] res = np.zeros((max_g+1, width))
+    cdef long[:] bin_count = np.zeros(max_g+1, dtype=int)
+    cdef long i
+    cdef long j
+    cdef long curr
+    for i in range(length):
+        for j in range(width):
+            res[groups[i], j] += x[i, j]
+        bin_count[groups[i]] += 1
+    for i in range(res.shape[0]):
+        curr = bin_count[i]
+        if curr != 0:
+            for j in range(width):
+                res[i, j] /= curr
+    return np.asarray(res)
+@cython.boundscheck(False)
+@cython.wraparound(False)
+@cython.cdivision(True)
+cpdef np.ndarray[double, ndim=2] agg_std(long[:] groups, double[:, :] x, long ddof=1):
+    cdef long length = groups.shape[0]
+    cdef long width = x.shape[1]
+    cdef long max_g = max_groups(groups, length)
+    cdef double[:, :] running_sum_square = np.zeros((max_g+1, width))
+    cdef double[:, :] running_sum = np.zeros((max_g+1, width))
+    cdef long[:] bin_count = np.zeros(max_g+1, dtype=int)
+    cdef long i
+    cdef long j
+    cdef long curr
+    cdef double raw_value
+    for i in range(length):
+        for j in range(width):
+            raw_value = x[i, j]
+            running_sum[groups[i], j] += raw_value
+            running_sum_square[groups[i], j] += raw_value * raw_value
+        bin_count[groups[i]] += 1
+    for i in range(running_sum_square.shape[0]):
+        curr = bin_count[i]
+        if curr > ddof:
+            for j in range(width):
+                running_sum_square[i, j] = sqrt((running_sum_square[i, j] - running_sum[i, j] * running_sum[i, j] / curr) / (curr - ddof))
+    return np.asarray(running_sum_square)
\ No newline at end of file
--- a/alphamind/data/standardize.py
+++ b/alphamind/data/standardize.py
@@ -6,14 +6,15 @@ Created on 2017-4-25
 """
 import numpy as np
-import numpy_groupies as npg
+from alphamind.data.impl import agg_mean
+from alphamind.data.impl import agg_std
 def standardize(x: np.ndarray, groups: np.ndarray=None) -> np.ndarray:
    if groups is not None:
-        mean_values = npg.aggregate_nb(groups, x, axis=0, func='mean')
+        mean_values = agg_mean(groups, x)
-        std_values = npg.aggregate_nb(groups, x, axis=0, func='std', ddof=1)
+        std_values = agg_std(groups, x, ddof=1)
        value_index = np.searchsorted(range(len(mean_values)), groups)

--- a/alphamind/data/winsorize.py
+++ b/alphamind/data/winsorize.py
@@ -6,14 +6,15 @@ Created on 2017-4-25
 """
 import numpy as np
-import numpy_groupies as npg
+from alphamind.data.impl import agg_mean
+from alphamind.data.impl import agg_std
 def winsorize_normal(x: np.ndarray, num_stds: int=3, groups: np.ndarray=None) -> np.ndarray:
    if groups is not None:
-        mean_values = npg.aggregate_nb(groups, x, axis=0, func='mean')
+        mean_values = agg_mean(groups, x)
-        std_values = npg.aggregate_nb(groups, x, axis=0, func='std', ddof=1)
+        std_values = agg_std(groups, x, ddof=1)
        value_index = np.searchsorted(range(len(mean_values)), groups)
@@ -36,7 +37,7 @@ def winsorize_normal(x: np.ndarray, num_stds: int=3, groups: np.ndarray=None) ->
 if __name__ == '__main__':
    x = np.random.randn(3000, 10)
-    groups = np.random.randint(20, 40, size=3000)
+    groups = np.random.randint(0, 20, size=3000)
    for _ in range(1000):
        winsorize_normal(x, 2, groups)
\ No newline at end of file
--- a/requirements.txt
+++ b/requirements.txt
-numba >= 0.32.0
+cython >= 0.25.2
 numpy >= 1.12.1
-numpy_groupies >= 0.9.6
 scikit-learn >= 0.18.1
 scipy >= 0.19.0
 pandas >= 0.19.2
\ No newline at end of file
--- a/setup.py
+++ b/setup.py
@@ -5,16 +5,65 @@ Created on 2017-4-25
 @author: cheng.li
 """
+import platform
+import sys
 from setuptools import setup
 from setuptools import find_packages
+from distutils.extension import Extension
+import numpy as np
+from Cython.Build import cythonize
+VERSION = "0.1.0"
+if "--line_trace" in sys.argv:
+    line_trace = True
+    print("Build with line trace enabled ...")
+    sys.argv.remove("--line_trace")
+else:
+    line_trace = False
+ext_modules = ['alphamind/data/impl.pyx']
+def generate_extensions(ext_modules, line_trace=False):
+    extensions = []
+    if line_trace:
+        print("define cython trace to True ...")
+        define_macros = [('CYTHON_TRACE', 1), ('CYTHON_TRACE_NOGIL', 1)]
+    else:
+        define_macros = []
+    for pyxfile in ext_modules:
+        ext = Extension(name='.'.join(pyxfile.split('/'))[:-4],
+                        sources=[pyxfile],
+                        define_macros=define_macros)
+        extensions.append(ext)
+    return extensions
+if platform.system() != "Windows":
+    import multiprocessing
+    n_cpu = multiprocessing.cpu_count()
+else:
+    n_cpu = 0
+ext_modules_settings = cythonize(generate_extensions(ext_modules, line_trace),
+                                 compiler_directives={'embedsignature': True, 'linetrace': line_trace},
+                                 nthreads=n_cpu)
 setup(
    name='Alpha-Mind',
-    version='',
+    version=VERSION,
    packages=find_packages(),
    url='',
    license='',
    author='wegamekinglc',
    author_email='',
+    ext_modules=ext_modules_settings,
+    include_dirs=[np.get_include()],
    description=''
 )