update impl

c7150a2c · Dr.李 · 77b3d469 · c7150a2c · c7150a2c · c7150a2c
Commit c7150a2c authored Apr 26, 2017 by Dr.李
Showing with 67 additions and 56 deletions

benchmarks.py alphamind/benchmarks/benchmarks.py +4 -0

impl.pyx alphamind/data/impl.pyx +55 -33

standardize.py alphamind/data/standardize.py +3 -9

winsorize.py alphamind/data/winsorize.py +5 -14

No files found.
--- a/alphamind/benchmarks/benchmarks.py
+++ b/alphamind/benchmarks/benchmarks.py
@@ -20,7 +20,11 @@ if __name__ == '__main__':
    benchmark_standardize_with_group(3000, 10, 1000, 30)
    benchmark_standardize(30, 10, 50000)
    benchmark_standardize_with_group(30, 10, 5000, 5)
+    benchmark_standardize(50000, 50, 20)
+    benchmark_standardize_with_group(50000, 50, 20, 50)
    benchmark_winsorize_normal(3000, 10, 1000)
    benchmark_winsorize_normal_with_group(3000, 10, 1000, 30)
    benchmark_winsorize_normal(30, 10, 50000)
    benchmark_winsorize_normal_with_group(30, 10, 5000, 5)
+    benchmark_winsorize_normal(50000, 50, 20)
+    benchmark_winsorize_normal_with_group(50000, 50, 20, 50)
--- a/alphamind/data/impl.pyx
+++ b/alphamind/data/impl.pyx
@@ -13,9 +13,9 @@ from libc.math cimport sqrt

 @cython.boundscheck(False)
 @cython.wraparound(False)
-cdef int max_groups(long[:] groups, long length) nogil:
+cdef int max_groups(long[:] groups, size_t length) nogil:
    cdef long curr_max = 0
-    cdef long i
+    cdef size_t i
    cdef long curr

    for i in range(length):
@@ -27,54 +27,76 @@ cdef int max_groups(long[:] groups, long length) nogil:
 @cython.boundscheck(False)
 @cython.wraparound(False)
 @cython.cdivision(True)
-cpdef np.ndarray[double, ndim=2] agg_mean(long[:] groups, double[:, :] x):
-    cdef long length = groups.shape[0]
-    cdef long width = x.shape[1]
+cdef double[:, :] agg_mean(long[:] groups, double[:, :] x, size_t length, size_t width):
    cdef long max_g = max_groups(groups, length)
    cdef double[:, :] res = np.zeros((max_g+1, width))
    cdef long[:] bin_count = np.zeros(max_g+1, dtype=int)
-    cdef long i
-    cdef long j
+    cdef size_t i
+    cdef size_t j
    cdef long curr

-    for i in range(length):
-        for j in range(width):
-            res[groups[i], j] += x[i, j]
-        bin_count[groups[i]] += 1
+    with nogil:
+        for i in range(length):
+            for j in range(width):
+                res[groups[i], j] += x[i, j]
+            bin_count[groups[i]] += 1
+
+        for i in range(res.shape[0]):
+            curr = bin_count[i]
+            if curr != 0:
+                for j in range(width):
+                    res[i, j] /= curr
+    return res
+
+
+@cython.boundscheck(False)
+@cython.wraparound(False)
+cpdef np.ndarray[double, ndim=2] transform(long[:] groups, double[:, :] x, str func):
+
+    cdef size_t length = x.shape[0]
+    cdef size_t width = x.shape[1]
+    cdef double[:, :] res_data = np.zeros((length, width))
+    cdef double[:, :] value_data = np.zeros((length, width))
+    cdef size_t i
+    cdef size_t j
+
+    if func == 'mean':
+        value_data = agg_mean(groups, x, length, width)
+    elif func == 'std':
+        value_data = agg_std(groups, x, length, width, ddof=1)

-    for i in range(res.shape[0]):
-        curr = bin_count[i]
-        if curr != 0:
+    with nogil:
+        for i in range(length):
            for j in range(width):
-                res[i, j] /= curr
-    return np.asarray(res)
+                res_data[i, j] = value_data[groups[i], j]
+
+    return np.asarray(res_data)


 @cython.boundscheck(False)
 @cython.wraparound(False)
 @cython.cdivision(True)
-cpdef np.ndarray[double, ndim=2] agg_std(long[:] groups, double[:, :] x, long ddof=1):
-    cdef long length = groups.shape[0]
-    cdef long width = x.shape[1]
+cdef double[:, :] agg_std(long[:] groups, double[:, :] x, size_t length, size_t width, long ddof=1):
    cdef long max_g = max_groups(groups, length)
    cdef double[:, :] running_sum_square = np.zeros((max_g+1, width))
    cdef double[:, :] running_sum = np.zeros((max_g+1, width))
    cdef long[:] bin_count = np.zeros(max_g+1, dtype=int)
-    cdef long i
-    cdef long j
+    cdef size_t i
+    cdef size_t j
    cdef long curr
    cdef double raw_value

-    for i in range(length):
-        for j in range(width):
-            raw_value = x[i, j]
-            running_sum[groups[i], j] += raw_value
-            running_sum_square[groups[i], j] += raw_value * raw_value
-        bin_count[groups[i]] += 1
-
-    for i in range(running_sum_square.shape[0]):
-        curr = bin_count[i]
-        if curr > ddof:
+    with nogil:
+        for i in range(length):
            for j in range(width):
-                running_sum_square[i, j] = sqrt((running_sum_square[i, j] - running_sum[i, j] * running_sum[i, j] / curr) / (curr - ddof))
-    return np.asarray(running_sum_square)
\ No newline at end of file
+                raw_value = x[i, j]
+                running_sum[groups[i], j] += raw_value
+                running_sum_square[groups[i], j] += raw_value * raw_value
+            bin_count[groups[i]] += 1
+
+        for i in range(running_sum_square.shape[0]):
+            curr = bin_count[i]
+            if curr > ddof:
+                for j in range(width):
+                    running_sum_square[i, j] = sqrt((running_sum_square[i, j] - running_sum[i, j] * running_sum[i, j] / curr) / (curr - ddof))
+    return running_sum_square
\ No newline at end of file
--- a/alphamind/data/standardize.py
+++ b/alphamind/data/standardize.py
@@ -6,20 +6,14 @@ Created on 2017-4-25
 """

 import numpy as np
-from alphamind.data.impl import agg_mean
-from alphamind.data.impl import agg_std
+from alphamind.data.impl import transform


 def standardize(x: np.ndarray, groups: np.ndarray=None) -> np.ndarray:

    if groups is not None:
-        mean_values = agg_mean(groups, x)
-        std_values = agg_std(groups, x, ddof=1)
-
-        value_index = np.searchsorted(range(len(mean_values)), groups)
-
-        mean_values = mean_values[value_index]
-        std_values = std_values[value_index]
+        mean_values = transform(groups, x, 'mean')
+        std_values = transform(groups, x, 'std')

        return (x - mean_values) / std_values
    else:

--- a/alphamind/data/winsorize.py
+++ b/alphamind/data/winsorize.py
@@ -6,29 +6,20 @@ Created on 2017-4-25
 """

 import numpy as np
-from alphamind.data.impl import agg_mean
-from alphamind.data.impl import agg_std
+from alphamind.data.impl import transform


 def winsorize_normal(x: np.ndarray, num_stds: int=3, groups: np.ndarray=None) -> np.ndarray:

    if groups is not None:
-        mean_values = agg_mean(groups, x)
-        std_values = agg_std(groups, x, ddof=1)
-
-        value_index = np.searchsorted(range(len(mean_values)), groups)
-
-        ubound = mean_values + num_stds * std_values
-        lbound = mean_values - num_stds * std_values
-
-        ubound = ubound[value_index]
-        lbound = lbound[value_index]
+        mean_values = transform(groups, x, 'mean')
+        std_values = transform(groups, x, 'std')
    else:
        std_values = x.std(axis=0)
        mean_values = x.mean(axis=0)

-        ubound = mean_values + num_stds * std_values
-        lbound = mean_values - num_stds * std_values
+    ubound = mean_values + num_stds * std_values
+    lbound = mean_values - num_stds * std_values

    res = np.where(x > ubound, ubound, np.where(x < lbound, lbound, x))