Commit c7150a2c authored by Dr.李's avatar Dr.李

update impl

parent 77b3d469
...@@ -20,7 +20,11 @@ if __name__ == '__main__': ...@@ -20,7 +20,11 @@ if __name__ == '__main__':
benchmark_standardize_with_group(3000, 10, 1000, 30) benchmark_standardize_with_group(3000, 10, 1000, 30)
benchmark_standardize(30, 10, 50000) benchmark_standardize(30, 10, 50000)
benchmark_standardize_with_group(30, 10, 5000, 5) benchmark_standardize_with_group(30, 10, 5000, 5)
benchmark_standardize(50000, 50, 20)
benchmark_standardize_with_group(50000, 50, 20, 50)
benchmark_winsorize_normal(3000, 10, 1000) benchmark_winsorize_normal(3000, 10, 1000)
benchmark_winsorize_normal_with_group(3000, 10, 1000, 30) benchmark_winsorize_normal_with_group(3000, 10, 1000, 30)
benchmark_winsorize_normal(30, 10, 50000) benchmark_winsorize_normal(30, 10, 50000)
benchmark_winsorize_normal_with_group(30, 10, 5000, 5) benchmark_winsorize_normal_with_group(30, 10, 5000, 5)
benchmark_winsorize_normal(50000, 50, 20)
benchmark_winsorize_normal_with_group(50000, 50, 20, 50)
...@@ -13,9 +13,9 @@ from libc.math cimport sqrt ...@@ -13,9 +13,9 @@ from libc.math cimport sqrt
@cython.boundscheck(False) @cython.boundscheck(False)
@cython.wraparound(False) @cython.wraparound(False)
cdef int max_groups(long[:] groups, long length) nogil: cdef int max_groups(long[:] groups, size_t length) nogil:
cdef long curr_max = 0 cdef long curr_max = 0
cdef long i cdef size_t i
cdef long curr cdef long curr
for i in range(length): for i in range(length):
...@@ -27,54 +27,76 @@ cdef int max_groups(long[:] groups, long length) nogil: ...@@ -27,54 +27,76 @@ cdef int max_groups(long[:] groups, long length) nogil:
@cython.boundscheck(False) @cython.boundscheck(False)
@cython.wraparound(False) @cython.wraparound(False)
@cython.cdivision(True) @cython.cdivision(True)
cpdef np.ndarray[double, ndim=2] agg_mean(long[:] groups, double[:, :] x): cdef double[:, :] agg_mean(long[:] groups, double[:, :] x, size_t length, size_t width):
cdef long length = groups.shape[0]
cdef long width = x.shape[1]
cdef long max_g = max_groups(groups, length) cdef long max_g = max_groups(groups, length)
cdef double[:, :] res = np.zeros((max_g+1, width)) cdef double[:, :] res = np.zeros((max_g+1, width))
cdef long[:] bin_count = np.zeros(max_g+1, dtype=int) cdef long[:] bin_count = np.zeros(max_g+1, dtype=int)
cdef long i cdef size_t i
cdef long j cdef size_t j
cdef long curr cdef long curr
for i in range(length): with nogil:
for j in range(width): for i in range(length):
res[groups[i], j] += x[i, j] for j in range(width):
bin_count[groups[i]] += 1 res[groups[i], j] += x[i, j]
bin_count[groups[i]] += 1
for i in range(res.shape[0]):
curr = bin_count[i]
if curr != 0:
for j in range(width):
res[i, j] /= curr
return res
@cython.boundscheck(False)
@cython.wraparound(False)
cpdef np.ndarray[double, ndim=2] transform(long[:] groups, double[:, :] x, str func):
cdef size_t length = x.shape[0]
cdef size_t width = x.shape[1]
cdef double[:, :] res_data = np.zeros((length, width))
cdef double[:, :] value_data = np.zeros((length, width))
cdef size_t i
cdef size_t j
if func == 'mean':
value_data = agg_mean(groups, x, length, width)
elif func == 'std':
value_data = agg_std(groups, x, length, width, ddof=1)
for i in range(res.shape[0]): with nogil:
curr = bin_count[i] for i in range(length):
if curr != 0:
for j in range(width): for j in range(width):
res[i, j] /= curr res_data[i, j] = value_data[groups[i], j]
return np.asarray(res)
return np.asarray(res_data)
@cython.boundscheck(False) @cython.boundscheck(False)
@cython.wraparound(False) @cython.wraparound(False)
@cython.cdivision(True) @cython.cdivision(True)
cpdef np.ndarray[double, ndim=2] agg_std(long[:] groups, double[:, :] x, long ddof=1): cdef double[:, :] agg_std(long[:] groups, double[:, :] x, size_t length, size_t width, long ddof=1):
cdef long length = groups.shape[0]
cdef long width = x.shape[1]
cdef long max_g = max_groups(groups, length) cdef long max_g = max_groups(groups, length)
cdef double[:, :] running_sum_square = np.zeros((max_g+1, width)) cdef double[:, :] running_sum_square = np.zeros((max_g+1, width))
cdef double[:, :] running_sum = np.zeros((max_g+1, width)) cdef double[:, :] running_sum = np.zeros((max_g+1, width))
cdef long[:] bin_count = np.zeros(max_g+1, dtype=int) cdef long[:] bin_count = np.zeros(max_g+1, dtype=int)
cdef long i cdef size_t i
cdef long j cdef size_t j
cdef long curr cdef long curr
cdef double raw_value cdef double raw_value
for i in range(length): with nogil:
for j in range(width): for i in range(length):
raw_value = x[i, j]
running_sum[groups[i], j] += raw_value
running_sum_square[groups[i], j] += raw_value * raw_value
bin_count[groups[i]] += 1
for i in range(running_sum_square.shape[0]):
curr = bin_count[i]
if curr > ddof:
for j in range(width): for j in range(width):
running_sum_square[i, j] = sqrt((running_sum_square[i, j] - running_sum[i, j] * running_sum[i, j] / curr) / (curr - ddof)) raw_value = x[i, j]
return np.asarray(running_sum_square) running_sum[groups[i], j] += raw_value
\ No newline at end of file running_sum_square[groups[i], j] += raw_value * raw_value
bin_count[groups[i]] += 1
for i in range(running_sum_square.shape[0]):
curr = bin_count[i]
if curr > ddof:
for j in range(width):
running_sum_square[i, j] = sqrt((running_sum_square[i, j] - running_sum[i, j] * running_sum[i, j] / curr) / (curr - ddof))
return running_sum_square
\ No newline at end of file
...@@ -6,20 +6,14 @@ Created on 2017-4-25 ...@@ -6,20 +6,14 @@ Created on 2017-4-25
""" """
import numpy as np import numpy as np
from alphamind.data.impl import agg_mean from alphamind.data.impl import transform
from alphamind.data.impl import agg_std
def standardize(x: np.ndarray, groups: np.ndarray=None) -> np.ndarray: def standardize(x: np.ndarray, groups: np.ndarray=None) -> np.ndarray:
if groups is not None: if groups is not None:
mean_values = agg_mean(groups, x) mean_values = transform(groups, x, 'mean')
std_values = agg_std(groups, x, ddof=1) std_values = transform(groups, x, 'std')
value_index = np.searchsorted(range(len(mean_values)), groups)
mean_values = mean_values[value_index]
std_values = std_values[value_index]
return (x - mean_values) / std_values return (x - mean_values) / std_values
else: else:
......
...@@ -6,29 +6,20 @@ Created on 2017-4-25 ...@@ -6,29 +6,20 @@ Created on 2017-4-25
""" """
import numpy as np import numpy as np
from alphamind.data.impl import agg_mean from alphamind.data.impl import transform
from alphamind.data.impl import agg_std
def winsorize_normal(x: np.ndarray, num_stds: int=3, groups: np.ndarray=None) -> np.ndarray: def winsorize_normal(x: np.ndarray, num_stds: int=3, groups: np.ndarray=None) -> np.ndarray:
if groups is not None: if groups is not None:
mean_values = agg_mean(groups, x) mean_values = transform(groups, x, 'mean')
std_values = agg_std(groups, x, ddof=1) std_values = transform(groups, x, 'std')
value_index = np.searchsorted(range(len(mean_values)), groups)
ubound = mean_values + num_stds * std_values
lbound = mean_values - num_stds * std_values
ubound = ubound[value_index]
lbound = lbound[value_index]
else: else:
std_values = x.std(axis=0) std_values = x.std(axis=0)
mean_values = x.mean(axis=0) mean_values = x.mean(axis=0)
ubound = mean_values + num_stds * std_values ubound = mean_values + num_stds * std_values
lbound = mean_values - num_stds * std_values lbound = mean_values - num_stds * std_values
res = np.where(x > ubound, ubound, np.where(x < lbound, lbound, x)) res = np.where(x > ubound, ubound, np.where(x < lbound, lbound, x))
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment