Commit e9d233d4 authored by Dr.李's avatar Dr.李

restructure

parent 8328777b
# -*- coding: utf-8 -*-
"""
Created on 2017-5-3
@author: cheng.li
"""
import math
import numpy as np
import numba as nb
def groupby(groups):
order_group_idx = groups.argsort()
counts = np.bincount(groups)
nonzero_idx = counts.nonzero()[0]
start = 0
res = []
for i in nonzero_idx:
num_g = counts[i]
res.append(order_group_idx[start:start+num_g])
start += num_g
return res
@nb.njit(nogil=True, cache=True)
def group_mapping(groups):
length = groups.shape[0]
order = groups.argsort()
res = np.zeros(length, dtype=order.dtype)
start = 0
res[order[0]] = start
previous = groups[order[0]]
for i in range(1, length):
curr_idx = order[i]
curr_val = groups[curr_idx]
if curr_val != previous:
start += 1
res[curr_idx] = start
else:
res[curr_idx] = start
previous = curr_val
return res
@nb.njit(nogil=True, cache=True)
def simple_sum(x, axis=0):
length, width = x.shape
if axis == 0:
res = np.zeros(width)
for i in range(length):
for j in range(width):
res[j] += x[i, j]
elif axis == 1:
res = np.zeros(length)
for i in range(length):
for j in range(width):
res[i] += x[i, j]
return res
@nb.njit(nogil=True, cache=True)
def simple_mean(x, axis=0):
length, width = x.shape
if axis == 0:
res = np.zeros(width)
for j in range(width):
for i in range(length):
res[j] += x[i, j]
res[j] /= length
elif axis == 1:
res = np.zeros(length)
for i in range(length):
for j in range(width):
res[i] += x[i, j]
res[i] /= width
return res
@nb.njit(nogil=True, cache=True)
def simple_std(x, axis=0, ddof=1):
length, width = x.shape
if axis == 0:
res = np.zeros(width)
sum_mat = np.zeros(width)
for j in range(width):
for i in range(length):
res[j] += x[i, j] * x[i, j]
sum_mat[j] += x[i, j]
res[j] = math.sqrt((res[j] - sum_mat[j] * sum_mat[j] / length) / (length - ddof))
elif axis == 1:
res = np.zeros(length)
sum_mat = np.zeros(width)
for i in range(length):
for j in range(width):
res[i] += x[i, j] * x[i, j]
sum_mat[i] += x[i, j]
res[i] = math.sqrt((res[i] - sum_mat[i] * sum_mat[i] / width) / (width - ddof))
return res
@nb.njit(nogil=True, cache=True)
def agg_sum(groups, x):
max_g = groups.max()
length, width = x.shape
res = np.zeros((max_g+1, width), dtype=np.float64)
for i in range(length):
for j in range(width):
res[groups[i], j] += x[i, j]
return res
@nb.njit(nogil=True, cache=True)
def agg_abssum(groups, x):
max_g = groups.max()
length, width = x.shape
res = np.zeros((max_g+1, width), dtype=np.float64)
for i in range(length):
for j in range(width):
res[groups[i], j] += abs(x[i, j])
return res
@nb.njit(nogil=True, cache=True)
def agg_mean(groups, x):
max_g = groups.max()
length, width = x.shape
res = np.zeros((max_g+1, width), dtype=np.float64)
bin_count = np.zeros(max_g+1, dtype=np.int32)
for i in range(length):
for j in range(width):
res[groups[i], j] += x[i, j]
bin_count[groups[i]] += 1
for i in range(max_g+1):
curr = bin_count[i]
for j in range(width):
res[i, j] /= curr
return res
@nb.njit(nogil=True, cache=True)
def agg_std(groups, x, ddof=1):
max_g = groups.max()
length, width = x.shape
res = np.zeros((max_g+1, width), dtype=np.float64)
sumsq = np.zeros((max_g + 1, width), dtype=np.float64)
bin_count = np.zeros(max_g+1, dtype=np.int32)
for i in range(length):
for j in range(width):
res[groups[i], j] += x[i, j]
sumsq[groups[i], j] += x[i, j] * x[i, j]
bin_count[groups[i]] += 1
for i in range(max_g+1):
curr = bin_count[i]
for j in range(width):
res[i, j] = math.sqrt((sumsq[i, j] - res[i, j] * res[i, j] / curr) / (curr - ddof))
return res
@nb.njit(nogil=True, cache=True)
def copy_value(groups, source):
length = groups.shape[0]
width = source.shape[1]
destination = np.zeros((length, width))
for i in range(length):
k = groups[i]
for j in range(width):
destination[i, j] = source[k, j]
return destination
def transform(groups, x, func, ddof=1):
if func == 'mean':
value_data = agg_mean(groups, x)
elif func == 'std':
value_data = agg_std(groups, x, ddof=ddof)
elif func == 'sum':
value_data = agg_sum(groups, x)
elif func =='abssum':
value_data = agg_abssum(groups, x)
else:
raise ValueError('({0}) is not recognized as valid functor'.format(func))
return copy_value(groups, value_data)
def aggregate(groups, x, func, ddof=1):
if func == 'mean':
value_data = agg_mean(groups, x)
elif func == 'std':
value_data = agg_std(groups, x, ddof=ddof)
elif func == 'sum':
value_data = agg_sum(groups, x)
elif func =='abssum':
value_data = agg_abssum(groups, x)
else:
raise ValueError('({0}) is not recognized as valid functor'.format(func))
return value_data
if __name__ == '__main__':
n_samples = 6000
n_features = 10
n_groups = 30
groups = np.random.randint(n_groups, size=n_samples)
max_g = n_groups - 1
x = np.random.randn(n_samples, n_features)
import datetime as dt
start = dt.datetime.now()
for i in range(1000):
res = aggregate(groups, x, 'mean')
print(dt.datetime.now() - start)
#transform = nb.jit(transform)
start = dt.datetime.now()
for i in range(1000):
res = aggregate(groups, x, 'mean')
print(dt.datetime.now() - start)
\ No newline at end of file
......@@ -12,7 +12,7 @@ from numpy.linalg import solve
from typing import Tuple
from typing import Union
from typing import Dict
from alphamind.aggregate import groupby
from alphamind.utilities import groupby
def neutralize(x: np.ndarray, y: np.ndarray, groups: np.ndarray=None, output_explained=False, output_exposure=False) \
......
......@@ -6,10 +6,10 @@ Created on 2017-4-25
"""
import numpy as np
from alphamind.aggregate import group_mapping
from alphamind.aggregate import transform
from alphamind.aggregate import simple_mean
from alphamind.aggregate import simple_std
from alphamind.utilities import group_mapping
from alphamind.utilities import transform
from alphamind.utilities import simple_mean
from alphamind.utilities import simple_std
def standardize(x: np.ndarray, groups: np.ndarray=None, ddof=1) -> np.ndarray:
......
......@@ -7,10 +7,10 @@ Created on 2017-4-25
import numpy as np
import numba as nb
from alphamind.aggregate import group_mapping
from alphamind.aggregate import transform
from alphamind.aggregate import simple_mean
from alphamind.aggregate import simple_std
from alphamind.utilities import group_mapping
from alphamind.utilities import transform
from alphamind.utilities import simple_mean
from alphamind.utilities import simple_std
@nb.njit(nogil=True, cache=True)
......
......@@ -9,7 +9,7 @@ import numpy as np
import numba as nb
from numpy import zeros
from numpy import zeros_like
from alphamind.aggregate import groupby
from alphamind.utilities import groupby
@nb.njit(nogil=True, cache=True)
......
......@@ -6,9 +6,9 @@ Created on 2017-4-28
"""
import numpy as np
from alphamind.aggregate import group_mapping
from alphamind.aggregate import aggregate
from alphamind.aggregate import simple_sum
from alphamind.utilities import group_mapping
from alphamind.utilities import aggregate
from alphamind.utilities import simple_sum
def simple_settle(weights: np.ndarray, ret_series: np.ndarray, groups: np.ndarray=None) -> np.ndarray:
......
......@@ -9,6 +9,10 @@ import os
import sys
import logging
import unittest
import math
from typing import List
import numpy as np
import numba as nb
alpha_logger = logging.getLogger('ALPHA_MIND')
......@@ -46,3 +50,208 @@ class TestRunner(object):
sys.exit(-1)
else:
sys.exit(0)
def groupby(groups: np.ndarray) -> List[np.ndarray]:
order_group_idx = groups.argsort()
counts = np.bincount(groups)
nonzero_idx = counts.nonzero()[0]
start = 0
res = []
for i in nonzero_idx:
num_g = counts[i]
res.append(order_group_idx[start:start+num_g])
start += num_g
return res
@nb.njit(nogil=True, cache=True)
def group_mapping(groups: np.ndarray) -> np.ndarray:
length = groups.shape[0]
order = groups.argsort()
res = np.zeros(length, dtype=order.dtype)
start = 0
res[order[0]] = start
previous = groups[order[0]]
for i in range(1, length):
curr_idx = order[i]
curr_val = groups[curr_idx]
if curr_val != previous:
start += 1
res[curr_idx] = start
else:
res[curr_idx] = start
previous = curr_val
return res
@nb.njit(nogil=True, cache=True)
def simple_sum(x, axis=0):
length, width = x.shape
if axis == 0:
res = np.zeros(width)
for i in range(length):
for j in range(width):
res[j] += x[i, j]
elif axis == 1:
res = np.zeros(length)
for i in range(length):
for j in range(width):
res[i] += x[i, j]
return res
@nb.njit(nogil=True, cache=True)
def simple_mean(x, axis=0):
length, width = x.shape
if axis == 0:
res = np.zeros(width)
for j in range(width):
for i in range(length):
res[j] += x[i, j]
res[j] /= length
elif axis == 1:
res = np.zeros(length)
for i in range(length):
for j in range(width):
res[i] += x[i, j]
res[i] /= width
return res
@nb.njit(nogil=True, cache=True)
def simple_std(x, axis=0, ddof=1):
length, width = x.shape
if axis == 0:
res = np.zeros(width)
sum_mat = np.zeros(width)
for j in range(width):
for i in range(length):
res[j] += x[i, j] * x[i, j]
sum_mat[j] += x[i, j]
res[j] = math.sqrt((res[j] - sum_mat[j] * sum_mat[j] / length) / (length - ddof))
elif axis == 1:
res = np.zeros(length)
sum_mat = np.zeros(width)
for i in range(length):
for j in range(width):
res[i] += x[i, j] * x[i, j]
sum_mat[i] += x[i, j]
res[i] = math.sqrt((res[i] - sum_mat[i] * sum_mat[i] / width) / (width - ddof))
return res
@nb.njit(nogil=True, cache=True)
def agg_sum(groups, x):
max_g = groups.max()
length, width = x.shape
res = np.zeros((max_g+1, width), dtype=np.float64)
for i in range(length):
for j in range(width):
res[groups[i], j] += x[i, j]
return res
@nb.njit(nogil=True, cache=True)
def agg_abssum(groups, x):
max_g = groups.max()
length, width = x.shape
res = np.zeros((max_g+1, width), dtype=np.float64)
for i in range(length):
for j in range(width):
res[groups[i], j] += abs(x[i, j])
return res
@nb.njit(nogil=True, cache=True)
def agg_mean(groups, x):
max_g = groups.max()
length, width = x.shape
res = np.zeros((max_g+1, width), dtype=np.float64)
bin_count = np.zeros(max_g+1, dtype=np.int32)
for i in range(length):
for j in range(width):
res[groups[i], j] += x[i, j]
bin_count[groups[i]] += 1
for i in range(max_g+1):
curr = bin_count[i]
for j in range(width):
res[i, j] /= curr
return res
@nb.njit(nogil=True, cache=True)
def agg_std(groups, x, ddof=1):
max_g = groups.max()
length, width = x.shape
res = np.zeros((max_g+1, width), dtype=np.float64)
sumsq = np.zeros((max_g + 1, width), dtype=np.float64)
bin_count = np.zeros(max_g+1, dtype=np.int32)
for i in range(length):
for j in range(width):
res[groups[i], j] += x[i, j]
sumsq[groups[i], j] += x[i, j] * x[i, j]
bin_count[groups[i]] += 1
for i in range(max_g+1):
curr = bin_count[i]
for j in range(width):
res[i, j] = math.sqrt((sumsq[i, j] - res[i, j] * res[i, j] / curr) / (curr - ddof))
return res
@nb.njit(nogil=True, cache=True)
def copy_value(groups, source):
length = groups.shape[0]
width = source.shape[1]
destination = np.zeros((length, width))
for i in range(length):
k = groups[i]
for j in range(width):
destination[i, j] = source[k, j]
return destination
def transform(groups, x, func, ddof=1):
if func == 'mean':
value_data = agg_mean(groups, x)
elif func == 'std':
value_data = agg_std(groups, x, ddof=ddof)
elif func == 'sum':
value_data = agg_sum(groups, x)
elif func =='abssum':
value_data = agg_abssum(groups, x)
else:
raise ValueError('({0}) is not recognized as valid functor'.format(func))
return copy_value(groups, value_data)
def aggregate(groups, x, func, ddof=1):
if func == 'mean':
value_data = agg_mean(groups, x)
elif func == 'std':
value_data = agg_std(groups, x, ddof=ddof)
elif func == 'sum':
value_data = agg_sum(groups, x)
elif func =='abssum':
value_data = agg_abssum(groups, x)
else:
raise ValueError('({0}) is not recognized as valid functor'.format(func))
return value_data
cython >= 0.25.2
numpy >= 1.12.1
numba >= 0.30.0
scikit-learn >= 0.18.1
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment