Commit e9d233d4 authored by Dr.李's avatar Dr.李

restructure

parent 8328777b
# -*- coding: utf-8 -*-
"""
Created on 2017-5-3
@author: cheng.li
"""
import math
import numpy as np
import numba as nb
def groupby(groups):
order_group_idx = groups.argsort()
counts = np.bincount(groups)
nonzero_idx = counts.nonzero()[0]
start = 0
res = []
for i in nonzero_idx:
num_g = counts[i]
res.append(order_group_idx[start:start+num_g])
start += num_g
return res
@nb.njit(nogil=True, cache=True)
def group_mapping(groups):
length = groups.shape[0]
order = groups.argsort()
res = np.zeros(length, dtype=order.dtype)
start = 0
res[order[0]] = start
previous = groups[order[0]]
for i in range(1, length):
curr_idx = order[i]
curr_val = groups[curr_idx]
if curr_val != previous:
start += 1
res[curr_idx] = start
else:
res[curr_idx] = start
previous = curr_val
return res
@nb.njit(nogil=True, cache=True)
def simple_sum(x, axis=0):
length, width = x.shape
if axis == 0:
res = np.zeros(width)
for i in range(length):
for j in range(width):
res[j] += x[i, j]
elif axis == 1:
res = np.zeros(length)
for i in range(length):
for j in range(width):
res[i] += x[i, j]
return res
@nb.njit(nogil=True, cache=True)
def simple_mean(x, axis=0):
length, width = x.shape
if axis == 0:
res = np.zeros(width)
for j in range(width):
for i in range(length):
res[j] += x[i, j]
res[j] /= length
elif axis == 1:
res = np.zeros(length)
for i in range(length):
for j in range(width):
res[i] += x[i, j]
res[i] /= width
return res
@nb.njit(nogil=True, cache=True)
def simple_std(x, axis=0, ddof=1):
length, width = x.shape
if axis == 0:
res = np.zeros(width)
sum_mat = np.zeros(width)
for j in range(width):
for i in range(length):
res[j] += x[i, j] * x[i, j]
sum_mat[j] += x[i, j]
res[j] = math.sqrt((res[j] - sum_mat[j] * sum_mat[j] / length) / (length - ddof))
elif axis == 1:
res = np.zeros(length)
sum_mat = np.zeros(width)
for i in range(length):
for j in range(width):
res[i] += x[i, j] * x[i, j]
sum_mat[i] += x[i, j]
res[i] = math.sqrt((res[i] - sum_mat[i] * sum_mat[i] / width) / (width - ddof))
return res
@nb.njit(nogil=True, cache=True)
def agg_sum(groups, x):
max_g = groups.max()
length, width = x.shape
res = np.zeros((max_g+1, width), dtype=np.float64)
for i in range(length):
for j in range(width):
res[groups[i], j] += x[i, j]
return res
@nb.njit(nogil=True, cache=True)
def agg_abssum(groups, x):
max_g = groups.max()
length, width = x.shape
res = np.zeros((max_g+1, width), dtype=np.float64)
for i in range(length):
for j in range(width):
res[groups[i], j] += abs(x[i, j])
return res
@nb.njit(nogil=True, cache=True)
def agg_mean(groups, x):
max_g = groups.max()
length, width = x.shape
res = np.zeros((max_g+1, width), dtype=np.float64)
bin_count = np.zeros(max_g+1, dtype=np.int32)
for i in range(length):
for j in range(width):
res[groups[i], j] += x[i, j]
bin_count[groups[i]] += 1
for i in range(max_g+1):
curr = bin_count[i]
for j in range(width):
res[i, j] /= curr
return res
@nb.njit(nogil=True, cache=True)
def agg_std(groups, x, ddof=1):
max_g = groups.max()
length, width = x.shape
res = np.zeros((max_g+1, width), dtype=np.float64)
sumsq = np.zeros((max_g + 1, width), dtype=np.float64)
bin_count = np.zeros(max_g+1, dtype=np.int32)
for i in range(length):
for j in range(width):
res[groups[i], j] += x[i, j]
sumsq[groups[i], j] += x[i, j] * x[i, j]
bin_count[groups[i]] += 1
for i in range(max_g+1):
curr = bin_count[i]
for j in range(width):
res[i, j] = math.sqrt((sumsq[i, j] - res[i, j] * res[i, j] / curr) / (curr - ddof))
return res
@nb.njit(nogil=True, cache=True)
def copy_value(groups, source):
length = groups.shape[0]
width = source.shape[1]
destination = np.zeros((length, width))
for i in range(length):
k = groups[i]
for j in range(width):
destination[i, j] = source[k, j]
return destination
def transform(groups, x, func, ddof=1):
if func == 'mean':
value_data = agg_mean(groups, x)
elif func == 'std':
value_data = agg_std(groups, x, ddof=ddof)
elif func == 'sum':
value_data = agg_sum(groups, x)
elif func =='abssum':
value_data = agg_abssum(groups, x)
else:
raise ValueError('({0}) is not recognized as valid functor'.format(func))
return copy_value(groups, value_data)
def aggregate(groups, x, func, ddof=1):
if func == 'mean':
value_data = agg_mean(groups, x)
elif func == 'std':
value_data = agg_std(groups, x, ddof=ddof)
elif func == 'sum':
value_data = agg_sum(groups, x)
elif func =='abssum':
value_data = agg_abssum(groups, x)
else:
raise ValueError('({0}) is not recognized as valid functor'.format(func))
return value_data
if __name__ == '__main__':
n_samples = 6000
n_features = 10
n_groups = 30
groups = np.random.randint(n_groups, size=n_samples)
max_g = n_groups - 1
x = np.random.randn(n_samples, n_features)
import datetime as dt
start = dt.datetime.now()
for i in range(1000):
res = aggregate(groups, x, 'mean')
print(dt.datetime.now() - start)
#transform = nb.jit(transform)
start = dt.datetime.now()
for i in range(1000):
res = aggregate(groups, x, 'mean')
print(dt.datetime.now() - start)
\ No newline at end of file
...@@ -12,7 +12,7 @@ from numpy.linalg import solve ...@@ -12,7 +12,7 @@ from numpy.linalg import solve
from typing import Tuple from typing import Tuple
from typing import Union from typing import Union
from typing import Dict from typing import Dict
from alphamind.aggregate import groupby from alphamind.utilities import groupby
def neutralize(x: np.ndarray, y: np.ndarray, groups: np.ndarray=None, output_explained=False, output_exposure=False) \ def neutralize(x: np.ndarray, y: np.ndarray, groups: np.ndarray=None, output_explained=False, output_exposure=False) \
......
...@@ -6,10 +6,10 @@ Created on 2017-4-25 ...@@ -6,10 +6,10 @@ Created on 2017-4-25
""" """
import numpy as np import numpy as np
from alphamind.aggregate import group_mapping from alphamind.utilities import group_mapping
from alphamind.aggregate import transform from alphamind.utilities import transform
from alphamind.aggregate import simple_mean from alphamind.utilities import simple_mean
from alphamind.aggregate import simple_std from alphamind.utilities import simple_std
def standardize(x: np.ndarray, groups: np.ndarray=None, ddof=1) -> np.ndarray: def standardize(x: np.ndarray, groups: np.ndarray=None, ddof=1) -> np.ndarray:
......
...@@ -7,10 +7,10 @@ Created on 2017-4-25 ...@@ -7,10 +7,10 @@ Created on 2017-4-25
import numpy as np import numpy as np
import numba as nb import numba as nb
from alphamind.aggregate import group_mapping from alphamind.utilities import group_mapping
from alphamind.aggregate import transform from alphamind.utilities import transform
from alphamind.aggregate import simple_mean from alphamind.utilities import simple_mean
from alphamind.aggregate import simple_std from alphamind.utilities import simple_std
@nb.njit(nogil=True, cache=True) @nb.njit(nogil=True, cache=True)
......
...@@ -9,7 +9,7 @@ import numpy as np ...@@ -9,7 +9,7 @@ import numpy as np
import numba as nb import numba as nb
from numpy import zeros from numpy import zeros
from numpy import zeros_like from numpy import zeros_like
from alphamind.aggregate import groupby from alphamind.utilities import groupby
@nb.njit(nogil=True, cache=True) @nb.njit(nogil=True, cache=True)
......
...@@ -6,9 +6,9 @@ Created on 2017-4-28 ...@@ -6,9 +6,9 @@ Created on 2017-4-28
""" """
import numpy as np import numpy as np
from alphamind.aggregate import group_mapping from alphamind.utilities import group_mapping
from alphamind.aggregate import aggregate from alphamind.utilities import aggregate
from alphamind.aggregate import simple_sum from alphamind.utilities import simple_sum
def simple_settle(weights: np.ndarray, ret_series: np.ndarray, groups: np.ndarray=None) -> np.ndarray: def simple_settle(weights: np.ndarray, ret_series: np.ndarray, groups: np.ndarray=None) -> np.ndarray:
......
...@@ -9,6 +9,10 @@ import os ...@@ -9,6 +9,10 @@ import os
import sys import sys
import logging import logging
import unittest import unittest
import math
from typing import List
import numpy as np
import numba as nb
alpha_logger = logging.getLogger('ALPHA_MIND') alpha_logger = logging.getLogger('ALPHA_MIND')
...@@ -46,3 +50,208 @@ class TestRunner(object): ...@@ -46,3 +50,208 @@ class TestRunner(object):
sys.exit(-1) sys.exit(-1)
else: else:
sys.exit(0) sys.exit(0)
def groupby(groups: np.ndarray) -> List[np.ndarray]:
order_group_idx = groups.argsort()
counts = np.bincount(groups)
nonzero_idx = counts.nonzero()[0]
start = 0
res = []
for i in nonzero_idx:
num_g = counts[i]
res.append(order_group_idx[start:start+num_g])
start += num_g
return res
@nb.njit(nogil=True, cache=True)
def group_mapping(groups: np.ndarray) -> np.ndarray:
length = groups.shape[0]
order = groups.argsort()
res = np.zeros(length, dtype=order.dtype)
start = 0
res[order[0]] = start
previous = groups[order[0]]
for i in range(1, length):
curr_idx = order[i]
curr_val = groups[curr_idx]
if curr_val != previous:
start += 1
res[curr_idx] = start
else:
res[curr_idx] = start
previous = curr_val
return res
@nb.njit(nogil=True, cache=True)
def simple_sum(x, axis=0):
length, width = x.shape
if axis == 0:
res = np.zeros(width)
for i in range(length):
for j in range(width):
res[j] += x[i, j]
elif axis == 1:
res = np.zeros(length)
for i in range(length):
for j in range(width):
res[i] += x[i, j]
return res
@nb.njit(nogil=True, cache=True)
def simple_mean(x, axis=0):
length, width = x.shape
if axis == 0:
res = np.zeros(width)
for j in range(width):
for i in range(length):
res[j] += x[i, j]
res[j] /= length
elif axis == 1:
res = np.zeros(length)
for i in range(length):
for j in range(width):
res[i] += x[i, j]
res[i] /= width
return res
@nb.njit(nogil=True, cache=True)
def simple_std(x, axis=0, ddof=1):
length, width = x.shape
if axis == 0:
res = np.zeros(width)
sum_mat = np.zeros(width)
for j in range(width):
for i in range(length):
res[j] += x[i, j] * x[i, j]
sum_mat[j] += x[i, j]
res[j] = math.sqrt((res[j] - sum_mat[j] * sum_mat[j] / length) / (length - ddof))
elif axis == 1:
res = np.zeros(length)
sum_mat = np.zeros(width)
for i in range(length):
for j in range(width):
res[i] += x[i, j] * x[i, j]
sum_mat[i] += x[i, j]
res[i] = math.sqrt((res[i] - sum_mat[i] * sum_mat[i] / width) / (width - ddof))
return res
@nb.njit(nogil=True, cache=True)
def agg_sum(groups, x):
max_g = groups.max()
length, width = x.shape
res = np.zeros((max_g+1, width), dtype=np.float64)
for i in range(length):
for j in range(width):
res[groups[i], j] += x[i, j]
return res
@nb.njit(nogil=True, cache=True)
def agg_abssum(groups, x):
max_g = groups.max()
length, width = x.shape
res = np.zeros((max_g+1, width), dtype=np.float64)
for i in range(length):
for j in range(width):
res[groups[i], j] += abs(x[i, j])
return res
@nb.njit(nogil=True, cache=True)
def agg_mean(groups, x):
max_g = groups.max()
length, width = x.shape
res = np.zeros((max_g+1, width), dtype=np.float64)
bin_count = np.zeros(max_g+1, dtype=np.int32)
for i in range(length):
for j in range(width):
res[groups[i], j] += x[i, j]
bin_count[groups[i]] += 1
for i in range(max_g+1):
curr = bin_count[i]
for j in range(width):
res[i, j] /= curr
return res
@nb.njit(nogil=True, cache=True)
def agg_std(groups, x, ddof=1):
max_g = groups.max()
length, width = x.shape
res = np.zeros((max_g+1, width), dtype=np.float64)
sumsq = np.zeros((max_g + 1, width), dtype=np.float64)
bin_count = np.zeros(max_g+1, dtype=np.int32)
for i in range(length):
for j in range(width):
res[groups[i], j] += x[i, j]
sumsq[groups[i], j] += x[i, j] * x[i, j]
bin_count[groups[i]] += 1
for i in range(max_g+1):
curr = bin_count[i]
for j in range(width):
res[i, j] = math.sqrt((sumsq[i, j] - res[i, j] * res[i, j] / curr) / (curr - ddof))
return res
@nb.njit(nogil=True, cache=True)
def copy_value(groups, source):
length = groups.shape[0]
width = source.shape[1]
destination = np.zeros((length, width))
for i in range(length):
k = groups[i]
for j in range(width):
destination[i, j] = source[k, j]
return destination
def transform(groups, x, func, ddof=1):
if func == 'mean':
value_data = agg_mean(groups, x)
elif func == 'std':
value_data = agg_std(groups, x, ddof=ddof)
elif func == 'sum':
value_data = agg_sum(groups, x)
elif func =='abssum':
value_data = agg_abssum(groups, x)
else:
raise ValueError('({0}) is not recognized as valid functor'.format(func))
return copy_value(groups, value_data)
def aggregate(groups, x, func, ddof=1):
if func == 'mean':
value_data = agg_mean(groups, x)
elif func == 'std':
value_data = agg_std(groups, x, ddof=ddof)
elif func == 'sum':
value_data = agg_sum(groups, x)
elif func =='abssum':
value_data = agg_abssum(groups, x)
else:
raise ValueError('({0}) is not recognized as valid functor'.format(func))
return value_data
cython >= 0.25.2
numpy >= 1.12.1 numpy >= 1.12.1
numba >= 0.30.0 numba >= 0.30.0
scikit-learn >= 0.18.1 scikit-learn >= 0.18.1
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment