restructure

e9d233d4 · Dr.李 · 8328777b · 8328777b · e9d233d4 · e9d233d4
Commit e9d233d4 authored May 04, 2017 by Dr.李
8 changed files
--- a/alphamind/aggregate.py
+++ b/alphamind/aggregate.py
-# -*- coding: utf-8 -*-
-"""
-Created on 2017-5-3
-
-@author: cheng.li
-"""
-
-import math
-import numpy as np
-import numba as nb
-
-
-def groupby(groups):
-    order_group_idx = groups.argsort()
-    counts = np.bincount(groups)
-    nonzero_idx = counts.nonzero()[0]
-
-    start = 0
-    res = []
-
-    for i in nonzero_idx:
-        num_g = counts[i]
-        res.append(order_group_idx[start:start+num_g])
-        start += num_g
-    return res
-
-
-@nb.njit(nogil=True, cache=True)
-def group_mapping(groups):
-    length = groups.shape[0]
-    order = groups.argsort()
-    res = np.zeros(length, dtype=order.dtype)
-
-    start = 0
-    res[order[0]] = start
-    previous = groups[order[0]]
-
-    for i in range(1, length):
-        curr_idx = order[i]
-        curr_val = groups[curr_idx]
-        if curr_val != previous:
-            start += 1
-            res[curr_idx] = start
-        else:
-            res[curr_idx] = start
-        previous = curr_val
-    return res
-
-
-@nb.njit(nogil=True, cache=True)
-def simple_sum(x, axis=0):
-    length, width = x.shape
-
-    if axis == 0:
-        res = np.zeros(width)
-        for i in range(length):
-            for j in range(width):
-                res[j] += x[i, j]
-
-    elif axis == 1:
-        res = np.zeros(length)
-        for i in range(length):
-            for j in range(width):
-                res[i] += x[i, j]
-    return res
-
-
-@nb.njit(nogil=True, cache=True)
-def simple_mean(x, axis=0):
-    length, width = x.shape
-
-    if axis == 0:
-        res = np.zeros(width)
-        for j in range(width):
-            for i in range(length):
-                res[j] += x[i, j]
-            res[j] /= length
-
-    elif axis == 1:
-        res = np.zeros(length)
-        for i in range(length):
-            for j in range(width):
-                res[i] += x[i, j]
-            res[i] /= width
-    return res
-
-
-@nb.njit(nogil=True, cache=True)
-def simple_std(x, axis=0, ddof=1):
-    length, width = x.shape
-
-    if axis == 0:
-        res = np.zeros(width)
-        sum_mat = np.zeros(width)
-        for j in range(width):
-            for i in range(length):
-                res[j] += x[i, j] * x[i, j]
-                sum_mat[j] += x[i, j]
-            res[j] = math.sqrt((res[j] - sum_mat[j] * sum_mat[j] / length) / (length - ddof))
-    elif axis == 1:
-        res = np.zeros(length)
-        sum_mat = np.zeros(width)
-        for i in range(length):
-            for j in range(width):
-                res[i] += x[i, j] * x[i, j]
-                sum_mat[i] += x[i, j]
-            res[i] = math.sqrt((res[i] - sum_mat[i] * sum_mat[i] / width) / (width - ddof))
-    return res
-
-
-@nb.njit(nogil=True, cache=True)
-def agg_sum(groups, x):
-    max_g = groups.max()
-    length, width = x.shape
-    res = np.zeros((max_g+1, width), dtype=np.float64)
-
-    for i in range(length):
-        for j in range(width):
-            res[groups[i], j] += x[i, j]
-    return res
-
-
-@nb.njit(nogil=True, cache=True)
-def agg_abssum(groups, x):
-    max_g = groups.max()
-    length, width = x.shape
-    res = np.zeros((max_g+1, width), dtype=np.float64)
-
-    for i in range(length):
-        for j in range(width):
-            res[groups[i], j] += abs(x[i, j])
-    return res
-
-
-@nb.njit(nogil=True, cache=True)
-def agg_mean(groups, x):
-    max_g = groups.max()
-    length, width = x.shape
-    res = np.zeros((max_g+1, width), dtype=np.float64)
-    bin_count = np.zeros(max_g+1, dtype=np.int32)
-
-    for i in range(length):
-        for j in range(width):
-            res[groups[i], j] += x[i, j]
-        bin_count[groups[i]] += 1
-
-    for i in range(max_g+1):
-        curr = bin_count[i]
-        for j in range(width):
-            res[i, j] /= curr
-    return res
-
-
-@nb.njit(nogil=True, cache=True)
-def agg_std(groups, x, ddof=1):
-    max_g = groups.max()
-    length, width = x.shape
-    res = np.zeros((max_g+1, width), dtype=np.float64)
-    sumsq = np.zeros((max_g + 1, width), dtype=np.float64)
-    bin_count = np.zeros(max_g+1, dtype=np.int32)
-
-    for i in range(length):
-        for j in range(width):
-            res[groups[i], j] += x[i, j]
-            sumsq[groups[i], j] += x[i, j] * x[i, j]
-        bin_count[groups[i]] += 1
-
-    for i in range(max_g+1):
-        curr = bin_count[i]
-        for j in range(width):
-            res[i, j] = math.sqrt((sumsq[i, j] - res[i, j] * res[i, j] / curr) / (curr - ddof))
-    return res
-
-
-@nb.njit(nogil=True, cache=True)
-def copy_value(groups, source):
-    length = groups.shape[0]
-    width = source.shape[1]
-    destination = np.zeros((length, width))
-    for i in range(length):
-        k = groups[i]
-        for j in range(width):
-            destination[i, j] = source[k, j]
-    return destination
-
-
-def transform(groups, x, func, ddof=1):
-
-    if func == 'mean':
-        value_data = agg_mean(groups, x)
-    elif func == 'std':
-        value_data = agg_std(groups, x, ddof=ddof)
-    elif func == 'sum':
-        value_data = agg_sum(groups, x)
-    elif func =='abssum':
-        value_data = agg_abssum(groups, x)
-    else:
-        raise ValueError('({0}) is not recognized as valid functor'.format(func))
-
-    return copy_value(groups, value_data)
-
-
-def aggregate(groups, x, func, ddof=1):
-    if func == 'mean':
-        value_data = agg_mean(groups, x)
-    elif func == 'std':
-        value_data = agg_std(groups, x, ddof=ddof)
-    elif func == 'sum':
-        value_data = agg_sum(groups, x)
-    elif func =='abssum':
-        value_data = agg_abssum(groups, x)
-    else:
-        raise ValueError('({0}) is not recognized as valid functor'.format(func))
-
-    return value_data
-
-
-if __name__ == '__main__':
-    n_samples = 6000
-    n_features = 10
-    n_groups = 30
-    groups = np.random.randint(n_groups, size=n_samples)
-    max_g = n_groups - 1
-    x = np.random.randn(n_samples, n_features)
-
-    import datetime as dt
-    start = dt.datetime.now()
-    for i in range(1000):
-        res = aggregate(groups, x, 'mean')
-    print(dt.datetime.now() - start)
-
-    #transform = nb.jit(transform)
-
-    start = dt.datetime.now()
-
-    for i in range(1000):
-        res = aggregate(groups, x, 'mean')
-    print(dt.datetime.now() - start)
\ No newline at end of file
--- a/alphamind/data/neutralize.py
+++ b/alphamind/data/neutralize.py
@@ -12,7 +12,7 @@ from numpy.linalg import solve
 from typing import Tuple
 from typing import Union
 from typing import Dict
-from alphamind.aggregate import groupby
+from alphamind.utilities import groupby


 def neutralize(x: np.ndarray, y: np.ndarray, groups: np.ndarray=None, output_explained=False, output_exposure=False) \

--- a/alphamind/data/standardize.py
+++ b/alphamind/data/standardize.py
@@ -6,10 +6,10 @@ Created on 2017-4-25
 """

 import numpy as np
-from alphamind.aggregate import group_mapping
-from alphamind.aggregate import transform
-from alphamind.aggregate import simple_mean
-from alphamind.aggregate import simple_std
+from alphamind.utilities import group_mapping
+from alphamind.utilities import transform
+from alphamind.utilities import simple_mean
+from alphamind.utilities import simple_std


 def standardize(x: np.ndarray, groups: np.ndarray=None, ddof=1) -> np.ndarray:

--- a/alphamind/data/winsorize.py
+++ b/alphamind/data/winsorize.py
@@ -7,10 +7,10 @@ Created on 2017-4-25

 import numpy as np
 import numba as nb
-from alphamind.aggregate import group_mapping
-from alphamind.aggregate import transform
-from alphamind.aggregate import simple_mean
-from alphamind.aggregate import simple_std
+from alphamind.utilities import group_mapping
+from alphamind.utilities import transform
+from alphamind.utilities import simple_mean
+from alphamind.utilities import simple_std


 @nb.njit(nogil=True, cache=True)

--- a/alphamind/portfolio/rankbuilder.py
+++ b/alphamind/portfolio/rankbuilder.py
@@ -9,7 +9,7 @@ import numpy as np
 import numba as nb
 from numpy import zeros
 from numpy import zeros_like
-from alphamind.aggregate import groupby
+from alphamind.utilities import groupby


 @nb.njit(nogil=True, cache=True)

--- a/alphamind/settlement/simplesettle.py
+++ b/alphamind/settlement/simplesettle.py
@@ -6,9 +6,9 @@ Created on 2017-4-28
 """

 import numpy as np
-from alphamind.aggregate import group_mapping
-from alphamind.aggregate import aggregate
-from alphamind.aggregate import simple_sum
+from alphamind.utilities import group_mapping
+from alphamind.utilities import aggregate
+from alphamind.utilities import simple_sum


 def simple_settle(weights: np.ndarray, ret_series: np.ndarray, groups: np.ndarray=None) -> np.ndarray:

--- a/alphamind/utilities.py
+++ b/alphamind/utilities.py
@@ -9,6 +9,10 @@ import os
 import sys
 import logging
 import unittest
+import math
+from typing import List
+import numpy as np
+import numba as nb


 alpha_logger = logging.getLogger('ALPHA_MIND')
@@ -46,3 +50,208 @@ class TestRunner(object):
            sys.exit(-1)
        else:
            sys.exit(0)
+
+
+def groupby(groups: np.ndarray) -> List[np.ndarray]:
+    order_group_idx = groups.argsort()
+    counts = np.bincount(groups)
+    nonzero_idx = counts.nonzero()[0]
+
+    start = 0
+    res = []
+
+    for i in nonzero_idx:
+        num_g = counts[i]
+        res.append(order_group_idx[start:start+num_g])
+        start += num_g
+    return res
+
+
+@nb.njit(nogil=True, cache=True)
+def group_mapping(groups: np.ndarray) -> np.ndarray:
+    length = groups.shape[0]
+    order = groups.argsort()
+    res = np.zeros(length, dtype=order.dtype)
+
+    start = 0
+    res[order[0]] = start
+    previous = groups[order[0]]
+
+    for i in range(1, length):
+        curr_idx = order[i]
+        curr_val = groups[curr_idx]
+        if curr_val != previous:
+            start += 1
+            res[curr_idx] = start
+        else:
+            res[curr_idx] = start
+        previous = curr_val
+    return res
+
+
+@nb.njit(nogil=True, cache=True)
+def simple_sum(x, axis=0):
+    length, width = x.shape
+
+    if axis == 0:
+        res = np.zeros(width)
+        for i in range(length):
+            for j in range(width):
+                res[j] += x[i, j]
+
+    elif axis == 1:
+        res = np.zeros(length)
+        for i in range(length):
+            for j in range(width):
+                res[i] += x[i, j]
+    return res
+
+
+@nb.njit(nogil=True, cache=True)
+def simple_mean(x, axis=0):
+    length, width = x.shape
+
+    if axis == 0:
+        res = np.zeros(width)
+        for j in range(width):
+            for i in range(length):
+                res[j] += x[i, j]
+            res[j] /= length
+
+    elif axis == 1:
+        res = np.zeros(length)
+        for i in range(length):
+            for j in range(width):
+                res[i] += x[i, j]
+            res[i] /= width
+    return res
+
+
+@nb.njit(nogil=True, cache=True)
+def simple_std(x, axis=0, ddof=1):
+    length, width = x.shape
+
+    if axis == 0:
+        res = np.zeros(width)
+        sum_mat = np.zeros(width)
+        for j in range(width):
+            for i in range(length):
+                res[j] += x[i, j] * x[i, j]
+                sum_mat[j] += x[i, j]
+            res[j] = math.sqrt((res[j] - sum_mat[j] * sum_mat[j] / length) / (length - ddof))
+    elif axis == 1:
+        res = np.zeros(length)
+        sum_mat = np.zeros(width)
+        for i in range(length):
+            for j in range(width):
+                res[i] += x[i, j] * x[i, j]
+                sum_mat[i] += x[i, j]
+            res[i] = math.sqrt((res[i] - sum_mat[i] * sum_mat[i] / width) / (width - ddof))
+    return res
+
+
+@nb.njit(nogil=True, cache=True)
+def agg_sum(groups, x):
+    max_g = groups.max()
+    length, width = x.shape
+    res = np.zeros((max_g+1, width), dtype=np.float64)
+
+    for i in range(length):
+        for j in range(width):
+            res[groups[i], j] += x[i, j]
+    return res
+
+
+@nb.njit(nogil=True, cache=True)
+def agg_abssum(groups, x):
+    max_g = groups.max()
+    length, width = x.shape
+    res = np.zeros((max_g+1, width), dtype=np.float64)
+
+    for i in range(length):
+        for j in range(width):
+            res[groups[i], j] += abs(x[i, j])
+    return res
+
+
+@nb.njit(nogil=True, cache=True)
+def agg_mean(groups, x):
+    max_g = groups.max()
+    length, width = x.shape
+    res = np.zeros((max_g+1, width), dtype=np.float64)
+    bin_count = np.zeros(max_g+1, dtype=np.int32)
+
+    for i in range(length):
+        for j in range(width):
+            res[groups[i], j] += x[i, j]
+        bin_count[groups[i]] += 1
+
+    for i in range(max_g+1):
+        curr = bin_count[i]
+        for j in range(width):
+            res[i, j] /= curr
+    return res
+
+
+@nb.njit(nogil=True, cache=True)
+def agg_std(groups, x, ddof=1):
+    max_g = groups.max()
+    length, width = x.shape
+    res = np.zeros((max_g+1, width), dtype=np.float64)
+    sumsq = np.zeros((max_g + 1, width), dtype=np.float64)
+    bin_count = np.zeros(max_g+1, dtype=np.int32)
+
+    for i in range(length):
+        for j in range(width):
+            res[groups[i], j] += x[i, j]
+            sumsq[groups[i], j] += x[i, j] * x[i, j]
+        bin_count[groups[i]] += 1
+
+    for i in range(max_g+1):
+        curr = bin_count[i]
+        for j in range(width):
+            res[i, j] = math.sqrt((sumsq[i, j] - res[i, j] * res[i, j] / curr) / (curr - ddof))
+    return res
+
+
+@nb.njit(nogil=True, cache=True)
+def copy_value(groups, source):
+    length = groups.shape[0]
+    width = source.shape[1]
+    destination = np.zeros((length, width))
+    for i in range(length):
+        k = groups[i]
+        for j in range(width):
+            destination[i, j] = source[k, j]
+    return destination
+
+
+def transform(groups, x, func, ddof=1):
+
+    if func == 'mean':
+        value_data = agg_mean(groups, x)
+    elif func == 'std':
+        value_data = agg_std(groups, x, ddof=ddof)
+    elif func == 'sum':
+        value_data = agg_sum(groups, x)
+    elif func =='abssum':
+        value_data = agg_abssum(groups, x)
+    else:
+        raise ValueError('({0}) is not recognized as valid functor'.format(func))
+
+    return copy_value(groups, value_data)
+
+
+def aggregate(groups, x, func, ddof=1):
+    if func == 'mean':
+        value_data = agg_mean(groups, x)
+    elif func == 'std':
+        value_data = agg_std(groups, x, ddof=ddof)
+    elif func == 'sum':
+        value_data = agg_sum(groups, x)
+    elif func =='abssum':
+        value_data = agg_abssum(groups, x)
+    else:
+        raise ValueError('({0}) is not recognized as valid functor'.format(func))
+
+    return value_data
--- a/requirements.txt
+++ b/requirements.txt
-cython >= 0.25.2
 numpy >= 1.12.1
 numba >= 0.30.0
 scikit-learn >= 0.18.1