restructure

414ed809 · Dr.李 · bbb01231 · 414ed809 · 414ed809 · 414ed809
Commit 414ed809 authored May 03, 2017 by Dr.李
8 changed files
--- a/alphamind/aggregate.py
+++ b/alphamind/aggregate.py
+# -*- coding: utf-8 -*-
+"""
+Created on 2017-5-3
+
+@author: cheng.li
+"""
+
+import math
+import numpy as np
+import numba as nb
+
+
+@nb.njit
+def agg_sum(groups, x):
+    max_g = groups.max()
+    length, width = x.shape
+    res = np.zeros((max_g+1, width), dtype=np.float64)
+
+    for i in range(length):
+        for j in range(width):
+            res[groups[i], j] += x[i, j]
+    return res
+
+
+@nb.njit
+def agg_abssum(groups, x):
+    max_g = groups.max()
+    length, width = x.shape
+    res = np.zeros((max_g+1, width), dtype=np.float64)
+
+    for i in range(length):
+        for j in range(width):
+            res[groups[i], j] += abs(x[i, j])
+    return res
+
+
+@nb.njit
+def agg_mean(groups, x):
+    max_g = groups.max()
+    length, width = x.shape
+    res = np.zeros((max_g+1, width), dtype=np.float64)
+    bin_count = np.zeros(max_g+1, dtype=np.int32)
+
+    for i in range(length):
+        for j in range(width):
+            res[groups[i], j] += x[i, j]
+        bin_count[groups[i]] += 1
+
+    for i in range(max_g+1):
+        curr = bin_count[i]
+        for j in range(width):
+            res[i, j] /= curr
+    return res
+
+
+@nb.njit
+def agg_std(groups, x, ddof=1):
+    max_g = groups.max()
+    length, width = x.shape
+    res = np.zeros((max_g+1, width), dtype=np.float64)
+    sumsq = np.zeros((max_g + 1, width), dtype=np.float64)
+    bin_count = np.zeros(max_g+1, dtype=np.int32)
+
+    for i in range(length):
+        for j in range(width):
+            res[groups[i], j] += x[i, j]
+            sumsq[groups[i], j] += x[i, j] * x[i, j]
+        bin_count[groups[i]] += 1
+
+    for i in range(max_g+1):
+        curr = bin_count[i]
+        for j in range(width):
+            res[i, j] = math.sqrt((sumsq[i, j] - res[i, j] * res[i, j] / curr) / (curr - ddof))
+    return res
+
+
+@nb.njit
+def set_value(groups, source, destinantion):
+    length, width = destinantion.shape
+    for i in range(length):
+        k = groups[i]
+        for j in range(width):
+            destinantion[i, j] = source[k, j]
+
+
+def transform(groups, x, func):
+    res = np.zeros_like(x)
+
+    if func == 'mean':
+        value_data = agg_mean(groups, x)
+    elif func == 'std':
+        value_data = agg_std(groups, x, ddof=1)
+    elif func == 'sum':
+        value_data = agg_sum(groups, x)
+    elif func =='abssum':
+        value_data = agg_abssum(groups, x)
+    else:
+        raise ValueError('({0}) is not recognized as valid functor'.format(func))
+
+    set_value(groups, value_data, res)
+    return res
+
+
+def aggregate(groups, x, func):
+    if func == 'mean':
+        value_data = agg_mean(groups, x)
+    elif func == 'std':
+        value_data = agg_std(groups, x, ddof=1)
+    elif func == 'sum':
+        value_data = agg_sum(groups, x)
+    elif func =='abssum':
+        value_data = agg_abssum(groups, x)
+    else:
+        raise ValueError('({0}) is not recognized as valid functor'.format(func))
+
+    return value_data
+
+
+if __name__ == '__main__':
+    n_samples = 6000
+    n_features = 10
+    n_groups = 30
+    groups = np.random.randint(n_groups, size=n_samples)
+    max_g = n_groups - 1
+    x = np.random.randn(n_samples, n_features)
+
+    import datetime as dt
+    start = dt.datetime.now()
+    for i in range(1000):
+        res = aggregate(groups, x, 'mean')
+    print(dt.datetime.now() - start)
+
+    #transform = nb.jit(transform)
+
+    start = dt.datetime.now()
+
+    for i in range(1000):
+        res = aggregate(groups, x, 'mean')
+    print(dt.datetime.now() - start)
\ No newline at end of file
--- a/alphamind/data/neutralize.py
+++ b/alphamind/data/neutralize.py
@@ -11,7 +11,7 @@ from numpy.linalg import solve
 from typing import Tuple
 from typing import Union
 from typing import Dict
-from alphamind.aggregate import groupby
+from alphamind.groupby import groupby


 def neutralize(x: np.ndarray, y: np.ndarray, groups: np.ndarray=None, output_explained=False, output_exposure=False) \

--- a/alphamind/data/standardize.py
+++ b/alphamind/data/standardize.py
@@ -6,8 +6,8 @@ Created on 2017-4-25
 """

 import numpy as np
-from alphamind.aggregate import group_mapping
-from alphamind.impl import transform
+from alphamind.groupby import group_mapping
+from alphamind.aggregate import transform


 def standardize(x: np.ndarray, groups: np.ndarray=None) -> np.ndarray:

--- a/alphamind/data/winsorize.py
+++ b/alphamind/data/winsorize.py
@@ -6,8 +6,8 @@ Created on 2017-4-25
 """

 import numpy as np
-from alphamind.aggregate import group_mapping
-from alphamind.impl import transform
+from alphamind.groupby import group_mapping
+from alphamind.aggregate import transform


 def winsorize_normal(x: np.ndarray, num_stds: int=3, groups: np.ndarray=None) -> np.ndarray:

--- a/alphamind/aggregate.pyx
+++ b/alphamind/aggregate.pyx
-# -*- coding: utf-8 -*-
-# distutils: language = c++
-"""
-Created on 2017-4-26
-
-@author: cheng.li
-"""
-
-import numpy as np
-from numpy import zeros
-from numpy import max as nmax
-cimport numpy as np
-cimport cython
-from libc.math cimport sqrt
-from libc.math cimport fabs
-from libcpp.vector cimport vector as cpp_vector
-from libcpp.unordered_map cimport unordered_map as cpp_map
-from cython.operator cimport dereference as deref
-
-
-ctypedef long long int64_t
-
-
-@cython.boundscheck(False)
-@cython.wraparound(False)
-@cython.initializedcheck(False)
-cpdef groupby(long[:] groups):
-
-    cdef long long length = groups.shape[0]
-    cdef cpp_map[long, cpp_vector[int64_t]] group_ids
-    cdef long long i
-    cdef long curr_tag
-    cdef cpp_map[long, cpp_vector[int64_t]].iterator it
-    cdef np.ndarray[long long, ndim=1] npy_array
-
-    for i in range(length):
-        curr_tag = groups[i]
-        it = group_ids.find(curr_tag)
-
-        if it == group_ids.end():
-            group_ids[curr_tag] = [i]
-        else:
-            deref(it).second.push_back(i)
-
-    return [np.array(v) for v in group_ids.values()]
-
-
-@cython.boundscheck(False)
-@cython.wraparound(False)
-@cython.initializedcheck(False)
-cpdef np.ndarray[int, ndim=1] group_mapping(long[:] groups):
-    cdef size_t length = groups.shape[0]
-    cdef np.ndarray[int, ndim=1] res= zeros(length, dtype=int)
-    cdef cpp_map[long, long] current_hold
-    cdef long curr_tag
-    cdef long running_tag = -1
-    cdef size_t i = 0
-    cdef cpp_map[long, long].iterator it
-
-    for i in range(length):
-        curr_tag = groups[i]
-        it = current_hold.find(curr_tag)
-        if it == current_hold.end():
-            running_tag += 1
-            res[i] = running_tag
-            current_hold[curr_tag] = running_tag
-        else:
-            res[i] = deref(it).second
-
-    return res
+# -*- coding: utf-8 -*-
+# distutils: language = c++
+"""
+Created on 2017-4-26
+
+@author: cheng.li
+"""
+
+import numpy as np
+from numpy import zeros
+from numpy import max as nmax
+cimport numpy as np
+cimport cython
+from libc.math cimport sqrt
+from libc.math cimport fabs
+from libcpp.vector cimport vector as cpp_vector
+from libcpp.unordered_map cimport unordered_map as cpp_map
+from cython.operator cimport dereference as deref
+
+
+ctypedef long long int64_t
+
+
+@cython.boundscheck(False)
+@cython.wraparound(False)
+@cython.initializedcheck(False)
+cpdef groupby(long[:] groups):
+
+    cdef long long length = groups.shape[0]
+    cdef cpp_map[long, cpp_vector[int64_t]] group_ids
+    cdef long long i
+    cdef long curr_tag
+    cdef cpp_map[long, cpp_vector[int64_t]].iterator it
+    cdef np.ndarray[long long, ndim=1] npy_array
+
+    for i in range(length):
+        curr_tag = groups[i]
+        it = group_ids.find(curr_tag)
+
+        if it == group_ids.end():
+            group_ids[curr_tag] = [i]
+        else:
+            deref(it).second.push_back(i)
+
+    return [np.array(v) for v in group_ids.values()]
+
+
+@cython.boundscheck(False)
+@cython.wraparound(False)
+@cython.initializedcheck(False)
+cpdef np.ndarray[int, ndim=1] group_mapping(long[:] groups):
+    cdef size_t length = groups.shape[0]
+    cdef np.ndarray[int, ndim=1] res= zeros(length, dtype=int)
+    cdef cpp_map[long, long] current_hold
+    cdef long curr_tag
+    cdef long running_tag = -1
+    cdef size_t i = 0
+    cdef cpp_map[long, long].iterator it
+
+    for i in range(length):
+        curr_tag = groups[i]
+        it = current_hold.find(curr_tag)
+        if it == current_hold.end():
+            running_tag += 1
+            res[i] = running_tag
+            current_hold[curr_tag] = running_tag
+        else:
+            res[i] = deref(it).second
+
+    return res
--- a/alphamind/portfolio/rankbuilder.py
+++ b/alphamind/portfolio/rankbuilder.py
@@ -8,7 +8,7 @@ Created on 2017-4-26
 import numpy as np
 import numba as nb
 from numpy import zeros
-from alphamind.aggregate import groupby
+from alphamind.groupby import groupby


 @nb.njit

--- a/alphamind/settlement/simplesettle.py
+++ b/alphamind/settlement/simplesettle.py
@@ -6,8 +6,8 @@ Created on 2017-4-28
 """

 import numpy as np
-from alphamind.aggregate import group_mapping
-from alphamind.impl import aggregate
+from alphamind.groupby import group_mapping
+from alphamind.aggregate import aggregate


 def simple_settle(weights: np.ndarray, ret_series: np.ndarray, groups: np.ndarray=None) -> np.ndarray:

--- a/setup.py
+++ b/setup.py
@@ -25,7 +25,7 @@ else:
    line_trace = False


-ext_modules = ['alphamind/aggregate.pyx']
+ext_modules = ['alphamind/groupby.pyx']


 def generate_extensions(ext_modules, line_trace=False):