update groupby

b5c7b1dd · Dr.李 · 70214fe0 · b5c7b1dd · b5c7b1dd
Commit b5c7b1dd authored Apr 30, 2017 by Dr.李
Hide whitespace changes
Inline Side-by-side

Showing with 34 additions and 53 deletions

aggregate.pyx alphamind/aggregate.pyx +29 -38

rankbuilder.py alphamind/portfolio/rankbuilder.py +5 -15

No files found.
--- a/alphamind/aggregate.pyx
+++ b/alphamind/aggregate.pyx
@@ -14,9 +14,7 @@ from libc.math cimport fabs
 from libc.stdlib cimport calloc
 from libc.stdlib cimport free
 from numpy import array
-from cpython.dict cimport PyDict_GetItem, PyDict_SetItem
-from cpython.ref cimport PyObject
-from cpython.list cimport PyList_Append
+from libcpp.vector cimport vector as cpp_vector
 from libcpp.unordered_map cimport unordered_map as cpp_map
 from cython.operator cimport dereference as deref

@@ -25,15 +23,7 @@ np.import_array()
 cdef extern from "numpy/arrayobject.h":
    void PyArray_ENABLEFLAGS(np.ndarray arr, int flags)

-
-cdef inline object _groupby_core(dict d, object key, object item):
-    cdef PyObject *obj = PyDict_GetItem(d, key)
-    if obj is NULL:
-        val = []
-        PyList_Append(val, item)
-        PyDict_SetItem(d, key, val)
-    else:
-        PyList_Append(<object>obj, item)
+ctypedef long long int64_t


 @cython.boundscheck(False)
@@ -41,15 +31,29 @@ cdef inline object _groupby_core(dict d, object key, object item):
 @cython.initializedcheck(False)
 cpdef list groupby(long[:] groups):

-    cdef size_t length = groups.shape[0]
-    cdef dict group_ids = {}
-    cdef size_t i
+    cdef long long length = groups.shape[0]
+    cdef cpp_map[long, cpp_vector[int64_t]] group_ids
+    cdef long long i
    cdef long curr_tag
+    cdef cpp_map[long, cpp_vector[int64_t]].iterator it
+    cdef list res = []
+    cdef np.ndarray[long long, ndim=1] npy_array
+    cdef cpp_vector[int64_t] v
+    cdef long long* arr_ptr

    for i in range(length):
-        _groupby_core(group_ids, groups[i], i)
+        curr_tag = groups[i]
+        it = group_ids.find(curr_tag)

-    return [array(v, dtype=np.int64) for v in group_ids.values()]
+        if it == group_ids.end():
+            group_ids[curr_tag] = [i]
+        else:
+            deref(it).second.push_back(i)
+
+    for v in group_ids.values():
+        res.append(array(v))
+
+    return res


 @cython.boundscheck(False)
@@ -58,38 +62,25 @@ cpdef list groupby(long[:] groups):
 cdef long* group_mapping(long* groups, size_t length, size_t* max_g):
    cdef long *res_ptr = <long*>calloc(length, sizeof(int))
    cdef cpp_map[long, long] current_hold
-    cdef long curr_g
-    cdef long running_g = -1
+    cdef long curr_tag
+    cdef long running_tag = -1
    cdef size_t i = 0
    cdef cpp_map[long, long].iterator it

    for i in range(length):
-        curr_g = groups[i]
-        it = current_hold.find(curr_g)
+        curr_tag = groups[i]
+        it = current_hold.find(curr_tag)
        if it == current_hold.end():
-            running_g += 1
-            res_ptr[i] = running_g
-            current_hold[curr_g] = running_g
+            running_tag += 1
+            res_ptr[i] = running_tag
+            current_hold[curr_tag] = running_tag
        else:
            res_ptr[i] = deref(it).second

-    max_g[0] = running_g
+    max_g[0] = running_tag
    return res_ptr


-cpdef group_mapping_test(long[:] groups):
-    cdef size_t length = groups.shape[0]
-    cdef size_t* max_g = <size_t*>calloc(1, sizeof(size_t))
-    cdef size_t g_max
-    cdef long* mapped_groups = group_mapping(&groups[0], length, max_g)
-
-    res = np.PyArray_SimpleNewFromData(1, [length], np.NPY_INT32, mapped_groups)
-    PyArray_ENABLEFLAGS(res, np.NPY_OWNDATA)
-    g_max = max_g[0]
-    free(max_g)
-    return res, g_max
-
-
 @cython.boundscheck(False)
 @cython.wraparound(False)
 @cython.cdivision(True)

--- a/alphamind/portfolio/rankbuilder.py
+++ b/alphamind/portfolio/rankbuilder.py
@@ -54,19 +54,9 @@ def rank_build(er: np.ndarray, use_rank: int, groups: np.ndarray=None) -> np.nda


 if __name__ == '__main__':
-    # n_samples = 4000
-    # n_include = 100
-    # n_groups = 20
-    #
-    # x = np.random.randn(n_samples, 2)
-    # groups = np.random.randint(n_groups, size=n_samples)
-    #
-    # for i in range(10000):
-    #     rank_build(x, n_include, groups)
+    n_sample = 6
+    n_groups = 3

-    from alphamind.portfolio.impl import set_value
-
-    x = np.zeros((3, 2), dtype=np.bool)
-    index = np.array([[1, 0], [2, 1]])
-    set_value(x.view(dtype=np.uint8), index)
-    print(x)
\ No newline at end of file
+    groups = np.array([1, 1, 2, 1, 0, 2])
+    print(groups)
+    print(groupby(groups))
\ No newline at end of file