update standardize to avoid overflow

19bb12be · Dr.李 · a4d062a0 · 19bb12be · 19bb12be · bf436718
Commit 19bb12be authored Jan 16, 2018 by Dr.李
Hide whitespace changes
Inline Side-by-side

Showing with 97 additions and 5 deletions

standardize.py alphamind/data/standardize.py +4 -4

factor_res_analysis.py alphamind/examples/factor_res_analysis.py +92 -0

xgboost xgboost +1 -1

No files found.
--- a/alphamind/data/standardize.py
+++ b/alphamind/data/standardize.py
@@ -22,9 +22,9 @@ def standardize(x: np.ndarray, groups: np.ndarray=None, ddof=1) -> np.ndarray:
        mean_values = transform(groups, x, 'mean')
        std_values = transform(groups, x, 'std', ddof)

-        return (x - mean_values) / std_values
+        return (x - mean_values) / np.maximum(std_values, 1e-8)
    else:
-        return (x - simple_mean(x, axis=0)) / simple_std(x, axis=0, ddof=ddof)
+        return (x - simple_mean(x, axis=0)) / np.maximum(simple_std(x, axis=0, ddof=ddof), 1e-8)


 def projection(x: np.ndarray, groups: np.ndarray=None, axis=1) -> np.ndarray:
@@ -48,7 +48,7 @@ class Standardizer(object):
        self.std_ = simple_std(x, axis=0, ddof=self.ddof_)

    def transform(self, x: np.ndarray) -> np.ndarray:
-        return (x - self.mean_) / self.std_
+        return (x - self.mean_) / np.maximum(self.std_, 1e-8)


 class GroupedStandardizer(object):
@@ -69,4 +69,4 @@ class GroupedStandardizer(object):
    def transform(self, x: np.ndarray) -> np.ndarray:
        groups = x[:, 0].astype(int)
        index = array_index(self.labels_, groups)
-        return (x[:, 1:] - self.mean_[index]) / self.std_[index]
+        return (x[:, 1:] - self.mean_[index]) / np.maximum(self.std_[index], 1e-8)
--- a/alphamind/examples/factor_res_analysis.py
+++ b/alphamind/examples/factor_res_analysis.py
+# -*- coding: utf-8 -*-
+"""
+Created on 2018-1-15
+
+@author: cheng.li
+"""
+
+import numpy as np
+import pandas as pd
+from PyFin.api import *
+from alphamind.api import *
+
+
+def factor_residue_analysis(start_date,
+                            end_date,
+                            factor,
+                            freq,
+                            universe,
+                            engine):
+    neutralize_risk = ['SIZE', 'LEVERAGE'] + industry_styles
+    n_bins = 5
+    horizon = map_freq(freq)
+
+    dates = makeSchedule(start_date,
+                         end_date,
+                         tenor=freq,
+                         calendar='china.sse')
+
+    alpha_factor_name = factor + '_res'
+    base1 = LAST('roe_q')
+    base2 = CSRes(LAST('ep_q'), 'roe_q')
+    alpha_factor = {alpha_factor_name: CSRes(CSRes(LAST(factor), base1), base2)}
+    factor_all_data = engine.fetch_data_range(universe,
+                                              alpha_factor,
+                                              dates=dates)['factor']
+    return_all_data = engine.fetch_dx_return_range(universe, dates=dates, horizon=horizon)
+
+    factor_groups = factor_all_data.groupby('trade_date')
+    return_groups = return_all_data.groupby('trade_date')
+    final_res = np.zeros((len(factor_groups.groups), n_bins))
+
+    index_dates = []
+
+    for i, value in enumerate(factor_groups):
+        date = value[0]
+        data = value[1][['code', alpha_factor_name, 'isOpen'] + neutralize_risk]
+        returns = return_groups.get_group(date)
+
+        total_data = pd.merge(data, returns, on=['code']).dropna()
+        risk_exp = total_data[neutralize_risk].values.astype(float)
+        dx_return = total_data.dx.values
+
+        index_dates.append(date)
+        try:
+            er = factor_processing(total_data[[alpha_factor_name]].values,
+                                   pre_process=[winsorize_normal, standardize],
+                                   risk_factors=risk_exp,
+                                   post_process=[winsorize_normal, standardize])
+            res = er_quantile_analysis(er,
+                                       n_bins=n_bins,
+                                       dx_return=dx_return)
+        except Exception as e:
+            print(e)
+            res = np.zeros(n_bins)
+
+        final_res[i] = res
+
+    df = pd.DataFrame(final_res, index=index_dates)
+
+    start_date = advanceDateByCalendar('china.sse', dates[0], '-1d')
+    df.loc[start_date] = 0.
+    df.sort_index(inplace=True)
+    df['$top1 - top5$'] = df[0] - df[4]
+    return df
+
+
+engine = SqlEngine()
+df = engine.fetch_factor_coverage().groupby('factor').mean()
+df = df[df.coverage >= 0.98]
+universe = Universe('custom', ['zz800'])
+
+factor_df = pd.DataFrame()
+
+for i, factor in enumerate(df.index):
+    res = factor_residue_analysis('2012-01-01',
+                                  '2018-01-05',
+                                  factor,
+                                  '5b',
+                                  universe,
+                                  engine)
+    factor_df[factor] = res['$top1 - top5$']
+    alpha_logger.info('{0}: {1} is done'.format(i + 1, factor))
--- a/xgboost @ bf436718
+++ b/xgboost @ bf436718
-Subproject commit a187ed6c8f3aa40b47d5be80667cbbe6a6fd563d
+Subproject commit bf4367184164e593cd2856ef38f8dd4f8cc76999