added model training example

a5260eba · Dr.李 · db8c05c3 · a5260eba · a5260eba · a5260eba
Commit a5260eba authored Aug 24, 2017 by Dr.李
Hide whitespace changes
Inline Side-by-side

Showing with 140 additions and 5 deletions

processing.py alphamind/data/processing.py +5 -4

model_training.py alphamind/examples/model_training.py +130 -0

preparing.py alphamind/model/preparing.py +5 -1

No files found.
--- a/alphamind/data/processing.py
+++ b/alphamind/data/processing.py
@@ -14,20 +14,21 @@ from alphamind.data.neutralize import neutralize
 def factor_processing(raw_factors: np.ndarray,
                      pre_process: Optional[List]=None,
                      risk_factors: Optional[np.ndarray]=None,
-                      post_process: Optional[List]=None) -> np.ndarray:
+                      post_process: Optional[List]=None,
+                      groups=None) -> np.ndarray:

    new_factors = raw_factors

    if pre_process:
        for p in pre_process:
-            new_factors = p(new_factors)
+            new_factors = p(new_factors, groups=groups)

    if risk_factors is not None:
        risk_factors = risk_factors[:, risk_factors.sum(axis=0) != 0]
-        new_factors = neutralize(risk_factors, new_factors)
+        new_factors = neutralize(risk_factors, new_factors, groups=groups)

    if post_process:
        for p in post_process:
-            new_factors = p(new_factors)
+            new_factors = p(new_factors, groups=groups)

    return new_factors
--- a/alphamind/examples/model_training.py
+++ b/alphamind/examples/model_training.py
+# -*- coding: utf-8 -*-
+"""
+Created on 2017-8-24
+
+@author: cheng.li
+"""
+
+import numpy as np
+import pandas as pd
+import copy
+from sklearn.linear_model import LinearRegression
+from alphamind.api import *
+from matplotlib import pyplot as plt
+plt.style.use('ggplot')
+
+'''
+Settings:
+
+    universe     - zz500
+    neutralize   - 'SIZE' + all industries
+    benchmark    - zz500
+    base factors - ['CFinc1', 'CHV', 'VAL', 'BDTO', 'RVOL']
+    quantiles    - 5
+    start_date   - 2012-01-01
+    end_date     - 2017-08-01
+    re-balance   - 1 week
+    training     - every 4 week
+'''
+
+engine = SqlEngine('postgresql+psycopg2://postgres:A12345678!@10.63.6.220/alpha')
+universe = Universe('zz500', ['zz500'])
+neutralize_risk = ['SIZE'] + industry_styles
+alpha_factors = ['CFinc1', 'CHV', 'VAL', 'BDTO', 'RVOL']
+benchmark = 905
+n_bins = 5
+frequency = '1w'
+batch = 4
+start_date = '2012-01-01'
+end_date = '2017-08-01'
+
+'''
+fetch data from target data base
+'''
+
+train_y, train_x = prepare_data(engine,
+                                start_date=start_date,
+                                end_date=end_date,
+                                factors=alpha_factors + neutralize_risk,
+                                frequency=frequency,
+                                universe=universe,
+                                benchmark=benchmark)
+
+dates = train_x.Date.unique()
+
+groups = train_x.Date.values
+raw_x = train_x[alpha_factors].values.astype(float)
+raw_y = train_y[['dx']].values.astype(float)
+benchmark_w = train_x['weight'].values
+risk_exp = train_x[neutralize_risk].values.astype(float)
+
+'''
+pre-processing stage for winsorize, standardize and neutralize
+'''
+
+ne_x = raw_x.copy()
+ne_y = raw_y.copy()
+
+for i, start_date in enumerate(dates[:-batch]):
+    end_date = dates[i + batch]
+    index = (groups >= start_date) & (groups < end_date)
+    this_raw_x = raw_x[index]
+    this_raw_y = raw_y[index]
+    this_risk_exp = risk_exp[index]
+
+    ne_x[index] = factor_processing(this_raw_x,
+                                    pre_process=[winsorize_normal, standardize],
+                                    risk_factors=this_risk_exp,
+                                    post_process=[standardize])
+
+    ne_y[index] = factor_processing(this_raw_y,
+                                    pre_process=[winsorize_normal, standardize],
+                                    risk_factors=this_risk_exp,
+                                    post_process=[standardize])
+
+'''
+training phase: using Linear - regression from scikit-learn
+'''
+
+model = LinearRegression(fit_intercept=False)
+model_df = pd.Series()
+
+for i, start_date in enumerate(dates[:-batch]):
+    end_date = dates[i + batch]
+    index = (groups >= start_date) & (groups < end_date)
+    this_ne_x = ne_x[index]
+    this_ne_y = ne_y[index]
+
+    model.fit(this_ne_x, this_ne_y)
+    model_df.loc[end_date] = copy.deepcopy(model)
+    print('Date: {0} training finished'.format(end_date))
+
+
+'''
+predicting phase: using trained model on the re-balance dates
+'''
+
+final_res = np.zeros((len(dates) - batch, n_bins))
+
+for i, predict_date in enumerate(dates[batch:]):
+    model = model_df[predict_date]
+    index = groups == predict_date
+    this_ne_x = ne_x[index]
+    realized_r = raw_y[index]
+    this_benchmark_w = benchmark_w[index]
+
+    predict_y = model.predict(this_ne_x)
+
+    res = er_quantile_analysis(predict_y,
+                               n_bins,
+                               dx_return=realized_r,
+                               benchmark=this_benchmark_w)
+
+    final_res[i] = res / this_benchmark_w.sum()
+
+df = pd.DataFrame(final_res, index=dates[batch:])
+df.loc[dates[0]] = 0.
+df.sort_index(inplace=True)
+df = df.cumsum().plot()
+plt.title('Prod factors model training with Linear Regression from 2012 - 2017')
+plt.show()
--- a/alphamind/model/preparing.py
+++ b/alphamind/model/preparing.py
@@ -34,6 +34,7 @@ def prepare_data(engine: SqlEngine,
                 end_date: str,
                 frequency: str,
                 universe: Universe,
+                 benchmark: int,
                 default_window: int=0):
    dates = makeSchedule(start_date, end_date, frequency, calendar='china.sse', dateRule=BizDayConventions.Following)

@@ -46,10 +47,13 @@ def prepare_data(engine: SqlEngine,
                                          dates=dates,
                                          default_window=default_window).sort_values(['Date', 'Code'])
    return_df = engine.fetch_dx_return_range(universe, dates=dates, horizon=horizon)
+    benchmark_df = engine.fetch_benchmark_range(benchmark, dates=dates)

    df = pd.merge(factor_df, return_df, on=['Date', 'Code']).dropna()
+    df = pd.merge(df, benchmark_df, on=['Date', 'Code'], how='left')
+    df['weight'] = df['weight'].fillna(0.)

-    return df[['Date', 'Code', 'dx']], df[['Date', 'Code'] + transformer.names]
+    return df[['Date', 'Code', 'dx']], df[['Date', 'Code', 'weight'] + transformer.names]


 if __name__ == '__main__':