Commit 8c31e011 authored by Dr.李's avatar Dr.李

simplified model training example

parent a5260eba
...@@ -22,7 +22,7 @@ from alphamind.data.standardize import standardize ...@@ -22,7 +22,7 @@ from alphamind.data.standardize import standardize
from alphamind.data.neutralize import neutralize from alphamind.data.neutralize import neutralize
from alphamind.data.engines.sqlengine import factor_tables from alphamind.data.engines.sqlengine import factor_tables
from alphamind.model.preparing import prepare_data from alphamind.model.data_preparing import fetch_data_package
from alphamind.utilities import alpha_logger from alphamind.utilities import alpha_logger
...@@ -43,6 +43,6 @@ __all__ = [ ...@@ -43,6 +43,6 @@ __all__ = [
'standardize', 'standardize',
'neutralize', 'neutralize',
'factor_tables', 'factor_tables',
'prepare_data', 'fetch_data_package',
'alpha_logger' 'alpha_logger'
] ]
\ No newline at end of file
...@@ -224,7 +224,7 @@ class SqlEngine(object): ...@@ -224,7 +224,7 @@ class SqlEngine(object):
ref_date: str, ref_date: str,
factors: Iterable[object], factors: Iterable[object],
codes: Iterable[int], codes: Iterable[int],
default_window: int=0) -> pd.DataFrame: warm_start: int=0) -> pd.DataFrame:
if isinstance(factors, Transformer): if isinstance(factors, Transformer):
transformer = factors transformer = factors
...@@ -235,7 +235,7 @@ class SqlEngine(object): ...@@ -235,7 +235,7 @@ class SqlEngine(object):
factor_cols = _map_factors(dependency) factor_cols = _map_factors(dependency)
start_date = advanceDateByCalendar('china.sse', ref_date, str(-default_window) + 'b').strftime('%Y-%m-%d') start_date = advanceDateByCalendar('china.sse', ref_date, str(-warm_start) + 'b').strftime('%Y-%m-%d')
end_date = ref_date end_date = ref_date
big_table = Market big_table = Market
...@@ -263,7 +263,7 @@ class SqlEngine(object): ...@@ -263,7 +263,7 @@ class SqlEngine(object):
start_date: str = None, start_date: str = None,
end_date: str = None, end_date: str = None,
dates: Iterable[str] = None, dates: Iterable[str] = None,
default_window: int=0) -> pd.DataFrame: warm_start: int=0) -> pd.DataFrame:
if isinstance(factors, Transformer): if isinstance(factors, Transformer):
transformer = factors transformer = factors
...@@ -287,10 +287,10 @@ class SqlEngine(object): ...@@ -287,10 +287,10 @@ class SqlEngine(object):
real_dates = dates real_dates = dates
else: else:
if dates: if dates:
real_start_date = advanceDateByCalendar('china.sse', dates[0], str(-default_window) + 'b').strftime('%Y-%m-%d') real_start_date = advanceDateByCalendar('china.sse', dates[0], str(-warm_start) + 'b').strftime('%Y-%m-%d')
real_end_date = dates[-1] real_end_date = dates[-1]
else: else:
real_start_date = advanceDateByCalendar('china.sse', start_date, str(-default_window) + 'b').strftime('%Y-%m-%d') real_start_date = advanceDateByCalendar('china.sse', start_date, str(-warm_start) + 'b').strftime('%Y-%m-%d')
real_end_date = end_date real_end_date = end_date
real_dates = None real_dates = None
...@@ -394,6 +394,9 @@ class SqlEngine(object): ...@@ -394,6 +394,9 @@ class SqlEngine(object):
risk_cov = pd.read_sql(query, self.engine).sort_values(['Date', 'FactorID']) risk_cov = pd.read_sql(query, self.engine).sort_values(['Date', 'FactorID'])
if not excluded:
excluded = []
risk_exposure_cols = [RiskExposure.__table__.columns[f] for f in total_risk_factors if f not in set(excluded)] risk_exposure_cols = [RiskExposure.__table__.columns[f] for f in total_risk_factors if f not in set(excluded)]
big_table = outerjoin(special_risk_table, RiskExposure, big_table = outerjoin(special_risk_table, RiskExposure,
and_(special_risk_table.Date == RiskExposure.Date, and_(special_risk_table.Date == RiskExposure.Date,
......
...@@ -10,7 +10,9 @@ import pandas as pd ...@@ -10,7 +10,9 @@ import pandas as pd
import copy import copy
from sklearn.linear_model import LinearRegression from sklearn.linear_model import LinearRegression
from alphamind.api import * from alphamind.api import *
from PyFin.api import *
from matplotlib import pyplot as plt from matplotlib import pyplot as plt
plt.style.use('ggplot') plt.style.use('ggplot')
''' '''
...@@ -25,6 +27,7 @@ Settings: ...@@ -25,6 +27,7 @@ Settings:
end_date - 2017-08-01 end_date - 2017-08-01
re-balance - 1 week re-balance - 1 week
training - every 4 week training - every 4 week
''' '''
engine = SqlEngine('postgresql+psycopg2://postgres:A12345678!@10.63.6.220/alpha') engine = SqlEngine('postgresql+psycopg2://postgres:A12345678!@10.63.6.220/alpha')
...@@ -39,91 +42,70 @@ start_date = '2012-01-01' ...@@ -39,91 +42,70 @@ start_date = '2012-01-01'
end_date = '2017-08-01' end_date = '2017-08-01'
''' '''
fetch data from target data base fetch data from target data base and do the corresponding data processing
''' '''
train_y, train_x = prepare_data(engine, data_package = fetch_data_package(engine,
alpha_factors=alpha_factors,
start_date=start_date, start_date=start_date,
end_date=end_date, end_date=end_date,
factors=alpha_factors + neutralize_risk,
frequency=frequency, frequency=frequency,
universe=universe, universe=universe,
benchmark=benchmark) benchmark=benchmark,
batch=batch,
dates = train_x.Date.unique() neutralized_risk=neutralize_risk,
groups = train_x.Date.values
raw_x = train_x[alpha_factors].values.astype(float)
raw_y = train_y[['dx']].values.astype(float)
benchmark_w = train_x['weight'].values
risk_exp = train_x[neutralize_risk].values.astype(float)
'''
pre-processing stage for winsorize, standardize and neutralize
'''
ne_x = raw_x.copy()
ne_y = raw_y.copy()
for i, start_date in enumerate(dates[:-batch]):
end_date = dates[i + batch]
index = (groups >= start_date) & (groups < end_date)
this_raw_x = raw_x[index]
this_raw_y = raw_y[index]
this_risk_exp = risk_exp[index]
ne_x[index] = factor_processing(this_raw_x,
pre_process=[winsorize_normal, standardize], pre_process=[winsorize_normal, standardize],
risk_factors=this_risk_exp,
post_process=[standardize])
ne_y[index] = factor_processing(this_raw_y,
pre_process=[winsorize_normal, standardize],
risk_factors=this_risk_exp,
post_process=[standardize]) post_process=[standardize])
''' '''
training phase: using Linear - regression from scikit-learn training phase: using Linear - regression from scikit-learn
''' '''
train_x = data_package['train']['x']
train_y = data_package['train']['y']
dates = sorted(train_x.keys())
model = LinearRegression(fit_intercept=False) model = LinearRegression(fit_intercept=False)
model_df = pd.Series() model_df = pd.Series()
for i, start_date in enumerate(dates[:-batch]): for train_date in dates:
end_date = dates[i + batch] x = train_x[train_date]
index = (groups >= start_date) & (groups < end_date) y = train_y[train_date]
this_ne_x = ne_x[index]
this_ne_y = ne_y[index]
model.fit(this_ne_x, this_ne_y)
model_df.loc[end_date] = copy.deepcopy(model)
print('Date: {0} training finished'.format(end_date))
model.fit(x, y)
model_df.loc[train_date] = copy.deepcopy(model)
print('Date: {0} training finished'.format(train_date))
''' '''
predicting phase: using trained model on the re-balance dates predicting phase: using trained model on the re-balance dates
''' '''
final_res = np.zeros((len(dates) - batch, n_bins)) predict_x = data_package['predict']['x']
settlement = data_package['settlement']
final_res = np.zeros((len(dates), n_bins))
for i, predict_date in enumerate(dates[batch:]):
for i, predict_date in enumerate(dates):
model = model_df[predict_date] model = model_df[predict_date]
index = groups == predict_date x = predict_x[predict_date]
this_ne_x = ne_x[index] benchmark_w = settlement[settlement.Date == predict_date]['weight'].values
realized_r = raw_y[index] realized_r = settlement[settlement.Date == predict_date]['dx'].values
this_benchmark_w = benchmark_w[index]
predict_y = model.predict(this_ne_x) predict_y = model.predict(x)
res = er_quantile_analysis(predict_y, res = er_quantile_analysis(predict_y,
n_bins, n_bins,
dx_return=realized_r, dx_return=realized_r,
benchmark=this_benchmark_w) benchmark=benchmark_w)
final_res[i] = res / benchmark_w.sum()
print('Date: {0} predicting finished'.format(train_date))
final_res[i] = res / this_benchmark_w.sum() last_date = advanceDateByCalendar('china.sse', dates[-1], frequency)
df = pd.DataFrame(final_res, index=dates[batch:]) df = pd.DataFrame(final_res, index=dates[1:] + [last_date])
df.loc[dates[0]] = 0.
df.sort_index(inplace=True) df.sort_index(inplace=True)
df = df.cumsum().plot() df = df.cumsum().plot()
plt.title('Prod factors model training with Linear Regression from 2012 - 2017') plt.title('Prod factors model training with Linear Regression from 2012 - 2017')
......
# -*- coding: utf-8 -*-
"""
Created on 2017-8-24
@author: cheng.li
"""
import numpy as np
import pandas as pd
from typing import Iterable
from typing import Union
from PyFin.api import makeSchedule
from PyFin.api import BizDayConventions
from alphamind.data.transformer import Transformer
from alphamind.data.engines.sqlengine import SqlEngine
from alphamind.data.engines.universe import Universe
from alphamind.data.processing import factor_processing
def _map_horizon(frequency: str) -> int:
if frequency == '1d':
return 0
elif frequency == '1w':
return 4
elif frequency == '1m':
return 21
elif frequency == '3m':
return 62
else:
raise ValueError('{0} is an unrecognized frequency rule'.format(frequency))
def prepare_data(engine: SqlEngine,
factors: Union[Transformer, Iterable[object]],
start_date: str,
end_date: str,
frequency: str,
universe: Universe,
benchmark: int,
warm_start: int = 0):
dates = makeSchedule(start_date, end_date, frequency, calendar='china.sse', dateRule=BizDayConventions.Following)
horizon = _map_horizon(frequency)
if isinstance(factors, Transformer):
transformer = factors
else:
transformer = Transformer(factors)
factor_df = engine.fetch_factor_range(universe,
factors=transformer,
dates=dates,
warm_start=warm_start).sort_values(['Date', 'Code'])
return_df = engine.fetch_dx_return_range(universe, dates=dates, horizon=horizon)
benchmark_df = engine.fetch_benchmark_range(benchmark, dates=dates)
df = pd.merge(factor_df, return_df, on=['Date', 'Code']).dropna()
df = pd.merge(df, benchmark_df, on=['Date', 'Code'], how='left')
df['weight'] = df['weight'].fillna(0.)
return df[['Date', 'Code', 'dx']], df[['Date', 'Code', 'weight'] + transformer.names]
def batch_processing(x_values,
y_values,
groups,
group_label,
batch,
risk_exp,
pre_process,
post_process):
train_x_buckets = {}
train_y_buckets = {}
predict_x_buckets = {}
for i, start in enumerate(groups[:-batch]):
end = groups[i + batch]
index = (group_label >= start) & (group_label < end)
this_raw_x = x_values[index]
this_raw_y = y_values[index]
if risk_exp is not None:
this_risk_exp = risk_exp[index]
else:
this_risk_exp = None
train_x_buckets[end] = factor_processing(this_raw_x,
pre_process=pre_process,
risk_factors=this_risk_exp,
post_process=post_process)
train_y_buckets[end] = factor_processing(this_raw_y,
pre_process=pre_process,
risk_factors=this_risk_exp,
post_process=post_process)
index = (group_label > start) & (group_label <= end)
sub_dates = group_label[index]
this_raw_x = x_values[index]
if risk_exp is not None:
this_risk_exp = risk_exp[index]
else:
this_risk_exp = None
ne_x = factor_processing(this_raw_x,
pre_process=pre_process,
risk_factors=this_risk_exp,
post_process=post_process)
predict_x_buckets[end] = ne_x[sub_dates == end]
return train_x_buckets, train_y_buckets, predict_x_buckets
def fetch_data_package(engine: SqlEngine,
alpha_factors: Iterable[object],
start_date: str,
end_date: str,
frequency: str,
universe: Universe,
benchmark: int,
warm_start: int = 0,
batch: int = 1,
neutralized_risk: Iterable[str] = None,
risk_model: str = 'short',
pre_process: Iterable[object] = None,
post_process: Iterable[object] = None):
transformer = Transformer(alpha_factors)
dates = makeSchedule(start_date, end_date, frequency, calendar='china.sse', dateRule=BizDayConventions.Following)
return_df, factor_df = prepare_data(engine,
transformer,
start_date,
end_date,
frequency,
universe,
benchmark,
warm_start)
if neutralized_risk:
risk_df = engine.fetch_risk_model_range(universe, dates=dates, risk_model=risk_model)[1]
risk_df = risk_df[['Date', 'Code'] + neutralized_risk].dropna()
train_x = pd.merge(factor_df, risk_df, on=['Date', 'Code'])
return_df = pd.merge(return_df, risk_df, on=['Date', 'Code'])[['Date', 'Code', 'dx']]
train_y = return_df.copy()
risk_exp = train_x[neutralized_risk].values.astype(float)
x_values = train_x[transformer.names].values.astype(float)
y_values = train_y[['dx']].values
else:
risk_exp = None
train_x = factor_df.copy()
train_y = return_df.copy()
x_values = train_x[transformer.names].values.astype(float)
y_values = train_y[['dx']].values
date_label = pd.DatetimeIndex(factor_df.Date).to_pydatetime()
dates = np.unique(date_label)
return_df['weight'] = train_x['weight']
train_x_buckets, train_y_buckets, predict_x_buckets = batch_processing(x_values,
y_values,
dates,
date_label,
batch,
risk_exp,
pre_process,
post_process)
ret = dict()
ret['settlement'] = return_df
ret['train'] = {'x': train_x_buckets, 'y': train_y_buckets}
ret['predict'] = {'x': predict_x_buckets}
return ret
if __name__ == '__main__':
from PyFin.api import MA
engine = SqlEngine('postgresql+psycopg2://postgres:A12345678!@10.63.6.220/alpha')
universe = Universe('zz500', ['zz500'])
res = fetch_data_package(engine,
MA(10, 'EPS'),
'2012-01-01',
'2012-04-01',
'1m',
universe,
905,
0)
print(res)
# -*- coding: utf-8 -*-
"""
Created on 2017-8-24
@author: cheng.li
"""
import pandas as pd
from typing import Iterable
from typing import Union
from PyFin.api import makeSchedule
from PyFin.api import BizDayConventions
from alphamind.data.transformer import Transformer
from alphamind.data.engines.sqlengine import SqlEngine
from alphamind.data.engines.universe import Universe
def _map_horizon(frequency: str) -> int:
if frequency == '1d':
return 0
elif frequency == '1w':
return 4
elif frequency == '1m':
return 21
elif frequency == '3m':
return 62
else:
raise ValueError('{0} is an unrecognized frequency rule'.format(frequency))
def prepare_data(engine: SqlEngine,
factors: Iterable[object],
start_date: str,
end_date: str,
frequency: str,
universe: Universe,
benchmark: int,
default_window: int=0):
dates = makeSchedule(start_date, end_date, frequency, calendar='china.sse', dateRule=BizDayConventions.Following)
horizon = _map_horizon(frequency)
transformer = Transformer(factors)
factor_df = engine.fetch_factor_range(universe,
factors=transformer,
dates=dates,
default_window=default_window).sort_values(['Date', 'Code'])
return_df = engine.fetch_dx_return_range(universe, dates=dates, horizon=horizon)
benchmark_df = engine.fetch_benchmark_range(benchmark, dates=dates)
df = pd.merge(factor_df, return_df, on=['Date', 'Code']).dropna()
df = pd.merge(df, benchmark_df, on=['Date', 'Code'], how='left')
df['weight'] = df['weight'].fillna(0.)
return df[['Date', 'Code', 'dx']], df[['Date', 'Code', 'weight'] + transformer.names]
if __name__ == '__main__':
from PyFin.api import *
engine = SqlEngine('postgresql+psycopg2://postgres:A12345678!@10.63.6.220/alpha')
universe = Universe('zz500', ['zz500'])
df1, df2 = prepare_data(engine,
MA(10, 'EPS'),
'2012-01-01',
'2013-01-01',
'1w',
universe)
print(df1)
print(df2)
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment