Commit 8c31e011 authored by Dr.李's avatar Dr.李

simplified model training example

parent a5260eba
......@@ -22,7 +22,7 @@ from alphamind.data.standardize import standardize
from alphamind.data.neutralize import neutralize
from alphamind.data.engines.sqlengine import factor_tables
from alphamind.model.preparing import prepare_data
from alphamind.model.data_preparing import fetch_data_package
from alphamind.utilities import alpha_logger
......@@ -43,6 +43,6 @@ __all__ = [
'standardize',
'neutralize',
'factor_tables',
'prepare_data',
'fetch_data_package',
'alpha_logger'
]
\ No newline at end of file
......@@ -224,7 +224,7 @@ class SqlEngine(object):
ref_date: str,
factors: Iterable[object],
codes: Iterable[int],
default_window: int=0) -> pd.DataFrame:
warm_start: int=0) -> pd.DataFrame:
if isinstance(factors, Transformer):
transformer = factors
......@@ -235,7 +235,7 @@ class SqlEngine(object):
factor_cols = _map_factors(dependency)
start_date = advanceDateByCalendar('china.sse', ref_date, str(-default_window) + 'b').strftime('%Y-%m-%d')
start_date = advanceDateByCalendar('china.sse', ref_date, str(-warm_start) + 'b').strftime('%Y-%m-%d')
end_date = ref_date
big_table = Market
......@@ -263,7 +263,7 @@ class SqlEngine(object):
start_date: str = None,
end_date: str = None,
dates: Iterable[str] = None,
default_window: int=0) -> pd.DataFrame:
warm_start: int=0) -> pd.DataFrame:
if isinstance(factors, Transformer):
transformer = factors
......@@ -287,10 +287,10 @@ class SqlEngine(object):
real_dates = dates
else:
if dates:
real_start_date = advanceDateByCalendar('china.sse', dates[0], str(-default_window) + 'b').strftime('%Y-%m-%d')
real_start_date = advanceDateByCalendar('china.sse', dates[0], str(-warm_start) + 'b').strftime('%Y-%m-%d')
real_end_date = dates[-1]
else:
real_start_date = advanceDateByCalendar('china.sse', start_date, str(-default_window) + 'b').strftime('%Y-%m-%d')
real_start_date = advanceDateByCalendar('china.sse', start_date, str(-warm_start) + 'b').strftime('%Y-%m-%d')
real_end_date = end_date
real_dates = None
......@@ -394,6 +394,9 @@ class SqlEngine(object):
risk_cov = pd.read_sql(query, self.engine).sort_values(['Date', 'FactorID'])
if not excluded:
excluded = []
risk_exposure_cols = [RiskExposure.__table__.columns[f] for f in total_risk_factors if f not in set(excluded)]
big_table = outerjoin(special_risk_table, RiskExposure,
and_(special_risk_table.Date == RiskExposure.Date,
......
......@@ -10,7 +10,9 @@ import pandas as pd
import copy
from sklearn.linear_model import LinearRegression
from alphamind.api import *
from PyFin.api import *
from matplotlib import pyplot as plt
plt.style.use('ggplot')
'''
......@@ -25,6 +27,7 @@ Settings:
end_date - 2017-08-01
re-balance - 1 week
training - every 4 week
'''
engine = SqlEngine('postgresql+psycopg2://postgres:A12345678!@10.63.6.220/alpha')
......@@ -39,91 +42,70 @@ start_date = '2012-01-01'
end_date = '2017-08-01'
'''
fetch data from target data base
fetch data from target data base and do the corresponding data processing
'''
train_y, train_x = prepare_data(engine,
start_date=start_date,
end_date=end_date,
factors=alpha_factors + neutralize_risk,
frequency=frequency,
universe=universe,
benchmark=benchmark)
dates = train_x.Date.unique()
groups = train_x.Date.values
raw_x = train_x[alpha_factors].values.astype(float)
raw_y = train_y[['dx']].values.astype(float)
benchmark_w = train_x['weight'].values
risk_exp = train_x[neutralize_risk].values.astype(float)
data_package = fetch_data_package(engine,
alpha_factors=alpha_factors,
start_date=start_date,
end_date=end_date,
frequency=frequency,
universe=universe,
benchmark=benchmark,
batch=batch,
neutralized_risk=neutralize_risk,
pre_process=[winsorize_normal, standardize],
post_process=[standardize])
'''
pre-processing stage for winsorize, standardize and neutralize
training phase: using Linear - regression from scikit-learn
'''
ne_x = raw_x.copy()
ne_y = raw_y.copy()
for i, start_date in enumerate(dates[:-batch]):
end_date = dates[i + batch]
index = (groups >= start_date) & (groups < end_date)
this_raw_x = raw_x[index]
this_raw_y = raw_y[index]
this_risk_exp = risk_exp[index]
ne_x[index] = factor_processing(this_raw_x,
pre_process=[winsorize_normal, standardize],
risk_factors=this_risk_exp,
post_process=[standardize])
ne_y[index] = factor_processing(this_raw_y,
pre_process=[winsorize_normal, standardize],
risk_factors=this_risk_exp,
post_process=[standardize])
train_x = data_package['train']['x']
train_y = data_package['train']['y']
'''
training phase: using Linear - regression from scikit-learn
'''
dates = sorted(train_x.keys())
model = LinearRegression(fit_intercept=False)
model_df = pd.Series()
for i, start_date in enumerate(dates[:-batch]):
end_date = dates[i + batch]
index = (groups >= start_date) & (groups < end_date)
this_ne_x = ne_x[index]
this_ne_y = ne_y[index]
model.fit(this_ne_x, this_ne_y)
model_df.loc[end_date] = copy.deepcopy(model)
print('Date: {0} training finished'.format(end_date))
for train_date in dates:
x = train_x[train_date]
y = train_y[train_date]
model.fit(x, y)
model_df.loc[train_date] = copy.deepcopy(model)
print('Date: {0} training finished'.format(train_date))
'''
predicting phase: using trained model on the re-balance dates
'''
final_res = np.zeros((len(dates) - batch, n_bins))
predict_x = data_package['predict']['x']
settlement = data_package['settlement']
for i, predict_date in enumerate(dates[batch:]):
final_res = np.zeros((len(dates), n_bins))
for i, predict_date in enumerate(dates):
model = model_df[predict_date]
index = groups == predict_date
this_ne_x = ne_x[index]
realized_r = raw_y[index]
this_benchmark_w = benchmark_w[index]
x = predict_x[predict_date]
benchmark_w = settlement[settlement.Date == predict_date]['weight'].values
realized_r = settlement[settlement.Date == predict_date]['dx'].values
predict_y = model.predict(this_ne_x)
predict_y = model.predict(x)
res = er_quantile_analysis(predict_y,
n_bins,
dx_return=realized_r,
benchmark=this_benchmark_w)
benchmark=benchmark_w)
final_res[i] = res / benchmark_w.sum()
print('Date: {0} predicting finished'.format(train_date))
final_res[i] = res / this_benchmark_w.sum()
last_date = advanceDateByCalendar('china.sse', dates[-1], frequency)
df = pd.DataFrame(final_res, index=dates[batch:])
df.loc[dates[0]] = 0.
df = pd.DataFrame(final_res, index=dates[1:] + [last_date])
df.sort_index(inplace=True)
df = df.cumsum().plot()
plt.title('Prod factors model training with Linear Regression from 2012 - 2017')
......
# -*- coding: utf-8 -*-
"""
Created on 2017-8-24
@author: cheng.li
"""
import numpy as np
import pandas as pd
from typing import Iterable
from typing import Union
from PyFin.api import makeSchedule
from PyFin.api import BizDayConventions
from alphamind.data.transformer import Transformer
from alphamind.data.engines.sqlengine import SqlEngine
from alphamind.data.engines.universe import Universe
from alphamind.data.processing import factor_processing
def _map_horizon(frequency: str) -> int:
if frequency == '1d':
return 0
elif frequency == '1w':
return 4
elif frequency == '1m':
return 21
elif frequency == '3m':
return 62
else:
raise ValueError('{0} is an unrecognized frequency rule'.format(frequency))
def prepare_data(engine: SqlEngine,
factors: Union[Transformer, Iterable[object]],
start_date: str,
end_date: str,
frequency: str,
universe: Universe,
benchmark: int,
warm_start: int = 0):
dates = makeSchedule(start_date, end_date, frequency, calendar='china.sse', dateRule=BizDayConventions.Following)
horizon = _map_horizon(frequency)
if isinstance(factors, Transformer):
transformer = factors
else:
transformer = Transformer(factors)
factor_df = engine.fetch_factor_range(universe,
factors=transformer,
dates=dates,
warm_start=warm_start).sort_values(['Date', 'Code'])
return_df = engine.fetch_dx_return_range(universe, dates=dates, horizon=horizon)
benchmark_df = engine.fetch_benchmark_range(benchmark, dates=dates)
df = pd.merge(factor_df, return_df, on=['Date', 'Code']).dropna()
df = pd.merge(df, benchmark_df, on=['Date', 'Code'], how='left')
df['weight'] = df['weight'].fillna(0.)
return df[['Date', 'Code', 'dx']], df[['Date', 'Code', 'weight'] + transformer.names]
def batch_processing(x_values,
y_values,
groups,
group_label,
batch,
risk_exp,
pre_process,
post_process):
train_x_buckets = {}
train_y_buckets = {}
predict_x_buckets = {}
for i, start in enumerate(groups[:-batch]):
end = groups[i + batch]
index = (group_label >= start) & (group_label < end)
this_raw_x = x_values[index]
this_raw_y = y_values[index]
if risk_exp is not None:
this_risk_exp = risk_exp[index]
else:
this_risk_exp = None
train_x_buckets[end] = factor_processing(this_raw_x,
pre_process=pre_process,
risk_factors=this_risk_exp,
post_process=post_process)
train_y_buckets[end] = factor_processing(this_raw_y,
pre_process=pre_process,
risk_factors=this_risk_exp,
post_process=post_process)
index = (group_label > start) & (group_label <= end)
sub_dates = group_label[index]
this_raw_x = x_values[index]
if risk_exp is not None:
this_risk_exp = risk_exp[index]
else:
this_risk_exp = None
ne_x = factor_processing(this_raw_x,
pre_process=pre_process,
risk_factors=this_risk_exp,
post_process=post_process)
predict_x_buckets[end] = ne_x[sub_dates == end]
return train_x_buckets, train_y_buckets, predict_x_buckets
def fetch_data_package(engine: SqlEngine,
alpha_factors: Iterable[object],
start_date: str,
end_date: str,
frequency: str,
universe: Universe,
benchmark: int,
warm_start: int = 0,
batch: int = 1,
neutralized_risk: Iterable[str] = None,
risk_model: str = 'short',
pre_process: Iterable[object] = None,
post_process: Iterable[object] = None):
transformer = Transformer(alpha_factors)
dates = makeSchedule(start_date, end_date, frequency, calendar='china.sse', dateRule=BizDayConventions.Following)
return_df, factor_df = prepare_data(engine,
transformer,
start_date,
end_date,
frequency,
universe,
benchmark,
warm_start)
if neutralized_risk:
risk_df = engine.fetch_risk_model_range(universe, dates=dates, risk_model=risk_model)[1]
risk_df = risk_df[['Date', 'Code'] + neutralized_risk].dropna()
train_x = pd.merge(factor_df, risk_df, on=['Date', 'Code'])
return_df = pd.merge(return_df, risk_df, on=['Date', 'Code'])[['Date', 'Code', 'dx']]
train_y = return_df.copy()
risk_exp = train_x[neutralized_risk].values.astype(float)
x_values = train_x[transformer.names].values.astype(float)
y_values = train_y[['dx']].values
else:
risk_exp = None
train_x = factor_df.copy()
train_y = return_df.copy()
x_values = train_x[transformer.names].values.astype(float)
y_values = train_y[['dx']].values
date_label = pd.DatetimeIndex(factor_df.Date).to_pydatetime()
dates = np.unique(date_label)
return_df['weight'] = train_x['weight']
train_x_buckets, train_y_buckets, predict_x_buckets = batch_processing(x_values,
y_values,
dates,
date_label,
batch,
risk_exp,
pre_process,
post_process)
ret = dict()
ret['settlement'] = return_df
ret['train'] = {'x': train_x_buckets, 'y': train_y_buckets}
ret['predict'] = {'x': predict_x_buckets}
return ret
if __name__ == '__main__':
from PyFin.api import MA
engine = SqlEngine('postgresql+psycopg2://postgres:A12345678!@10.63.6.220/alpha')
universe = Universe('zz500', ['zz500'])
res = fetch_data_package(engine,
MA(10, 'EPS'),
'2012-01-01',
'2012-04-01',
'1m',
universe,
905,
0)
print(res)
# -*- coding: utf-8 -*-
"""
Created on 2017-8-24
@author: cheng.li
"""
import pandas as pd
from typing import Iterable
from typing import Union
from PyFin.api import makeSchedule
from PyFin.api import BizDayConventions
from alphamind.data.transformer import Transformer
from alphamind.data.engines.sqlengine import SqlEngine
from alphamind.data.engines.universe import Universe
def _map_horizon(frequency: str) -> int:
if frequency == '1d':
return 0
elif frequency == '1w':
return 4
elif frequency == '1m':
return 21
elif frequency == '3m':
return 62
else:
raise ValueError('{0} is an unrecognized frequency rule'.format(frequency))
def prepare_data(engine: SqlEngine,
factors: Iterable[object],
start_date: str,
end_date: str,
frequency: str,
universe: Universe,
benchmark: int,
default_window: int=0):
dates = makeSchedule(start_date, end_date, frequency, calendar='china.sse', dateRule=BizDayConventions.Following)
horizon = _map_horizon(frequency)
transformer = Transformer(factors)
factor_df = engine.fetch_factor_range(universe,
factors=transformer,
dates=dates,
default_window=default_window).sort_values(['Date', 'Code'])
return_df = engine.fetch_dx_return_range(universe, dates=dates, horizon=horizon)
benchmark_df = engine.fetch_benchmark_range(benchmark, dates=dates)
df = pd.merge(factor_df, return_df, on=['Date', 'Code']).dropna()
df = pd.merge(df, benchmark_df, on=['Date', 'Code'], how='left')
df['weight'] = df['weight'].fillna(0.)
return df[['Date', 'Code', 'dx']], df[['Date', 'Code', 'weight'] + transformer.names]
if __name__ == '__main__':
from PyFin.api import *
engine = SqlEngine('postgresql+psycopg2://postgres:A12345678!@10.63.6.220/alpha')
universe = Universe('zz500', ['zz500'])
df1, df2 = prepare_data(engine,
MA(10, 'EPS'),
'2012-01-01',
'2013-01-01',
'1w',
universe)
print(df1)
print(df2)
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment