Unverified Commit a1cc7865 authored by iLampard's avatar iLampard Committed by GitHub

Merge pull request #8 from alpha-miner/master

merge update
parents 45afaf12 81538ae6
...@@ -481,7 +481,7 @@ class SqlEngine(object): ...@@ -481,7 +481,7 @@ class SqlEngine(object):
) )
df = pd.read_sql(query, self.engine).sort_values(['trade_date', 'code']) df = pd.read_sql(query, self.engine).sort_values(['trade_date', 'code'])
return df return pd.merge(df, codes[['trade_date', 'code']], how='inner')
def fetch_benchmark(self, def fetch_benchmark(self,
ref_date: str, ref_date: str,
......
...@@ -47,7 +47,7 @@ def _map_factors(factors: Iterable[str], used_factor_tables) -> Dict: ...@@ -47,7 +47,7 @@ def _map_factors(factors: Iterable[str], used_factor_tables) -> Dict:
break break
if not factor_cols: if not factor_cols:
raise ValueError(f"some factors in <{factors}> can't be find") raise ValueError("some factors in <{0}> can't be find".format(factors))
return factor_cols return factor_cols
......
...@@ -8,6 +8,7 @@ Created on 2017-9-27 ...@@ -8,6 +8,7 @@ Created on 2017-9-27
import copy import copy
import bisect import bisect
from typing import Iterable from typing import Iterable
import numpy as np
import pandas as pd import pandas as pd
from simpleutils.miscellaneous import list_eq from simpleutils.miscellaneous import list_eq
from alphamind.model.modelbase import ModelBase from alphamind.model.modelbase import ModelBase
...@@ -133,7 +134,8 @@ class DataMeta(object): ...@@ -133,7 +134,8 @@ class DataMeta(object):
self.pre_process, self.pre_process,
self.post_process, self.post_process,
self.warm_start, self.warm_start,
fillna=True) fillna=True,
fit_target=alpha_model.fit_target)
def train_model(ref_date: str, def train_model(ref_date: str,
...@@ -186,6 +188,15 @@ class Composer(object): ...@@ -186,6 +188,15 @@ class Composer(object):
codes = x.index codes = x.index
return pd.DataFrame(model.predict(x_values).flatten(), index=codes) return pd.DataFrame(model.predict(x_values).flatten(), index=codes)
def score(self, ref_date: str, x: pd.DataFrame = None, y: np.ndarray = None) -> float:
model = self._fetch_latest_model(ref_date)
if x is None:
predict_data = self.data_meta.fetch_predict_data(ref_date, model)
x = predict_data['predict']['x']
if y is None:
y = predict_data['predict']['y']
return model.score(x, y)
def _fetch_latest_model(self, ref_date) -> ModelBase: def _fetch_latest_model(self, ref_date) -> ModelBase:
if self.is_updated: if self.is_updated:
sorted_keys = self.sorted_keys sorted_keys = self.sorted_keys
...@@ -211,35 +222,33 @@ class Composer(object): ...@@ -211,35 +222,33 @@ class Composer(object):
if __name__ == '__main__': if __name__ == '__main__':
import numpy as np from PyFin.api import LAST
from alphamind.data.standardize import standardize from alphamind.data.engines.sqlengine import risk_styles, industry_styles
from alphamind.data.winsorize import winsorize_normal from alphamind.model.linearmodel import LinearRegression
from alphamind.data.engines.sqlengine import industry_styles
from alphamind.model.linearmodel import ConstLinearModel universe = Universe('custom', ['ashare_ex'])
freq = '20b'
data_source = "postgres+psycopg2://postgres:we083826@localhost/alpha" batch = 0
alpha_model = ConstLinearModel(['EPS'], np.array([1.])) neutralized_risk = risk_styles + industry_styles
alpha_factors = ['EPS']
freq = '1w'
universe = Universe('zz500', ['zz500'])
batch = 4
neutralized_risk = ['SIZE'] + industry_styles
risk_model = 'short' risk_model = 'short'
pre_process = [winsorize_normal, standardize] pre_process = [winsorize_normal, standardize]
pos_process = [winsorize_normal, standardize] post_process = [standardize]
warm_start = 0
data_meta = DataMeta(freq, data_source = "postgres+psycopg2://postgres:we083826@localhost/alpha"
universe,
batch, data_meta = DataMeta(freq=freq,
neutralized_risk, universe=universe,
risk_model, batch=batch,
pre_process, neutralized_risk=neutralized_risk,
pos_process, risk_model=risk_model,
pre_process=pre_process,
post_process=post_process,
warm_start=warm_start,
data_source=data_source) data_source=data_source)
composer = Composer(alpha_model, data_meta) alpha_model = LinearRegression({'roe_q': LAST('roe_q')}, fit_target='roe_q')
composer = Composer(alpha_model=alpha_model, data_meta=data_meta)
composer.train('2017-09-20') ref_date = '2018-01-30'
composer.train('2017-09-22') composer.train(ref_date)
composer.train('2017-09-25') res = composer.predict(ref_date)
composer.predict('2017-09-21') \ No newline at end of file
...@@ -106,6 +106,7 @@ def prepare_data(engine: SqlEngine, ...@@ -106,6 +106,7 @@ def prepare_data(engine: SqlEngine,
df = pd.merge(df, benchmark_df, on=['trade_date', 'code'], how='left') df = pd.merge(df, benchmark_df, on=['trade_date', 'code'], how='left')
df = pd.merge(df, industry_df, on=['trade_date', 'code']) df = pd.merge(df, industry_df, on=['trade_date', 'code'])
df['weight'] = df['weight'].fillna(0.) df['weight'] = df['weight'].fillna(0.)
df.dropna(inplace=True)
return dates, df[['trade_date', 'code', 'dx']], df[ return dates, df[['trade_date', 'code', 'dx']], df[
['trade_date', 'code', 'weight', 'isOpen', 'industry_code', 'industry'] + transformer.names] ['trade_date', 'code', 'weight', 'isOpen', 'industry_code', 'industry'] + transformer.names]
...@@ -310,10 +311,10 @@ def fetch_train_phase(engine, ...@@ -310,10 +311,10 @@ def fetch_train_phase(engine,
if dates[-1] == dt.datetime.strptime(ref_date, '%Y-%m-%d'): if dates[-1] == dt.datetime.strptime(ref_date, '%Y-%m-%d'):
pyFinAssert(len(dates) >= 2, ValueError, "No previous data for training for the date {0}".format(ref_date)) pyFinAssert(len(dates) >= 2, ValueError, "No previous data for training for the date {0}".format(ref_date))
end = dates[-2] end = dates[-2]
start = dates[-batch - 1] if batch <= len(dates) - 1 else dates[0] start = dates[-batch - 2] if batch <= len(dates) - 2 else dates[0]
else: else:
end = dates[-1] end = dates[-1]
start = dates[-batch] if batch <= len(dates) else dates[0] start = dates[-batch - 1] if batch <= len(dates) else dates[0]
index = (date_label >= start) & (date_label <= end) index = (date_label >= start) & (date_label <= end)
this_raw_x = x_values[index] this_raw_x = x_values[index]
...@@ -352,7 +353,8 @@ def fetch_predict_phase(engine, ...@@ -352,7 +353,8 @@ def fetch_predict_phase(engine,
pre_process: Iterable[object] = None, pre_process: Iterable[object] = None,
post_process: Iterable[object] = None, post_process: Iterable[object] = None,
warm_start: int = 0, warm_start: int = 0,
fillna: str = None): fillna: str = None,
fit_target: Union[Transformer, object] = None):
if isinstance(alpha_factors, Transformer): if isinstance(alpha_factors, Transformer):
transformer = alpha_factors transformer = alpha_factors
else: else:
...@@ -369,6 +371,8 @@ def fetch_predict_phase(engine, ...@@ -369,6 +371,8 @@ def fetch_predict_phase(engine,
dateRule=BizDayConventions.Following, dateRule=BizDayConventions.Following,
dateGenerationRule=DateGeneration.Backward) dateGenerationRule=DateGeneration.Backward)
horizon = map_freq(frequency)
factor_df = engine.fetch_factor_range(universe, factors=transformer, dates=dates) factor_df = engine.fetch_factor_range(universe, factors=transformer, dates=dates)
if fillna: if fillna:
...@@ -377,6 +381,14 @@ def fetch_predict_phase(engine, ...@@ -377,6 +381,14 @@ def fetch_predict_phase(engine,
else: else:
factor_df = factor_df.dropna() factor_df = factor_df.dropna()
if fit_target is None:
target_df = engine.fetch_dx_return_range(universe, dates=dates, horizon=horizon)
else:
one_more_date = advanceDateByCalendar('china.sse', dates[-1], frequency)
target_df = engine.fetch_factor_range_forward(universe, factors=fit_target, dates=dates + [one_more_date])
target_df = target_df[target_df.trade_date.isin(dates)]
target_df = target_df.groupby('code').apply(lambda x: x.fillna(method='pad'))
names = transformer.names names = transformer.names
if neutralized_risk: if neutralized_risk:
...@@ -384,13 +396,17 @@ def fetch_predict_phase(engine, ...@@ -384,13 +396,17 @@ def fetch_predict_phase(engine,
used_neutralized_risk = list(set(neutralized_risk).difference(names)) used_neutralized_risk = list(set(neutralized_risk).difference(names))
risk_df = risk_df[['trade_date', 'code'] + used_neutralized_risk].dropna() risk_df = risk_df[['trade_date', 'code'] + used_neutralized_risk].dropna()
train_x = pd.merge(factor_df, risk_df, on=['trade_date', 'code']) train_x = pd.merge(factor_df, risk_df, on=['trade_date', 'code'])
train_x = pd.merge(train_x, target_df, on=['trade_date', 'code'], how='left')
risk_exp = train_x[neutralized_risk].values.astype(float) risk_exp = train_x[neutralized_risk].values.astype(float)
else: else:
train_x = factor_df.copy() train_x = pd.merge(factor_df, target_df, on=['trade_date', 'code'], how='left')
risk_exp = None risk_exp = None
train_x.dropna(inplace=True)
x_values = train_x[names].values.astype(float) x_values = train_x[names].values.astype(float)
y_values = train_x[['dx']].values.astype(float)
date_label = pd.DatetimeIndex(factor_df.trade_date).to_pydatetime() date_label = pd.DatetimeIndex(train_x.trade_date).to_pydatetime()
dates = np.unique(date_label) dates = np.unique(date_label)
if dates[-1] == dt.datetime.strptime(ref_date, '%Y-%m-%d'): if dates[-1] == dt.datetime.strptime(ref_date, '%Y-%m-%d'):
...@@ -400,6 +416,7 @@ def fetch_predict_phase(engine, ...@@ -400,6 +416,7 @@ def fetch_predict_phase(engine,
left_index = bisect.bisect_left(date_label, start) left_index = bisect.bisect_left(date_label, start)
right_index = bisect.bisect_right(date_label, end) right_index = bisect.bisect_right(date_label, end)
this_raw_x = x_values[left_index:right_index] this_raw_x = x_values[left_index:right_index]
this_raw_y = y_values[left_index:right_index]
sub_dates = date_label[left_index:right_index] sub_dates = date_label[left_index:right_index]
if risk_exp is not None: if risk_exp is not None:
...@@ -412,10 +429,16 @@ def fetch_predict_phase(engine, ...@@ -412,10 +429,16 @@ def fetch_predict_phase(engine,
risk_factors=this_risk_exp, risk_factors=this_risk_exp,
post_process=post_process) post_process=post_process)
ne_y = factor_processing(this_raw_y,
pre_process=pre_process,
risk_factors=this_risk_exp,
post_process=post_process)
inner_left_index = bisect.bisect_left(sub_dates, end) inner_left_index = bisect.bisect_left(sub_dates, end)
inner_right_index = bisect.bisect_right(sub_dates, end) inner_right_index = bisect.bisect_right(sub_dates, end)
ne_x = ne_x[inner_left_index:inner_right_index] ne_x = ne_x[inner_left_index:inner_right_index]
ne_y = ne_y[inner_left_index:inner_right_index]
left_index = bisect.bisect_left(date_label, end) left_index = bisect.bisect_left(date_label, end)
right_index = bisect.bisect_right(date_label, end) right_index = bisect.bisect_right(date_label, end)
...@@ -423,11 +446,12 @@ def fetch_predict_phase(engine, ...@@ -423,11 +446,12 @@ def fetch_predict_phase(engine,
codes = train_x.code.values[left_index:right_index] codes = train_x.code.values[left_index:right_index]
else: else:
ne_x = None ne_x = None
ne_y = None
codes = None codes = None
ret = dict() ret = dict()
ret['x_names'] = transformer.names ret['x_names'] = transformer.names
ret['predict'] = {'x': pd.DataFrame(ne_x, columns=transformer.names), 'code': codes} ret['predict'] = {'x': pd.DataFrame(ne_x, columns=transformer.names), 'code': codes, 'y': ne_y.flatten()}
return ret return ret
...@@ -437,7 +461,7 @@ if __name__ == '__main__': ...@@ -437,7 +461,7 @@ if __name__ == '__main__':
engine = SqlEngine('postgresql+psycopg2://postgres:we083826@localhost/alpha') engine = SqlEngine('postgresql+psycopg2://postgres:we083826@localhost/alpha')
universe = Universe('zz500', ['hs300', 'zz500']) universe = Universe('zz500', ['hs300', 'zz500'])
neutralized_risk = risk_styles + industry_styles neutralized_risk = risk_styles + industry_styles
res = fetch_train_phase(engine, ['ep_q'], res = fetch_predict_phase(engine, ['ep_q'],
'2012-01-05', '2012-01-05',
'5b', '5b',
universe, universe,
......
This diff is collapsed.
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment