Commit 2553b951 authored by Dr.李's avatar Dr.李

added fit target setting in models

parent 189ee654
...@@ -9,6 +9,7 @@ import copy ...@@ -9,6 +9,7 @@ import copy
import bisect import bisect
from typing import Iterable from typing import Iterable
import pandas as pd import pandas as pd
from typing import Union
from simpleutils.miscellaneous import list_eq from simpleutils.miscellaneous import list_eq
from alphamind.model.modelbase import ModelBase from alphamind.model.modelbase import ModelBase
from alphamind.model.data_preparing import fetch_train_phase from alphamind.model.data_preparing import fetch_train_phase
...@@ -19,6 +20,7 @@ from alphamind.data.winsorize import winsorize_normal ...@@ -19,6 +20,7 @@ from alphamind.data.winsorize import winsorize_normal
from alphamind.data.rank import rank from alphamind.data.rank import rank
from alphamind.data.standardize import standardize from alphamind.data.standardize import standardize
from alphamind.model.loader import load_model from alphamind.model.loader import load_model
from alphamind.data.transformer import Transformer
PROCESS_MAPPING = { PROCESS_MAPPING = {
'winsorize_normal': winsorize_normal, 'winsorize_normal': winsorize_normal,
...@@ -116,7 +118,8 @@ class DataMeta(object): ...@@ -116,7 +118,8 @@ class DataMeta(object):
self.risk_model, self.risk_model,
self.pre_process, self.pre_process,
self.post_process, self.post_process,
self.warm_start) self.warm_start,
fit_target=alpha_model.fit_target)
def fetch_predict_data(self, def fetch_predict_data(self,
ref_date: str, ref_date: str,
......
...@@ -60,7 +60,8 @@ def prepare_data(engine: SqlEngine, ...@@ -60,7 +60,8 @@ def prepare_data(engine: SqlEngine,
frequency: str, frequency: str,
universe: Universe, universe: Universe,
benchmark: int, benchmark: int,
warm_start: int = 0): warm_start: int = 0,
fit_target: Union[Transformer, object]=None):
if warm_start > 0: if warm_start > 0:
p = Period(frequency) p = Period(frequency)
p = Period(length=-warm_start * p.length(), units=p.units()) p = Period(length=-warm_start * p.length(), units=p.units())
...@@ -86,14 +87,22 @@ def prepare_data(engine: SqlEngine, ...@@ -86,14 +87,22 @@ def prepare_data(engine: SqlEngine,
factors=transformer, factors=transformer,
dates=dates).sort_values(['trade_date', 'code']) dates=dates).sort_values(['trade_date', 'code'])
alpha_logger.info("factor data loading finished") alpha_logger.info("factor data loading finished")
return_df = engine.fetch_dx_return_range(universe, dates=dates, horizon=horizon)
alpha_logger.info("return data loading finished") if fit_target is None:
target_df = engine.fetch_dx_return_range(universe, dates=dates, horizon=horizon)
else:
one_more_date = advanceDateByCalendar('china.sse', dates[-1], frequency)
target_df = engine.fetch_factor_range_forward(universe, factors=fit_target, dates=dates + [one_more_date])
target_df = target_df[target_df.trade_date.isin(dates)]
target_df = target_df.groupby('code').apply(lambda x: x.fillna(method='pad'))
alpha_logger.info("fit target data loading finished")
industry_df = engine.fetch_industry_range(universe, dates=dates) industry_df = engine.fetch_industry_range(universe, dates=dates)
alpha_logger.info("industry data loading finished") alpha_logger.info("industry data loading finished")
benchmark_df = engine.fetch_benchmark_range(benchmark, dates=dates) benchmark_df = engine.fetch_benchmark_range(benchmark, dates=dates)
alpha_logger.info("benchmark data loading finished") alpha_logger.info("benchmark data loading finished")
df = pd.merge(factor_df, return_df, on=['trade_date', 'code']).dropna() df = pd.merge(factor_df, target_df, on=['trade_date', 'code']).dropna()
df = pd.merge(df, benchmark_df, on=['trade_date', 'code'], how='left') df = pd.merge(df, benchmark_df, on=['trade_date', 'code'], how='left')
df = pd.merge(df, industry_df, on=['trade_date', 'code']) df = pd.merge(df, industry_df, on=['trade_date', 'code'])
df['weight'] = df['weight'].fillna(0.) df['weight'] = df['weight'].fillna(0.)
...@@ -262,7 +271,7 @@ def fetch_train_phase(engine, ...@@ -262,7 +271,7 @@ def fetch_train_phase(engine,
pre_process: Iterable[object] = None, pre_process: Iterable[object] = None,
post_process: Iterable[object] = None, post_process: Iterable[object] = None,
warm_start: int = 0, warm_start: int = 0,
fitting_target: Union[Transformer, object] = None) -> dict: fit_target: Union[Transformer, object] = None) -> dict:
if isinstance(alpha_factors, Transformer): if isinstance(alpha_factors, Transformer):
transformer = alpha_factors transformer = alpha_factors
else: else:
...@@ -282,11 +291,11 @@ def fetch_train_phase(engine, ...@@ -282,11 +291,11 @@ def fetch_train_phase(engine,
horizon = map_freq(frequency) horizon = map_freq(frequency)
factor_df = engine.fetch_factor_range(universe, factors=transformer, dates=dates) factor_df = engine.fetch_factor_range(universe, factors=transformer, dates=dates)
if fitting_target is None: if fit_target is None:
target_df = engine.fetch_dx_return_range(universe, dates=dates, horizon=horizon) target_df = engine.fetch_dx_return_range(universe, dates=dates, horizon=horizon)
else: else:
one_more_date = advanceDateByCalendar('china.sse', dates[-1], frequency) one_more_date = advanceDateByCalendar('china.sse', dates[-1], frequency)
target_df = engine.fetch_factor_range_forward(universe, factors=fitting_target, dates=dates + [one_more_date]) target_df = engine.fetch_factor_range_forward(universe, factors=fit_target, dates=dates + [one_more_date])
target_df = target_df[target_df.trade_date.isin(dates)] target_df = target_df[target_df.trade_date.isin(dates)]
target_df = target_df.groupby('code').apply(lambda x: x.fillna(method='pad')) target_df = target_df.groupby('code').apply(lambda x: x.fillna(method='pad'))
...@@ -424,14 +433,16 @@ def fetch_predict_phase(engine, ...@@ -424,14 +433,16 @@ def fetch_predict_phase(engine,
if __name__ == '__main__': if __name__ == '__main__':
engine = SqlEngine('postgresql+psycopg2://postgres:A12345678!@10.63.6.220/alpha') from alphamind.api import risk_styles, industry_styles, standardize
engine = SqlEngine('postgresql+psycopg2://postgres:we083826@localhost/alpha')
universe = Universe('zz500', ['hs300', 'zz500']) universe = Universe('zz500', ['hs300', 'zz500'])
neutralized_risk = ['SIZE'] neutralized_risk = risk_styles + industry_styles
res = fetch_train_phase(engine, ['ep_q'], res = fetch_train_phase(engine, ['ep_q'],
'2012-01-05', '2012-01-05',
'5b', '5b',
universe, universe,
16, 16,
neutralized_risk=neutralized_risk, neutralized_risk=neutralized_risk,
fitting_target='closePrice') post_process=[standardize],
fit_target='closePrice')
print(res) print(res)
...@@ -32,8 +32,9 @@ class ConstLinearModel(ModelBase): ...@@ -32,8 +32,9 @@ class ConstLinearModel(ModelBase):
def __init__(self, def __init__(self,
features=None, features=None,
weights: dict = None): weights: dict = None,
super().__init__(features) fit_target=None):
super().__init__(features=features, fit_target=fit_target)
if features is not None and weights is not None: if features is not None and weights is not None:
pyFinAssert(len(features) == len(weights), pyFinAssert(len(features) == len(weights),
ValueError, ValueError,
...@@ -57,8 +58,8 @@ class ConstLinearModel(ModelBase): ...@@ -57,8 +58,8 @@ class ConstLinearModel(ModelBase):
class LinearRegression(ModelBase): class LinearRegression(ModelBase):
def __init__(self, features=None, fit_intercept: bool = False, **kwargs): def __init__(self, features=None, fit_intercept: bool = False, fit_target=None, **kwargs):
super().__init__(features) super().__init__(features=features, fit_target=fit_target)
self.impl = LinearRegressionImpl(fit_intercept=fit_intercept, **kwargs) self.impl = LinearRegressionImpl(fit_intercept=fit_intercept, **kwargs)
def save(self) -> dict: def save(self) -> dict:
...@@ -84,8 +85,8 @@ class LinearRegression(ModelBase): ...@@ -84,8 +85,8 @@ class LinearRegression(ModelBase):
class LassoRegression(ModelBase): class LassoRegression(ModelBase):
def __init__(self, alpha=0.01, features=None, fit_intercept: bool = False, **kwargs): def __init__(self, alpha=0.01, features=None, fit_intercept: bool = False, fit_target=None, **kwargs):
super().__init__(features) super().__init__(features=features, fit_target=fit_target)
self.impl = Lasso(alpha=alpha, fit_intercept=fit_intercept, **kwargs) self.impl = Lasso(alpha=alpha, fit_intercept=fit_intercept, **kwargs)
def save(self) -> dict: def save(self) -> dict:
...@@ -111,8 +112,8 @@ class LassoRegression(ModelBase): ...@@ -111,8 +112,8 @@ class LassoRegression(ModelBase):
class LogisticRegression(ModelBase): class LogisticRegression(ModelBase):
def __init__(self, features=None, fit_intercept: bool = False, **kwargs): def __init__(self, features=None, fit_intercept: bool = False, fit_target=None, **kwargs):
super().__init__(features) super().__init__(features=features, fit_target=fit_target)
self.impl = LogisticRegressionImpl(fit_intercept=fit_intercept, **kwargs) self.impl = LogisticRegressionImpl(fit_intercept=fit_intercept, **kwargs)
def save(self) -> dict: def save(self) -> dict:
......
...@@ -18,12 +18,17 @@ from alphamind.data.transformer import Transformer ...@@ -18,12 +18,17 @@ from alphamind.data.transformer import Transformer
class ModelBase(metaclass=abc.ABCMeta): class ModelBase(metaclass=abc.ABCMeta):
def __init__(self, features=None): def __init__(self, features=None, fit_target=None):
if features is not None: if features is not None:
self.formulas = Transformer(features) self.formulas = Transformer(features)
self.features = self.formulas.names self.features = self.formulas.names
else: else:
self.features = None self.features = None
if fit_target is not None:
self.fit_target = Transformer(fit_target)
else:
self.fit_target = None
self.impl = None self.impl = None
self.trained_time = None self.trained_time = None
...@@ -31,7 +36,8 @@ class ModelBase(metaclass=abc.ABCMeta): ...@@ -31,7 +36,8 @@ class ModelBase(metaclass=abc.ABCMeta):
return encode(self.impl) == encode(rhs.impl) \ return encode(self.impl) == encode(rhs.impl) \
and self.trained_time == rhs.trained_time \ and self.trained_time == rhs.trained_time \
and list_eq(self.features, rhs.features) \ and list_eq(self.features, rhs.features) \
and encode(self.formulas) == encode(rhs.formulas) and encode(self.formulas) == encode(rhs.formulas) \
and encode(self.fit_target) == encode(rhs.fit_target)
def fit(self, x: pd.DataFrame, y: np.ndarray): def fit(self, x: pd.DataFrame, y: np.ndarray):
self.impl.fit(x[self.features].values, y.flatten()) self.impl.fit(x[self.features].values, y.flatten())
...@@ -56,15 +62,21 @@ class ModelBase(metaclass=abc.ABCMeta): ...@@ -56,15 +62,21 @@ class ModelBase(metaclass=abc.ABCMeta):
trained_time=self.trained_time, trained_time=self.trained_time,
desc=encode(self.impl), desc=encode(self.impl),
formulas=encode(self.formulas), formulas=encode(self.formulas),
fit_target=encode(self.fit_target),
internal_model=self.impl.__class__.__module__ + "." + self.impl.__class__.__name__) internal_model=self.impl.__class__.__module__ + "." + self.impl.__class__.__name__)
return model_desc return model_desc
@abc.abstractclassmethod @abc.abstractmethod
@classmethod
def load(cls, model_desc: dict): def load(cls, model_desc: dict):
obj_layout = cls() obj_layout = cls()
obj_layout.features = model_desc['features'] obj_layout.features = model_desc['features']
obj_layout.formulas = decode(model_desc['formulas']) obj_layout.formulas = decode(model_desc['formulas'])
obj_layout.trained_time = model_desc['trained_time'] obj_layout.trained_time = model_desc['trained_time']
obj_layout.impl = decode(model_desc['desc']) obj_layout.impl = decode(model_desc['desc'])
if 'fit_target' in model_desc:
obj_layout.fit_target = decode(model_desc['fit_target'])
else:
obj_layout.fit_target = None
return obj_layout return obj_layout
...@@ -27,8 +27,9 @@ class RandomForestRegressor(ModelBase): ...@@ -27,8 +27,9 @@ class RandomForestRegressor(ModelBase):
n_estimators: int=100, n_estimators: int=100,
max_features: str='auto', max_features: str='auto',
features=None, features=None,
fit_target=None,
**kwargs): **kwargs):
super().__init__(features) super().__init__(features=features, fit_target=fit_target)
self.impl = RandomForestRegressorImpl(n_estimators=n_estimators, self.impl = RandomForestRegressorImpl(n_estimators=n_estimators,
max_features=max_features, max_features=max_features,
**kwargs) **kwargs)
...@@ -59,8 +60,9 @@ class RandomForestClassifier(ModelBase): ...@@ -59,8 +60,9 @@ class RandomForestClassifier(ModelBase):
n_estimators: int=100, n_estimators: int=100,
max_features: str='auto', max_features: str='auto',
features=None, features=None,
fit_target=None,
**kwargs): **kwargs):
super().__init__(features) super().__init__(features=features, fit_target=fit_target)
self.impl = RandomForestClassifierImpl(n_estimators=n_estimators, self.impl = RandomForestClassifierImpl(n_estimators=n_estimators,
max_features=max_features, max_features=max_features,
**kwargs) **kwargs)
...@@ -92,9 +94,10 @@ class XGBRegressor(ModelBase): ...@@ -92,9 +94,10 @@ class XGBRegressor(ModelBase):
learning_rate: float=0.1, learning_rate: float=0.1,
max_depth: int=3, max_depth: int=3,
features=None, features=None,
fit_target=None,
n_jobs: int=1, n_jobs: int=1,
**kwargs): **kwargs):
super().__init__(features) super().__init__(features=features, fit_target=fit_target)
self.impl = XGBRegressorImpl(n_estimators=n_estimators, self.impl = XGBRegressorImpl(n_estimators=n_estimators,
learning_rate=learning_rate, learning_rate=learning_rate,
max_depth=max_depth, max_depth=max_depth,
...@@ -128,9 +131,10 @@ class XGBClassifier(ModelBase): ...@@ -128,9 +131,10 @@ class XGBClassifier(ModelBase):
learning_rate: float=0.1, learning_rate: float=0.1,
max_depth: int=3, max_depth: int=3,
features=None, features=None,
fit_target=None,
n_jobs: int=1, n_jobs: int=1,
**kwargs): **kwargs):
super().__init__(features) super().__init__(features=features, fit_target=fit_target)
self.impl = XGBClassifierImpl(n_estimators=n_estimators, self.impl = XGBClassifierImpl(n_estimators=n_estimators,
learning_rate=learning_rate, learning_rate=learning_rate,
max_depth=max_depth, max_depth=max_depth,
...@@ -171,10 +175,11 @@ class XGBTrainer(ModelBase): ...@@ -171,10 +175,11 @@ class XGBTrainer(ModelBase):
subsample=1., subsample=1.,
colsample_bytree=1., colsample_bytree=1.,
features=None, features=None,
fit_target=None,
random_state: int=0, random_state: int=0,
n_jobs: int=1, n_jobs: int=1,
**kwargs): **kwargs):
super().__init__(features) super().__init__(features=features, fit_target=fit_target)
self.params = { self.params = {
'silent': 1, 'silent': 1,
'objective': objective, 'objective': objective,
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment