Unverified Commit 6f2b65e5 authored by iLampard's avatar iLampard Committed by GitHub

Merge pull request #7 from alpha-miner/master

merge update
parents 3dd7d3f7 31805b28
...@@ -417,8 +417,7 @@ class SqlEngine(object): ...@@ -417,8 +417,7 @@ class SqlEngine(object):
df = pd.read_sql(query, self.engine) df = pd.read_sql(query, self.engine)
if universe.is_filtered: if universe.is_filtered:
codes = universe.query(self, start_date, end_date, dates) df = pd.merge(df, universe_df, how='inner', on=['trade_date', 'code'])
df = pd.merge(df, codes, how='inner', on=['trade_date', 'code'])
if external_data is not None: if external_data is not None:
df = pd.merge(df, external_data, on=['trade_date', 'code']).dropna() df = pd.merge(df, external_data, on=['trade_date', 'code']).dropna()
...@@ -435,6 +434,55 @@ class SqlEngine(object): ...@@ -435,6 +434,55 @@ class SqlEngine(object):
df = df.reset_index() df = df.reset_index()
return pd.merge(df, universe_df[['trade_date', 'code']], how='inner') return pd.merge(df, universe_df[['trade_date', 'code']], how='inner')
def fetch_factor_range_forward(self,
universe: Universe,
factors: Union[Transformer, object],
start_date: str = None,
end_date: str = None,
dates: Iterable[str] = None):
if isinstance(factors, Transformer):
transformer = factors
else:
transformer = Transformer(factors)
dependency = transformer.dependency
factor_cols = _map_factors(dependency, factor_tables)
codes = universe.query(self, start_date, end_date, dates)
total_codes = codes.code.unique().tolist()
total_dates = codes.trade_date.astype(str).unique().tolist()
big_table = Market
joined_tables = set()
joined_tables.add(Market.__table__.name)
for t in set(factor_cols.values()):
if t.__table__.name not in joined_tables:
if dates is not None:
big_table = outerjoin(big_table, t, and_(Market.trade_date == t.trade_date,
Market.code == t.code,
Market.trade_date.in_(dates)))
else:
big_table = outerjoin(big_table, t, and_(Market.trade_date == t.trade_date,
Market.code == t.code,
Market.trade_date.between(start_date, end_date)))
joined_tables.add(t.__table__.name)
stats = func.lag(list(factor_cols.keys())[0], -1).over(
partition_by=Market.code,
order_by=Market.trade_date).label('dx')
query = select([Market.trade_date, Market.code, stats]).select_from(big_table).where(
and_(
Market.trade_date.in_(total_dates),
Market.code.in_(total_codes)
)
)
df = pd.read_sql(query, self.engine).sort_values(['trade_date', 'code'])
return df
def fetch_benchmark(self, def fetch_benchmark(self,
ref_date: str, ref_date: str,
benchmark: int, benchmark: int,
...@@ -988,9 +1036,6 @@ if __name__ == '__main__': ...@@ -988,9 +1036,6 @@ if __name__ == '__main__':
universe = Universe('', ['zz800']) universe = Universe('', ['zz800'])
codes = engine.fetch_codes(ref_date, universe) codes = engine.fetch_codes(ref_date, universe)
dates = makeSchedule('2010-01-01', '2018-02-01', '10b', 'china.sse') dates = makeSchedule('2018-01-01', '2018-02-01', '10b', 'china.sse')
# df = engine.fetch_factor_range(universe, DIFF('roe_q'), dates=dates) factor_data = engine.fetch_factor_range_forward(universe, ['roe_q'], dates=dates)
print(factor_data)
risk_cov, risk_exposure = engine.fetch_risk_model(ref_date, codes)
factor_data = engine.fetch_factor_range(universe, ['roe_q'], dates=dates)
risk_cov, risk_exposure = engine.fetch_risk_model_range(universe, dates=dates)
...@@ -7,6 +7,7 @@ Created on 2017-12-25 ...@@ -7,6 +7,7 @@ Created on 2017-12-25
from typing import Iterable from typing import Iterable
from typing import Dict from typing import Dict
from alphamind.data.dbmodel.models import Market
from alphamind.data.dbmodel.models import RiskCovDay from alphamind.data.dbmodel.models import RiskCovDay
from alphamind.data.dbmodel.models import RiskCovShort from alphamind.data.dbmodel.models import RiskCovShort
from alphamind.data.dbmodel.models import RiskCovLong from alphamind.data.dbmodel.models import RiskCovLong
...@@ -22,7 +23,7 @@ from alphamind.data.dbmodel.models import RiskExposure ...@@ -22,7 +23,7 @@ from alphamind.data.dbmodel.models import RiskExposure
from alphamind.data.engines.industries import INDUSTRY_MAPPING from alphamind.data.engines.industries import INDUSTRY_MAPPING
factor_tables = [RiskExposure, Uqer, Gogoal, Experimental, LegacyFactor, Tiny] factor_tables = [Market, RiskExposure, Uqer, Gogoal, Experimental, LegacyFactor, Tiny]
def _map_risk_model_table(risk_model: str) -> tuple: def _map_risk_model_table(risk_model: str) -> tuple:
...@@ -44,13 +45,17 @@ def _map_factors(factors: Iterable[str], used_factor_tables) -> Dict: ...@@ -44,13 +45,17 @@ def _map_factors(factors: Iterable[str], used_factor_tables) -> Dict:
if f not in excluded and f in t.__table__.columns: if f not in excluded and f in t.__table__.columns:
factor_cols[t.__table__.columns[f]] = t factor_cols[t.__table__.columns[f]] = t
break break
if not factor_cols:
raise ValueError(f"some factors in <{factors}> can't be find")
return factor_cols return factor_cols
def _map_industry_category(category: str) -> str: def _map_industry_category(category: str) -> str:
if category == 'sw': if category == 'sw':
return '申万行业分类' return '申万行业分类'
if category == 'sw_adj': elif category == 'sw_adj':
return '申万行业分类修订' return '申万行业分类修订'
elif category == 'zz': elif category == 'zz':
return '中证行业分类' return '中证行业分类'
......
...@@ -16,8 +16,8 @@ class TargetVolExecutor(ExecutorBase): ...@@ -16,8 +16,8 @@ class TargetVolExecutor(ExecutorBase):
def __init__(self, window=30, target_vol=0.01): def __init__(self, window=30, target_vol=0.01):
super().__init__() super().__init__()
self.m_vol = MovingStandardDeviation(window=window, dependency='return') self.m_vol = MovingStandardDeviation(window, 'return')
self.m_leverage = MovingAverage(window=window, dependency='leverage') self.m_leverage = MovingAverage(window, 'leverage')
self.target_vol = target_vol self.target_vol = target_vol
self.multiplier = 1. self.multiplier = 1.
......
...@@ -116,7 +116,8 @@ class DataMeta(object): ...@@ -116,7 +116,8 @@ class DataMeta(object):
self.risk_model, self.risk_model,
self.pre_process, self.pre_process,
self.post_process, self.post_process,
self.warm_start) self.warm_start,
fit_target=alpha_model.fit_target)
def fetch_predict_data(self, def fetch_predict_data(self,
ref_date: str, ref_date: str,
......
...@@ -60,7 +60,8 @@ def prepare_data(engine: SqlEngine, ...@@ -60,7 +60,8 @@ def prepare_data(engine: SqlEngine,
frequency: str, frequency: str,
universe: Universe, universe: Universe,
benchmark: int, benchmark: int,
warm_start: int = 0): warm_start: int = 0,
fit_target: Union[Transformer, object]=None):
if warm_start > 0: if warm_start > 0:
p = Period(frequency) p = Period(frequency)
p = Period(length=-warm_start * p.length(), units=p.units()) p = Period(length=-warm_start * p.length(), units=p.units())
...@@ -86,14 +87,22 @@ def prepare_data(engine: SqlEngine, ...@@ -86,14 +87,22 @@ def prepare_data(engine: SqlEngine,
factors=transformer, factors=transformer,
dates=dates).sort_values(['trade_date', 'code']) dates=dates).sort_values(['trade_date', 'code'])
alpha_logger.info("factor data loading finished") alpha_logger.info("factor data loading finished")
return_df = engine.fetch_dx_return_range(universe, dates=dates, horizon=horizon)
alpha_logger.info("return data loading finished") if fit_target is None:
target_df = engine.fetch_dx_return_range(universe, dates=dates, horizon=horizon)
else:
one_more_date = advanceDateByCalendar('china.sse', dates[-1], frequency)
target_df = engine.fetch_factor_range_forward(universe, factors=fit_target, dates=dates + [one_more_date])
target_df = target_df[target_df.trade_date.isin(dates)]
target_df = target_df.groupby('code').apply(lambda x: x.fillna(method='pad'))
alpha_logger.info("fit target data loading finished")
industry_df = engine.fetch_industry_range(universe, dates=dates) industry_df = engine.fetch_industry_range(universe, dates=dates)
alpha_logger.info("industry data loading finished") alpha_logger.info("industry data loading finished")
benchmark_df = engine.fetch_benchmark_range(benchmark, dates=dates) benchmark_df = engine.fetch_benchmark_range(benchmark, dates=dates)
alpha_logger.info("benchmark data loading finished") alpha_logger.info("benchmark data loading finished")
df = pd.merge(factor_df, return_df, on=['trade_date', 'code']).dropna() df = pd.merge(factor_df, target_df, on=['trade_date', 'code']).dropna()
df = pd.merge(df, benchmark_df, on=['trade_date', 'code'], how='left') df = pd.merge(df, benchmark_df, on=['trade_date', 'code'], how='left')
df = pd.merge(df, industry_df, on=['trade_date', 'code']) df = pd.merge(df, industry_df, on=['trade_date', 'code'])
df['weight'] = df['weight'].fillna(0.) df['weight'] = df['weight'].fillna(0.)
...@@ -261,7 +270,8 @@ def fetch_train_phase(engine, ...@@ -261,7 +270,8 @@ def fetch_train_phase(engine,
risk_model: str = 'short', risk_model: str = 'short',
pre_process: Iterable[object] = None, pre_process: Iterable[object] = None,
post_process: Iterable[object] = None, post_process: Iterable[object] = None,
warm_start: int = 0) -> dict: warm_start: int = 0,
fit_target: Union[Transformer, object] = None) -> dict:
if isinstance(alpha_factors, Transformer): if isinstance(alpha_factors, Transformer):
transformer = alpha_factors transformer = alpha_factors
else: else:
...@@ -281,7 +291,13 @@ def fetch_train_phase(engine, ...@@ -281,7 +291,13 @@ def fetch_train_phase(engine,
horizon = map_freq(frequency) horizon = map_freq(frequency)
factor_df = engine.fetch_factor_range(universe, factors=transformer, dates=dates) factor_df = engine.fetch_factor_range(universe, factors=transformer, dates=dates)
target_df = engine.fetch_dx_return_range(universe, dates=dates, horizon=horizon) if fit_target is None:
target_df = engine.fetch_dx_return_range(universe, dates=dates, horizon=horizon)
else:
one_more_date = advanceDateByCalendar('china.sse', dates[-1], frequency)
target_df = engine.fetch_factor_range_forward(universe, factors=fit_target, dates=dates + [one_more_date])
target_df = target_df[target_df.trade_date.isin(dates)]
target_df = target_df.groupby('code').apply(lambda x: x.fillna(method='pad'))
df = pd.merge(factor_df, target_df, on=['trade_date', 'code']).dropna() df = pd.merge(factor_df, target_df, on=['trade_date', 'code']).dropna()
...@@ -336,7 +352,7 @@ def fetch_predict_phase(engine, ...@@ -336,7 +352,7 @@ def fetch_predict_phase(engine,
pre_process: Iterable[object] = None, pre_process: Iterable[object] = None,
post_process: Iterable[object] = None, post_process: Iterable[object] = None,
warm_start: int = 0, warm_start: int = 0,
fillna: str=None): fillna: str = None):
if isinstance(alpha_factors, Transformer): if isinstance(alpha_factors, Transformer):
transformer = alpha_factors transformer = alpha_factors
else: else:
...@@ -356,7 +372,8 @@ def fetch_predict_phase(engine, ...@@ -356,7 +372,8 @@ def fetch_predict_phase(engine,
factor_df = engine.fetch_factor_range(universe, factors=transformer, dates=dates) factor_df = engine.fetch_factor_range(universe, factors=transformer, dates=dates)
if fillna: if fillna:
factor_df = factor_df.groupby('trade_date').apply(lambda x: x.fillna(x.median())).reset_index(drop=True).dropna() factor_df = factor_df.groupby('trade_date').apply(lambda x: x.fillna(x.median())).reset_index(
drop=True).dropna()
else: else:
factor_df = factor_df.dropna() factor_df = factor_df.dropna()
...@@ -416,13 +433,16 @@ def fetch_predict_phase(engine, ...@@ -416,13 +433,16 @@ def fetch_predict_phase(engine,
if __name__ == '__main__': if __name__ == '__main__':
engine = SqlEngine('postgresql+psycopg2://postgres:A12345678!@10.63.6.220/alpha') from alphamind.api import risk_styles, industry_styles, standardize
engine = SqlEngine('postgresql+psycopg2://postgres:we083826@localhost/alpha')
universe = Universe('zz500', ['hs300', 'zz500']) universe = Universe('zz500', ['hs300', 'zz500'])
neutralized_risk = ['SIZE'] neutralized_risk = risk_styles + industry_styles
res = fetch_train_phase(engine, ['ep_q'], res = fetch_train_phase(engine, ['ep_q'],
'2012-01-05', '2012-01-05',
'5b', '5b',
universe, universe,
16, 16,
neutralized_risk=neutralized_risk) neutralized_risk=neutralized_risk,
post_process=[standardize],
fit_target='closePrice')
print(res) print(res)
...@@ -32,8 +32,9 @@ class ConstLinearModel(ModelBase): ...@@ -32,8 +32,9 @@ class ConstLinearModel(ModelBase):
def __init__(self, def __init__(self,
features=None, features=None,
weights: dict = None): weights: dict = None,
super().__init__(features) fit_target=None):
super().__init__(features=features, fit_target=fit_target)
if features is not None and weights is not None: if features is not None and weights is not None:
pyFinAssert(len(features) == len(weights), pyFinAssert(len(features) == len(weights),
ValueError, ValueError,
...@@ -57,8 +58,8 @@ class ConstLinearModel(ModelBase): ...@@ -57,8 +58,8 @@ class ConstLinearModel(ModelBase):
class LinearRegression(ModelBase): class LinearRegression(ModelBase):
def __init__(self, features=None, fit_intercept: bool = False, **kwargs): def __init__(self, features=None, fit_intercept: bool = False, fit_target=None, **kwargs):
super().__init__(features) super().__init__(features=features, fit_target=fit_target)
self.impl = LinearRegressionImpl(fit_intercept=fit_intercept, **kwargs) self.impl = LinearRegressionImpl(fit_intercept=fit_intercept, **kwargs)
def save(self) -> dict: def save(self) -> dict:
...@@ -84,8 +85,8 @@ class LinearRegression(ModelBase): ...@@ -84,8 +85,8 @@ class LinearRegression(ModelBase):
class LassoRegression(ModelBase): class LassoRegression(ModelBase):
def __init__(self, alpha=0.01, features=None, fit_intercept: bool = False, **kwargs): def __init__(self, alpha=0.01, features=None, fit_intercept: bool = False, fit_target=None, **kwargs):
super().__init__(features) super().__init__(features=features, fit_target=fit_target)
self.impl = Lasso(alpha=alpha, fit_intercept=fit_intercept, **kwargs) self.impl = Lasso(alpha=alpha, fit_intercept=fit_intercept, **kwargs)
def save(self) -> dict: def save(self) -> dict:
...@@ -111,8 +112,8 @@ class LassoRegression(ModelBase): ...@@ -111,8 +112,8 @@ class LassoRegression(ModelBase):
class LogisticRegression(ModelBase): class LogisticRegression(ModelBase):
def __init__(self, features=None, fit_intercept: bool = False, **kwargs): def __init__(self, features=None, fit_intercept: bool = False, fit_target=None, **kwargs):
super().__init__(features) super().__init__(features=features, fit_target=fit_target)
self.impl = LogisticRegressionImpl(fit_intercept=fit_intercept, **kwargs) self.impl = LogisticRegressionImpl(fit_intercept=fit_intercept, **kwargs)
def save(self) -> dict: def save(self) -> dict:
......
...@@ -18,12 +18,17 @@ from alphamind.data.transformer import Transformer ...@@ -18,12 +18,17 @@ from alphamind.data.transformer import Transformer
class ModelBase(metaclass=abc.ABCMeta): class ModelBase(metaclass=abc.ABCMeta):
def __init__(self, features=None): def __init__(self, features=None, fit_target=None):
if features is not None: if features is not None:
self.formulas = Transformer(features) self.formulas = Transformer(features)
self.features = self.formulas.names self.features = self.formulas.names
else: else:
self.features = None self.features = None
if fit_target is not None:
self.fit_target = Transformer(fit_target)
else:
self.fit_target = None
self.impl = None self.impl = None
self.trained_time = None self.trained_time = None
...@@ -31,7 +36,8 @@ class ModelBase(metaclass=abc.ABCMeta): ...@@ -31,7 +36,8 @@ class ModelBase(metaclass=abc.ABCMeta):
return encode(self.impl) == encode(rhs.impl) \ return encode(self.impl) == encode(rhs.impl) \
and self.trained_time == rhs.trained_time \ and self.trained_time == rhs.trained_time \
and list_eq(self.features, rhs.features) \ and list_eq(self.features, rhs.features) \
and encode(self.formulas) == encode(rhs.formulas) and encode(self.formulas) == encode(rhs.formulas) \
and encode(self.fit_target) == encode(rhs.fit_target)
def fit(self, x: pd.DataFrame, y: np.ndarray): def fit(self, x: pd.DataFrame, y: np.ndarray):
self.impl.fit(x[self.features].values, y.flatten()) self.impl.fit(x[self.features].values, y.flatten())
...@@ -56,15 +62,21 @@ class ModelBase(metaclass=abc.ABCMeta): ...@@ -56,15 +62,21 @@ class ModelBase(metaclass=abc.ABCMeta):
trained_time=self.trained_time, trained_time=self.trained_time,
desc=encode(self.impl), desc=encode(self.impl),
formulas=encode(self.formulas), formulas=encode(self.formulas),
fit_target=encode(self.fit_target),
internal_model=self.impl.__class__.__module__ + "." + self.impl.__class__.__name__) internal_model=self.impl.__class__.__module__ + "." + self.impl.__class__.__name__)
return model_desc return model_desc
@abc.abstractclassmethod @classmethod
@abc.abstractmethod
def load(cls, model_desc: dict): def load(cls, model_desc: dict):
obj_layout = cls() obj_layout = cls()
obj_layout.features = model_desc['features'] obj_layout.features = model_desc['features']
obj_layout.formulas = decode(model_desc['formulas']) obj_layout.formulas = decode(model_desc['formulas'])
obj_layout.trained_time = model_desc['trained_time'] obj_layout.trained_time = model_desc['trained_time']
obj_layout.impl = decode(model_desc['desc']) obj_layout.impl = decode(model_desc['desc'])
if 'fit_target' in model_desc:
obj_layout.fit_target = decode(model_desc['fit_target'])
else:
obj_layout.fit_target = None
return obj_layout return obj_layout
...@@ -27,8 +27,9 @@ class RandomForestRegressor(ModelBase): ...@@ -27,8 +27,9 @@ class RandomForestRegressor(ModelBase):
n_estimators: int=100, n_estimators: int=100,
max_features: str='auto', max_features: str='auto',
features=None, features=None,
fit_target=None,
**kwargs): **kwargs):
super().__init__(features) super().__init__(features=features, fit_target=fit_target)
self.impl = RandomForestRegressorImpl(n_estimators=n_estimators, self.impl = RandomForestRegressorImpl(n_estimators=n_estimators,
max_features=max_features, max_features=max_features,
**kwargs) **kwargs)
...@@ -59,8 +60,9 @@ class RandomForestClassifier(ModelBase): ...@@ -59,8 +60,9 @@ class RandomForestClassifier(ModelBase):
n_estimators: int=100, n_estimators: int=100,
max_features: str='auto', max_features: str='auto',
features=None, features=None,
fit_target=None,
**kwargs): **kwargs):
super().__init__(features) super().__init__(features=features, fit_target=fit_target)
self.impl = RandomForestClassifierImpl(n_estimators=n_estimators, self.impl = RandomForestClassifierImpl(n_estimators=n_estimators,
max_features=max_features, max_features=max_features,
**kwargs) **kwargs)
...@@ -92,9 +94,10 @@ class XGBRegressor(ModelBase): ...@@ -92,9 +94,10 @@ class XGBRegressor(ModelBase):
learning_rate: float=0.1, learning_rate: float=0.1,
max_depth: int=3, max_depth: int=3,
features=None, features=None,
fit_target=None,
n_jobs: int=1, n_jobs: int=1,
**kwargs): **kwargs):
super().__init__(features) super().__init__(features=features, fit_target=fit_target)
self.impl = XGBRegressorImpl(n_estimators=n_estimators, self.impl = XGBRegressorImpl(n_estimators=n_estimators,
learning_rate=learning_rate, learning_rate=learning_rate,
max_depth=max_depth, max_depth=max_depth,
...@@ -128,9 +131,10 @@ class XGBClassifier(ModelBase): ...@@ -128,9 +131,10 @@ class XGBClassifier(ModelBase):
learning_rate: float=0.1, learning_rate: float=0.1,
max_depth: int=3, max_depth: int=3,
features=None, features=None,
fit_target=None,
n_jobs: int=1, n_jobs: int=1,
**kwargs): **kwargs):
super().__init__(features) super().__init__(features=features, fit_target=fit_target)
self.impl = XGBClassifierImpl(n_estimators=n_estimators, self.impl = XGBClassifierImpl(n_estimators=n_estimators,
learning_rate=learning_rate, learning_rate=learning_rate,
max_depth=max_depth, max_depth=max_depth,
...@@ -171,10 +175,11 @@ class XGBTrainer(ModelBase): ...@@ -171,10 +175,11 @@ class XGBTrainer(ModelBase):
subsample=1., subsample=1.,
colsample_bytree=1., colsample_bytree=1.,
features=None, features=None,
fit_target=None,
random_state: int=0, random_state: int=0,
n_jobs: int=1, n_jobs: int=1,
**kwargs): **kwargs):
super().__init__(features) super().__init__(features=features, fit_target=fit_target)
self.params = { self.params = {
'silent': 1, 'silent': 1,
'objective': objective, 'objective': objective,
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment