Unverified Commit 6f2b65e5 authored by iLampard's avatar iLampard Committed by GitHub

Merge pull request #7 from alpha-miner/master

merge update
parents 3dd7d3f7 31805b28
......@@ -417,8 +417,7 @@ class SqlEngine(object):
df = pd.read_sql(query, self.engine)
if universe.is_filtered:
codes = universe.query(self, start_date, end_date, dates)
df = pd.merge(df, codes, how='inner', on=['trade_date', 'code'])
df = pd.merge(df, universe_df, how='inner', on=['trade_date', 'code'])
if external_data is not None:
df = pd.merge(df, external_data, on=['trade_date', 'code']).dropna()
......@@ -435,6 +434,55 @@ class SqlEngine(object):
df = df.reset_index()
return pd.merge(df, universe_df[['trade_date', 'code']], how='inner')
def fetch_factor_range_forward(self,
universe: Universe,
factors: Union[Transformer, object],
start_date: str = None,
end_date: str = None,
dates: Iterable[str] = None):
if isinstance(factors, Transformer):
transformer = factors
else:
transformer = Transformer(factors)
dependency = transformer.dependency
factor_cols = _map_factors(dependency, factor_tables)
codes = universe.query(self, start_date, end_date, dates)
total_codes = codes.code.unique().tolist()
total_dates = codes.trade_date.astype(str).unique().tolist()
big_table = Market
joined_tables = set()
joined_tables.add(Market.__table__.name)
for t in set(factor_cols.values()):
if t.__table__.name not in joined_tables:
if dates is not None:
big_table = outerjoin(big_table, t, and_(Market.trade_date == t.trade_date,
Market.code == t.code,
Market.trade_date.in_(dates)))
else:
big_table = outerjoin(big_table, t, and_(Market.trade_date == t.trade_date,
Market.code == t.code,
Market.trade_date.between(start_date, end_date)))
joined_tables.add(t.__table__.name)
stats = func.lag(list(factor_cols.keys())[0], -1).over(
partition_by=Market.code,
order_by=Market.trade_date).label('dx')
query = select([Market.trade_date, Market.code, stats]).select_from(big_table).where(
and_(
Market.trade_date.in_(total_dates),
Market.code.in_(total_codes)
)
)
df = pd.read_sql(query, self.engine).sort_values(['trade_date', 'code'])
return df
def fetch_benchmark(self,
ref_date: str,
benchmark: int,
......@@ -988,9 +1036,6 @@ if __name__ == '__main__':
universe = Universe('', ['zz800'])
codes = engine.fetch_codes(ref_date, universe)
dates = makeSchedule('2010-01-01', '2018-02-01', '10b', 'china.sse')
# df = engine.fetch_factor_range(universe, DIFF('roe_q'), dates=dates)
risk_cov, risk_exposure = engine.fetch_risk_model(ref_date, codes)
factor_data = engine.fetch_factor_range(universe, ['roe_q'], dates=dates)
risk_cov, risk_exposure = engine.fetch_risk_model_range(universe, dates=dates)
dates = makeSchedule('2018-01-01', '2018-02-01', '10b', 'china.sse')
factor_data = engine.fetch_factor_range_forward(universe, ['roe_q'], dates=dates)
print(factor_data)
......@@ -7,6 +7,7 @@ Created on 2017-12-25
from typing import Iterable
from typing import Dict
from alphamind.data.dbmodel.models import Market
from alphamind.data.dbmodel.models import RiskCovDay
from alphamind.data.dbmodel.models import RiskCovShort
from alphamind.data.dbmodel.models import RiskCovLong
......@@ -22,7 +23,7 @@ from alphamind.data.dbmodel.models import RiskExposure
from alphamind.data.engines.industries import INDUSTRY_MAPPING
factor_tables = [RiskExposure, Uqer, Gogoal, Experimental, LegacyFactor, Tiny]
factor_tables = [Market, RiskExposure, Uqer, Gogoal, Experimental, LegacyFactor, Tiny]
def _map_risk_model_table(risk_model: str) -> tuple:
......@@ -44,13 +45,17 @@ def _map_factors(factors: Iterable[str], used_factor_tables) -> Dict:
if f not in excluded and f in t.__table__.columns:
factor_cols[t.__table__.columns[f]] = t
break
if not factor_cols:
raise ValueError(f"some factors in <{factors}> can't be find")
return factor_cols
def _map_industry_category(category: str) -> str:
if category == 'sw':
return '申万行业分类'
if category == 'sw_adj':
elif category == 'sw_adj':
return '申万行业分类修订'
elif category == 'zz':
return '中证行业分类'
......
......@@ -16,8 +16,8 @@ class TargetVolExecutor(ExecutorBase):
def __init__(self, window=30, target_vol=0.01):
super().__init__()
self.m_vol = MovingStandardDeviation(window=window, dependency='return')
self.m_leverage = MovingAverage(window=window, dependency='leverage')
self.m_vol = MovingStandardDeviation(window, 'return')
self.m_leverage = MovingAverage(window, 'leverage')
self.target_vol = target_vol
self.multiplier = 1.
......
......@@ -116,7 +116,8 @@ class DataMeta(object):
self.risk_model,
self.pre_process,
self.post_process,
self.warm_start)
self.warm_start,
fit_target=alpha_model.fit_target)
def fetch_predict_data(self,
ref_date: str,
......
......@@ -60,7 +60,8 @@ def prepare_data(engine: SqlEngine,
frequency: str,
universe: Universe,
benchmark: int,
warm_start: int = 0):
warm_start: int = 0,
fit_target: Union[Transformer, object]=None):
if warm_start > 0:
p = Period(frequency)
p = Period(length=-warm_start * p.length(), units=p.units())
......@@ -86,14 +87,22 @@ def prepare_data(engine: SqlEngine,
factors=transformer,
dates=dates).sort_values(['trade_date', 'code'])
alpha_logger.info("factor data loading finished")
return_df = engine.fetch_dx_return_range(universe, dates=dates, horizon=horizon)
alpha_logger.info("return data loading finished")
if fit_target is None:
target_df = engine.fetch_dx_return_range(universe, dates=dates, horizon=horizon)
else:
one_more_date = advanceDateByCalendar('china.sse', dates[-1], frequency)
target_df = engine.fetch_factor_range_forward(universe, factors=fit_target, dates=dates + [one_more_date])
target_df = target_df[target_df.trade_date.isin(dates)]
target_df = target_df.groupby('code').apply(lambda x: x.fillna(method='pad'))
alpha_logger.info("fit target data loading finished")
industry_df = engine.fetch_industry_range(universe, dates=dates)
alpha_logger.info("industry data loading finished")
benchmark_df = engine.fetch_benchmark_range(benchmark, dates=dates)
alpha_logger.info("benchmark data loading finished")
df = pd.merge(factor_df, return_df, on=['trade_date', 'code']).dropna()
df = pd.merge(factor_df, target_df, on=['trade_date', 'code']).dropna()
df = pd.merge(df, benchmark_df, on=['trade_date', 'code'], how='left')
df = pd.merge(df, industry_df, on=['trade_date', 'code'])
df['weight'] = df['weight'].fillna(0.)
......@@ -261,7 +270,8 @@ def fetch_train_phase(engine,
risk_model: str = 'short',
pre_process: Iterable[object] = None,
post_process: Iterable[object] = None,
warm_start: int = 0) -> dict:
warm_start: int = 0,
fit_target: Union[Transformer, object] = None) -> dict:
if isinstance(alpha_factors, Transformer):
transformer = alpha_factors
else:
......@@ -281,7 +291,13 @@ def fetch_train_phase(engine,
horizon = map_freq(frequency)
factor_df = engine.fetch_factor_range(universe, factors=transformer, dates=dates)
target_df = engine.fetch_dx_return_range(universe, dates=dates, horizon=horizon)
if fit_target is None:
target_df = engine.fetch_dx_return_range(universe, dates=dates, horizon=horizon)
else:
one_more_date = advanceDateByCalendar('china.sse', dates[-1], frequency)
target_df = engine.fetch_factor_range_forward(universe, factors=fit_target, dates=dates + [one_more_date])
target_df = target_df[target_df.trade_date.isin(dates)]
target_df = target_df.groupby('code').apply(lambda x: x.fillna(method='pad'))
df = pd.merge(factor_df, target_df, on=['trade_date', 'code']).dropna()
......@@ -336,7 +352,7 @@ def fetch_predict_phase(engine,
pre_process: Iterable[object] = None,
post_process: Iterable[object] = None,
warm_start: int = 0,
fillna: str=None):
fillna: str = None):
if isinstance(alpha_factors, Transformer):
transformer = alpha_factors
else:
......@@ -356,7 +372,8 @@ def fetch_predict_phase(engine,
factor_df = engine.fetch_factor_range(universe, factors=transformer, dates=dates)
if fillna:
factor_df = factor_df.groupby('trade_date').apply(lambda x: x.fillna(x.median())).reset_index(drop=True).dropna()
factor_df = factor_df.groupby('trade_date').apply(lambda x: x.fillna(x.median())).reset_index(
drop=True).dropna()
else:
factor_df = factor_df.dropna()
......@@ -416,13 +433,16 @@ def fetch_predict_phase(engine,
if __name__ == '__main__':
engine = SqlEngine('postgresql+psycopg2://postgres:A12345678!@10.63.6.220/alpha')
from alphamind.api import risk_styles, industry_styles, standardize
engine = SqlEngine('postgresql+psycopg2://postgres:we083826@localhost/alpha')
universe = Universe('zz500', ['hs300', 'zz500'])
neutralized_risk = ['SIZE']
neutralized_risk = risk_styles + industry_styles
res = fetch_train_phase(engine, ['ep_q'],
'2012-01-05',
'5b',
universe,
16,
neutralized_risk=neutralized_risk)
'2012-01-05',
'5b',
universe,
16,
neutralized_risk=neutralized_risk,
post_process=[standardize],
fit_target='closePrice')
print(res)
......@@ -32,8 +32,9 @@ class ConstLinearModel(ModelBase):
def __init__(self,
features=None,
weights: dict = None):
super().__init__(features)
weights: dict = None,
fit_target=None):
super().__init__(features=features, fit_target=fit_target)
if features is not None and weights is not None:
pyFinAssert(len(features) == len(weights),
ValueError,
......@@ -57,8 +58,8 @@ class ConstLinearModel(ModelBase):
class LinearRegression(ModelBase):
def __init__(self, features=None, fit_intercept: bool = False, **kwargs):
super().__init__(features)
def __init__(self, features=None, fit_intercept: bool = False, fit_target=None, **kwargs):
super().__init__(features=features, fit_target=fit_target)
self.impl = LinearRegressionImpl(fit_intercept=fit_intercept, **kwargs)
def save(self) -> dict:
......@@ -84,8 +85,8 @@ class LinearRegression(ModelBase):
class LassoRegression(ModelBase):
def __init__(self, alpha=0.01, features=None, fit_intercept: bool = False, **kwargs):
super().__init__(features)
def __init__(self, alpha=0.01, features=None, fit_intercept: bool = False, fit_target=None, **kwargs):
super().__init__(features=features, fit_target=fit_target)
self.impl = Lasso(alpha=alpha, fit_intercept=fit_intercept, **kwargs)
def save(self) -> dict:
......@@ -111,8 +112,8 @@ class LassoRegression(ModelBase):
class LogisticRegression(ModelBase):
def __init__(self, features=None, fit_intercept: bool = False, **kwargs):
super().__init__(features)
def __init__(self, features=None, fit_intercept: bool = False, fit_target=None, **kwargs):
super().__init__(features=features, fit_target=fit_target)
self.impl = LogisticRegressionImpl(fit_intercept=fit_intercept, **kwargs)
def save(self) -> dict:
......
......@@ -18,12 +18,17 @@ from alphamind.data.transformer import Transformer
class ModelBase(metaclass=abc.ABCMeta):
def __init__(self, features=None):
def __init__(self, features=None, fit_target=None):
if features is not None:
self.formulas = Transformer(features)
self.features = self.formulas.names
else:
self.features = None
if fit_target is not None:
self.fit_target = Transformer(fit_target)
else:
self.fit_target = None
self.impl = None
self.trained_time = None
......@@ -31,7 +36,8 @@ class ModelBase(metaclass=abc.ABCMeta):
return encode(self.impl) == encode(rhs.impl) \
and self.trained_time == rhs.trained_time \
and list_eq(self.features, rhs.features) \
and encode(self.formulas) == encode(rhs.formulas)
and encode(self.formulas) == encode(rhs.formulas) \
and encode(self.fit_target) == encode(rhs.fit_target)
def fit(self, x: pd.DataFrame, y: np.ndarray):
self.impl.fit(x[self.features].values, y.flatten())
......@@ -56,15 +62,21 @@ class ModelBase(metaclass=abc.ABCMeta):
trained_time=self.trained_time,
desc=encode(self.impl),
formulas=encode(self.formulas),
fit_target=encode(self.fit_target),
internal_model=self.impl.__class__.__module__ + "." + self.impl.__class__.__name__)
return model_desc
@abc.abstractclassmethod
@classmethod
@abc.abstractmethod
def load(cls, model_desc: dict):
obj_layout = cls()
obj_layout.features = model_desc['features']
obj_layout.formulas = decode(model_desc['formulas'])
obj_layout.trained_time = model_desc['trained_time']
obj_layout.impl = decode(model_desc['desc'])
if 'fit_target' in model_desc:
obj_layout.fit_target = decode(model_desc['fit_target'])
else:
obj_layout.fit_target = None
return obj_layout
......@@ -27,8 +27,9 @@ class RandomForestRegressor(ModelBase):
n_estimators: int=100,
max_features: str='auto',
features=None,
fit_target=None,
**kwargs):
super().__init__(features)
super().__init__(features=features, fit_target=fit_target)
self.impl = RandomForestRegressorImpl(n_estimators=n_estimators,
max_features=max_features,
**kwargs)
......@@ -59,8 +60,9 @@ class RandomForestClassifier(ModelBase):
n_estimators: int=100,
max_features: str='auto',
features=None,
fit_target=None,
**kwargs):
super().__init__(features)
super().__init__(features=features, fit_target=fit_target)
self.impl = RandomForestClassifierImpl(n_estimators=n_estimators,
max_features=max_features,
**kwargs)
......@@ -92,9 +94,10 @@ class XGBRegressor(ModelBase):
learning_rate: float=0.1,
max_depth: int=3,
features=None,
fit_target=None,
n_jobs: int=1,
**kwargs):
super().__init__(features)
super().__init__(features=features, fit_target=fit_target)
self.impl = XGBRegressorImpl(n_estimators=n_estimators,
learning_rate=learning_rate,
max_depth=max_depth,
......@@ -128,9 +131,10 @@ class XGBClassifier(ModelBase):
learning_rate: float=0.1,
max_depth: int=3,
features=None,
fit_target=None,
n_jobs: int=1,
**kwargs):
super().__init__(features)
super().__init__(features=features, fit_target=fit_target)
self.impl = XGBClassifierImpl(n_estimators=n_estimators,
learning_rate=learning_rate,
max_depth=max_depth,
......@@ -171,10 +175,11 @@ class XGBTrainer(ModelBase):
subsample=1.,
colsample_bytree=1.,
features=None,
fit_target=None,
random_state: int=0,
n_jobs: int=1,
**kwargs):
super().__init__(features)
super().__init__(features=features, fit_target=fit_target)
self.params = {
'silent': 1,
'objective': objective,
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment