Commit 3f878191 authored by Dr.李's avatar Dr.李

added more api for models

parent 81538ae6
......@@ -188,15 +188,28 @@ class Composer(object):
codes = x.index
return pd.DataFrame(model.predict(x_values).flatten(), index=codes)
def score(self, ref_date: str, x: pd.DataFrame = None, y: np.ndarray = None) -> float:
def score(self, ref_date: str, x: pd.DataFrame = None, y: np.ndarray = None, d_type: str = 'test') -> float:
model = self._fetch_latest_model(ref_date)
if x is None:
predict_data = self.data_meta.fetch_predict_data(ref_date, model)
x = predict_data['predict']['x']
if y is None:
y = predict_data['predict']['y']
if d_type == 'test':
test_data = self.data_meta.fetch_predict_data(ref_date, model)
x = test_data['predict']['x']
if y is None:
y = test_data['predict']['y']
else:
test_data = self.data_meta.fetch_train_data(ref_date, model)
x = test_data['train']['x']
if y is None:
y = test_data['train']['y']
return model.score(x, y)
def ic(self, ref_date) -> float:
model = self._fetch_latest_model(ref_date)
test_data = self.data_meta.fetch_predict_data(ref_date, model)
x = test_data['predict']['x']
y = test_data['predict']['y']
return model.ic(x, y)
def _fetch_latest_model(self, ref_date) -> ModelBase:
if self.is_updated:
sorted_keys = self.sorted_keys
......@@ -208,6 +221,9 @@ class Composer(object):
latest_index = bisect.bisect_left(sorted_keys, ref_date) - 1
return self.models[sorted_keys[latest_index]]
def __getitem__(self, ref_date) -> ModelBase:
return self.models[ref_date]
def save(self) -> dict:
return dict(
alpha_model=self.alpha_model.save(),
......
......@@ -28,7 +28,6 @@ from alphamind.utilities import map_freq
def _merge_df(engine, names, factor_df, target_df, universe, dates, risk_model, neutralized_risk):
risk_df = engine.fetch_risk_model_range(universe, dates=dates, risk_model=risk_model)[1]
alpha_logger.info("risk data loading finished")
used_neutralized_risk = list(set(total_risk_factors).difference(names))
risk_df = risk_df[['trade_date', 'code'] + used_neutralized_risk].dropna()
target_df = pd.merge(target_df, risk_df, on=['trade_date', 'code'])
......@@ -209,32 +208,34 @@ def fetch_data_package(engine: SqlEngine,
neutralized_risk: Iterable[str] = None,
risk_model: str = 'short',
pre_process: Iterable[object] = None,
post_process: Iterable[object] = None) -> dict:
post_process: Iterable[object] = None,
fit_target: Union[Transformer, object] = None) -> dict:
alpha_logger.info("Starting data package fetching ...")
transformer = Transformer(alpha_factors)
names = transformer.names
dates, return_df, factor_df = prepare_data(engine,
dates, target_df, factor_df = prepare_data(engine,
transformer,
start_date,
end_date,
frequency,
universe,
benchmark,
warm_start)
warm_start,
fit_target=fit_target)
return_df, dates, date_label, risk_exp, x_values, y_values, train_x, train_y, codes = \
_merge_df(engine, names, factor_df, return_df, universe, dates, risk_model, neutralized_risk)
target_df, dates, date_label, risk_exp, x_values, y_values, train_x, train_y, codes = \
_merge_df(engine, names, factor_df, target_df, universe, dates, risk_model, neutralized_risk)
alpha_logger.info("data merging finished")
return_df['weight'] = train_x['weight']
return_df['industry'] = train_x['industry']
return_df['industry_code'] = train_x['industry_code']
return_df['isOpen'] = train_x['isOpen']
target_df['weight'] = train_x['weight']
target_df['industry'] = train_x['industry']
target_df['industry_code'] = train_x['industry_code']
target_df['isOpen'] = train_x['isOpen']
if neutralized_risk:
for i, name in enumerate(neutralized_risk):
return_df.loc[:, name] = risk_exp[:, i]
target_df.loc[:, name] = risk_exp[:, i]
alpha_logger.info("Loading data is finished")
......@@ -254,7 +255,7 @@ def fetch_data_package(engine: SqlEngine,
ret = dict()
ret['x_names'] = names
ret['settlement'] = return_df
ret['settlement'] = target_df
ret['train'] = {'x': train_x_buckets, 'y': train_y_buckets, 'risk': train_risk_buckets}
ret['predict'] = {'x': predict_x_buckets, 'y': predict_y_buckets, 'risk': predict_risk_buckets,
'code': predict_codes_bucket}
......@@ -266,7 +267,7 @@ def fetch_train_phase(engine,
ref_date,
frequency,
universe,
batch,
batch=1,
neutralized_risk: Iterable[str] = None,
risk_model: str = 'short',
pre_process: Iterable[object] = None,
......@@ -279,7 +280,7 @@ def fetch_train_phase(engine,
transformer = Transformer(alpha_factors)
p = Period(frequency)
p = Period(length=-(warm_start + batch + 1) * p.length(), units=p.units())
p = Period(length=-(warm_start + batch) * p.length(), units=p.units())
start_date = advanceDateByCalendar('china.sse', ref_date, p, BizDayConventions.Following)
dates = makeSchedule(start_date,
......@@ -311,10 +312,10 @@ def fetch_train_phase(engine,
if dates[-1] == dt.datetime.strptime(ref_date, '%Y-%m-%d'):
pyFinAssert(len(dates) >= 2, ValueError, "No previous data for training for the date {0}".format(ref_date))
end = dates[-2]
start = dates[-batch - 2] if batch <= len(dates) - 2 else dates[0]
start = dates[-batch - 1] if batch <= len(dates) - 1 else dates[0]
else:
end = dates[-1]
start = dates[-batch - 1] if batch <= len(dates) else dates[0]
start = dates[-batch] if batch <= len(dates) else dates[0]
index = (date_label >= start) & (date_label <= end)
this_raw_x = x_values[index]
......@@ -347,7 +348,7 @@ def fetch_predict_phase(engine,
ref_date,
frequency,
universe,
batch,
batch=1,
neutralized_risk: Iterable[str] = None,
risk_model: str = 'short',
pre_process: Iterable[object] = None,
......@@ -361,7 +362,7 @@ def fetch_predict_phase(engine,
transformer = Transformer(alpha_factors)
p = Period(frequency)
p = Period(length=-(warm_start + batch) * p.length(), units=p.units())
p = Period(length=-(warm_start + batch - 1) * p.length(), units=p.units())
start_date = advanceDateByCalendar('china.sse', ref_date, p, BizDayConventions.Following)
dates = makeSchedule(start_date,
......@@ -458,15 +459,15 @@ def fetch_predict_phase(engine,
if __name__ == '__main__':
from alphamind.api import risk_styles, industry_styles, standardize
engine = SqlEngine('postgresql+psycopg2://postgres:we083826@localhost/alpha')
engine = SqlEngine('postgresql+psycopg2://postgres:A12345678!@10.63.6.220/alpha')
universe = Universe('zz500', ['hs300', 'zz500'])
neutralized_risk = risk_styles + industry_styles
res = fetch_predict_phase(engine, ['ep_q'],
'2012-01-05',
'5b',
universe,
16,
neutralized_risk=neutralized_risk,
post_process=[standardize],
fit_target='closePrice')
res = fetch_train_phase(engine, ['ep_q'],
'2012-01-05',
'5b',
universe,
2,
neutralized_risk=neutralized_risk,
post_process=[standardize],
fit_target='closePrice')
print(res)
......@@ -27,6 +27,13 @@ class ConstLinearModelImpl(object):
def predict(self, x: np.ndarray):
return x @ self.weights
def score(self, x: np.ndarray, y: np.ndarray) -> float:
y_hat = self.predict(x)
y_bar = y.mean()
ssto = ((y - y_bar) ** 2).sum()
sse = ((y - y_hat) ** 2).sum()
return 1. - sse / ssto
class ConstLinearModel(ModelBase):
......
......@@ -49,6 +49,10 @@ class ModelBase(metaclass=abc.ABCMeta):
def score(self, x: pd.DataFrame, y: np.ndarray) -> float:
return self.impl.score(x[self.features].values, y)
def ic(self, x: pd.DataFrame, y: np.ndarray) -> float:
predict_y = self.impl.predict(x[self.features].values)
return np.corrcoef(predict_y, y)[0, 1]
@abc.abstractmethod
def save(self) -> dict:
......
......@@ -48,6 +48,18 @@ class TestLinearModel(unittest.TestCase):
self.assertEqual(model.features, new_model.features)
np.testing.assert_array_almost_equal(model.weights, new_model.weights)
def test_const_linear_model_score(self):
model = LinearRegression(['a', 'b', 'c'], fit_intercept=False)
model.fit(self.train_x, self.train_y)
expected_score = model.score(self.train_x, self.train_y)
const_model = ConstLinearModel(features=['a', 'b', 'c'],
weights=dict(zip(model.features, model.weights)))
calculated_score = const_model.score(self.train_x, self.train_y)
self.assertAlmostEqual(expected_score, calculated_score)
def test_linear_regression(self):
model = LinearRegression(['a', 'b', 'c'], fit_intercept=False)
model.fit(self.train_x, self.train_y)
......
<!-- @import "[TOC]" {cmd="toc" depthFrom=1 depthTo=6 orderedList=false} -->
<!-- code_chunk_output -->
* [市场以及策略回顾](#市场以及策略回顾)
* [摘要](#摘要)
* [风格因子](#风格因子)
* [全市场股票(去除上市三个月以内的新股)](#全市场股票去除上市三个月以内的新股)
* [沪深300](#沪深300)
* [中证500](#中证500)
* [行业因子](#行业因子)
* [全市场股票(去除上市三个月以内的新股)](#全市场股票去除上市三个月以内的新股-1)
* [沪深300](#沪深300-1)
* [中证500](#中证500-1)
* [当前策略的风格分析](#当前策略的风格分析)
* [收益估值 - `EARNYILD`](#收益估值-earnyild)
* [成长因子 - `GROWTH`](#成长因子-growth)
* [市场因子 - `BETA`](#市场因子-beta)
* [市值因子 - `SIZE`](#市值因子-size)
* [流动性因子 - `LIQUIDTY`](#流动性因子-liquidty)
* [生产因子表现](#生产因子表现)
* [沪深300增强](#沪深300增强)
* [中证500增强](#中证500增强)
* [红塔红土中证500增强](#红塔红土中证500增强)
<!-- /code_chunk_output -->
# 市场以及策略回顾
<!-- @import "[TOC]" {cmd="toc" depthFrom=1 depthTo=6 orderedList=false} -->
<!-- code_chunk_output -->
* [市场以及策略回顾](#市场以及策略回顾)
* [摘要](#摘要)
* [风格因子](#风格因子)
* [全市场股票(去除上市三个月以内的新股)](#全市场股票去除上市三个月以内的新股)
* [沪深300](#沪深300)
* [中证500](#中证500)
* [行业因子](#行业因子)
* [全市场股票(去除上市三个月以内的新股)](#全市场股票去除上市三个月以内的新股-1)
* [沪深300](#沪深300-1)
* [中证500](#中证500-1)
* [当前策略的风格分析](#当前策略的风格分析)
* [收益估值 - `EARNYILD`](#收益估值-earnyild)
* [成长因子 - `GROWTH`](#成长因子-growth)
* [市场因子 - `BETA`](#市场因子-beta)
* [市值因子 - `SIZE`](#市值因子-size)
* [流动性因子 - `LIQUIDTY`](#流动性因子-liquidty)
* [生产因子表现](#生产因子表现)
* [沪深300增强](#沪深300增强)
* [中证500增强](#中证500增强)
* [红塔红土中证500增强](#红塔红土中证500增强)
<!-- /code_chunk_output -->
## 摘要
......@@ -11,11 +64,11 @@
* 行业上,`COMPUTER`以及`Electronics`是最近相对表现比较好的行业。
* 建议的措施
* 可以考虑适当保留负向的`SIZE`暴露;
* 可以考虑适当保留负向的`SIZE`暴露(需要慎重)
* 建议将`LIQUIDITY`作为alpha因子使用,或者保持一定的负向`LIQUIDTY`暴露;
* 控制`EARNYILD`的正向暴露。
* 控制`BETA`暴露至中性;
* 适当高配`COMPUTER`以及`ELectronics`;
* 适当高配`Computer`以及`ELectronics`
## 风格因子
......@@ -35,15 +88,7 @@
* 其他相对有趋势性的因子包括:`BETA``MOMENTUM`以及`BTOP`。但是不够稳定。
<iframe
width="600"
height="400"
seamless
frameBorder="0"
scrolling="no"
src="http://10.63.6.13:8088/superset/explore/table/2/?form_data=%7B%22datasource%22%3A%222__table%22%2C%22viz_type%22%3A%22line%22%2C%22slice_id%22%3A8%2C%22granularity_sqla%22%3A%22trade_date%22%2C%22time_grain_sqla%22%3Anull%2C%22since%22%3A%22100+years+ago%22%2C%22until%22%3A%22now%22%2C%22metrics%22%3A%5B%22avg__ic%22%5D%2C%22groupby%22%3A%5B%22factor%22%5D%2C%22limit%22%3A50%2C%22timeseries_limit_metric%22%3Anull%2C%22order_desc%22%3Atrue%2C%22color_scheme%22%3A%22bnbColors%22%2C%22show_brush%22%3Afalse%2C%22show_legend%22%3Atrue%2C%22rich_tooltip%22%3Atrue%2C%22show_markers%22%3Afalse%2C%22line_interpolation%22%3A%22linear%22%2C%22contribution%22%3Afalse%2C%22x_axis_label%22%3A%22%22%2C%22bottom_margin%22%3A%22auto%22%2C%22x_axis_showminmax%22%3Atrue%2C%22x_axis_format%22%3A%22smart_date%22%2C%22y_axis_label%22%3A%22%22%2C%22left_margin%22%3A%22auto%22%2C%22y_axis_showminmax%22%3Atrue%2C%22y_log_scale%22%3Afalse%2C%22y_axis_format%22%3A%22.3s%22%2C%22y_axis_bounds%22%3A%5Bnull%2Cnull%5D%2C%22rolling_type%22%3A%22cumsum%22%2C%22time_compare%22%3Anull%2C%22num_period_compare%22%3A%22%22%2C%22period_ratio_type%22%3A%22growth%22%2C%22resample_how%22%3A%22mean%22%2C%22resample_rule%22%3A%221M%22%2C%22resample_fillmethod%22%3Anull%2C%22annotation_layers%22%3A%5B%5D%2C%22where%22%3A%22%22%2C%22having%22%3A%22%22%2C%22filters%22%3A%5B%7B%22col%22%3A%22type%22%2C%22op%22%3A%22%3D%3D%22%2C%22val%22%3A%22style%22%7D%2C%7B%22col%22%3A%22horizon%22%2C%22op%22%3A%22%3D%3D%22%2C%22val%22%3A%2220b%22%7D%2C%7B%22col%22%3A%22universe%22%2C%22op%22%3A%22%3D%3D%22%2C%22val%22%3A%22ashare_ex%22%7D%5D%7D&standalone=true&height=400"
>
</iframe>
![](figures/全市场风格.png)
### 沪深300
......@@ -55,6 +100,7 @@
* `MOMENTUM`具有最高的累积IC;
* `SiZE`效应在2017之后的反转,比全市场更加显著。
![](figures/沪深300风格.png)
### 中证500
......@@ -64,6 +110,8 @@
* `LIQUIDTY`效应要比沪深300好;
* `SIZE`的反转效应在中证500上并不显著;
![](figures/中证500风格.png)
## 行业因子
行业因子的表现要更加复杂而难以把握,只能做一些推测性的总结。
......@@ -75,34 +123,98 @@
* 行业的IC风险波动很大;
* 周期性行业确实对应了比较到的IC波动,例如:`Mining``IronSteel`
![](figures/全市场行业.png)
### 沪深300
整体上与全市场效应类似。
![](figures/沪深300行业.png)
### 中证500
* `Electronics`特别强势;`computer`也是相对比较好的行业;
![](figures/中证500行业.png)
## 当前策略的风格分析
### 收益估值 - `EARNYILD`
在我们现行所有的策略中,都具有比较稳定的`EARNYILD`的正向暴露,比例从40% ~ 70%不等。
![](figures/策略风险暴露_EARNYILD.png)
### 成长因子 - `GROWTH`
当前策略并没有对`GROWTH`因子进行控制,但是整体来说,`GROWTH`因子的暴露都不高,从-22% ~ 9%不等。
![](figures/策略风险暴露_GROWTH.png)
### 市场因子 - `BETA`
当前策略对`BETA`因子的历史暴露不是很一致,在2017年12月之前暴露比较低,并且偏负向。进入三月份以后,`BETA`暴露显著为正(除了沪深300)。
![](figures/策略风险暴露_BETA.png)
### 市值因子 - `SIZE`
市值因子是受控因子,整体符合预期,处于0附近无暴露状态。
![](figures/策略风险暴露_SIZE.png)
### 流动性因子 - `LIQUIDTY`
在2017年年末以后,组合一直有比较稳定的正向流动性暴露。
##
![](figures/策略风险暴露_LIQUIDTY.png)
## 生产因子表现
### 沪深300增强
沪深300现阶段使用的因子包括:
* roe_q - experimental
* ep_q - experimental
* DivP - uqer
* cfinc1_q - experimental
* EBIT - uqer
* EARNYILD - uqer
* EPIBS - uqer
除EBIT以外,最近都遭遇回撤。EBIT与其他因子实际类型相似,属于质量指标,应该属于偶然。整体来讲,我们现阶段倾向于估值与质量的策略遭遇瓶颈。
![](figures/沪深300增强生产因子.png)
### 中证500增强
中证500增强现阶段使用的因子包括:
* BDTO - tiny
* CFinc1 - tiny
* CHV - tiny
* DivP - uqer
* IVR - experimental
* VAL - tiny
* eps_q - experimental
* roe_q - experimental
在中证500上面,DivP表现一直比较低迷。同时有比较明显回撤的是GREV这样的增长指标。相对避险比较好的是,CHV和IVR等技术指标因子。同时值得关注的是,因子中,估值质量类的,例如roe_q,eps_q和cfinc1_q等,虽然有些回撤或者进入平台期。但是没有明显的回撤。
![](figures/中证500增强生产因子.png)
### 红塔红土中证500增强
红塔红土中证500指数增强使用的因子包括:
* BDTO - tiny
* CFinc1 - tiny
* DROEAfterNonRecurring - legacy_factor
* DivP - uqer
* RVOL - tiny
* eps_q - experimental
结论与中证500增强类似,比较意外的是RVOL,作为与IVR做法类似的因子,表现并不稳健。
![](figures/红塔红土中证500增强生产因子.png)
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment