Commit fb1af4db authored by Dr.李's avatar Dr.李

update tree based model

parent 60aa3422
...@@ -18,6 +18,7 @@ from sqlalchemy import select, and_, outerjoin, join ...@@ -18,6 +18,7 @@ from sqlalchemy import select, and_, outerjoin, join
from sqlalchemy.sql import func from sqlalchemy.sql import func
from alphamind.data.engines.universe import Universe from alphamind.data.engines.universe import Universe
from alphamind.data.dbmodel.models import FactorMaster from alphamind.data.dbmodel.models import FactorMaster
from alphamind.data.dbmodel.models import FactorLog
from alphamind.data.dbmodel.models import Strategy from alphamind.data.dbmodel.models import Strategy
from alphamind.data.dbmodel.models import DailyReturn from alphamind.data.dbmodel.models import DailyReturn
from alphamind.data.dbmodel.models import IndexComponent from alphamind.data.dbmodel.models import IndexComponent
...@@ -138,6 +139,10 @@ class SqlEngine(object): ...@@ -138,6 +139,10 @@ class SqlEngine(object):
query = self.session.query(FactorMaster) query = self.session.query(FactorMaster)
return pd.read_sql(query.statement, query.session.bind) return pd.read_sql(query.statement, query.session.bind)
def fetch_factor_coverage(self) -> pd.DataFrame:
query = self.session.query(FactorLog)
return pd.read_sql(query.statement, query.session.bind)
def fetch_risk_meta(self) -> pd.DataFrame: def fetch_risk_meta(self) -> pd.DataFrame:
query = self.session.query(RiskMaster) query = self.session.query(RiskMaster)
return pd.read_sql(query.statement, query.session.bind) return pd.read_sql(query.statement, query.session.bind)
......
{
"cells": [
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"%matplotlib inline\n",
"import datetime as dt\n",
"import numpy as np\n",
"import pandas as pd\n",
"from sklearn.tree import DecisionTreeClassifier\n",
"from sklearn.model_selection import GridSearchCV\n",
"from sklearn.ensemble import RandomForestClassifier\n",
"from sklearn.ensemble import RandomForestRegressor\n",
"from matplotlib import pyplot as plt\n",
"from alphamind.api import *\n",
"from PyFin.api import *\n",
"\n",
"plt.style.use('fivethirtyeight')\n",
"engine = SqlEngine()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"u_name = 'zz800'\n",
"universe = Universe(u_name, [u_name])\n",
"factor_coverage = engine.fetch_factor_coverage()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"flitered_coverage = factor_coverage[((factor_coverage.source == 'uqer'))\n",
" & (factor_coverage.universe == u_name) \n",
" & (factor_coverage.trade_date >= '2012-01-01')]\n",
"coverage_report = flitered_coverage.groupby(['factor'])['coverage'].mean()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"alpha_factors = coverage_report[coverage_report > 0.98].index.tolist()\n",
"\n",
"#alpha_factors = ['']\n",
"\n",
"alpha_factors = {\n",
" f: DIFF(f) for f in alpha_factors\n",
"}"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"benchmark = 906\n",
"frequency = '2w'\n",
"batch = 4\n",
"start_date = '2012-01-01'\n",
"end_date = '2017-10-24'\n",
"method = 'risk_neutral'\n",
"neutralize_risk = industry_styles + ['SIZE']"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"data_package = fetch_data_package(engine,\n",
" alpha_factors=alpha_factors,\n",
" start_date=start_date,\n",
" end_date=end_date,\n",
" frequency=frequency,\n",
" universe=universe,\n",
" benchmark=benchmark,\n",
" batch=batch,\n",
" neutralized_risk=neutralize_risk,\n",
" pre_process=[winsorize_normal],\n",
" post_process=[winsorize_normal],\n",
" warm_start=batch)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"train_x = data_package['train']['x']\n",
"train_y = data_package['train']['y']\n",
"\n",
"predict_x = data_package['predict']['x']\n",
"predict_y = data_package['predict']['y']\n",
"\n",
"features = data_package['x_names']"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"def plot_model_importance(model, features):\n",
" features = np.array(features)\n",
" n_features = len(features)\n",
" features_importance = model.feature_importances_\n",
" order = features_importance.argsort().argsort()\n",
" features = features[order >= n_features - 10]\n",
" features_importance = features_importance[order >= n_features - 10]\n",
" n_features = len(features)\n",
" plt.figure(figsize=(12, 6))\n",
" plt.barh(range(n_features), features_importance, align='center')\n",
" plt.yticks(np.arange(n_features), features)\n",
" plt.xlabel('Feature importance')\n",
" plt.ylabel('Feature')"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## 0. Train Score on a specific date\n",
"------------------------------------"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"ref_date = list(train_x.keys())[-2]\n",
"sample_train_x = train_x[ref_date]\n",
"sample_train_y = train_y[ref_date].flatten()\n",
"\n",
"sample_test_x = predict_x[ref_date]\n",
"sample_test_y = predict_y[ref_date].flatten()\n",
"\n",
"n_estimators = 200\n",
"max_depth = 10\n",
"min_samples_split = 5\n",
"min_samples_leaf = 2"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"model = RandomForestRegressor(n_estimators=n_estimators,\n",
" max_depth=max_depth,\n",
" min_samples_split=min_samples_split,\n",
" min_samples_leaf=min_samples_leaf,\n",
" n_jobs=-1,\n",
" max_features='log2')\n",
"model.fit(sample_train_x, sample_train_y)\n",
"model.score(sample_train_x, sample_train_y)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"model.score(sample_test_x, sample_test_y)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"plot_model_importance(model, features)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## 1. Train and test accuracy trend\n",
"----------"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"dates = sorted(train_x.keys())\n",
"accuray_table = pd.DataFrame(columns=['train', 'test'])\n",
"model_df = pd.Series()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"for ref_date in dates:\n",
" sample_train_x = train_x[ref_date]\n",
" sample_train_y = train_y[ref_date].flatten()\n",
" \n",
" model = RandomForestRegressor(n_estimators=n_estimators,\n",
" max_depth=max_depth,\n",
" min_samples_split=min_samples_split,\n",
" min_samples_leaf=min_samples_leaf,\n",
" n_jobs=-1,\n",
" max_features='log2')\n",
" model.fit(sample_train_x, sample_train_y)\n",
" \n",
" train_score = model.score(sample_train_x, sample_train_y)\n",
" accuray_table.loc[ref_date, 'train'] = train_score\n",
" model_df.loc[ref_date] = model\n",
" alpha_logger.info('trade_date: {0} training finished'.format(ref_date))"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"portfolio_risk_neutralize = ['SIZE']\n",
"portfolio_industry_neutralize = True\n",
"\n",
"settlement = data_package['settlement']\n",
"industry_dummies = pd.get_dummies(settlement['industry'].values)\n",
"risk_styles = settlement[portfolio_risk_neutralize].values\n",
"total_risks = settlement[neutralize_risk].values\n",
"final_res = np.zeros(len(dates))\n",
"method = 'risk_neutral'"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"for i, ref_date in enumerate(dates):\n",
" model = model_df[ref_date]\n",
" sample_test_x = predict_x[ref_date]\n",
" sample_test_y = predict_y[ref_date].flatten()\n",
" \n",
" cons = Constraints()\n",
" index = settlement.trade_date == ref_date\n",
" benchmark_w = settlement[index]['weight'].values\n",
" realized_r = settlement[index]['dx'].values\n",
" industry_names = settlement[index]['industry'].values\n",
" is_tradable = settlement[index]['isOpen'].values\n",
"\n",
" cons.add_exposure(['total'], np.ones((len(is_tradable), 1)))\n",
" cons.set_constraints('total', benchmark_w.sum(), benchmark_w.sum())\n",
"\n",
" if portfolio_industry_neutralize:\n",
" ind_exp = industry_dummies[index]\n",
"\n",
" risk_tags = ind_exp.columns\n",
" cons.add_exposure(risk_tags, ind_exp.values)\n",
" benchmark_exp = benchmark_w @ ind_exp.values\n",
"\n",
" for k, name in enumerate(risk_tags):\n",
" cons.set_constraints(name, benchmark_exp[k], benchmark_exp[k])\n",
"\n",
" if portfolio_risk_neutralize:\n",
" risk_exp = risk_styles[index]\n",
"\n",
" risk_tags = np.array(portfolio_risk_neutralize)\n",
" cons.add_exposure(risk_tags, risk_exp)\n",
"\n",
" benchmark_exp = benchmark_w @ risk_exp\n",
" for k, name in enumerate(risk_tags):\n",
" cons.set_constraints(name, benchmark_exp[k], benchmark_exp[k])\n",
"\n",
" risk_table = total_risks[index]\n",
"\n",
" y = model.predict(sample_test_x)\n",
" test_score = model.score(sample_test_x, sample_test_y)\n",
" accuray_table.loc[ref_date, 'test'] = test_score\n",
"\n",
" is_tradable[:] = True\n",
" weights, analysis = er_portfolio_analysis(y,\n",
" industry_names,\n",
" realized_r,\n",
" constraints=cons,\n",
" detail_analysis=True,\n",
" benchmark=benchmark_w,\n",
" is_tradable=is_tradable,\n",
" method=method)\n",
" \n",
" final_res[i] = analysis['er']['total'] / benchmark_w.sum()\n",
" alpha_logger.info('trade_date: {0} predicting finished'.format(ref_date))"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"last_date = advanceDateByCalendar('china.sse', dates[-1], frequency)\n",
"\n",
"df = pd.Series(final_res, index=dates[1:] + [last_date])\n",
"df.sort_index(inplace=True)\n",
"df['2012-01-01':].cumsum().plot(figsize=(12, 6))\n",
"plt.title('Prod factors model {1} ({0})'.format(method, model.__class__.__name__))"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"accuray_table.aggregate([np.mean, np.std])"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment