Commit fb1af4db authored by Dr.李's avatar Dr.李

update tree based model

parent 60aa3422
......@@ -18,6 +18,7 @@ from sqlalchemy import select, and_, outerjoin, join
from sqlalchemy.sql import func
from alphamind.data.engines.universe import Universe
from alphamind.data.dbmodel.models import FactorMaster
from alphamind.data.dbmodel.models import FactorLog
from alphamind.data.dbmodel.models import Strategy
from alphamind.data.dbmodel.models import DailyReturn
from alphamind.data.dbmodel.models import IndexComponent
......@@ -138,6 +139,10 @@ class SqlEngine(object):
query = self.session.query(FactorMaster)
return pd.read_sql(query.statement, query.session.bind)
def fetch_factor_coverage(self) -> pd.DataFrame:
query = self.session.query(FactorLog)
return pd.read_sql(query.statement, query.session.bind)
def fetch_risk_meta(self) -> pd.DataFrame:
query = self.session.query(RiskMaster)
return pd.read_sql(query.statement, query.session.bind)
......
{
"cells": [
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"%matplotlib inline\n",
"import datetime as dt\n",
"import numpy as np\n",
"import pandas as pd\n",
"from sklearn.tree import DecisionTreeClassifier\n",
"from sklearn.model_selection import GridSearchCV\n",
"from sklearn.ensemble import RandomForestClassifier\n",
"from sklearn.ensemble import RandomForestRegressor\n",
"from matplotlib import pyplot as plt\n",
"from alphamind.api import *\n",
"from PyFin.api import *\n",
"\n",
"plt.style.use('fivethirtyeight')\n",
"engine = SqlEngine()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"u_name = 'zz800'\n",
"universe = Universe(u_name, [u_name])\n",
"factor_coverage = engine.fetch_factor_coverage()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"flitered_coverage = factor_coverage[((factor_coverage.source == 'uqer'))\n",
" & (factor_coverage.universe == u_name) \n",
" & (factor_coverage.trade_date >= '2012-01-01')]\n",
"coverage_report = flitered_coverage.groupby(['factor'])['coverage'].mean()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"alpha_factors = coverage_report[coverage_report > 0.98].index.tolist()\n",
"\n",
"#alpha_factors = ['']\n",
"\n",
"alpha_factors = {\n",
" f: DIFF(f) for f in alpha_factors\n",
"}"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"benchmark = 906\n",
"frequency = '2w'\n",
"batch = 4\n",
"start_date = '2012-01-01'\n",
"end_date = '2017-10-24'\n",
"method = 'risk_neutral'\n",
"neutralize_risk = industry_styles + ['SIZE']"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"data_package = fetch_data_package(engine,\n",
" alpha_factors=alpha_factors,\n",
" start_date=start_date,\n",
" end_date=end_date,\n",
" frequency=frequency,\n",
" universe=universe,\n",
" benchmark=benchmark,\n",
" batch=batch,\n",
" neutralized_risk=neutralize_risk,\n",
" pre_process=[winsorize_normal],\n",
" post_process=[winsorize_normal],\n",
" warm_start=batch)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"train_x = data_package['train']['x']\n",
"train_y = data_package['train']['y']\n",
"\n",
"predict_x = data_package['predict']['x']\n",
"predict_y = data_package['predict']['y']\n",
"\n",
"features = data_package['x_names']"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"def plot_model_importance(model, features):\n",
" features = np.array(features)\n",
" n_features = len(features)\n",
" features_importance = model.feature_importances_\n",
" order = features_importance.argsort().argsort()\n",
" features = features[order >= n_features - 10]\n",
" features_importance = features_importance[order >= n_features - 10]\n",
" n_features = len(features)\n",
" plt.figure(figsize=(12, 6))\n",
" plt.barh(range(n_features), features_importance, align='center')\n",
" plt.yticks(np.arange(n_features), features)\n",
" plt.xlabel('Feature importance')\n",
" plt.ylabel('Feature')"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## 0. Train Score on a specific date\n",
"------------------------------------"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"ref_date = list(train_x.keys())[-2]\n",
"sample_train_x = train_x[ref_date]\n",
"sample_train_y = train_y[ref_date].flatten()\n",
"\n",
"sample_test_x = predict_x[ref_date]\n",
"sample_test_y = predict_y[ref_date].flatten()\n",
"\n",
"n_estimators = 200\n",
"max_depth = 10\n",
"min_samples_split = 5\n",
"min_samples_leaf = 2"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"model = RandomForestRegressor(n_estimators=n_estimators,\n",
" max_depth=max_depth,\n",
" min_samples_split=min_samples_split,\n",
" min_samples_leaf=min_samples_leaf,\n",
" n_jobs=-1,\n",
" max_features='log2')\n",
"model.fit(sample_train_x, sample_train_y)\n",
"model.score(sample_train_x, sample_train_y)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"model.score(sample_test_x, sample_test_y)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"plot_model_importance(model, features)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## 1. Train and test accuracy trend\n",
"----------"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"dates = sorted(train_x.keys())\n",
"accuray_table = pd.DataFrame(columns=['train', 'test'])\n",
"model_df = pd.Series()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"for ref_date in dates:\n",
" sample_train_x = train_x[ref_date]\n",
" sample_train_y = train_y[ref_date].flatten()\n",
" \n",
" model = RandomForestRegressor(n_estimators=n_estimators,\n",
" max_depth=max_depth,\n",
" min_samples_split=min_samples_split,\n",
" min_samples_leaf=min_samples_leaf,\n",
" n_jobs=-1,\n",
" max_features='log2')\n",
" model.fit(sample_train_x, sample_train_y)\n",
" \n",
" train_score = model.score(sample_train_x, sample_train_y)\n",
" accuray_table.loc[ref_date, 'train'] = train_score\n",
" model_df.loc[ref_date] = model\n",
" alpha_logger.info('trade_date: {0} training finished'.format(ref_date))"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"portfolio_risk_neutralize = ['SIZE']\n",
"portfolio_industry_neutralize = True\n",
"\n",
"settlement = data_package['settlement']\n",
"industry_dummies = pd.get_dummies(settlement['industry'].values)\n",
"risk_styles = settlement[portfolio_risk_neutralize].values\n",
"total_risks = settlement[neutralize_risk].values\n",
"final_res = np.zeros(len(dates))\n",
"method = 'risk_neutral'"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"for i, ref_date in enumerate(dates):\n",
" model = model_df[ref_date]\n",
" sample_test_x = predict_x[ref_date]\n",
" sample_test_y = predict_y[ref_date].flatten()\n",
" \n",
" cons = Constraints()\n",
" index = settlement.trade_date == ref_date\n",
" benchmark_w = settlement[index]['weight'].values\n",
" realized_r = settlement[index]['dx'].values\n",
" industry_names = settlement[index]['industry'].values\n",
" is_tradable = settlement[index]['isOpen'].values\n",
"\n",
" cons.add_exposure(['total'], np.ones((len(is_tradable), 1)))\n",
" cons.set_constraints('total', benchmark_w.sum(), benchmark_w.sum())\n",
"\n",
" if portfolio_industry_neutralize:\n",
" ind_exp = industry_dummies[index]\n",
"\n",
" risk_tags = ind_exp.columns\n",
" cons.add_exposure(risk_tags, ind_exp.values)\n",
" benchmark_exp = benchmark_w @ ind_exp.values\n",
"\n",
" for k, name in enumerate(risk_tags):\n",
" cons.set_constraints(name, benchmark_exp[k], benchmark_exp[k])\n",
"\n",
" if portfolio_risk_neutralize:\n",
" risk_exp = risk_styles[index]\n",
"\n",
" risk_tags = np.array(portfolio_risk_neutralize)\n",
" cons.add_exposure(risk_tags, risk_exp)\n",
"\n",
" benchmark_exp = benchmark_w @ risk_exp\n",
" for k, name in enumerate(risk_tags):\n",
" cons.set_constraints(name, benchmark_exp[k], benchmark_exp[k])\n",
"\n",
" risk_table = total_risks[index]\n",
"\n",
" y = model.predict(sample_test_x)\n",
" test_score = model.score(sample_test_x, sample_test_y)\n",
" accuray_table.loc[ref_date, 'test'] = test_score\n",
"\n",
" is_tradable[:] = True\n",
" weights, analysis = er_portfolio_analysis(y,\n",
" industry_names,\n",
" realized_r,\n",
" constraints=cons,\n",
" detail_analysis=True,\n",
" benchmark=benchmark_w,\n",
" is_tradable=is_tradable,\n",
" method=method)\n",
" \n",
" final_res[i] = analysis['er']['total'] / benchmark_w.sum()\n",
" alpha_logger.info('trade_date: {0} predicting finished'.format(ref_date))"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"last_date = advanceDateByCalendar('china.sse', dates[-1], frequency)\n",
"\n",
"df = pd.Series(final_res, index=dates[1:] + [last_date])\n",
"df.sort_index(inplace=True)\n",
"df['2012-01-01':].cumsum().plot(figsize=(12, 6))\n",
"plt.title('Prod factors model {1} ({0})'.format(method, model.__class__.__name__))"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"accuray_table.aggregate([np.mean, np.std])"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment