Commit 80748343 authored by Dr.李's avatar Dr.李

added update special tables

parent a6e9e0c8
...@@ -17,25 +17,24 @@ import datetime as dt ...@@ -17,25 +17,24 @@ import datetime as dt
start = dt.datetime.now() start = dt.datetime.now()
universe_name = 'zz500' universe = Universe('custom', ['zz800'])
factor_name = 'PE' simple_expression = CSRes(LAST('OperCashInToAsset'), 'roe_q')
expression = 1. / LAST(factor_name)
alpha_factor_name = '1/PE' alpha_factor_name = 'alpha_factor'
alpha_factor = {alpha_factor_name: expression} alpha_factor = {alpha_factor_name: simple_expression}
# end of formula definition # end of formula definition
engine = SqlEngine('postgresql+psycopg2://postgres:A12345678!@10.63.6.220/alpha') engine = SqlEngine('postgresql+psycopg2://postgres:A12345678!@10.63.6.220/alpha')
universe = Universe('custom', [universe_name])
neutralize_risk = ['SIZE'] + industry_styles neutralize_risk = ['SIZE', 'LEVERAGE'] + industry_styles
freq = '5b' freq = '10b'
n_bins = 5 n_bins = 5
horizon = map_freq(freq) horizon = map_freq(freq)
start_date = '2012-01-01' start_date = '2012-01-01'
end_date = '2017-11-21' end_date = '2018-01-05'
dates = makeSchedule(start_date, dates = makeSchedule(start_date,
end_date, end_date,
...@@ -93,10 +92,9 @@ df = df.cumsum().plot(ax=axes[0], title='Quantile Analysis for {0}'.format(alpha ...@@ -93,10 +92,9 @@ df = df.cumsum().plot(ax=axes[0], title='Quantile Analysis for {0}'.format(alpha
# =================================================================== # # =================================================================== #
factor_name = 'PE' factor_name = 'PE'
expression = DIFF(1./LAST(factor_name))
alpha_factor_name = '1/PE_1w_diff' alpha_factor_name = alpha_factor_name + '_1w_diff'
alpha_factor = {alpha_factor_name: expression} alpha_factor = {alpha_factor_name: DIFF(simple_expression)}
dates = makeSchedule(start_date, dates = makeSchedule(start_date,
end_date, end_date,
......
...@@ -24,7 +24,9 @@ ...@@ -24,7 +24,9 @@
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 2, "execution_count": 2,
"metadata": {}, "metadata": {
"collapsed": true
},
"outputs": [], "outputs": [],
"source": [ "source": [
"\"\"\"\n", "\"\"\"\n",
...@@ -517,7 +519,7 @@ ...@@ -517,7 +519,7 @@
" risk = train_risk[ref_date][:, 1:]\n", " risk = train_risk[ref_date][:, 1:]\n",
" new_x = cross_product(x, risk)\n", " new_x = cross_product(x, risk)\n",
"\n", "\n",
" model = LinearRegression(features=linear_model_features_keys, fit_intercept=True) # n_jobs=8, min_samples_split=20)\n", " #model = LinearRegression(features=linear_model_features_keys, fit_intercept=True) # n_jobs=8, min_samples_split=20)\n",
" model = LassoRegression(alpha=0.01, features=linear_model_features_keys, fit_intercept=True) # n_jobs=8, min_samples_split=20)\n", " model = LassoRegression(alpha=0.01, features=linear_model_features_keys, fit_intercept=True) # n_jobs=8, min_samples_split=20)\n",
" model.fit(new_x, y)\n", " model.fit(new_x, y)\n",
" models_series.loc[ref_date] = model\n", " models_series.loc[ref_date] = model\n",
......
...@@ -2,14 +2,15 @@ ...@@ -2,14 +2,15 @@
"cells": [ "cells": [
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 1, "execution_count": 8,
"metadata": { "metadata": {},
"collapsed": true
},
"outputs": [], "outputs": [],
"source": [ "source": [
"%matplotlib inline\n", "%matplotlib inline\n",
"import numpy as np\n", "import numpy as np\n",
"import xgboost as xgb\n",
"from sklearn.metrics import r2_score\n",
"from sklearn.model_selection import train_test_split\n",
"from alphamind.api import *\n", "from alphamind.api import *\n",
"from PyFin.api import *\n", "from PyFin.api import *\n",
"\n", "\n",
...@@ -18,14 +19,14 @@ ...@@ -18,14 +19,14 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 2, "execution_count": 52,
"metadata": { "metadata": {
"collapsed": true "collapsed": true
}, },
"outputs": [], "outputs": [],
"source": [ "source": [
"start_date = '2015-01-01'\n", "start_date = '2012-01-01'\n",
"end_date = '2017-12-31'\n", "end_date = '2018-01-05'\n",
"\n", "\n",
"features = ['roe_q',\n", "features = ['roe_q',\n",
" 'ep_q',\n", " 'ep_q',\n",
...@@ -36,7 +37,7 @@ ...@@ -36,7 +37,7 @@
" 'EPIBS']\n", " 'EPIBS']\n",
"\n", "\n",
"freq = '5b'\n", "freq = '5b'\n",
"batch = 16\n", "batch = 32\n",
"universe = Universe('custom', ['zz500', 'hs300'])\n", "universe = Universe('custom', ['zz500', 'hs300'])\n",
"benchmark = 905\n", "benchmark = 905\n",
"neutralized_risk = ['SIZE'] + industry_styles\n", "neutralized_risk = ['SIZE'] + industry_styles\n",
...@@ -45,23 +46,29 @@ ...@@ -45,23 +46,29 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 3, "execution_count": 53,
"metadata": {}, "metadata": {},
"outputs": [ "outputs": [
{ {
"name": "stderr", "name": "stderr",
"output_type": "stream", "output_type": "stream",
"text": [ "text": [
"2018-01-08 10:14:20,323 - ALPHA_MIND - INFO - Starting data package fetching ...\n", "2018-01-10 14:56:47,595 - ALPHA_MIND - INFO - Starting data package fetching ...\n",
"2018-01-08 10:22:40,245 - ALPHA_MIND - INFO - Loading data is finished\n", "2018-01-10 14:56:54,781 - ALPHA_MIND - INFO - factor data loading finished\n",
"2018-01-08 10:22:47,375 - ALPHA_MIND - INFO - Data processing is finished\n" "2018-01-10 14:57:03,949 - ALPHA_MIND - INFO - return data loading finished\n",
"2018-01-10 14:57:05,113 - ALPHA_MIND - INFO - industry data loading finished\n",
"2018-01-10 14:57:05,828 - ALPHA_MIND - INFO - benchmark data loading finished\n",
"2018-01-10 14:57:15,662 - ALPHA_MIND - INFO - risk data loading finished\n",
"2018-01-10 14:57:17,773 - ALPHA_MIND - INFO - data merging finished\n",
"2018-01-10 14:57:19,490 - ALPHA_MIND - INFO - Loading data is finished\n",
"2018-01-10 14:57:35,324 - ALPHA_MIND - INFO - Data processing is finished\n"
] ]
}, },
{ {
"name": "stdout", "name": "stdout",
"output_type": "stream", "output_type": "stream",
"text": [ "text": [
"Wall time: 8min 27s\n" "Wall time: 47.7 s\n"
] ]
} }
], ],
...@@ -83,7 +90,7 @@ ...@@ -83,7 +90,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 4, "execution_count": 54,
"metadata": { "metadata": {
"collapsed": true "collapsed": true
}, },
...@@ -112,24 +119,34 @@ ...@@ -112,24 +119,34 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 5, "execution_count": 66,
"metadata": {}, "metadata": {},
"outputs": [ "outputs": [
{ {
"name": "stdout", "name": "stdout",
"output_type": "stream", "output_type": "stream",
"text": [ "text": [
"2015-01-05 00:00:00\n", "2012-02-15 00:00:00\n",
"2015-04-28 00:00:00\n", "2012-06-06 00:00:00\n",
"2015-08-13 00:00:00\n", "2012-09-20 00:00:00\n",
"2015-12-07 00:00:00\n", "2013-01-15 00:00:00\n",
"2016-03-29 00:00:00\n", "2013-05-14 00:00:00\n",
"2016-07-18 00:00:00\n", "2013-08-30 00:00:00\n",
"2016-11-09 00:00:00\n", "2013-12-24 00:00:00\n",
"2017-03-02 00:00:00\n", "2014-04-17 00:00:00\n",
"2017-06-22 00:00:00\n", "2014-08-05 00:00:00\n",
"2017-10-12 00:00:00\n", "2014-11-26 00:00:00\n",
"Wall time: 447 ms\n" "2015-03-20 00:00:00\n",
"2015-07-08 00:00:00\n",
"2015-10-30 00:00:00\n",
"2016-02-22 00:00:00\n",
"2016-06-08 00:00:00\n",
"2016-09-27 00:00:00\n",
"2017-01-18 00:00:00\n",
"2017-05-15 00:00:00\n",
"2017-08-30 00:00:00\n",
"2017-12-20 00:00:00\n",
"Wall time: 1.26 s\n"
] ]
} }
], ],
...@@ -156,15 +173,15 @@ ...@@ -156,15 +173,15 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 6, "execution_count": 67,
"metadata": {}, "metadata": {},
"outputs": [ "outputs": [
{ {
"name": "stdout", "name": "stdout",
"output_type": "stream", "output_type": "stream",
"text": [ "text": [
"0.0119584020732\n", "0.0107609007052\n",
"-0.587410279791\n" "-0.480548329833\n"
] ]
} }
], ],
...@@ -178,29 +195,39 @@ ...@@ -178,29 +195,39 @@
"metadata": {}, "metadata": {},
"source": [ "source": [
"## Lasso Regression\n", "## Lasso Regression\n",
"------------" "---------"
] ]
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 7, "execution_count": 60,
"metadata": {}, "metadata": {},
"outputs": [ "outputs": [
{ {
"name": "stdout", "name": "stdout",
"output_type": "stream", "output_type": "stream",
"text": [ "text": [
"2015-01-05 00:00:00\n", "2012-02-15 00:00:00\n",
"2015-04-28 00:00:00\n", "2012-06-06 00:00:00\n",
"2015-08-13 00:00:00\n", "2012-09-20 00:00:00\n",
"2015-12-07 00:00:00\n", "2013-01-15 00:00:00\n",
"2016-03-29 00:00:00\n", "2013-05-14 00:00:00\n",
"2016-07-18 00:00:00\n", "2013-08-30 00:00:00\n",
"2016-11-09 00:00:00\n", "2013-12-24 00:00:00\n",
"2017-03-02 00:00:00\n", "2014-04-17 00:00:00\n",
"2017-06-22 00:00:00\n", "2014-08-05 00:00:00\n",
"2017-10-12 00:00:00\n", "2014-11-26 00:00:00\n",
"Wall time: 518 ms\n" "2015-03-20 00:00:00\n",
"2015-07-08 00:00:00\n",
"2015-10-30 00:00:00\n",
"2016-02-22 00:00:00\n",
"2016-06-08 00:00:00\n",
"2016-09-27 00:00:00\n",
"2017-01-18 00:00:00\n",
"2017-05-15 00:00:00\n",
"2017-08-30 00:00:00\n",
"2017-12-20 00:00:00\n",
"Wall time: 1.58 s\n"
] ]
} }
], ],
...@@ -227,15 +254,15 @@ ...@@ -227,15 +254,15 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 8, "execution_count": 61,
"metadata": {}, "metadata": {},
"outputs": [ "outputs": [
{ {
"name": "stdout", "name": "stdout",
"output_type": "stream", "output_type": "stream",
"text": [ "text": [
"0.00981862410414\n", "0.00875291615929\n",
"-0.570666932124\n" "-0.475440026\n"
] ]
} }
], ],
...@@ -272,24 +299,25 @@ ...@@ -272,24 +299,25 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 10, "execution_count": 34,
"metadata": {}, "metadata": {},
"outputs": [ "outputs": [
{ {
"name": "stdout", "name": "stdout",
"output_type": "stream", "output_type": "stream",
"text": [ "text": [
"2015-01-05 00:00:00\n", "2017-01-03 00:00:00\n"
"2015-04-28 00:00:00\n", ]
"2015-08-13 00:00:00\n", },
"2015-12-07 00:00:00\n", {
"2016-03-29 00:00:00\n", "ename": "NameError",
"2016-07-18 00:00:00\n", "evalue": "name 'cross_product' is not defined",
"2016-11-09 00:00:00\n", "output_type": "error",
"2017-03-02 00:00:00\n", "traceback": [
"2017-06-22 00:00:00\n", "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
"2017-10-12 00:00:00\n", "\u001b[1;31mNameError\u001b[0m Traceback (most recent call last)",
"Wall time: 15.6 s\n" "\u001b[1;32m<timed exec>\u001b[0m in \u001b[0;36m<module>\u001b[1;34m()\u001b[0m\n",
"\u001b[1;31mNameError\u001b[0m: name 'cross_product' is not defined"
] ]
} }
], ],
...@@ -299,7 +327,7 @@ ...@@ -299,7 +327,7 @@
"train_scores = []\n", "train_scores = []\n",
"predict_scores = []\n", "predict_scores = []\n",
"\n", "\n",
"for i, date in enumerate(train_dates):\n", "for i, date in enumerate(train_dates[:1]):\n",
" if i % 15 == 0:\n", " if i % 15 == 0:\n",
" print(date)\n", " print(date)\n",
" x = train_x[date]\n", " x = train_x[date]\n",
...@@ -327,8 +355,8 @@ ...@@ -327,8 +355,8 @@
"name": "stdout", "name": "stdout",
"output_type": "stream", "output_type": "stream",
"text": [ "text": [
"0.0322126965952\n", "0.0291928676769\n",
"-0.67474281393\n" "-0.24146254373\n"
] ]
} }
], ],
...@@ -354,17 +382,11 @@ ...@@ -354,17 +382,11 @@
"name": "stdout", "name": "stdout",
"output_type": "stream", "output_type": "stream",
"text": [ "text": [
"2015-01-05 00:00:00\n", "2017-01-03 00:00:00\n",
"2015-04-28 00:00:00\n", "2017-04-27 00:00:00\n",
"2015-08-13 00:00:00\n", "2017-08-15 00:00:00\n",
"2015-12-07 00:00:00\n", "2017-12-05 00:00:00\n",
"2016-03-29 00:00:00\n", "Wall time: 4.78 s\n"
"2016-07-18 00:00:00\n",
"2016-11-09 00:00:00\n",
"2017-03-02 00:00:00\n",
"2017-06-22 00:00:00\n",
"2017-10-12 00:00:00\n",
"Wall time: 6.84 s\n"
] ]
} }
], ],
...@@ -402,8 +424,8 @@ ...@@ -402,8 +424,8 @@
"name": "stdout", "name": "stdout",
"output_type": "stream", "output_type": "stream",
"text": [ "text": [
"0.000197987187525\n", "0.000355789142204\n",
"-0.569332930413\n" "-0.200552889618\n"
] ]
} }
], ],
...@@ -429,17 +451,11 @@ ...@@ -429,17 +451,11 @@
"name": "stdout", "name": "stdout",
"output_type": "stream", "output_type": "stream",
"text": [ "text": [
"2015-01-05 00:00:00\n", "2017-01-03 00:00:00\n",
"2015-04-28 00:00:00\n", "2017-04-27 00:00:00\n",
"2015-08-13 00:00:00\n", "2017-08-15 00:00:00\n",
"2015-12-07 00:00:00\n", "2017-12-05 00:00:00\n",
"2016-03-29 00:00:00\n", "Wall time: 1min 18s\n"
"2016-07-18 00:00:00\n",
"2016-11-09 00:00:00\n",
"2017-03-02 00:00:00\n",
"2017-06-22 00:00:00\n",
"2017-10-12 00:00:00\n",
"Wall time: 2min 25s\n"
] ]
} }
], ],
...@@ -473,8 +489,8 @@ ...@@ -473,8 +489,8 @@
"name": "stdout", "name": "stdout",
"output_type": "stream", "output_type": "stream",
"text": [ "text": [
"0.0149542839772\n", "0.0137863030105\n",
"-0.566305808069\n" "-0.197952235791\n"
] ]
} }
], ],
...@@ -493,24 +509,18 @@ ...@@ -493,24 +509,18 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 22, "execution_count": 16,
"metadata": {}, "metadata": {},
"outputs": [ "outputs": [
{ {
"name": "stdout", "name": "stdout",
"output_type": "stream", "output_type": "stream",
"text": [ "text": [
"2015-01-05 00:00:00\n", "2017-01-03 00:00:00\n",
"2015-04-28 00:00:00\n", "2017-04-27 00:00:00\n",
"2015-08-13 00:00:00\n", "2017-08-15 00:00:00\n",
"2015-12-07 00:00:00\n", "2017-12-05 00:00:00\n",
"2016-03-29 00:00:00\n", "Wall time: 1min 32s\n"
"2016-07-18 00:00:00\n",
"2016-11-09 00:00:00\n",
"2017-03-02 00:00:00\n",
"2017-06-22 00:00:00\n",
"2017-10-12 00:00:00\n",
"Wall time: 1min 53s\n"
] ]
} }
], ],
...@@ -541,15 +551,115 @@ ...@@ -541,15 +551,115 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 23, "execution_count": 17,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"0.0575499865219\n",
"-0.209037365429\n"
]
}
],
"source": [
"print(np.mean(train_scores))\n",
"print(np.mean(predict_scores))"
]
},
{
"cell_type": "markdown",
"metadata": {
"collapsed": true
},
"source": [
"## Native XGBoost Regressor\n",
"---------------"
]
},
{
"cell_type": "code",
"execution_count": 68,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"2012-02-15 00:00:00\n",
"2012-06-06 00:00:00\n",
"2012-09-20 00:00:00\n",
"2013-01-15 00:00:00\n",
"2013-05-14 00:00:00\n",
"2013-08-30 00:00:00\n",
"2013-12-24 00:00:00\n",
"2014-04-17 00:00:00\n",
"2014-08-05 00:00:00\n",
"2014-11-26 00:00:00\n",
"2015-03-20 00:00:00\n",
"2015-07-08 00:00:00\n",
"2015-10-30 00:00:00\n",
"2016-02-22 00:00:00\n",
"2016-06-08 00:00:00\n",
"2016-09-27 00:00:00\n",
"2017-01-18 00:00:00\n",
"2017-05-15 00:00:00\n",
"2017-08-30 00:00:00\n",
"2017-12-20 00:00:00\n",
"Wall time: 6min 57s\n"
]
}
],
"source": [
"%%time\n",
"train_dates = list(train_x.keys())\n",
"train_scores = []\n",
"predict_scores = []\n",
"\n",
"for i, date in enumerate(train_dates):\n",
" if i % 15 == 0:\n",
" print(date)\n",
" x = train_x[date]\n",
" y = train_y[date]\n",
" \n",
" x_train, x_eval, y_train, y_eval = train_test_split(x, y, test_size=0.33, random_state=42)\n",
" \n",
" dtrain = xgb.DMatrix(x_train, y_train)\n",
" deval = xgb.DMatrix(x_eval, y_eval)\n",
" param = {'silent': 1,\n",
" 'objective': 'reg:linear',\n",
" 'max_depth': 3,\n",
" 'eta': 0.005,\n",
" 'boost': 'gbtree',\n",
" 'tree_method': 'hist',\n",
" 'subsample': 0.1,\n",
" 'colsample_bytree': 0.25}\n",
" num_round = 2000\n",
" model = xgb.train(param, dtrain, num_round, evals=[(deval, 'eval')], early_stopping_rounds=50, verbose_eval=False)\n",
" \n",
" y_train_predict = model.predict(dtrain)\n",
" train_scores.append(r2_score(y_train, y_train_predict, multioutput='uniform_average'))\n",
" \n",
" p_x = predict_x[date]\n",
" p_y = predict_y[date]\n",
" dtest = xgb.DMatrix(p_x, p_y)\n",
" \n",
" y_test_predict = model.predict(dtest)\n",
" predict_scores.append(r2_score(p_y, y_test_predict, multioutput='uniform_average'))"
]
},
{
"cell_type": "code",
"execution_count": 69,
"metadata": {}, "metadata": {},
"outputs": [ "outputs": [
{ {
"name": "stdout", "name": "stdout",
"output_type": "stream", "output_type": "stream",
"text": [ "text": [
"0.1654455223\n", "0.0158347715471\n",
"-0.70322789707\n" "-0.477095380466\n"
] ]
} }
], ],
......
...@@ -9,23 +9,25 @@ ...@@ -9,23 +9,25 @@
"outputs": [], "outputs": [],
"source": [ "source": [
"%matplotlib inline\n", "%matplotlib inline\n",
"import xgboost as xgb\n",
"import numpy as np\n", "import numpy as np\n",
"from alphamind.api import *\n", "from alphamind.api import *\n",
"from PyFin.api import *\n", "from PyFin.api import *\n",
"from sklearn.model_selection import train_test_split\n",
"\n", "\n",
"engine = SqlEngine()" "engine = SqlEngine()"
] ]
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 2, "execution_count": 23,
"metadata": { "metadata": {
"collapsed": true "collapsed": true
}, },
"outputs": [], "outputs": [],
"source": [ "source": [
"start_date = '2012-01-01'\n", "start_date = '2012-01-01'\n",
"end_date = '2017-12-31'\n", "end_date = '2018-01-05'\n",
"\n", "\n",
"features = ['roe_q',\n", "features = ['roe_q',\n",
" 'ep_q',\n", " 'ep_q',\n",
...@@ -35,7 +37,7 @@ ...@@ -35,7 +37,7 @@
" 'EARNYILD',\n", " 'EARNYILD',\n",
" 'EPIBS']\n", " 'EPIBS']\n",
"\n", "\n",
"freq = '5b'\n", "freq = '10b'\n",
"batch = 16\n", "batch = 16\n",
"universe = Universe('custom', ['zz500', 'hs300'])\n", "universe = Universe('custom', ['zz500', 'hs300'])\n",
"benchmark = 905\n", "benchmark = 905\n",
...@@ -45,29 +47,29 @@ ...@@ -45,29 +47,29 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 3, "execution_count": 24,
"metadata": {}, "metadata": {},
"outputs": [ "outputs": [
{ {
"name": "stderr", "name": "stderr",
"output_type": "stream", "output_type": "stream",
"text": [ "text": [
"2018-01-08 16:54:05,618 - ALPHA_MIND - INFO - Starting data package fetching ...\n", "2018-01-11 15:12:44,105 - ALPHA_MIND - INFO - Starting data package fetching ...\n",
"2018-01-08 16:54:15,904 - ALPHA_MIND - INFO - factor data loading finished\n", "2018-01-11 15:12:53,578 - ALPHA_MIND - INFO - factor data loading finished\n",
"2018-01-08 16:54:26,575 - ALPHA_MIND - INFO - return data loading finished\n", "2018-01-11 15:13:03,880 - ALPHA_MIND - INFO - return data loading finished\n",
"2018-01-08 16:54:27,944 - ALPHA_MIND - INFO - industry data loading finished\n", "2018-01-11 15:13:05,384 - ALPHA_MIND - INFO - industry data loading finished\n",
"2018-01-08 16:54:28,634 - ALPHA_MIND - INFO - benchmark data loading finished\n", "2018-01-11 15:13:06,178 - ALPHA_MIND - INFO - benchmark data loading finished\n",
"2018-01-08 16:54:41,966 - ALPHA_MIND - INFO - risk data loading finished\n", "2018-01-11 15:13:17,845 - ALPHA_MIND - INFO - risk data loading finished\n",
"2018-01-08 16:54:45,557 - ALPHA_MIND - INFO - data merging finished\n", "2018-01-11 15:13:21,266 - ALPHA_MIND - INFO - data merging finished\n",
"2018-01-08 16:54:48,150 - ALPHA_MIND - INFO - Loading data is finished\n", "2018-01-11 15:13:23,371 - ALPHA_MIND - INFO - Loading data is finished\n",
"2018-01-08 16:54:59,541 - ALPHA_MIND - INFO - Data processing is finished\n" "2018-01-11 15:13:33,174 - ALPHA_MIND - INFO - Data processing is finished\n"
] ]
}, },
{ {
"name": "stdout", "name": "stdout",
"output_type": "stream", "output_type": "stream",
"text": [ "text": [
"Wall time: 53.9 s\n" "Wall time: 49.1 s\n"
] ]
} }
], ],
...@@ -89,7 +91,7 @@ ...@@ -89,7 +91,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 4, "execution_count": 25,
"metadata": { "metadata": {
"collapsed": true "collapsed": true
}, },
...@@ -110,8 +112,10 @@ ...@@ -110,8 +112,10 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 5, "execution_count": 26,
"metadata": {}, "metadata": {
"collapsed": true
},
"outputs": [], "outputs": [],
"source": [ "source": [
"for key, val in train_y.items():\n", "for key, val in train_y.items():\n",
...@@ -131,7 +135,7 @@ ...@@ -131,7 +135,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 6, "execution_count": 27,
"metadata": {}, "metadata": {},
"outputs": [ "outputs": [
{ {
...@@ -158,7 +162,7 @@ ...@@ -158,7 +162,7 @@
"2017-05-15 00:00:00\n", "2017-05-15 00:00:00\n",
"2017-08-30 00:00:00\n", "2017-08-30 00:00:00\n",
"2017-12-20 00:00:00\n", "2017-12-20 00:00:00\n",
"Wall time: 6.92 s\n" "Wall time: 5.34 s\n"
] ]
} }
], ],
...@@ -185,15 +189,15 @@ ...@@ -185,15 +189,15 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 7, "execution_count": 28,
"metadata": {}, "metadata": {},
"outputs": [ "outputs": [
{ {
"name": "stdout", "name": "stdout",
"output_type": "stream", "output_type": "stream",
"text": [ "text": [
"0.54106394519\n", "0.541013986745\n",
"0.519270440032\n" "0.51932344036\n"
] ]
} }
], ],
...@@ -212,7 +216,7 @@ ...@@ -212,7 +216,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 21, "execution_count": 29,
"metadata": {}, "metadata": {},
"outputs": [ "outputs": [
{ {
...@@ -239,7 +243,7 @@ ...@@ -239,7 +243,7 @@
"2017-05-15 00:00:00\n", "2017-05-15 00:00:00\n",
"2017-08-30 00:00:00\n", "2017-08-30 00:00:00\n",
"2017-12-20 00:00:00\n", "2017-12-20 00:00:00\n",
"Wall time: 17min 2s\n" "Wall time: 15min 34s\n"
] ]
} }
], ],
...@@ -266,15 +270,15 @@ ...@@ -266,15 +270,15 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 22, "execution_count": 30,
"metadata": {}, "metadata": {},
"outputs": [ "outputs": [
{ {
"name": "stdout", "name": "stdout",
"output_type": "stream", "output_type": "stream",
"text": [ "text": [
"0.557667621301\n", "0.557563825608\n",
"0.554107283453\n" "0.553974775005\n"
] ]
} }
], ],
...@@ -293,7 +297,7 @@ ...@@ -293,7 +297,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 10, "execution_count": 31,
"metadata": {}, "metadata": {},
"outputs": [ "outputs": [
{ {
...@@ -320,7 +324,7 @@ ...@@ -320,7 +324,7 @@
"2017-05-15 00:00:00\n", "2017-05-15 00:00:00\n",
"2017-08-30 00:00:00\n", "2017-08-30 00:00:00\n",
"2017-12-20 00:00:00\n", "2017-12-20 00:00:00\n",
"Wall time: 4min 33s\n" "Wall time: 13min 40s\n"
] ]
} }
], ],
...@@ -336,7 +340,7 @@ ...@@ -336,7 +340,7 @@
" x = train_x[date]\n", " x = train_x[date]\n",
" y = train_y[date]\n", " y = train_y[date]\n",
" \n", " \n",
" model = XGBClassifier(n_estimators=500,\n", " model = XGBClassifier(n_estimators=1000,\n",
" learning_rate=0.02,\n", " learning_rate=0.02,\n",
" max_depth=3,\n", " max_depth=3,\n",
" n_jobs=-1,\n", " n_jobs=-1,\n",
...@@ -352,15 +356,118 @@ ...@@ -352,15 +356,118 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 11, "execution_count": 32,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"0.642946015759\n",
"0.537550683184\n"
]
}
],
"source": [
"print(np.mean(train_scores))\n",
"print(np.mean(predict_scores))"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Native XGBoost Classifier\n",
"---------------"
]
},
{
"cell_type": "code",
"execution_count": 33,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"2012-02-15 00:00:00\n",
"2012-06-06 00:00:00\n",
"2012-09-20 00:00:00\n",
"2013-01-15 00:00:00\n",
"2013-05-14 00:00:00\n",
"2013-08-30 00:00:00\n",
"2013-12-24 00:00:00\n",
"2014-04-17 00:00:00\n",
"2014-08-05 00:00:00\n",
"2014-11-26 00:00:00\n",
"2015-03-20 00:00:00\n",
"2015-07-08 00:00:00\n",
"2015-10-30 00:00:00\n",
"2016-02-22 00:00:00\n",
"2016-06-08 00:00:00\n",
"2016-09-27 00:00:00\n",
"2017-01-18 00:00:00\n",
"2017-05-15 00:00:00\n",
"2017-08-30 00:00:00\n",
"2017-12-20 00:00:00\n",
"Wall time: 1min 6s\n"
]
}
],
"source": [
"%%time\n",
"train_dates = list(train_x.keys())\n",
"train_scores = []\n",
"predict_scores = []\n",
"\n",
"for i, date in enumerate(train_dates):\n",
" if i % 15 == 0:\n",
" print(date)\n",
" x = train_x[date]\n",
" y = train_y[date]\n",
" \n",
" x_train, x_eval, y_train, y_eval = train_test_split(x, y, test_size=0.33, random_state=42)\n",
" \n",
" dtrain = xgb.DMatrix(x_train, y_train)\n",
" deval = xgb.DMatrix(x_eval, y_eval)\n",
" param = {'silent': 1,\n",
" 'objective': 'binary:logistic',\n",
" 'max_depth': 3,\n",
" 'eta': 0.01,\n",
" 'boost': 'dart',\n",
" 'tree_method': 'hist',\n",
" 'subsample': 0.25,\n",
" 'colsample_bytree': 0.5}\n",
" num_round = 2000\n",
" model = xgb.train(param, dtrain, num_round, evals=[(deval, 'eval')], early_stopping_rounds=50, verbose_eval=False)\n",
" \n",
" y_train_predict = model.predict(dtrain)\n",
" label = dtrain.get_label()\n",
" train_score = np.sum((y_train_predict > 0.5) == label) / float(len(label))\n",
"\n",
" train_scores.append(train_score)\n",
" \n",
" p_x = predict_x[date]\n",
" p_y = predict_y[date]\n",
" dtest = xgb.DMatrix(p_x, p_y)\n",
" \n",
" y_test_predict = model.predict(dtest)\n",
" p_label = dtest.get_label()\n",
" test_score = np.sum((y_test_predict > 0.5) == p_label) / float(len(p_label))\n",
" predict_scores.append(test_score)"
]
},
{
"cell_type": "code",
"execution_count": 34,
"metadata": {}, "metadata": {},
"outputs": [ "outputs": [
{ {
"name": "stdout", "name": "stdout",
"output_type": "stream", "output_type": "stream",
"text": [ "text": [
"0.612408578757\n", "0.567225761699\n",
"0.543523900352\n" "0.550997907465\n"
] ]
} }
], ],
...@@ -379,7 +486,7 @@ ...@@ -379,7 +486,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 12, "execution_count": 35,
"metadata": { "metadata": {
"collapsed": true "collapsed": true
}, },
...@@ -397,7 +504,7 @@ ...@@ -397,7 +504,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 13, "execution_count": 36,
"metadata": {}, "metadata": {},
"outputs": [ "outputs": [
{ {
...@@ -424,7 +531,7 @@ ...@@ -424,7 +531,7 @@
"2017-05-15 00:00:00\n", "2017-05-15 00:00:00\n",
"2017-08-30 00:00:00\n", "2017-08-30 00:00:00\n",
"2017-12-20 00:00:00\n", "2017-12-20 00:00:00\n",
"Wall time: 26.7 s\n" "Wall time: 36.1 s\n"
] ]
} }
], ],
...@@ -455,15 +562,15 @@ ...@@ -455,15 +562,15 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 14, "execution_count": 37,
"metadata": {}, "metadata": {},
"outputs": [ "outputs": [
{ {
"name": "stdout", "name": "stdout",
"output_type": "stream", "output_type": "stream",
"text": [ "text": [
"0.568151341668\n", "0.568125478425\n",
"0.517317353974\n" "0.517523115163\n"
] ]
} }
], ],
...@@ -482,7 +589,7 @@ ...@@ -482,7 +589,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 19, "execution_count": 38,
"metadata": {}, "metadata": {},
"outputs": [ "outputs": [
{ {
...@@ -509,7 +616,7 @@ ...@@ -509,7 +616,7 @@
"2017-05-15 00:00:00\n", "2017-05-15 00:00:00\n",
"2017-08-30 00:00:00\n", "2017-08-30 00:00:00\n",
"2017-12-20 00:00:00\n", "2017-12-20 00:00:00\n",
"Wall time: 12min 2s\n" "Wall time: 14min 40s\n"
] ]
} }
], ],
...@@ -540,15 +647,15 @@ ...@@ -540,15 +647,15 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 20, "execution_count": 39,
"metadata": {}, "metadata": {},
"outputs": [ "outputs": [
{ {
"name": "stdout", "name": "stdout",
"output_type": "stream", "output_type": "stream",
"text": [ "text": [
"0.549010335268\n", "0.549090142483\n",
"0.56003282178\n" "0.559944504146\n"
] ]
} }
], ],
...@@ -567,7 +674,7 @@ ...@@ -567,7 +674,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 17, "execution_count": 40,
"metadata": {}, "metadata": {},
"outputs": [ "outputs": [
{ {
...@@ -594,7 +701,7 @@ ...@@ -594,7 +701,7 @@
"2017-05-15 00:00:00\n", "2017-05-15 00:00:00\n",
"2017-08-30 00:00:00\n", "2017-08-30 00:00:00\n",
"2017-12-20 00:00:00\n", "2017-12-20 00:00:00\n",
"Wall time: 17min 2s\n" "Wall time: 12min 25s\n"
] ]
} }
], ],
...@@ -630,15 +737,122 @@ ...@@ -630,15 +737,122 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 18, "execution_count": 41,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"0.59375573895\n",
"0.55230987889\n"
]
}
],
"source": [
"print(np.mean(train_scores))\n",
"print(np.mean(predict_scores))"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Native XGBoost Classifier with More Features\n",
"---------------"
]
},
{
"cell_type": "code",
"execution_count": 42,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"2012-02-15 00:00:00\n",
"2012-06-06 00:00:00\n",
"2012-09-20 00:00:00\n",
"2013-01-15 00:00:00\n",
"2013-05-14 00:00:00\n",
"2013-08-30 00:00:00\n",
"2013-12-24 00:00:00\n",
"2014-04-17 00:00:00\n",
"2014-08-05 00:00:00\n",
"2014-11-26 00:00:00\n",
"2015-03-20 00:00:00\n",
"2015-07-08 00:00:00\n",
"2015-10-30 00:00:00\n",
"2016-02-22 00:00:00\n",
"2016-06-08 00:00:00\n",
"2016-09-27 00:00:00\n",
"2017-01-18 00:00:00\n",
"2017-05-15 00:00:00\n",
"2017-08-30 00:00:00\n",
"2017-12-20 00:00:00\n",
"Wall time: 5min 23s\n"
]
}
],
"source": [
"%%time\n",
"train_dates = list(train_x.keys())\n",
"train_scores = []\n",
"predict_scores = []\n",
"\n",
"for i, date in enumerate(train_dates):\n",
" if i % 15 == 0:\n",
" print(date)\n",
" x = train_x[date]\n",
" y = train_y[date]\n",
" risk = train_risk[date][:, 1:]\n",
" new_x = cross_product(x, risk)\n",
" \n",
" x_train, x_eval, y_train, y_eval = train_test_split(new_x, y, test_size=0.33, random_state=42)\n",
" \n",
" dtrain = xgb.DMatrix(x_train, y_train)\n",
" deval = xgb.DMatrix(x_eval, y_eval)\n",
" param = {'silent': 1,\n",
" 'objective': 'binary:logistic',\n",
" 'max_depth': 3,\n",
" 'eta': 0.01,\n",
" 'booster': 'dart',\n",
" 'tree_method': 'hist',\n",
" 'subsample': 0.25,\n",
" 'colsample_bytree': 0.5}\n",
" num_round = 2000\n",
" model = xgb.train(param, dtrain, num_round, evals=[(deval, 'eval')], early_stopping_rounds=50, verbose_eval=False)\n",
" \n",
" y_train_predict = model.predict(dtrain)\n",
" label = dtrain.get_label()\n",
" train_score = np.sum((y_train_predict > 0.5) == label) / float(len(label))\n",
"\n",
" train_scores.append(train_score)\n",
" \n",
" p_x = predict_x[date]\n",
" p_y = predict_y[date]\n",
" p_risk = predict_risk[date][:, 1:]\n",
" new_p_x = cross_product(p_x, p_risk)\n",
" dtest = xgb.DMatrix(new_p_x, p_y)\n",
" \n",
" y_test_predict = model.predict(dtest)\n",
" p_label = dtest.get_label()\n",
" test_score = np.sum((y_test_predict > 0.5) == p_label) / float(len(p_label))\n",
" predict_scores.append(test_score)"
]
},
{
"cell_type": "code",
"execution_count": 43,
"metadata": {}, "metadata": {},
"outputs": [ "outputs": [
{ {
"name": "stdout", "name": "stdout",
"output_type": "stream", "output_type": "stream",
"text": [ "text": [
"0.593739136739\n", "0.560057712549\n",
"0.552533996977\n" "0.552663472836\n"
] ]
} }
], ],
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment