Commit 80748343 authored by Dr.李's avatar Dr.李

added update special tables

parent a6e9e0c8
......@@ -17,25 +17,24 @@ import datetime as dt
start = dt.datetime.now()
universe_name = 'zz500'
universe = Universe('custom', ['zz800'])
factor_name = 'PE'
expression = 1. / LAST(factor_name)
simple_expression = CSRes(LAST('OperCashInToAsset'), 'roe_q')
alpha_factor_name = '1/PE'
alpha_factor = {alpha_factor_name: expression}
alpha_factor_name = 'alpha_factor'
alpha_factor = {alpha_factor_name: simple_expression}
# end of formula definition
engine = SqlEngine('postgresql+psycopg2://postgres:A12345678!@10.63.6.220/alpha')
universe = Universe('custom', [universe_name])
neutralize_risk = ['SIZE'] + industry_styles
freq = '5b'
neutralize_risk = ['SIZE', 'LEVERAGE'] + industry_styles
freq = '10b'
n_bins = 5
horizon = map_freq(freq)
start_date = '2012-01-01'
end_date = '2017-11-21'
end_date = '2018-01-05'
dates = makeSchedule(start_date,
end_date,
......@@ -93,10 +92,9 @@ df = df.cumsum().plot(ax=axes[0], title='Quantile Analysis for {0}'.format(alpha
# =================================================================== #
factor_name = 'PE'
expression = DIFF(1./LAST(factor_name))
alpha_factor_name = '1/PE_1w_diff'
alpha_factor = {alpha_factor_name: expression}
alpha_factor_name = alpha_factor_name + '_1w_diff'
alpha_factor = {alpha_factor_name: DIFF(simple_expression)}
dates = makeSchedule(start_date,
end_date,
......
......@@ -24,7 +24,9 @@
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"\"\"\"\n",
......@@ -517,7 +519,7 @@
" risk = train_risk[ref_date][:, 1:]\n",
" new_x = cross_product(x, risk)\n",
"\n",
" model = LinearRegression(features=linear_model_features_keys, fit_intercept=True) # n_jobs=8, min_samples_split=20)\n",
" #model = LinearRegression(features=linear_model_features_keys, fit_intercept=True) # n_jobs=8, min_samples_split=20)\n",
" model = LassoRegression(alpha=0.01, features=linear_model_features_keys, fit_intercept=True) # n_jobs=8, min_samples_split=20)\n",
" model.fit(new_x, y)\n",
" models_series.loc[ref_date] = model\n",
......
......@@ -2,14 +2,15 @@
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {
"collapsed": true
},
"execution_count": 8,
"metadata": {},
"outputs": [],
"source": [
"%matplotlib inline\n",
"import numpy as np\n",
"import xgboost as xgb\n",
"from sklearn.metrics import r2_score\n",
"from sklearn.model_selection import train_test_split\n",
"from alphamind.api import *\n",
"from PyFin.api import *\n",
"\n",
......@@ -18,14 +19,14 @@
},
{
"cell_type": "code",
"execution_count": 2,
"execution_count": 52,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"start_date = '2015-01-01'\n",
"end_date = '2017-12-31'\n",
"start_date = '2012-01-01'\n",
"end_date = '2018-01-05'\n",
"\n",
"features = ['roe_q',\n",
" 'ep_q',\n",
......@@ -36,7 +37,7 @@
" 'EPIBS']\n",
"\n",
"freq = '5b'\n",
"batch = 16\n",
"batch = 32\n",
"universe = Universe('custom', ['zz500', 'hs300'])\n",
"benchmark = 905\n",
"neutralized_risk = ['SIZE'] + industry_styles\n",
......@@ -45,23 +46,29 @@
},
{
"cell_type": "code",
"execution_count": 3,
"execution_count": 53,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"2018-01-08 10:14:20,323 - ALPHA_MIND - INFO - Starting data package fetching ...\n",
"2018-01-08 10:22:40,245 - ALPHA_MIND - INFO - Loading data is finished\n",
"2018-01-08 10:22:47,375 - ALPHA_MIND - INFO - Data processing is finished\n"
"2018-01-10 14:56:47,595 - ALPHA_MIND - INFO - Starting data package fetching ...\n",
"2018-01-10 14:56:54,781 - ALPHA_MIND - INFO - factor data loading finished\n",
"2018-01-10 14:57:03,949 - ALPHA_MIND - INFO - return data loading finished\n",
"2018-01-10 14:57:05,113 - ALPHA_MIND - INFO - industry data loading finished\n",
"2018-01-10 14:57:05,828 - ALPHA_MIND - INFO - benchmark data loading finished\n",
"2018-01-10 14:57:15,662 - ALPHA_MIND - INFO - risk data loading finished\n",
"2018-01-10 14:57:17,773 - ALPHA_MIND - INFO - data merging finished\n",
"2018-01-10 14:57:19,490 - ALPHA_MIND - INFO - Loading data is finished\n",
"2018-01-10 14:57:35,324 - ALPHA_MIND - INFO - Data processing is finished\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Wall time: 8min 27s\n"
"Wall time: 47.7 s\n"
]
}
],
......@@ -83,7 +90,7 @@
},
{
"cell_type": "code",
"execution_count": 4,
"execution_count": 54,
"metadata": {
"collapsed": true
},
......@@ -112,24 +119,34 @@
},
{
"cell_type": "code",
"execution_count": 5,
"execution_count": 66,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"2015-01-05 00:00:00\n",
"2015-04-28 00:00:00\n",
"2015-08-13 00:00:00\n",
"2015-12-07 00:00:00\n",
"2016-03-29 00:00:00\n",
"2016-07-18 00:00:00\n",
"2016-11-09 00:00:00\n",
"2017-03-02 00:00:00\n",
"2017-06-22 00:00:00\n",
"2017-10-12 00:00:00\n",
"Wall time: 447 ms\n"
"2012-02-15 00:00:00\n",
"2012-06-06 00:00:00\n",
"2012-09-20 00:00:00\n",
"2013-01-15 00:00:00\n",
"2013-05-14 00:00:00\n",
"2013-08-30 00:00:00\n",
"2013-12-24 00:00:00\n",
"2014-04-17 00:00:00\n",
"2014-08-05 00:00:00\n",
"2014-11-26 00:00:00\n",
"2015-03-20 00:00:00\n",
"2015-07-08 00:00:00\n",
"2015-10-30 00:00:00\n",
"2016-02-22 00:00:00\n",
"2016-06-08 00:00:00\n",
"2016-09-27 00:00:00\n",
"2017-01-18 00:00:00\n",
"2017-05-15 00:00:00\n",
"2017-08-30 00:00:00\n",
"2017-12-20 00:00:00\n",
"Wall time: 1.26 s\n"
]
}
],
......@@ -156,15 +173,15 @@
},
{
"cell_type": "code",
"execution_count": 6,
"execution_count": 67,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"0.0119584020732\n",
"-0.587410279791\n"
"0.0107609007052\n",
"-0.480548329833\n"
]
}
],
......@@ -178,29 +195,39 @@
"metadata": {},
"source": [
"## Lasso Regression\n",
"------------"
"---------"
]
},
{
"cell_type": "code",
"execution_count": 7,
"execution_count": 60,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"2015-01-05 00:00:00\n",
"2015-04-28 00:00:00\n",
"2015-08-13 00:00:00\n",
"2015-12-07 00:00:00\n",
"2016-03-29 00:00:00\n",
"2016-07-18 00:00:00\n",
"2016-11-09 00:00:00\n",
"2017-03-02 00:00:00\n",
"2017-06-22 00:00:00\n",
"2017-10-12 00:00:00\n",
"Wall time: 518 ms\n"
"2012-02-15 00:00:00\n",
"2012-06-06 00:00:00\n",
"2012-09-20 00:00:00\n",
"2013-01-15 00:00:00\n",
"2013-05-14 00:00:00\n",
"2013-08-30 00:00:00\n",
"2013-12-24 00:00:00\n",
"2014-04-17 00:00:00\n",
"2014-08-05 00:00:00\n",
"2014-11-26 00:00:00\n",
"2015-03-20 00:00:00\n",
"2015-07-08 00:00:00\n",
"2015-10-30 00:00:00\n",
"2016-02-22 00:00:00\n",
"2016-06-08 00:00:00\n",
"2016-09-27 00:00:00\n",
"2017-01-18 00:00:00\n",
"2017-05-15 00:00:00\n",
"2017-08-30 00:00:00\n",
"2017-12-20 00:00:00\n",
"Wall time: 1.58 s\n"
]
}
],
......@@ -227,15 +254,15 @@
},
{
"cell_type": "code",
"execution_count": 8,
"execution_count": 61,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"0.00981862410414\n",
"-0.570666932124\n"
"0.00875291615929\n",
"-0.475440026\n"
]
}
],
......@@ -272,24 +299,25 @@
},
{
"cell_type": "code",
"execution_count": 10,
"execution_count": 34,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"2015-01-05 00:00:00\n",
"2015-04-28 00:00:00\n",
"2015-08-13 00:00:00\n",
"2015-12-07 00:00:00\n",
"2016-03-29 00:00:00\n",
"2016-07-18 00:00:00\n",
"2016-11-09 00:00:00\n",
"2017-03-02 00:00:00\n",
"2017-06-22 00:00:00\n",
"2017-10-12 00:00:00\n",
"Wall time: 15.6 s\n"
"2017-01-03 00:00:00\n"
]
},
{
"ename": "NameError",
"evalue": "name 'cross_product' is not defined",
"output_type": "error",
"traceback": [
"\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[1;31mNameError\u001b[0m Traceback (most recent call last)",
"\u001b[1;32m<timed exec>\u001b[0m in \u001b[0;36m<module>\u001b[1;34m()\u001b[0m\n",
"\u001b[1;31mNameError\u001b[0m: name 'cross_product' is not defined"
]
}
],
......@@ -299,7 +327,7 @@
"train_scores = []\n",
"predict_scores = []\n",
"\n",
"for i, date in enumerate(train_dates):\n",
"for i, date in enumerate(train_dates[:1]):\n",
" if i % 15 == 0:\n",
" print(date)\n",
" x = train_x[date]\n",
......@@ -327,8 +355,8 @@
"name": "stdout",
"output_type": "stream",
"text": [
"0.0322126965952\n",
"-0.67474281393\n"
"0.0291928676769\n",
"-0.24146254373\n"
]
}
],
......@@ -354,17 +382,11 @@
"name": "stdout",
"output_type": "stream",
"text": [
"2015-01-05 00:00:00\n",
"2015-04-28 00:00:00\n",
"2015-08-13 00:00:00\n",
"2015-12-07 00:00:00\n",
"2016-03-29 00:00:00\n",
"2016-07-18 00:00:00\n",
"2016-11-09 00:00:00\n",
"2017-03-02 00:00:00\n",
"2017-06-22 00:00:00\n",
"2017-10-12 00:00:00\n",
"Wall time: 6.84 s\n"
"2017-01-03 00:00:00\n",
"2017-04-27 00:00:00\n",
"2017-08-15 00:00:00\n",
"2017-12-05 00:00:00\n",
"Wall time: 4.78 s\n"
]
}
],
......@@ -402,8 +424,8 @@
"name": "stdout",
"output_type": "stream",
"text": [
"0.000197987187525\n",
"-0.569332930413\n"
"0.000355789142204\n",
"-0.200552889618\n"
]
}
],
......@@ -429,17 +451,11 @@
"name": "stdout",
"output_type": "stream",
"text": [
"2015-01-05 00:00:00\n",
"2015-04-28 00:00:00\n",
"2015-08-13 00:00:00\n",
"2015-12-07 00:00:00\n",
"2016-03-29 00:00:00\n",
"2016-07-18 00:00:00\n",
"2016-11-09 00:00:00\n",
"2017-03-02 00:00:00\n",
"2017-06-22 00:00:00\n",
"2017-10-12 00:00:00\n",
"Wall time: 2min 25s\n"
"2017-01-03 00:00:00\n",
"2017-04-27 00:00:00\n",
"2017-08-15 00:00:00\n",
"2017-12-05 00:00:00\n",
"Wall time: 1min 18s\n"
]
}
],
......@@ -473,8 +489,8 @@
"name": "stdout",
"output_type": "stream",
"text": [
"0.0149542839772\n",
"-0.566305808069\n"
"0.0137863030105\n",
"-0.197952235791\n"
]
}
],
......@@ -493,24 +509,18 @@
},
{
"cell_type": "code",
"execution_count": 22,
"execution_count": 16,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"2015-01-05 00:00:00\n",
"2015-04-28 00:00:00\n",
"2015-08-13 00:00:00\n",
"2015-12-07 00:00:00\n",
"2016-03-29 00:00:00\n",
"2016-07-18 00:00:00\n",
"2016-11-09 00:00:00\n",
"2017-03-02 00:00:00\n",
"2017-06-22 00:00:00\n",
"2017-10-12 00:00:00\n",
"Wall time: 1min 53s\n"
"2017-01-03 00:00:00\n",
"2017-04-27 00:00:00\n",
"2017-08-15 00:00:00\n",
"2017-12-05 00:00:00\n",
"Wall time: 1min 32s\n"
]
}
],
......@@ -541,15 +551,115 @@
},
{
"cell_type": "code",
"execution_count": 23,
"execution_count": 17,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"0.0575499865219\n",
"-0.209037365429\n"
]
}
],
"source": [
"print(np.mean(train_scores))\n",
"print(np.mean(predict_scores))"
]
},
{
"cell_type": "markdown",
"metadata": {
"collapsed": true
},
"source": [
"## Native XGBoost Regressor\n",
"---------------"
]
},
{
"cell_type": "code",
"execution_count": 68,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"2012-02-15 00:00:00\n",
"2012-06-06 00:00:00\n",
"2012-09-20 00:00:00\n",
"2013-01-15 00:00:00\n",
"2013-05-14 00:00:00\n",
"2013-08-30 00:00:00\n",
"2013-12-24 00:00:00\n",
"2014-04-17 00:00:00\n",
"2014-08-05 00:00:00\n",
"2014-11-26 00:00:00\n",
"2015-03-20 00:00:00\n",
"2015-07-08 00:00:00\n",
"2015-10-30 00:00:00\n",
"2016-02-22 00:00:00\n",
"2016-06-08 00:00:00\n",
"2016-09-27 00:00:00\n",
"2017-01-18 00:00:00\n",
"2017-05-15 00:00:00\n",
"2017-08-30 00:00:00\n",
"2017-12-20 00:00:00\n",
"Wall time: 6min 57s\n"
]
}
],
"source": [
"%%time\n",
"train_dates = list(train_x.keys())\n",
"train_scores = []\n",
"predict_scores = []\n",
"\n",
"for i, date in enumerate(train_dates):\n",
" if i % 15 == 0:\n",
" print(date)\n",
" x = train_x[date]\n",
" y = train_y[date]\n",
" \n",
" x_train, x_eval, y_train, y_eval = train_test_split(x, y, test_size=0.33, random_state=42)\n",
" \n",
" dtrain = xgb.DMatrix(x_train, y_train)\n",
" deval = xgb.DMatrix(x_eval, y_eval)\n",
" param = {'silent': 1,\n",
" 'objective': 'reg:linear',\n",
" 'max_depth': 3,\n",
" 'eta': 0.005,\n",
" 'boost': 'gbtree',\n",
" 'tree_method': 'hist',\n",
" 'subsample': 0.1,\n",
" 'colsample_bytree': 0.25}\n",
" num_round = 2000\n",
" model = xgb.train(param, dtrain, num_round, evals=[(deval, 'eval')], early_stopping_rounds=50, verbose_eval=False)\n",
" \n",
" y_train_predict = model.predict(dtrain)\n",
" train_scores.append(r2_score(y_train, y_train_predict, multioutput='uniform_average'))\n",
" \n",
" p_x = predict_x[date]\n",
" p_y = predict_y[date]\n",
" dtest = xgb.DMatrix(p_x, p_y)\n",
" \n",
" y_test_predict = model.predict(dtest)\n",
" predict_scores.append(r2_score(p_y, y_test_predict, multioutput='uniform_average'))"
]
},
{
"cell_type": "code",
"execution_count": 69,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"0.1654455223\n",
"-0.70322789707\n"
"0.0158347715471\n",
"-0.477095380466\n"
]
}
],
......
......@@ -9,23 +9,25 @@
"outputs": [],
"source": [
"%matplotlib inline\n",
"import xgboost as xgb\n",
"import numpy as np\n",
"from alphamind.api import *\n",
"from PyFin.api import *\n",
"from sklearn.model_selection import train_test_split\n",
"\n",
"engine = SqlEngine()"
]
},
{
"cell_type": "code",
"execution_count": 2,
"execution_count": 23,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"start_date = '2012-01-01'\n",
"end_date = '2017-12-31'\n",
"end_date = '2018-01-05'\n",
"\n",
"features = ['roe_q',\n",
" 'ep_q',\n",
......@@ -35,7 +37,7 @@
" 'EARNYILD',\n",
" 'EPIBS']\n",
"\n",
"freq = '5b'\n",
"freq = '10b'\n",
"batch = 16\n",
"universe = Universe('custom', ['zz500', 'hs300'])\n",
"benchmark = 905\n",
......@@ -45,29 +47,29 @@
},
{
"cell_type": "code",
"execution_count": 3,
"execution_count": 24,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"2018-01-08 16:54:05,618 - ALPHA_MIND - INFO - Starting data package fetching ...\n",
"2018-01-08 16:54:15,904 - ALPHA_MIND - INFO - factor data loading finished\n",
"2018-01-08 16:54:26,575 - ALPHA_MIND - INFO - return data loading finished\n",
"2018-01-08 16:54:27,944 - ALPHA_MIND - INFO - industry data loading finished\n",
"2018-01-08 16:54:28,634 - ALPHA_MIND - INFO - benchmark data loading finished\n",
"2018-01-08 16:54:41,966 - ALPHA_MIND - INFO - risk data loading finished\n",
"2018-01-08 16:54:45,557 - ALPHA_MIND - INFO - data merging finished\n",
"2018-01-08 16:54:48,150 - ALPHA_MIND - INFO - Loading data is finished\n",
"2018-01-08 16:54:59,541 - ALPHA_MIND - INFO - Data processing is finished\n"
"2018-01-11 15:12:44,105 - ALPHA_MIND - INFO - Starting data package fetching ...\n",
"2018-01-11 15:12:53,578 - ALPHA_MIND - INFO - factor data loading finished\n",
"2018-01-11 15:13:03,880 - ALPHA_MIND - INFO - return data loading finished\n",
"2018-01-11 15:13:05,384 - ALPHA_MIND - INFO - industry data loading finished\n",
"2018-01-11 15:13:06,178 - ALPHA_MIND - INFO - benchmark data loading finished\n",
"2018-01-11 15:13:17,845 - ALPHA_MIND - INFO - risk data loading finished\n",
"2018-01-11 15:13:21,266 - ALPHA_MIND - INFO - data merging finished\n",
"2018-01-11 15:13:23,371 - ALPHA_MIND - INFO - Loading data is finished\n",
"2018-01-11 15:13:33,174 - ALPHA_MIND - INFO - Data processing is finished\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Wall time: 53.9 s\n"
"Wall time: 49.1 s\n"
]
}
],
......@@ -89,7 +91,7 @@
},
{
"cell_type": "code",
"execution_count": 4,
"execution_count": 25,
"metadata": {
"collapsed": true
},
......@@ -110,8 +112,10 @@
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"execution_count": 26,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"for key, val in train_y.items():\n",
......@@ -131,7 +135,7 @@
},
{
"cell_type": "code",
"execution_count": 6,
"execution_count": 27,
"metadata": {},
"outputs": [
{
......@@ -158,7 +162,7 @@
"2017-05-15 00:00:00\n",
"2017-08-30 00:00:00\n",
"2017-12-20 00:00:00\n",
"Wall time: 6.92 s\n"
"Wall time: 5.34 s\n"
]
}
],
......@@ -185,15 +189,15 @@
},
{
"cell_type": "code",
"execution_count": 7,
"execution_count": 28,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"0.54106394519\n",
"0.519270440032\n"
"0.541013986745\n",
"0.51932344036\n"
]
}
],
......@@ -212,7 +216,7 @@
},
{
"cell_type": "code",
"execution_count": 21,
"execution_count": 29,
"metadata": {},
"outputs": [
{
......@@ -239,7 +243,7 @@
"2017-05-15 00:00:00\n",
"2017-08-30 00:00:00\n",
"2017-12-20 00:00:00\n",
"Wall time: 17min 2s\n"
"Wall time: 15min 34s\n"
]
}
],
......@@ -266,15 +270,15 @@
},
{
"cell_type": "code",
"execution_count": 22,
"execution_count": 30,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"0.557667621301\n",
"0.554107283453\n"
"0.557563825608\n",
"0.553974775005\n"
]
}
],
......@@ -293,7 +297,7 @@
},
{
"cell_type": "code",
"execution_count": 10,
"execution_count": 31,
"metadata": {},
"outputs": [
{
......@@ -320,7 +324,7 @@
"2017-05-15 00:00:00\n",
"2017-08-30 00:00:00\n",
"2017-12-20 00:00:00\n",
"Wall time: 4min 33s\n"
"Wall time: 13min 40s\n"
]
}
],
......@@ -336,7 +340,7 @@
" x = train_x[date]\n",
" y = train_y[date]\n",
" \n",
" model = XGBClassifier(n_estimators=500,\n",
" model = XGBClassifier(n_estimators=1000,\n",
" learning_rate=0.02,\n",
" max_depth=3,\n",
" n_jobs=-1,\n",
......@@ -352,15 +356,118 @@
},
{
"cell_type": "code",
"execution_count": 11,
"execution_count": 32,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"0.642946015759\n",
"0.537550683184\n"
]
}
],
"source": [
"print(np.mean(train_scores))\n",
"print(np.mean(predict_scores))"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Native XGBoost Classifier\n",
"---------------"
]
},
{
"cell_type": "code",
"execution_count": 33,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"2012-02-15 00:00:00\n",
"2012-06-06 00:00:00\n",
"2012-09-20 00:00:00\n",
"2013-01-15 00:00:00\n",
"2013-05-14 00:00:00\n",
"2013-08-30 00:00:00\n",
"2013-12-24 00:00:00\n",
"2014-04-17 00:00:00\n",
"2014-08-05 00:00:00\n",
"2014-11-26 00:00:00\n",
"2015-03-20 00:00:00\n",
"2015-07-08 00:00:00\n",
"2015-10-30 00:00:00\n",
"2016-02-22 00:00:00\n",
"2016-06-08 00:00:00\n",
"2016-09-27 00:00:00\n",
"2017-01-18 00:00:00\n",
"2017-05-15 00:00:00\n",
"2017-08-30 00:00:00\n",
"2017-12-20 00:00:00\n",
"Wall time: 1min 6s\n"
]
}
],
"source": [
"%%time\n",
"train_dates = list(train_x.keys())\n",
"train_scores = []\n",
"predict_scores = []\n",
"\n",
"for i, date in enumerate(train_dates):\n",
" if i % 15 == 0:\n",
" print(date)\n",
" x = train_x[date]\n",
" y = train_y[date]\n",
" \n",
" x_train, x_eval, y_train, y_eval = train_test_split(x, y, test_size=0.33, random_state=42)\n",
" \n",
" dtrain = xgb.DMatrix(x_train, y_train)\n",
" deval = xgb.DMatrix(x_eval, y_eval)\n",
" param = {'silent': 1,\n",
" 'objective': 'binary:logistic',\n",
" 'max_depth': 3,\n",
" 'eta': 0.01,\n",
" 'boost': 'dart',\n",
" 'tree_method': 'hist',\n",
" 'subsample': 0.25,\n",
" 'colsample_bytree': 0.5}\n",
" num_round = 2000\n",
" model = xgb.train(param, dtrain, num_round, evals=[(deval, 'eval')], early_stopping_rounds=50, verbose_eval=False)\n",
" \n",
" y_train_predict = model.predict(dtrain)\n",
" label = dtrain.get_label()\n",
" train_score = np.sum((y_train_predict > 0.5) == label) / float(len(label))\n",
"\n",
" train_scores.append(train_score)\n",
" \n",
" p_x = predict_x[date]\n",
" p_y = predict_y[date]\n",
" dtest = xgb.DMatrix(p_x, p_y)\n",
" \n",
" y_test_predict = model.predict(dtest)\n",
" p_label = dtest.get_label()\n",
" test_score = np.sum((y_test_predict > 0.5) == p_label) / float(len(p_label))\n",
" predict_scores.append(test_score)"
]
},
{
"cell_type": "code",
"execution_count": 34,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"0.612408578757\n",
"0.543523900352\n"
"0.567225761699\n",
"0.550997907465\n"
]
}
],
......@@ -379,7 +486,7 @@
},
{
"cell_type": "code",
"execution_count": 12,
"execution_count": 35,
"metadata": {
"collapsed": true
},
......@@ -397,7 +504,7 @@
},
{
"cell_type": "code",
"execution_count": 13,
"execution_count": 36,
"metadata": {},
"outputs": [
{
......@@ -424,7 +531,7 @@
"2017-05-15 00:00:00\n",
"2017-08-30 00:00:00\n",
"2017-12-20 00:00:00\n",
"Wall time: 26.7 s\n"
"Wall time: 36.1 s\n"
]
}
],
......@@ -455,15 +562,15 @@
},
{
"cell_type": "code",
"execution_count": 14,
"execution_count": 37,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"0.568151341668\n",
"0.517317353974\n"
"0.568125478425\n",
"0.517523115163\n"
]
}
],
......@@ -482,7 +589,7 @@
},
{
"cell_type": "code",
"execution_count": 19,
"execution_count": 38,
"metadata": {},
"outputs": [
{
......@@ -509,7 +616,7 @@
"2017-05-15 00:00:00\n",
"2017-08-30 00:00:00\n",
"2017-12-20 00:00:00\n",
"Wall time: 12min 2s\n"
"Wall time: 14min 40s\n"
]
}
],
......@@ -540,15 +647,15 @@
},
{
"cell_type": "code",
"execution_count": 20,
"execution_count": 39,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"0.549010335268\n",
"0.56003282178\n"
"0.549090142483\n",
"0.559944504146\n"
]
}
],
......@@ -567,7 +674,7 @@
},
{
"cell_type": "code",
"execution_count": 17,
"execution_count": 40,
"metadata": {},
"outputs": [
{
......@@ -594,7 +701,7 @@
"2017-05-15 00:00:00\n",
"2017-08-30 00:00:00\n",
"2017-12-20 00:00:00\n",
"Wall time: 17min 2s\n"
"Wall time: 12min 25s\n"
]
}
],
......@@ -630,15 +737,122 @@
},
{
"cell_type": "code",
"execution_count": 18,
"execution_count": 41,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"0.59375573895\n",
"0.55230987889\n"
]
}
],
"source": [
"print(np.mean(train_scores))\n",
"print(np.mean(predict_scores))"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Native XGBoost Classifier with More Features\n",
"---------------"
]
},
{
"cell_type": "code",
"execution_count": 42,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"2012-02-15 00:00:00\n",
"2012-06-06 00:00:00\n",
"2012-09-20 00:00:00\n",
"2013-01-15 00:00:00\n",
"2013-05-14 00:00:00\n",
"2013-08-30 00:00:00\n",
"2013-12-24 00:00:00\n",
"2014-04-17 00:00:00\n",
"2014-08-05 00:00:00\n",
"2014-11-26 00:00:00\n",
"2015-03-20 00:00:00\n",
"2015-07-08 00:00:00\n",
"2015-10-30 00:00:00\n",
"2016-02-22 00:00:00\n",
"2016-06-08 00:00:00\n",
"2016-09-27 00:00:00\n",
"2017-01-18 00:00:00\n",
"2017-05-15 00:00:00\n",
"2017-08-30 00:00:00\n",
"2017-12-20 00:00:00\n",
"Wall time: 5min 23s\n"
]
}
],
"source": [
"%%time\n",
"train_dates = list(train_x.keys())\n",
"train_scores = []\n",
"predict_scores = []\n",
"\n",
"for i, date in enumerate(train_dates):\n",
" if i % 15 == 0:\n",
" print(date)\n",
" x = train_x[date]\n",
" y = train_y[date]\n",
" risk = train_risk[date][:, 1:]\n",
" new_x = cross_product(x, risk)\n",
" \n",
" x_train, x_eval, y_train, y_eval = train_test_split(new_x, y, test_size=0.33, random_state=42)\n",
" \n",
" dtrain = xgb.DMatrix(x_train, y_train)\n",
" deval = xgb.DMatrix(x_eval, y_eval)\n",
" param = {'silent': 1,\n",
" 'objective': 'binary:logistic',\n",
" 'max_depth': 3,\n",
" 'eta': 0.01,\n",
" 'booster': 'dart',\n",
" 'tree_method': 'hist',\n",
" 'subsample': 0.25,\n",
" 'colsample_bytree': 0.5}\n",
" num_round = 2000\n",
" model = xgb.train(param, dtrain, num_round, evals=[(deval, 'eval')], early_stopping_rounds=50, verbose_eval=False)\n",
" \n",
" y_train_predict = model.predict(dtrain)\n",
" label = dtrain.get_label()\n",
" train_score = np.sum((y_train_predict > 0.5) == label) / float(len(label))\n",
"\n",
" train_scores.append(train_score)\n",
" \n",
" p_x = predict_x[date]\n",
" p_y = predict_y[date]\n",
" p_risk = predict_risk[date][:, 1:]\n",
" new_p_x = cross_product(p_x, p_risk)\n",
" dtest = xgb.DMatrix(new_p_x, p_y)\n",
" \n",
" y_test_predict = model.predict(dtest)\n",
" p_label = dtest.get_label()\n",
" test_score = np.sum((y_test_predict > 0.5) == p_label) / float(len(p_label))\n",
" predict_scores.append(test_score)"
]
},
{
"cell_type": "code",
"execution_count": 43,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"0.593739136739\n",
"0.552533996977\n"
"0.560057712549\n",
"0.552663472836\n"
]
}
],
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment