Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Sign in
Toggle navigation
A
alpha-mind
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
Dr.李
alpha-mind
Commits
80748343
Commit
80748343
authored
Jan 11, 2018
by
Dr.李
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
added update special tables
parent
a6e9e0c8
Changes
4
Hide whitespace changes
Inline
Side-by-side
Showing
4 changed files
with
488 additions
and
164 deletions
+488
-164
formula_expression.py
alphamind/examples/formula_expression.py
+10
-12
candidate_prod_model_20171204.ipynb
notebooks/candidate_prod_model_20171204.ipynb
+4
-2
model_comparing.ipynb
notebooks/model_comparing.ipynb
+211
-101
model_comparing_classifiers.ipynb
notebooks/model_comparing_classifiers.ipynb
+263
-49
No files found.
alphamind/examples/formula_expression.py
View file @
80748343
...
...
@@ -17,25 +17,24 @@ import datetime as dt
start
=
dt
.
datetime
.
now
()
universe
_name
=
'zz500'
universe
=
Universe
(
'custom'
,
[
'zz800'
])
factor_name
=
'PE'
expression
=
1.
/
LAST
(
factor_name
)
simple_expression
=
CSRes
(
LAST
(
'OperCashInToAsset'
),
'roe_q'
)
alpha_factor_name
=
'
1/PE
'
alpha_factor
=
{
alpha_factor_name
:
expression
}
alpha_factor_name
=
'
alpha_factor
'
alpha_factor
=
{
alpha_factor_name
:
simple_
expression
}
# end of formula definition
engine
=
SqlEngine
(
'postgresql+psycopg2://postgres:A12345678!@10.63.6.220/alpha'
)
universe
=
Universe
(
'custom'
,
[
universe_name
])
neutralize_risk
=
[
'SIZE'
]
+
industry_styles
freq
=
'
5
b'
neutralize_risk
=
[
'SIZE'
,
'LEVERAGE'
]
+
industry_styles
freq
=
'
10
b'
n_bins
=
5
horizon
=
map_freq
(
freq
)
start_date
=
'2012-01-01'
end_date
=
'201
7-11-21
'
end_date
=
'201
8-01-05
'
dates
=
makeSchedule
(
start_date
,
end_date
,
...
...
@@ -93,10 +92,9 @@ df = df.cumsum().plot(ax=axes[0], title='Quantile Analysis for {0}'.format(alpha
# =================================================================== #
factor_name
=
'PE'
expression
=
DIFF
(
1.
/
LAST
(
factor_name
))
alpha_factor_name
=
'1/PE
_1w_diff'
alpha_factor
=
{
alpha_factor_name
:
expression
}
alpha_factor_name
=
alpha_factor_name
+
'
_1w_diff'
alpha_factor
=
{
alpha_factor_name
:
DIFF
(
simple_expression
)
}
dates
=
makeSchedule
(
start_date
,
end_date
,
...
...
notebooks/candidate_prod_model_20171204.ipynb
View file @
80748343
...
...
@@ -24,7 +24,9 @@
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"\"\"\"\n",
...
...
@@ -517,7 +519,7 @@
" risk = train_risk[ref_date][:, 1:]\n",
" new_x = cross_product(x, risk)\n",
"\n",
" model = LinearRegression(features=linear_model_features_keys, fit_intercept=True) # n_jobs=8, min_samples_split=20)\n",
"
#
model = LinearRegression(features=linear_model_features_keys, fit_intercept=True) # n_jobs=8, min_samples_split=20)\n",
" model = LassoRegression(alpha=0.01, features=linear_model_features_keys, fit_intercept=True) # n_jobs=8, min_samples_split=20)\n",
" model.fit(new_x, y)\n",
" models_series.loc[ref_date] = model\n",
...
...
notebooks/model_comparing.ipynb
View file @
80748343
...
...
@@ -2,14 +2,15 @@
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {
"collapsed": true
},
"execution_count": 8,
"metadata": {},
"outputs": [],
"source": [
"%matplotlib inline\n",
"import numpy as np\n",
"import xgboost as xgb\n",
"from sklearn.metrics import r2_score\n",
"from sklearn.model_selection import train_test_split\n",
"from alphamind.api import *\n",
"from PyFin.api import *\n",
"\n",
...
...
@@ -18,14 +19,14 @@
},
{
"cell_type": "code",
"execution_count": 2,
"execution_count":
5
2,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"start_date = '201
5
-01-01'\n",
"end_date = '201
7-12-31
'\n",
"start_date = '201
2
-01-01'\n",
"end_date = '201
8-01-05
'\n",
"\n",
"features = ['roe_q',\n",
" 'ep_q',\n",
...
...
@@ -36,7 +37,7 @@
" 'EPIBS']\n",
"\n",
"freq = '5b'\n",
"batch =
16
\n",
"batch =
32
\n",
"universe = Universe('custom', ['zz500', 'hs300'])\n",
"benchmark = 905\n",
"neutralized_risk = ['SIZE'] + industry_styles\n",
...
...
@@ -45,23 +46,29 @@
},
{
"cell_type": "code",
"execution_count": 3,
"execution_count":
5
3,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"2018-01-08 10:14:20,323 - ALPHA_MIND - INFO - Starting data package fetching ...\n",
"2018-01-08 10:22:40,245 - ALPHA_MIND - INFO - Loading data is finished\n",
"2018-01-08 10:22:47,375 - ALPHA_MIND - INFO - Data processing is finished\n"
"2018-01-10 14:56:47,595 - ALPHA_MIND - INFO - Starting data package fetching ...\n",
"2018-01-10 14:56:54,781 - ALPHA_MIND - INFO - factor data loading finished\n",
"2018-01-10 14:57:03,949 - ALPHA_MIND - INFO - return data loading finished\n",
"2018-01-10 14:57:05,113 - ALPHA_MIND - INFO - industry data loading finished\n",
"2018-01-10 14:57:05,828 - ALPHA_MIND - INFO - benchmark data loading finished\n",
"2018-01-10 14:57:15,662 - ALPHA_MIND - INFO - risk data loading finished\n",
"2018-01-10 14:57:17,773 - ALPHA_MIND - INFO - data merging finished\n",
"2018-01-10 14:57:19,490 - ALPHA_MIND - INFO - Loading data is finished\n",
"2018-01-10 14:57:35,324 - ALPHA_MIND - INFO - Data processing is finished\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Wall time:
8min 27
s\n"
"Wall time:
47.7
s\n"
]
}
],
...
...
@@ -83,7 +90,7 @@
},
{
"cell_type": "code",
"execution_count": 4,
"execution_count":
5
4,
"metadata": {
"collapsed": true
},
...
...
@@ -112,24 +119,34 @@
},
{
"cell_type": "code",
"execution_count":
5
,
"execution_count":
66
,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"2015-01-05 00:00:00\n",
"2015-04-28 00:00:00\n",
"2015-08-13 00:00:00\n",
"2015-12-07 00:00:00\n",
"2016-03-29 00:00:00\n",
"2016-07-18 00:00:00\n",
"2016-11-09 00:00:00\n",
"2017-03-02 00:00:00\n",
"2017-06-22 00:00:00\n",
"2017-10-12 00:00:00\n",
"Wall time: 447 ms\n"
"2012-02-15 00:00:00\n",
"2012-06-06 00:00:00\n",
"2012-09-20 00:00:00\n",
"2013-01-15 00:00:00\n",
"2013-05-14 00:00:00\n",
"2013-08-30 00:00:00\n",
"2013-12-24 00:00:00\n",
"2014-04-17 00:00:00\n",
"2014-08-05 00:00:00\n",
"2014-11-26 00:00:00\n",
"2015-03-20 00:00:00\n",
"2015-07-08 00:00:00\n",
"2015-10-30 00:00:00\n",
"2016-02-22 00:00:00\n",
"2016-06-08 00:00:00\n",
"2016-09-27 00:00:00\n",
"2017-01-18 00:00:00\n",
"2017-05-15 00:00:00\n",
"2017-08-30 00:00:00\n",
"2017-12-20 00:00:00\n",
"Wall time: 1.26 s\n"
]
}
],
...
...
@@ -156,15 +173,15 @@
},
{
"cell_type": "code",
"execution_count": 6,
"execution_count": 6
7
,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"0.01
1958402073
2\n",
"-0.
587410279791
\n"
"0.01
0760900705
2\n",
"-0.
480548329833
\n"
]
}
],
...
...
@@ -178,29 +195,39 @@
"metadata": {},
"source": [
"## Lasso Regression\n",
"---------
---
"
"---------"
]
},
{
"cell_type": "code",
"execution_count":
7
,
"execution_count":
60
,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"2015-01-05 00:00:00\n",
"2015-04-28 00:00:00\n",
"2015-08-13 00:00:00\n",
"2015-12-07 00:00:00\n",
"2016-03-29 00:00:00\n",
"2016-07-18 00:00:00\n",
"2016-11-09 00:00:00\n",
"2017-03-02 00:00:00\n",
"2017-06-22 00:00:00\n",
"2017-10-12 00:00:00\n",
"Wall time: 518 ms\n"
"2012-02-15 00:00:00\n",
"2012-06-06 00:00:00\n",
"2012-09-20 00:00:00\n",
"2013-01-15 00:00:00\n",
"2013-05-14 00:00:00\n",
"2013-08-30 00:00:00\n",
"2013-12-24 00:00:00\n",
"2014-04-17 00:00:00\n",
"2014-08-05 00:00:00\n",
"2014-11-26 00:00:00\n",
"2015-03-20 00:00:00\n",
"2015-07-08 00:00:00\n",
"2015-10-30 00:00:00\n",
"2016-02-22 00:00:00\n",
"2016-06-08 00:00:00\n",
"2016-09-27 00:00:00\n",
"2017-01-18 00:00:00\n",
"2017-05-15 00:00:00\n",
"2017-08-30 00:00:00\n",
"2017-12-20 00:00:00\n",
"Wall time: 1.58 s\n"
]
}
],
...
...
@@ -227,15 +254,15 @@
},
{
"cell_type": "code",
"execution_count":
8
,
"execution_count":
61
,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"0.00
981862410414
\n",
"-0.
570666932124
\n"
"0.00
875291615929
\n",
"-0.
475440026
\n"
]
}
],
...
...
@@ -272,24 +299,25 @@
},
{
"cell_type": "code",
"execution_count":
10
,
"execution_count":
34
,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"2015-01-05 00:00:00\n",
"2015-04-28 00:00:00\n",
"2015-08-13 00:00:00\n",
"2015-12-07 00:00:00\n",
"2016-03-29 00:00:00\n",
"2016-07-18 00:00:00\n",
"2016-11-09 00:00:00\n",
"2017-03-02 00:00:00\n",
"2017-06-22 00:00:00\n",
"2017-10-12 00:00:00\n",
"Wall time: 15.6 s\n"
"2017-01-03 00:00:00\n"
]
},
{
"ename": "NameError",
"evalue": "name 'cross_product' is not defined",
"output_type": "error",
"traceback": [
"\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[1;31mNameError\u001b[0m Traceback (most recent call last)",
"\u001b[1;32m<timed exec>\u001b[0m in \u001b[0;36m<module>\u001b[1;34m()\u001b[0m\n",
"\u001b[1;31mNameError\u001b[0m: name 'cross_product' is not defined"
]
}
],
...
...
@@ -299,7 +327,7 @@
"train_scores = []\n",
"predict_scores = []\n",
"\n",
"for i, date in enumerate(train_dates):\n",
"for i, date in enumerate(train_dates
[:1]
):\n",
" if i % 15 == 0:\n",
" print(date)\n",
" x = train_x[date]\n",
...
...
@@ -327,8 +355,8 @@
"name": "stdout",
"output_type": "stream",
"text": [
"0.0
322126965952
\n",
"-0.
6747428139
3\n"
"0.0
291928676769
\n",
"-0.
2414625437
3\n"
]
}
],
...
...
@@ -354,17 +382,11 @@
"name": "stdout",
"output_type": "stream",
"text": [
"2015-01-05 00:00:00\n",
"2015-04-28 00:00:00\n",
"2015-08-13 00:00:00\n",
"2015-12-07 00:00:00\n",
"2016-03-29 00:00:00\n",
"2016-07-18 00:00:00\n",
"2016-11-09 00:00:00\n",
"2017-03-02 00:00:00\n",
"2017-06-22 00:00:00\n",
"2017-10-12 00:00:00\n",
"Wall time: 6.84 s\n"
"2017-01-03 00:00:00\n",
"2017-04-27 00:00:00\n",
"2017-08-15 00:00:00\n",
"2017-12-05 00:00:00\n",
"Wall time: 4.78 s\n"
]
}
],
...
...
@@ -402,8 +424,8 @@
"name": "stdout",
"output_type": "stream",
"text": [
"0.000
197987187525
\n",
"-0.
569332930413
\n"
"0.000
355789142204
\n",
"-0.
200552889618
\n"
]
}
],
...
...
@@ -429,17 +451,11 @@
"name": "stdout",
"output_type": "stream",
"text": [
"2015-01-05 00:00:00\n",
"2015-04-28 00:00:00\n",
"2015-08-13 00:00:00\n",
"2015-12-07 00:00:00\n",
"2016-03-29 00:00:00\n",
"2016-07-18 00:00:00\n",
"2016-11-09 00:00:00\n",
"2017-03-02 00:00:00\n",
"2017-06-22 00:00:00\n",
"2017-10-12 00:00:00\n",
"Wall time: 2min 25s\n"
"2017-01-03 00:00:00\n",
"2017-04-27 00:00:00\n",
"2017-08-15 00:00:00\n",
"2017-12-05 00:00:00\n",
"Wall time: 1min 18s\n"
]
}
],
...
...
@@ -473,8 +489,8 @@
"name": "stdout",
"output_type": "stream",
"text": [
"0.01
49542839772
\n",
"-0.
566305808069
\n"
"0.01
37863030105
\n",
"-0.
197952235791
\n"
]
}
],
...
...
@@ -493,24 +509,18 @@
},
{
"cell_type": "code",
"execution_count":
22
,
"execution_count":
16
,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"2015-01-05 00:00:00\n",
"2015-04-28 00:00:00\n",
"2015-08-13 00:00:00\n",
"2015-12-07 00:00:00\n",
"2016-03-29 00:00:00\n",
"2016-07-18 00:00:00\n",
"2016-11-09 00:00:00\n",
"2017-03-02 00:00:00\n",
"2017-06-22 00:00:00\n",
"2017-10-12 00:00:00\n",
"Wall time: 1min 53s\n"
"2017-01-03 00:00:00\n",
"2017-04-27 00:00:00\n",
"2017-08-15 00:00:00\n",
"2017-12-05 00:00:00\n",
"Wall time: 1min 32s\n"
]
}
],
...
...
@@ -541,15 +551,115 @@
},
{
"cell_type": "code",
"execution_count": 23,
"execution_count": 17,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"0.0575499865219\n",
"-0.209037365429\n"
]
}
],
"source": [
"print(np.mean(train_scores))\n",
"print(np.mean(predict_scores))"
]
},
{
"cell_type": "markdown",
"metadata": {
"collapsed": true
},
"source": [
"## Native XGBoost Regressor\n",
"---------------"
]
},
{
"cell_type": "code",
"execution_count": 68,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"2012-02-15 00:00:00\n",
"2012-06-06 00:00:00\n",
"2012-09-20 00:00:00\n",
"2013-01-15 00:00:00\n",
"2013-05-14 00:00:00\n",
"2013-08-30 00:00:00\n",
"2013-12-24 00:00:00\n",
"2014-04-17 00:00:00\n",
"2014-08-05 00:00:00\n",
"2014-11-26 00:00:00\n",
"2015-03-20 00:00:00\n",
"2015-07-08 00:00:00\n",
"2015-10-30 00:00:00\n",
"2016-02-22 00:00:00\n",
"2016-06-08 00:00:00\n",
"2016-09-27 00:00:00\n",
"2017-01-18 00:00:00\n",
"2017-05-15 00:00:00\n",
"2017-08-30 00:00:00\n",
"2017-12-20 00:00:00\n",
"Wall time: 6min 57s\n"
]
}
],
"source": [
"%%time\n",
"train_dates = list(train_x.keys())\n",
"train_scores = []\n",
"predict_scores = []\n",
"\n",
"for i, date in enumerate(train_dates):\n",
" if i % 15 == 0:\n",
" print(date)\n",
" x = train_x[date]\n",
" y = train_y[date]\n",
" \n",
" x_train, x_eval, y_train, y_eval = train_test_split(x, y, test_size=0.33, random_state=42)\n",
" \n",
" dtrain = xgb.DMatrix(x_train, y_train)\n",
" deval = xgb.DMatrix(x_eval, y_eval)\n",
" param = {'silent': 1,\n",
" 'objective': 'reg:linear',\n",
" 'max_depth': 3,\n",
" 'eta': 0.005,\n",
" 'boost': 'gbtree',\n",
" 'tree_method': 'hist',\n",
" 'subsample': 0.1,\n",
" 'colsample_bytree': 0.25}\n",
" num_round = 2000\n",
" model = xgb.train(param, dtrain, num_round, evals=[(deval, 'eval')], early_stopping_rounds=50, verbose_eval=False)\n",
" \n",
" y_train_predict = model.predict(dtrain)\n",
" train_scores.append(r2_score(y_train, y_train_predict, multioutput='uniform_average'))\n",
" \n",
" p_x = predict_x[date]\n",
" p_y = predict_y[date]\n",
" dtest = xgb.DMatrix(p_x, p_y)\n",
" \n",
" y_test_predict = model.predict(dtest)\n",
" predict_scores.append(r2_score(p_y, y_test_predict, multioutput='uniform_average'))"
]
},
{
"cell_type": "code",
"execution_count": 69,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"0.
1654455223
\n",
"-0.
70322789707
\n"
"0.
0158347715471
\n",
"-0.
477095380466
\n"
]
}
],
...
...
notebooks/model_comparing_classifiers.ipynb
View file @
80748343
...
...
@@ -9,23 +9,25 @@
"outputs": [],
"source": [
"%matplotlib inline\n",
"import xgboost as xgb\n",
"import numpy as np\n",
"from alphamind.api import *\n",
"from PyFin.api import *\n",
"from sklearn.model_selection import train_test_split\n",
"\n",
"engine = SqlEngine()"
]
},
{
"cell_type": "code",
"execution_count": 2,
"execution_count": 2
3
,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"start_date = '2012-01-01'\n",
"end_date = '201
7-12-31
'\n",
"end_date = '201
8-01-05
'\n",
"\n",
"features = ['roe_q',\n",
" 'ep_q',\n",
...
...
@@ -35,7 +37,7 @@
" 'EARNYILD',\n",
" 'EPIBS']\n",
"\n",
"freq = '
5
b'\n",
"freq = '
10
b'\n",
"batch = 16\n",
"universe = Universe('custom', ['zz500', 'hs300'])\n",
"benchmark = 905\n",
...
...
@@ -45,29 +47,29 @@
},
{
"cell_type": "code",
"execution_count":
3
,
"execution_count":
24
,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"2018-01-
08 16:54:05,618
- ALPHA_MIND - INFO - Starting data package fetching ...\n",
"2018-01-
08 16:54:15,904
- ALPHA_MIND - INFO - factor data loading finished\n",
"2018-01-
08 16:54:26,575
- ALPHA_MIND - INFO - return data loading finished\n",
"2018-01-
08 16:54:27,94
4 - ALPHA_MIND - INFO - industry data loading finished\n",
"2018-01-
08 16:54:28,634
- ALPHA_MIND - INFO - benchmark data loading finished\n",
"2018-01-
08 16:54:41,966
- ALPHA_MIND - INFO - risk data loading finished\n",
"2018-01-
08 16:54:45,557
- ALPHA_MIND - INFO - data merging finished\n",
"2018-01-
08 16:54:48,150
- ALPHA_MIND - INFO - Loading data is finished\n",
"2018-01-
08 16:54:59,541
- ALPHA_MIND - INFO - Data processing is finished\n"
"2018-01-
11 15:12:44,105
- ALPHA_MIND - INFO - Starting data package fetching ...\n",
"2018-01-
11 15:12:53,578
- ALPHA_MIND - INFO - factor data loading finished\n",
"2018-01-
11 15:13:03,880
- ALPHA_MIND - INFO - return data loading finished\n",
"2018-01-
11 15:13:05,38
4 - ALPHA_MIND - INFO - industry data loading finished\n",
"2018-01-
11 15:13:06,178
- ALPHA_MIND - INFO - benchmark data loading finished\n",
"2018-01-
11 15:13:17,845
- ALPHA_MIND - INFO - risk data loading finished\n",
"2018-01-
11 15:13:21,266
- ALPHA_MIND - INFO - data merging finished\n",
"2018-01-
11 15:13:23,371
- ALPHA_MIND - INFO - Loading data is finished\n",
"2018-01-
11 15:13:33,174
- ALPHA_MIND - INFO - Data processing is finished\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Wall time:
53.9
s\n"
"Wall time:
49.1
s\n"
]
}
],
...
...
@@ -89,7 +91,7 @@
},
{
"cell_type": "code",
"execution_count":
4
,
"execution_count":
25
,
"metadata": {
"collapsed": true
},
...
...
@@ -110,8 +112,10 @@
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"execution_count": 26,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"for key, val in train_y.items():\n",
...
...
@@ -131,7 +135,7 @@
},
{
"cell_type": "code",
"execution_count":
6
,
"execution_count":
27
,
"metadata": {},
"outputs": [
{
...
...
@@ -158,7 +162,7 @@
"2017-05-15 00:00:00\n",
"2017-08-30 00:00:00\n",
"2017-12-20 00:00:00\n",
"Wall time:
6.92
s\n"
"Wall time:
5.34
s\n"
]
}
],
...
...
@@ -185,15 +189,15 @@
},
{
"cell_type": "code",
"execution_count":
7
,
"execution_count":
28
,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"0.5410
6394519
\n",
"0.519
270440032
\n"
"0.5410
13986745
\n",
"0.519
32344036
\n"
]
}
],
...
...
@@ -212,7 +216,7 @@
},
{
"cell_type": "code",
"execution_count": 2
1
,
"execution_count": 2
9
,
"metadata": {},
"outputs": [
{
...
...
@@ -239,7 +243,7 @@
"2017-05-15 00:00:00\n",
"2017-08-30 00:00:00\n",
"2017-12-20 00:00:00\n",
"Wall time: 1
7min 2
s\n"
"Wall time: 1
5min 34
s\n"
]
}
],
...
...
@@ -266,15 +270,15 @@
},
{
"cell_type": "code",
"execution_count":
22
,
"execution_count":
30
,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"0.557
667621301
\n",
"0.55
4107283453
\n"
"0.557
563825608
\n",
"0.55
3974775005
\n"
]
}
],
...
...
@@ -293,7 +297,7 @@
},
{
"cell_type": "code",
"execution_count":
10
,
"execution_count":
31
,
"metadata": {},
"outputs": [
{
...
...
@@ -320,7 +324,7 @@
"2017-05-15 00:00:00\n",
"2017-08-30 00:00:00\n",
"2017-12-20 00:00:00\n",
"Wall time:
4min 33
s\n"
"Wall time:
13min 40
s\n"
]
}
],
...
...
@@ -336,7 +340,7 @@
" x = train_x[date]\n",
" y = train_y[date]\n",
" \n",
" model = XGBClassifier(n_estimators=
5
00,\n",
" model = XGBClassifier(n_estimators=
10
00,\n",
" learning_rate=0.02,\n",
" max_depth=3,\n",
" n_jobs=-1,\n",
...
...
@@ -352,15 +356,118 @@
},
{
"cell_type": "code",
"execution_count": 11,
"execution_count": 32,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"0.642946015759\n",
"0.537550683184\n"
]
}
],
"source": [
"print(np.mean(train_scores))\n",
"print(np.mean(predict_scores))"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Native XGBoost Classifier\n",
"---------------"
]
},
{
"cell_type": "code",
"execution_count": 33,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"2012-02-15 00:00:00\n",
"2012-06-06 00:00:00\n",
"2012-09-20 00:00:00\n",
"2013-01-15 00:00:00\n",
"2013-05-14 00:00:00\n",
"2013-08-30 00:00:00\n",
"2013-12-24 00:00:00\n",
"2014-04-17 00:00:00\n",
"2014-08-05 00:00:00\n",
"2014-11-26 00:00:00\n",
"2015-03-20 00:00:00\n",
"2015-07-08 00:00:00\n",
"2015-10-30 00:00:00\n",
"2016-02-22 00:00:00\n",
"2016-06-08 00:00:00\n",
"2016-09-27 00:00:00\n",
"2017-01-18 00:00:00\n",
"2017-05-15 00:00:00\n",
"2017-08-30 00:00:00\n",
"2017-12-20 00:00:00\n",
"Wall time: 1min 6s\n"
]
}
],
"source": [
"%%time\n",
"train_dates = list(train_x.keys())\n",
"train_scores = []\n",
"predict_scores = []\n",
"\n",
"for i, date in enumerate(train_dates):\n",
" if i % 15 == 0:\n",
" print(date)\n",
" x = train_x[date]\n",
" y = train_y[date]\n",
" \n",
" x_train, x_eval, y_train, y_eval = train_test_split(x, y, test_size=0.33, random_state=42)\n",
" \n",
" dtrain = xgb.DMatrix(x_train, y_train)\n",
" deval = xgb.DMatrix(x_eval, y_eval)\n",
" param = {'silent': 1,\n",
" 'objective': 'binary:logistic',\n",
" 'max_depth': 3,\n",
" 'eta': 0.01,\n",
" 'boost': 'dart',\n",
" 'tree_method': 'hist',\n",
" 'subsample': 0.25,\n",
" 'colsample_bytree': 0.5}\n",
" num_round = 2000\n",
" model = xgb.train(param, dtrain, num_round, evals=[(deval, 'eval')], early_stopping_rounds=50, verbose_eval=False)\n",
" \n",
" y_train_predict = model.predict(dtrain)\n",
" label = dtrain.get_label()\n",
" train_score = np.sum((y_train_predict > 0.5) == label) / float(len(label))\n",
"\n",
" train_scores.append(train_score)\n",
" \n",
" p_x = predict_x[date]\n",
" p_y = predict_y[date]\n",
" dtest = xgb.DMatrix(p_x, p_y)\n",
" \n",
" y_test_predict = model.predict(dtest)\n",
" p_label = dtest.get_label()\n",
" test_score = np.sum((y_test_predict > 0.5) == p_label) / float(len(p_label))\n",
" predict_scores.append(test_score)"
]
},
{
"cell_type": "code",
"execution_count": 34,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"0.
612408578757
\n",
"0.5
43523900352
\n"
"0.
567225761699
\n",
"0.5
50997907465
\n"
]
}
],
...
...
@@ -379,7 +486,7 @@
},
{
"cell_type": "code",
"execution_count":
12
,
"execution_count":
35
,
"metadata": {
"collapsed": true
},
...
...
@@ -397,7 +504,7 @@
},
{
"cell_type": "code",
"execution_count":
13
,
"execution_count":
36
,
"metadata": {},
"outputs": [
{
...
...
@@ -424,7 +531,7 @@
"2017-05-15 00:00:00\n",
"2017-08-30 00:00:00\n",
"2017-12-20 00:00:00\n",
"Wall time:
26.7
s\n"
"Wall time:
36.1
s\n"
]
}
],
...
...
@@ -455,15 +562,15 @@
},
{
"cell_type": "code",
"execution_count":
14
,
"execution_count":
37
,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"0.5681
51341668
\n",
"0.517
317353974
\n"
"0.5681
25478425
\n",
"0.517
523115163
\n"
]
}
],
...
...
@@ -482,7 +589,7 @@
},
{
"cell_type": "code",
"execution_count":
19
,
"execution_count":
38
,
"metadata": {},
"outputs": [
{
...
...
@@ -509,7 +616,7 @@
"2017-05-15 00:00:00\n",
"2017-08-30 00:00:00\n",
"2017-12-20 00:00:00\n",
"Wall time: 1
2min 2
s\n"
"Wall time: 1
4min 40
s\n"
]
}
],
...
...
@@ -540,15 +647,15 @@
},
{
"cell_type": "code",
"execution_count":
20
,
"execution_count":
39
,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"0.5490
10335268
\n",
"0.5
6003282178
\n"
"0.5490
90142483
\n",
"0.5
59944504146
\n"
]
}
],
...
...
@@ -567,7 +674,7 @@
},
{
"cell_type": "code",
"execution_count":
17
,
"execution_count":
40
,
"metadata": {},
"outputs": [
{
...
...
@@ -594,7 +701,7 @@
"2017-05-15 00:00:00\n",
"2017-08-30 00:00:00\n",
"2017-12-20 00:00:00\n",
"Wall time: 1
7min 2
s\n"
"Wall time: 1
2min 25
s\n"
]
}
],
...
...
@@ -630,15 +737,122 @@
},
{
"cell_type": "code",
"execution_count": 18,
"execution_count": 41,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"0.59375573895\n",
"0.55230987889\n"
]
}
],
"source": [
"print(np.mean(train_scores))\n",
"print(np.mean(predict_scores))"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Native XGBoost Classifier with More Features\n",
"---------------"
]
},
{
"cell_type": "code",
"execution_count": 42,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"2012-02-15 00:00:00\n",
"2012-06-06 00:00:00\n",
"2012-09-20 00:00:00\n",
"2013-01-15 00:00:00\n",
"2013-05-14 00:00:00\n",
"2013-08-30 00:00:00\n",
"2013-12-24 00:00:00\n",
"2014-04-17 00:00:00\n",
"2014-08-05 00:00:00\n",
"2014-11-26 00:00:00\n",
"2015-03-20 00:00:00\n",
"2015-07-08 00:00:00\n",
"2015-10-30 00:00:00\n",
"2016-02-22 00:00:00\n",
"2016-06-08 00:00:00\n",
"2016-09-27 00:00:00\n",
"2017-01-18 00:00:00\n",
"2017-05-15 00:00:00\n",
"2017-08-30 00:00:00\n",
"2017-12-20 00:00:00\n",
"Wall time: 5min 23s\n"
]
}
],
"source": [
"%%time\n",
"train_dates = list(train_x.keys())\n",
"train_scores = []\n",
"predict_scores = []\n",
"\n",
"for i, date in enumerate(train_dates):\n",
" if i % 15 == 0:\n",
" print(date)\n",
" x = train_x[date]\n",
" y = train_y[date]\n",
" risk = train_risk[date][:, 1:]\n",
" new_x = cross_product(x, risk)\n",
" \n",
" x_train, x_eval, y_train, y_eval = train_test_split(new_x, y, test_size=0.33, random_state=42)\n",
" \n",
" dtrain = xgb.DMatrix(x_train, y_train)\n",
" deval = xgb.DMatrix(x_eval, y_eval)\n",
" param = {'silent': 1,\n",
" 'objective': 'binary:logistic',\n",
" 'max_depth': 3,\n",
" 'eta': 0.01,\n",
" 'booster': 'dart',\n",
" 'tree_method': 'hist',\n",
" 'subsample': 0.25,\n",
" 'colsample_bytree': 0.5}\n",
" num_round = 2000\n",
" model = xgb.train(param, dtrain, num_round, evals=[(deval, 'eval')], early_stopping_rounds=50, verbose_eval=False)\n",
" \n",
" y_train_predict = model.predict(dtrain)\n",
" label = dtrain.get_label()\n",
" train_score = np.sum((y_train_predict > 0.5) == label) / float(len(label))\n",
"\n",
" train_scores.append(train_score)\n",
" \n",
" p_x = predict_x[date]\n",
" p_y = predict_y[date]\n",
" p_risk = predict_risk[date][:, 1:]\n",
" new_p_x = cross_product(p_x, p_risk)\n",
" dtest = xgb.DMatrix(new_p_x, p_y)\n",
" \n",
" y_test_predict = model.predict(dtest)\n",
" p_label = dtest.get_label()\n",
" test_score = np.sum((y_test_predict > 0.5) == p_label) / float(len(p_label))\n",
" predict_scores.append(test_score)"
]
},
{
"cell_type": "code",
"execution_count": 43,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"0.5
9373913673
9\n",
"0.552
533996977
\n"
"0.5
6005771254
9\n",
"0.552
663472836
\n"
]
}
],
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment