Commit 8b47319b authored by Dr.李's avatar Dr.李

update notebooks

parent 31d1dbb1
This source diff could not be displayed because it is too large. You can view the blob instead.
......@@ -2,8 +2,10 @@
"cells": [
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"execution_count": 1,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"%matplotlib inline\n",
......@@ -11,16 +13,18 @@
"from alphamind.api import *\n",
"from PyFin.api import *\n",
"\n",
"engine = SqlEngine('postgres+psycopg2://postgres:we083826@localhost/alpha')"
"engine = SqlEngine()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"execution_count": 2,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"start_date = '2012-01-01'\n",
"start_date = '2015-01-01'\n",
"end_date = '2017-12-31'\n",
"\n",
"features = ['roe_q',\n",
......@@ -41,9 +45,26 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 3,
"metadata": {},
"outputs": [],
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"2018-01-08 10:14:20,323 - ALPHA_MIND - INFO - Starting data package fetching ...\n",
"2018-01-08 10:22:40,245 - ALPHA_MIND - INFO - Loading data is finished\n",
"2018-01-08 10:22:47,375 - ALPHA_MIND - INFO - Data processing is finished\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Wall time: 8min 27s\n"
]
}
],
"source": [
"%%time\n",
"factor_data = fetch_data_package(engine,\n",
......@@ -62,8 +83,10 @@
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"execution_count": 4,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"features = factor_data['x_names']\n",
......@@ -89,9 +112,27 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 5,
"metadata": {},
"outputs": [],
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"2015-01-05 00:00:00\n",
"2015-04-28 00:00:00\n",
"2015-08-13 00:00:00\n",
"2015-12-07 00:00:00\n",
"2016-03-29 00:00:00\n",
"2016-07-18 00:00:00\n",
"2016-11-09 00:00:00\n",
"2017-03-02 00:00:00\n",
"2017-06-22 00:00:00\n",
"2017-10-12 00:00:00\n",
"Wall time: 447 ms\n"
]
}
],
"source": [
"%%time\n",
"train_dates = list(train_x.keys())\n",
......@@ -115,9 +156,18 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 6,
"metadata": {},
"outputs": [],
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"0.0119584020732\n",
"-0.587410279791\n"
]
}
],
"source": [
"print(np.mean(train_scores))\n",
"print(np.mean(predict_scores))"
......@@ -133,9 +183,27 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 7,
"metadata": {},
"outputs": [],
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"2015-01-05 00:00:00\n",
"2015-04-28 00:00:00\n",
"2015-08-13 00:00:00\n",
"2015-12-07 00:00:00\n",
"2016-03-29 00:00:00\n",
"2016-07-18 00:00:00\n",
"2016-11-09 00:00:00\n",
"2017-03-02 00:00:00\n",
"2017-06-22 00:00:00\n",
"2017-10-12 00:00:00\n",
"Wall time: 518 ms\n"
]
}
],
"source": [
"%%time\n",
"train_dates = list(train_x.keys())\n",
......@@ -159,9 +227,18 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 8,
"metadata": {},
"outputs": [],
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"0.00981862410414\n",
"-0.570666932124\n"
]
}
],
"source": [
"print(np.mean(train_scores))\n",
"print(np.mean(predict_scores))"
......@@ -177,8 +254,10 @@
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"execution_count": 9,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"def cross_product(x, y):\n",
......@@ -193,9 +272,27 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 10,
"metadata": {},
"outputs": [],
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"2015-01-05 00:00:00\n",
"2015-04-28 00:00:00\n",
"2015-08-13 00:00:00\n",
"2015-12-07 00:00:00\n",
"2016-03-29 00:00:00\n",
"2016-07-18 00:00:00\n",
"2016-11-09 00:00:00\n",
"2017-03-02 00:00:00\n",
"2017-06-22 00:00:00\n",
"2017-10-12 00:00:00\n",
"Wall time: 15.6 s\n"
]
}
],
"source": [
"%%time\n",
"train_dates = list(train_x.keys())\n",
......@@ -223,9 +320,18 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 11,
"metadata": {},
"outputs": [],
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"0.0322126965952\n",
"-0.67474281393\n"
]
}
],
"source": [
"print(np.mean(train_scores))\n",
"print(np.mean(predict_scores))"
......@@ -241,9 +347,27 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 12,
"metadata": {},
"outputs": [],
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"2015-01-05 00:00:00\n",
"2015-04-28 00:00:00\n",
"2015-08-13 00:00:00\n",
"2015-12-07 00:00:00\n",
"2016-03-29 00:00:00\n",
"2016-07-18 00:00:00\n",
"2016-11-09 00:00:00\n",
"2017-03-02 00:00:00\n",
"2017-06-22 00:00:00\n",
"2017-10-12 00:00:00\n",
"Wall time: 6.84 s\n"
]
}
],
"source": [
"%%time\n",
"train_dates = list(train_x.keys())\n",
......@@ -271,9 +395,18 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 13,
"metadata": {},
"outputs": [],
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"0.000197987187525\n",
"-0.569332930413\n"
]
}
],
"source": [
"print(np.mean(train_scores))\n",
"print(np.mean(predict_scores))"
......@@ -289,9 +422,27 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 14,
"metadata": {},
"outputs": [],
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"2015-01-05 00:00:00\n",
"2015-04-28 00:00:00\n",
"2015-08-13 00:00:00\n",
"2015-12-07 00:00:00\n",
"2016-03-29 00:00:00\n",
"2016-07-18 00:00:00\n",
"2016-11-09 00:00:00\n",
"2017-03-02 00:00:00\n",
"2017-06-22 00:00:00\n",
"2017-10-12 00:00:00\n",
"Wall time: 2min 25s\n"
]
}
],
"source": [
"%%time\n",
"train_dates = list(train_x.keys())\n",
......@@ -315,9 +466,18 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 15,
"metadata": {},
"outputs": [],
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"0.0149542839772\n",
"-0.566305808069\n"
]
}
],
"source": [
"print(np.mean(train_scores))\n",
"print(np.mean(predict_scores))"
......@@ -333,9 +493,27 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 22,
"metadata": {},
"outputs": [],
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"2015-01-05 00:00:00\n",
"2015-04-28 00:00:00\n",
"2015-08-13 00:00:00\n",
"2015-12-07 00:00:00\n",
"2016-03-29 00:00:00\n",
"2016-07-18 00:00:00\n",
"2016-11-09 00:00:00\n",
"2017-03-02 00:00:00\n",
"2017-06-22 00:00:00\n",
"2017-10-12 00:00:00\n",
"Wall time: 1min 53s\n"
]
}
],
"source": [
"%%time\n",
"train_dates = list(train_x.keys())\n",
......@@ -347,7 +525,12 @@
" print(date)\n",
" x = train_x[date]\n",
" y = train_y[date]\n",
" model = XGBRegressor(n_estimators=500, max_features='sqrt', max_depth=3, n_jobs=-1)\n",
" model = XGBRegressor(n_estimators=500,\n",
" learning_rate=0.02,\n",
" max_depth=3,\n",
" n_jobs=-1,\n",
" subsample=0.25,\n",
" colsample_bytree=0.5)\n",
" model.fit(x, y)\n",
" train_scores.append(model.score(x, y))\n",
" \n",
......@@ -358,33 +541,29 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 23,
"metadata": {},
"outputs": [],
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"0.1654455223\n",
"-0.70322789707\n"
]
}
],
"source": [
"%%time\n",
"train_dates = list(train_x.keys())\n",
"train_scores = []\n",
"predict_scores = []\n",
"\n",
"for i, date in enumerate(train_dates):\n",
" if i % 15 == 0:\n",
" print(date)\n",
" x = train_x[date]\n",
" y = train_y[date]\n",
" model = XGBRegressor(n_estimators=500, max_features='sqrt', max_depth=3, n_jobs=-1)\n",
" model.fit(x, y)\n",
" new_train_scores.append(model.score(x, y))\n",
" \n",
" p_x = predict_x[date]\n",
" p_y = predict_y[date]\n",
" new_predict_scores.append(model.score(p_x, p_y))"
"print(np.mean(train_scores))\n",
"print(np.mean(predict_scores))"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"metadata": {
"collapsed": true
},
"outputs": [],
"source": []
}
......@@ -405,7 +584,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.4"
"version": "3.6.3"
}
},
"nbformat": 4,
......
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"%matplotlib inline\n",
"import numpy as np\n",
"from alphamind.api import *\n",
"from PyFin.api import *\n",
"\n",
"engine = SqlEngine()"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"start_date = '2012-01-01'\n",
"end_date = '2017-12-31'\n",
"\n",
"features = ['roe_q',\n",
" 'ep_q',\n",
" 'DivP',\n",
" 'cfinc1_q',\n",
" 'EBIT',\n",
" 'EARNYILD',\n",
" 'EPIBS']\n",
"\n",
"freq = '5b'\n",
"batch = 16\n",
"universe = Universe('custom', ['zz500', 'hs300'])\n",
"benchmark = 905\n",
"neutralized_risk = ['SIZE'] + industry_styles\n",
"horizon = map_freq(freq)"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"2018-01-08 16:54:05,618 - ALPHA_MIND - INFO - Starting data package fetching ...\n",
"2018-01-08 16:54:15,904 - ALPHA_MIND - INFO - factor data loading finished\n",
"2018-01-08 16:54:26,575 - ALPHA_MIND - INFO - return data loading finished\n",
"2018-01-08 16:54:27,944 - ALPHA_MIND - INFO - industry data loading finished\n",
"2018-01-08 16:54:28,634 - ALPHA_MIND - INFO - benchmark data loading finished\n",
"2018-01-08 16:54:41,966 - ALPHA_MIND - INFO - risk data loading finished\n",
"2018-01-08 16:54:45,557 - ALPHA_MIND - INFO - data merging finished\n",
"2018-01-08 16:54:48,150 - ALPHA_MIND - INFO - Loading data is finished\n",
"2018-01-08 16:54:59,541 - ALPHA_MIND - INFO - Data processing is finished\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Wall time: 53.9 s\n"
]
}
],
"source": [
"%%time\n",
"factor_data = fetch_data_package(engine,\n",
" features,\n",
" start_date,\n",
" end_date,\n",
" '5b',\n",
" universe,\n",
" benchmark,\n",
" batch=batch,\n",
" warm_start=batch,\n",
" neutralized_risk=neutralized_risk, \n",
" pre_process=[winsorize_normal, standardize],\n",
" post_process=[winsorize_normal, standardize])"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"features = factor_data['x_names']\n",
"\n",
"train_x = factor_data['train']['x']\n",
"train_y = factor_data['train']['y']\n",
"train_risk = factor_data['train']['risk']\n",
"ref_dates = sorted(train_x.keys())\n",
"\n",
"predict_x = factor_data['predict']['x']\n",
"predict_y = factor_data['predict']['y']\n",
"predict_risk = factor_data['predict']['risk']\n",
"settlement = factor_data['settlement']"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [],
"source": [
"for key, val in train_y.items():\n",
" train_y[key] = np.where(val > 0., 1, 0)\n",
" \n",
"for key, val in predict_y.items():\n",
" predict_y[key] = np.where(val > 0., 1, 0)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Logistic Regression\n",
"--------------"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"2012-02-15 00:00:00\n",
"2012-06-06 00:00:00\n",
"2012-09-20 00:00:00\n",
"2013-01-15 00:00:00\n",
"2013-05-14 00:00:00\n",
"2013-08-30 00:00:00\n",
"2013-12-24 00:00:00\n",
"2014-04-17 00:00:00\n",
"2014-08-05 00:00:00\n",
"2014-11-26 00:00:00\n",
"2015-03-20 00:00:00\n",
"2015-07-08 00:00:00\n",
"2015-10-30 00:00:00\n",
"2016-02-22 00:00:00\n",
"2016-06-08 00:00:00\n",
"2016-09-27 00:00:00\n",
"2017-01-18 00:00:00\n",
"2017-05-15 00:00:00\n",
"2017-08-30 00:00:00\n",
"2017-12-20 00:00:00\n",
"Wall time: 6.92 s\n"
]
}
],
"source": [
"%%time\n",
"train_dates = list(train_x.keys())\n",
"train_scores = []\n",
"predict_scores = []\n",
"\n",
"for i, date in enumerate(train_dates):\n",
" if i % 15 == 0:\n",
" print(date)\n",
" x = train_x[date]\n",
" y = train_y[date]\n",
" \n",
" model = LogisticRegression(fit_intercept=False, features=features)\n",
" model.fit(x, y)\n",
" train_scores.append(model.score(x, y))\n",
" \n",
" p_x = predict_x[date]\n",
" p_y = predict_y[date]\n",
" predict_scores.append(model.score(p_x, p_y))"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"0.54106394519\n",
"0.519270440032\n"
]
}
],
"source": [
"print(np.mean(train_scores))\n",
"print(np.mean(predict_scores))"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Random Forest Classifier\n",
"-----------"
]
},
{
"cell_type": "code",
"execution_count": 21,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"2012-02-15 00:00:00\n",
"2012-06-06 00:00:00\n",
"2012-09-20 00:00:00\n",
"2013-01-15 00:00:00\n",
"2013-05-14 00:00:00\n",
"2013-08-30 00:00:00\n",
"2013-12-24 00:00:00\n",
"2014-04-17 00:00:00\n",
"2014-08-05 00:00:00\n",
"2014-11-26 00:00:00\n",
"2015-03-20 00:00:00\n",
"2015-07-08 00:00:00\n",
"2015-10-30 00:00:00\n",
"2016-02-22 00:00:00\n",
"2016-06-08 00:00:00\n",
"2016-09-27 00:00:00\n",
"2017-01-18 00:00:00\n",
"2017-05-15 00:00:00\n",
"2017-08-30 00:00:00\n",
"2017-12-20 00:00:00\n",
"Wall time: 17min 2s\n"
]
}
],
"source": [
"%%time\n",
"train_dates = list(train_x.keys())\n",
"train_scores = []\n",
"predict_scores = []\n",
"\n",
"for i, date in enumerate(train_dates):\n",
" if i % 15 == 0:\n",
" print(date)\n",
" x = train_x[date]\n",
" y = train_y[date]\n",
" \n",
" model = RandomForestClassifier(n_estimators=1000, max_features='sqrt', max_depth=3, n_jobs=-1)\n",
" model.fit(x, y)\n",
" train_scores.append(model.score(x, y))\n",
" \n",
" p_x = predict_x[date]\n",
" p_y = predict_y[date]\n",
" predict_scores.append(model.score(p_x, p_y))"
]
},
{
"cell_type": "code",
"execution_count": 22,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"0.557667621301\n",
"0.554107283453\n"
]
}
],
"source": [
"print(np.mean(train_scores))\n",
"print(np.mean(predict_scores))"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## XGBoost Classifier\n",
"---------"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"2012-02-15 00:00:00\n",
"2012-06-06 00:00:00\n",
"2012-09-20 00:00:00\n",
"2013-01-15 00:00:00\n",
"2013-05-14 00:00:00\n",
"2013-08-30 00:00:00\n",
"2013-12-24 00:00:00\n",
"2014-04-17 00:00:00\n",
"2014-08-05 00:00:00\n",
"2014-11-26 00:00:00\n",
"2015-03-20 00:00:00\n",
"2015-07-08 00:00:00\n",
"2015-10-30 00:00:00\n",
"2016-02-22 00:00:00\n",
"2016-06-08 00:00:00\n",
"2016-09-27 00:00:00\n",
"2017-01-18 00:00:00\n",
"2017-05-15 00:00:00\n",
"2017-08-30 00:00:00\n",
"2017-12-20 00:00:00\n",
"Wall time: 4min 33s\n"
]
}
],
"source": [
"%%time\n",
"train_dates = list(train_x.keys())\n",
"train_scores = []\n",
"predict_scores = []\n",
"\n",
"for i, date in enumerate(train_dates):\n",
" if i % 15 == 0:\n",
" print(date)\n",
" x = train_x[date]\n",
" y = train_y[date]\n",
" \n",
" model = XGBClassifier(n_estimators=500,\n",
" learning_rate=0.02,\n",
" max_depth=3,\n",
" n_jobs=-1,\n",
" subsample=0.25,\n",
" colsample_bytree=0.5)\n",
" model.fit(x, y)\n",
" train_scores.append(model.score(x, y))\n",
" \n",
" p_x = predict_x[date]\n",
" p_y = predict_y[date]\n",
" predict_scores.append(model.score(p_x, p_y))"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"0.612408578757\n",
"0.543523900352\n"
]
}
],
"source": [
"print(np.mean(train_scores))\n",
"print(np.mean(predict_scores))"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Logistic Regression with More Features\n",
"-----------------"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"def cross_product(x, y):\n",
" n, m = x.shape\n",
" res = []\n",
" \n",
" for j in range(m):\n",
" res.append(x[:, [j]] * y)\n",
" \n",
" return np.concatenate(res, axis=1)"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"2012-02-15 00:00:00\n",
"2012-06-06 00:00:00\n",
"2012-09-20 00:00:00\n",
"2013-01-15 00:00:00\n",
"2013-05-14 00:00:00\n",
"2013-08-30 00:00:00\n",
"2013-12-24 00:00:00\n",
"2014-04-17 00:00:00\n",
"2014-08-05 00:00:00\n",
"2014-11-26 00:00:00\n",
"2015-03-20 00:00:00\n",
"2015-07-08 00:00:00\n",
"2015-10-30 00:00:00\n",
"2016-02-22 00:00:00\n",
"2016-06-08 00:00:00\n",
"2016-09-27 00:00:00\n",
"2017-01-18 00:00:00\n",
"2017-05-15 00:00:00\n",
"2017-08-30 00:00:00\n",
"2017-12-20 00:00:00\n",
"Wall time: 26.7 s\n"
]
}
],
"source": [
"%%time\n",
"train_dates = list(train_x.keys())\n",
"train_scores = []\n",
"predict_scores = []\n",
"\n",
"for i, date in enumerate(train_dates):\n",
" if i % 15 == 0:\n",
" print(date)\n",
" x = train_x[date]\n",
" y = train_y[date]\n",
" risk = train_risk[date][:, 1:]\n",
" new_x = cross_product(x, risk)\n",
" \n",
" model = LogisticRegression(fit_intercept=False, features=features)\n",
" model.fit(new_x, y)\n",
" train_scores.append(model.score(new_x, y))\n",
" \n",
" p_x = predict_x[date]\n",
" p_y = predict_y[date]\n",
" p_risk = predict_risk[date][:, 1:]\n",
" new_p_x = cross_product(p_x, p_risk)\n",
" predict_scores.append(model.score(new_p_x, p_y))"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"0.568151341668\n",
"0.517317353974\n"
]
}
],
"source": [
"print(np.mean(train_scores))\n",
"print(np.mean(predict_scores))"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Random Forest Classifier with More Features\n",
"-----------"
]
},
{
"cell_type": "code",
"execution_count": 19,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"2012-02-15 00:00:00\n",
"2012-06-06 00:00:00\n",
"2012-09-20 00:00:00\n",
"2013-01-15 00:00:00\n",
"2013-05-14 00:00:00\n",
"2013-08-30 00:00:00\n",
"2013-12-24 00:00:00\n",
"2014-04-17 00:00:00\n",
"2014-08-05 00:00:00\n",
"2014-11-26 00:00:00\n",
"2015-03-20 00:00:00\n",
"2015-07-08 00:00:00\n",
"2015-10-30 00:00:00\n",
"2016-02-22 00:00:00\n",
"2016-06-08 00:00:00\n",
"2016-09-27 00:00:00\n",
"2017-01-18 00:00:00\n",
"2017-05-15 00:00:00\n",
"2017-08-30 00:00:00\n",
"2017-12-20 00:00:00\n",
"Wall time: 12min 2s\n"
]
}
],
"source": [
"%%time\n",
"train_dates = list(train_x.keys())\n",
"train_scores = []\n",
"predict_scores = []\n",
"\n",
"for i, date in enumerate(train_dates):\n",
" if i % 15 == 0:\n",
" print(date)\n",
" x = train_x[date]\n",
" y = train_y[date]\n",
" risk = train_risk[date][:, 1:]\n",
" new_x = cross_product(x, risk)\n",
" \n",
" model = RandomForestClassifier(n_estimators=1000, max_features='sqrt', max_depth=3, n_jobs=-1)\n",
" model.fit(new_x, y)\n",
" train_scores.append(model.score(new_x, y))\n",
" \n",
" p_x = predict_x[date]\n",
" p_y = predict_y[date]\n",
" p_risk = predict_risk[date][:, 1:]\n",
" new_p_x = cross_product(p_x, p_risk)\n",
" predict_scores.append(model.score(new_p_x, p_y))"
]
},
{
"cell_type": "code",
"execution_count": 20,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"0.549010335268\n",
"0.56003282178\n"
]
}
],
"source": [
"print(np.mean(train_scores))\n",
"print(np.mean(predict_scores))"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## XGBoost Classifier with More Features\n",
"---------"
]
},
{
"cell_type": "code",
"execution_count": 17,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"2012-02-15 00:00:00\n",
"2012-06-06 00:00:00\n",
"2012-09-20 00:00:00\n",
"2013-01-15 00:00:00\n",
"2013-05-14 00:00:00\n",
"2013-08-30 00:00:00\n",
"2013-12-24 00:00:00\n",
"2014-04-17 00:00:00\n",
"2014-08-05 00:00:00\n",
"2014-11-26 00:00:00\n",
"2015-03-20 00:00:00\n",
"2015-07-08 00:00:00\n",
"2015-10-30 00:00:00\n",
"2016-02-22 00:00:00\n",
"2016-06-08 00:00:00\n",
"2016-09-27 00:00:00\n",
"2017-01-18 00:00:00\n",
"2017-05-15 00:00:00\n",
"2017-08-30 00:00:00\n",
"2017-12-20 00:00:00\n",
"Wall time: 17min 2s\n"
]
}
],
"source": [
"%%time\n",
"train_dates = list(train_x.keys())\n",
"train_scores = []\n",
"predict_scores = []\n",
"\n",
"for i, date in enumerate(train_dates):\n",
" if i % 15 == 0:\n",
" print(date)\n",
" x = train_x[date]\n",
" y = train_y[date]\n",
" risk = train_risk[date][:, 1:]\n",
" new_x = cross_product(x, risk)\n",
" \n",
" model = XGBClassifier(n_estimators=500,\n",
" learning_rate=0.02,\n",
" max_depth=3,\n",
" n_jobs=-1,\n",
" subsample=0.25,\n",
" colsample_bytree=0.1)\n",
" model.fit(new_x, y)\n",
" train_scores.append(model.score(new_x, y))\n",
" \n",
" p_x = predict_x[date]\n",
" p_y = predict_y[date]\n",
" p_risk = predict_risk[date][:, 1:]\n",
" new_p_x = cross_product(p_x, p_risk)\n",
" predict_scores.append(model.score(new_p_x, p_y))"
]
},
{
"cell_type": "code",
"execution_count": 18,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"0.593739136739\n",
"0.552533996977\n"
]
}
],
"source": [
"print(np.mean(train_scores))\n",
"print(np.mean(predict_scores))"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.3"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
This source diff could not be displayed because it is too large. You can view the blob instead.
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment