Commit d99cf0c5 authored by Yucheng's avatar Yucheng

Merge branch 'master' of https://github.com/lion-sing/alpha-mind

parents bfc257f9 7924db6d
......@@ -33,6 +33,7 @@ from alphamind.model import RandomForestRegressor
from alphamind.model import RandomForestClassifier
from alphamind.model import XGBRegressor
from alphamind.model import XGBClassifier
from alphamind.model import XGBTrainer
from alphamind.model import load_model
from alphamind.model.data_preparing import fetch_data_package
from alphamind.model.data_preparing import fetch_train_phase
......@@ -74,6 +75,7 @@ __all__ = [
'RandomForestClassifier',
'XGBRegressor',
'XGBClassifier',
'XGBTrainer',
'load_model',
'NaiveExecutor',
'ThresholdExecutor',
......
......@@ -1954,5 +1954,5 @@ class OutrightTmp(Base):
if __name__ == '__main__':
from sqlalchemy import create_engine
engine = create_engine('postgres+psycopg2://postgres:A12345678!@10.63.6.220/alpha')
engine = create_engine('postgres+psycopg2://postgres:we083826@101.132.104.118/alpha')
Base.metadata.create_all(engine)
......@@ -388,11 +388,13 @@ class SqlEngine(object):
df = pd.read_sql(query, self.engine)
if universe.is_filtered:
codes = universe.query(self, start_date, end_date, dates)
df = pd.merge(df, codes, how='inner', on=['trade_date', 'code']).sort_values(['trade_date', 'code'])
df = pd.merge(df, codes, how='inner', on=['trade_date', 'code'])
if external_data is not None:
df = pd.merge(df, external_data, on=['trade_date', 'code']).dropna()
df.sort_values(['trade_date', 'code'], inplace=True)
df.set_index('trade_date', inplace=True)
res = transformer.transform('code', df)
......
......@@ -17,25 +17,24 @@ import datetime as dt
start = dt.datetime.now()
universe_name = 'zz500'
universe = Universe('custom', ['zz800'])
factor_name = 'PE'
expression = 1. / LAST(factor_name)
simple_expression = CSRes(LAST('OperCashInToAsset'), 'roe_q')
alpha_factor_name = '1/PE'
alpha_factor = {alpha_factor_name: expression}
alpha_factor_name = 'alpha_factor'
alpha_factor = {alpha_factor_name: simple_expression}
# end of formula definition
engine = SqlEngine('postgresql+psycopg2://postgres:A12345678!@10.63.6.220/alpha')
universe = Universe('custom', [universe_name])
neutralize_risk = ['SIZE'] + industry_styles
freq = '5b'
neutralize_risk = ['SIZE', 'LEVERAGE'] + industry_styles
freq = '10b'
n_bins = 5
horizon = map_freq(freq)
start_date = '2012-01-01'
end_date = '2017-11-21'
end_date = '2018-01-05'
dates = makeSchedule(start_date,
end_date,
......@@ -93,10 +92,9 @@ df = df.cumsum().plot(ax=axes[0], title='Quantile Analysis for {0}'.format(alpha
# =================================================================== #
factor_name = 'PE'
expression = DIFF(1./LAST(factor_name))
alpha_factor_name = '1/PE_1w_diff'
alpha_factor = {alpha_factor_name: expression}
alpha_factor_name = alpha_factor_name + '_1w_diff'
alpha_factor = {alpha_factor_name: DIFF(simple_expression)}
dates = makeSchedule(start_date,
end_date,
......
......@@ -14,6 +14,7 @@ from alphamind.model.treemodel import RandomForestRegressor
from alphamind.model.treemodel import RandomForestClassifier
from alphamind.model.treemodel import XGBRegressor
from alphamind.model.treemodel import XGBClassifier
from alphamind.model.treemodel import XGBTrainer
from alphamind.model.loader import load_model
......@@ -26,4 +27,5 @@ __all__ = ['LinearRegression',
'RandomForestClassifier',
'XGBRegressor',
'XGBClassifier',
'XGBTrainer',
'load_model']
\ No newline at end of file
......@@ -27,6 +27,7 @@ from alphamind.utilities import map_freq
def _merge_df(engine, names, factor_df, return_df, universe, dates, risk_model, neutralized_risk):
risk_df = engine.fetch_risk_model_range(universe, dates=dates, risk_model=risk_model)[1]
alpha_logger.info("risk data loading finished")
used_neutralized_risk = list(set(total_risk_factors).difference(names))
risk_df = risk_df[['trade_date', 'code'] + used_neutralized_risk].dropna()
return_df = pd.merge(return_df, risk_df, on=['trade_date', 'code'])
......@@ -45,9 +46,10 @@ def _merge_df(engine, names, factor_df, return_df, universe, dates, risk_model,
x_values = train_x[names].values.astype(float)
y_values = train_y[['dx']].values
codes = train_x['code'].values
date_label = pd.DatetimeIndex(factor_df.trade_date).to_pydatetime()
dates = np.unique(date_label)
return return_df, dates, date_label, risk_exp, x_values, y_values, train_x, train_y
return return_df, dates, date_label, risk_exp, x_values, y_values, train_x, train_y, codes
def prepare_data(engine: SqlEngine,
......@@ -82,9 +84,13 @@ def prepare_data(engine: SqlEngine,
factor_df = engine.fetch_factor_range(universe,
factors=transformer,
dates=dates).sort_values(['trade_date', 'code'])
alpha_logger.info("factor data loading finished")
return_df = engine.fetch_dx_return_range(universe, dates=dates, horizon=horizon)
alpha_logger.info("return data loading finished")
industry_df = engine.fetch_industry_range(universe, dates=dates)
alpha_logger.info("industry data loading finished")
benchmark_df = engine.fetch_benchmark_range(benchmark, dates=dates)
alpha_logger.info("benchmark data loading finished")
df = pd.merge(factor_df, return_df, on=['trade_date', 'code']).dropna()
df = pd.merge(df, benchmark_df, on=['trade_date', 'code'], how='left')
......@@ -102,13 +108,15 @@ def batch_processing(x_values,
batch,
risk_exp,
pre_process,
post_process):
post_process,
codes):
train_x_buckets = {}
train_y_buckets = {}
train_risk_buckets = {}
predict_x_buckets = {}
predict_y_buckets = {}
predict_risk_buckets = {}
predict_codes_bucket = {}
for i, start in enumerate(groups[:-batch]):
end = groups[i + batch]
......@@ -141,6 +149,7 @@ def batch_processing(x_values,
sub_dates = group_label[left_index:right_index]
this_raw_x = x_values[left_index:right_index]
this_codes = codes[left_index:right_index]
if risk_exp is not None:
this_risk_exp = risk_exp[left_index:right_index]
......@@ -156,6 +165,7 @@ def batch_processing(x_values,
inner_right_index = bisect.bisect_right(sub_dates, end)
predict_x_buckets[end] = ne_x[inner_left_index:inner_right_index]
predict_risk_buckets[end] = this_risk_exp[inner_left_index:inner_right_index]
predict_codes_bucket[end] = this_codes[inner_left_index:inner_right_index]
this_raw_y = y_values[left_index:right_index]
if len(this_raw_y) > 0:
......@@ -165,7 +175,13 @@ def batch_processing(x_values,
post_process=post_process)
predict_y_buckets[end] = ne_y[inner_left_index:inner_right_index]
return train_x_buckets, train_y_buckets, train_risk_buckets, predict_x_buckets, predict_y_buckets, predict_risk_buckets
return train_x_buckets, \
train_y_buckets, \
train_risk_buckets, \
predict_x_buckets, \
predict_y_buckets, \
predict_risk_buckets, \
predict_codes_bucket
def fetch_data_package(engine: SqlEngine,
......@@ -193,9 +209,11 @@ def fetch_data_package(engine: SqlEngine,
benchmark,
warm_start)
return_df, dates, date_label, risk_exp, x_values, y_values, train_x, train_y = \
return_df, dates, date_label, risk_exp, x_values, y_values, train_x, train_y, codes = \
_merge_df(engine, transformer.names, factor_df, return_df, universe, dates, risk_model, neutralized_risk)
alpha_logger.info("data merging finished")
return_df['weight'] = train_x['weight']
return_df['industry'] = train_x['industry']
return_df['industry_code'] = train_x['industry_code']
......@@ -207,15 +225,16 @@ def fetch_data_package(engine: SqlEngine,
alpha_logger.info("Loading data is finished")
train_x_buckets, train_y_buckets, train_risk_buckets, predict_x_buckets, predict_y_buckets, predict_risk_buckets = batch_processing(
x_values,
train_x_buckets, train_y_buckets, train_risk_buckets, predict_x_buckets, predict_y_buckets, predict_risk_buckets, predict_codes_bucket \
= batch_processing(x_values,
y_values,
dates,
date_label,
batch,
risk_exp,
pre_process,
post_process)
post_process,
codes)
alpha_logger.info("Data processing is finished")
......@@ -223,7 +242,7 @@ def fetch_data_package(engine: SqlEngine,
ret['x_names'] = transformer.names
ret['settlement'] = return_df
ret['train'] = {'x': train_x_buckets, 'y': train_y_buckets, 'risk': train_risk_buckets}
ret['predict'] = {'x': predict_x_buckets, 'y': predict_y_buckets, 'risk': predict_risk_buckets}
ret['predict'] = {'x': predict_x_buckets, 'y': predict_y_buckets, 'risk': predict_risk_buckets, 'code': predict_codes_bucket}
return ret
......@@ -261,7 +280,7 @@ def fetch_train_phase(engine,
return_df, factor_df = df[['trade_date', 'code', 'dx']], df[
['trade_date', 'code', 'isOpen'] + transformer.names]
return_df, dates, date_label, risk_exp, x_values, y_values, _, _ = \
return_df, dates, date_label, risk_exp, x_values, y_values, _, _, codes = \
_merge_df(engine, transformer.names, factor_df, return_df, universe, dates, risk_model, neutralized_risk)
if dates[-1] == dt.datetime.strptime(ref_date, '%Y-%m-%d'):
......@@ -274,6 +293,7 @@ def fetch_train_phase(engine,
index = (date_label >= start) & (date_label <= end)
this_raw_x = x_values[index]
this_raw_y = y_values[index]
this_code = codes[index]
if risk_exp is not None:
this_risk_exp = risk_exp[index]
else:
......@@ -291,7 +311,7 @@ def fetch_train_phase(engine,
ret = dict()
ret['x_names'] = transformer.names
ret['train'] = {'x': ne_x, 'y': ne_y}
ret['train'] = {'x': ne_x, 'y': ne_y, 'code': this_code}
return ret
......@@ -342,7 +362,6 @@ def fetch_predict_phase(engine,
end = dates[-1]
start = dates[-batch]
# index = (date_label >= start) & (date_label <= end)
left_index = bisect.bisect_left(date_label, start)
right_index = bisect.bisect_right(date_label, end)
this_raw_x = x_values[left_index:right_index]
......@@ -380,26 +399,12 @@ def fetch_predict_phase(engine,
if __name__ == '__main__':
engine = SqlEngine('postgresql+psycopg2://postgres:A12345678!@10.63.6.220/alpha')
universe = Universe('zz500', ['ashare_ex'])
universe = Universe('zz500', ['hs300', 'zz500'])
neutralized_risk = ['SIZE']
res = fetch_train_phase(engine,
['EPS', 'CFinc1'],
'2017-09-04',
'2w',
res = fetch_predict_phase(engine, ['ep_q'],
'2018-01-08',
'5b',
universe,
4,
warm_start=1,
16,
neutralized_risk=neutralized_risk)
print(res)
res = fetch_predict_phase(engine,
['EPS', 'CFinc1'],
'2017-09-04',
'2w',
universe,
4,
warm_start=1,
neutralized_risk=neutralized_risk)
print(res)
......@@ -14,6 +14,7 @@ from alphamind.model.treemodel import RandomForestRegressor
from alphamind.model.treemodel import RandomForestClassifier
from alphamind.model.treemodel import XGBRegressor
from alphamind.model.treemodel import XGBClassifier
from alphamind.model.treemodel import XGBTrainer
def load_model(model_desc: dict) -> ModelBase:
......@@ -37,5 +38,7 @@ def load_model(model_desc: dict) -> ModelBase:
return XGBRegressor.load(model_desc)
elif 'XGBClassifier' in model_name_parts:
return XGBClassifier.load(model_desc)
elif 'XGBTrainer' in model_name_parts:
return XGBTrainer.load(model_desc)
else:
raise ValueError('{0} is not currently supported in model loader.'.format(model_name))
......@@ -21,7 +21,7 @@ class ModelBase(metaclass=abc.ABCMeta):
self.impl = None
self.trained_time = None
def fit(self, x, y):
def fit(self, x: np.ndarray, y: np.ndarray):
self.impl.fit(x, y.flatten())
self.trained_time = arrow.now().format("YYYY-MM-DD HH:mm:ss")
......
......@@ -7,9 +7,13 @@ Created on 2017-12-4
from typing import List
from distutils.version import LooseVersion
import arrow
import numpy as np
from sklearn import __version__ as sklearn_version
from sklearn.ensemble import RandomForestRegressor as RandomForestRegressorImpl
from sklearn.ensemble import RandomForestClassifier as RandomForestClassifierImpl
from sklearn.model_selection import train_test_split
import xgboost as xgb
from xgboost import __version__ as xgbboot_version
from xgboost import XGBRegressor as XGBRegressorImpl
from xgboost import XGBClassifier as XGBClassifierImpl
......@@ -155,4 +159,91 @@ class XGBClassifier(ModelBase):
return self.impl.feature_importances_.tolist()
class XGBTrainer(ModelBase):
def __init__(self,
objective='binary:logistic',
booster='gbtree',
tree_method='hist',
n_estimators: int=100,
learning_rate: float=0.1,
max_depth=3,
eval_sample=None,
early_stopping_rounds=None,
subsample=1.,
colsample_bytree=1.,
features: List = None,
random_state=0,
**kwargs):
super().__init__(features)
self.params = {
'silent': 1,
'objective': objective,
'max_depth': max_depth,
'eta': learning_rate,
'booster': booster,
'tree_method': tree_method,
'subsample': subsample,
'colsample_bytree': colsample_bytree,
'seed': random_state
}
self.eval_sample = eval_sample
self.num_boost_round = n_estimators
self.early_stopping_rounds = early_stopping_rounds
self.impl = None
self.kwargs = kwargs
def fit(self, x, y):
if self.eval_sample:
x_train, x_eval, y_train, y_eval = train_test_split(x,
y,
test_size=self.eval_sample,
random_state=42)
d_train = xgb.DMatrix(x_train, y_train)
d_eval = xgb.DMatrix(x_eval, y_eval)
self.impl = xgb.train(params=self.params,
dtrain=d_train,
num_boost_round=self.num_boost_round,
evals=[(d_eval, 'eval')],
verbose_eval=False,
**self.kwargs)
else:
d_train = xgb.DMatrix(x, y)
self.impl = xgb.train(params=self.params,
dtrain=d_train,
num_boost_round=self.num_boost_round,
**self.kwargs)
self.trained_time = arrow.now().format("YYYY-MM-DD HH:mm:ss")
def predict(self, x: np.ndarray) -> np.ndarray:
d_predict = xgb.DMatrix(x)
return self.impl.predict(d_predict)
def save(self) -> dict:
model_desc = super().save()
model_desc['xgbboot_version'] = xgbboot_version
model_desc['importances'] = self.importances
return model_desc
@classmethod
def load(cls, model_desc: dict):
obj_layout = super().load(model_desc)
if LooseVersion(sklearn_version) < LooseVersion(model_desc['xgbboot_version']):
alpha_logger.warning('Current xgboost version {0} is lower than the model version {1}. '
'Loaded model may work incorrectly.'.format(xgbboot_version,
model_desc['xgbboot_version']))
return obj_layout
@property
def importances(self):
imps = self.impl.get_fscore().items()
imps = sorted(imps, key=lambda x: x[0])
return list(zip(*imps))[1]
......@@ -12,16 +12,18 @@ from alphamind.model.treemodel import RandomForestRegressor
from alphamind.model.treemodel import RandomForestClassifier
from alphamind.model.treemodel import XGBRegressor
from alphamind.model.treemodel import XGBClassifier
from alphamind.model.treemodel import XGBTrainer
class TestTreeModel(unittest.TestCase):
def setUp(self):
self.x = np.random.randn(1000, 10)
self.y = np.random.randn(1000)
def test_random_forest_regress_persistence(self):
model = RandomForestRegressor(features=list(range(10)))
x = np.random.randn(1000, 10)
y = np.random.randn(1000)
model.fit(x, y)
model.fit(self.x, self.y)
desc = model.save()
new_model = load_model(desc)
......@@ -29,14 +31,12 @@ class TestTreeModel(unittest.TestCase):
sample_x = np.random.randn(100, 10)
np.testing.assert_array_almost_equal(model.predict(sample_x), new_model.predict(sample_x))
np.testing.assert_array_almost_equal(model.importances, new_model.importances)
def test_random_forest_classify_persistence(self):
model = RandomForestClassifier(features=list(range(10)))
x = np.random.randn(1000, 10)
y = np.random.randn(1000)
y = np.where(y > 0, 1, 0)
model.fit(x, y)
y = np.where(self.y > 0, 1, 0)
model.fit(self.x, y)
desc = model.save()
new_model = load_model(desc)
......@@ -44,13 +44,11 @@ class TestTreeModel(unittest.TestCase):
sample_x = np.random.randn(100, 10)
np.testing.assert_array_almost_equal(model.predict(sample_x), new_model.predict(sample_x))
np.testing.assert_array_almost_equal(model.importances, new_model.importances)
def test_xgb_regress_persistence(self):
model = XGBRegressor(features=list(range(10)))
x = np.random.randn(1000, 10)
y = np.random.randn(1000)
model.fit(x, y)
model.fit(self.x, self.y)
desc = model.save()
new_model = load_model(desc)
......@@ -58,14 +56,56 @@ class TestTreeModel(unittest.TestCase):
sample_x = np.random.randn(100, 10)
np.testing.assert_array_almost_equal(model.predict(sample_x), new_model.predict(sample_x))
np.testing.assert_array_almost_equal(model.importances, new_model.importances)
def test_xgb_classify_persistence(self):
model = XGBClassifier(features=list(range(10)))
x = np.random.randn(1000, 10)
y = np.random.randn(1000)
y = np.where(y > 0, 1, 0)
y = np.where(self.y > 0, 1, 0)
model.fit(self.x, y)
desc = model.save()
new_model = load_model(desc)
self.assertEqual(model.features, new_model.features)
sample_x = np.random.randn(100, 10)
np.testing.assert_array_almost_equal(model.predict(sample_x), new_model.predict(sample_x))
np.testing.assert_array_almost_equal(model.importances, new_model.importances)
def test_xgb_trainer_equal_classifier(self):
sample_x = np.random.randn(100, 10)
model.fit(x, y)
model1 = XGBClassifier(n_estimators=100,
learning_rate=0.1,
max_depth=3,
features=list(range(10)),
random_state=42)
model2 = XGBTrainer(features=list(range(10)),
objective='reg:logistic',
booster='gbtree',
tree_method='exact',
n_estimators=100,
learning_rate=0.1,
max_depth=3,
random_state=42)
y = np.where(self.y > 0, 1, 0)
model1.fit(self.x, y)
model2.fit(self.x, y)
predict1 = model1.predict(sample_x)
predict2 = model2.predict(sample_x)
predict2 = np.where(predict2 > 0.5, 1., 0.)
np.testing.assert_array_almost_equal(predict1, predict2)
def test_xgb_trainer_persistence(self):
model = XGBTrainer(features=list(range(10)),
objective='binary:logistic',
booster='gbtree',
tree_method='hist',
n_estimators=200)
y = np.where(self.y > 0, 1, 0)
model.fit(self.x, y)
desc = model.save()
new_model = load_model(desc)
......@@ -73,3 +113,4 @@ class TestTreeModel(unittest.TestCase):
sample_x = np.random.randn(100, 10)
np.testing.assert_array_almost_equal(model.predict(sample_x), new_model.predict(sample_x))
np.testing.assert_array_almost_equal(model.importances, new_model.importances)
......@@ -4,7 +4,12 @@ cd xgboost
git submodule init
git submodule update
mkdir build
cd build
cmake ..
make -j4
cd ..
cd python-package
python setup.py install
......
This source diff could not be displayed because it is too large. You can view the blob instead.
This source diff could not be displayed because it is too large. You can view the blob instead.
......@@ -2,26 +2,31 @@
"cells": [
{
"cell_type": "code",
"execution_count": null,
"execution_count": 8,
"metadata": {},
"outputs": [],
"source": [
"%matplotlib inline\n",
"import numpy as np\n",
"import xgboost as xgb\n",
"from sklearn.metrics import r2_score\n",
"from sklearn.model_selection import train_test_split\n",
"from alphamind.api import *\n",
"from PyFin.api import *\n",
"\n",
"engine = SqlEngine('postgres+psycopg2://postgres:we083826@localhost/alpha')"
"engine = SqlEngine()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"execution_count": 52,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"start_date = '2012-01-01'\n",
"end_date = '2017-12-31'\n",
"end_date = '2018-01-05'\n",
"\n",
"features = ['roe_q',\n",
" 'ep_q',\n",
......@@ -32,7 +37,7 @@
" 'EPIBS']\n",
"\n",
"freq = '5b'\n",
"batch = 16\n",
"batch = 32\n",
"universe = Universe('custom', ['zz500', 'hs300'])\n",
"benchmark = 905\n",
"neutralized_risk = ['SIZE'] + industry_styles\n",
......@@ -41,9 +46,32 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 53,
"metadata": {},
"outputs": [],
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"2018-01-10 14:56:47,595 - ALPHA_MIND - INFO - Starting data package fetching ...\n",
"2018-01-10 14:56:54,781 - ALPHA_MIND - INFO - factor data loading finished\n",
"2018-01-10 14:57:03,949 - ALPHA_MIND - INFO - return data loading finished\n",
"2018-01-10 14:57:05,113 - ALPHA_MIND - INFO - industry data loading finished\n",
"2018-01-10 14:57:05,828 - ALPHA_MIND - INFO - benchmark data loading finished\n",
"2018-01-10 14:57:15,662 - ALPHA_MIND - INFO - risk data loading finished\n",
"2018-01-10 14:57:17,773 - ALPHA_MIND - INFO - data merging finished\n",
"2018-01-10 14:57:19,490 - ALPHA_MIND - INFO - Loading data is finished\n",
"2018-01-10 14:57:35,324 - ALPHA_MIND - INFO - Data processing is finished\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Wall time: 47.7 s\n"
]
}
],
"source": [
"%%time\n",
"factor_data = fetch_data_package(engine,\n",
......@@ -62,8 +90,10 @@
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"execution_count": 54,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"features = factor_data['x_names']\n",
......@@ -89,9 +119,37 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 66,
"metadata": {},
"outputs": [],
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"2012-02-15 00:00:00\n",
"2012-06-06 00:00:00\n",
"2012-09-20 00:00:00\n",
"2013-01-15 00:00:00\n",
"2013-05-14 00:00:00\n",
"2013-08-30 00:00:00\n",
"2013-12-24 00:00:00\n",
"2014-04-17 00:00:00\n",
"2014-08-05 00:00:00\n",
"2014-11-26 00:00:00\n",
"2015-03-20 00:00:00\n",
"2015-07-08 00:00:00\n",
"2015-10-30 00:00:00\n",
"2016-02-22 00:00:00\n",
"2016-06-08 00:00:00\n",
"2016-09-27 00:00:00\n",
"2017-01-18 00:00:00\n",
"2017-05-15 00:00:00\n",
"2017-08-30 00:00:00\n",
"2017-12-20 00:00:00\n",
"Wall time: 1.26 s\n"
]
}
],
"source": [
"%%time\n",
"train_dates = list(train_x.keys())\n",
......@@ -115,9 +173,18 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 67,
"metadata": {},
"outputs": [],
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"0.0107609007052\n",
"-0.480548329833\n"
]
}
],
"source": [
"print(np.mean(train_scores))\n",
"print(np.mean(predict_scores))"
......@@ -128,14 +195,42 @@
"metadata": {},
"source": [
"## Lasso Regression\n",
"------------"
"---------"
]
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 60,
"metadata": {},
"outputs": [],
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"2012-02-15 00:00:00\n",
"2012-06-06 00:00:00\n",
"2012-09-20 00:00:00\n",
"2013-01-15 00:00:00\n",
"2013-05-14 00:00:00\n",
"2013-08-30 00:00:00\n",
"2013-12-24 00:00:00\n",
"2014-04-17 00:00:00\n",
"2014-08-05 00:00:00\n",
"2014-11-26 00:00:00\n",
"2015-03-20 00:00:00\n",
"2015-07-08 00:00:00\n",
"2015-10-30 00:00:00\n",
"2016-02-22 00:00:00\n",
"2016-06-08 00:00:00\n",
"2016-09-27 00:00:00\n",
"2017-01-18 00:00:00\n",
"2017-05-15 00:00:00\n",
"2017-08-30 00:00:00\n",
"2017-12-20 00:00:00\n",
"Wall time: 1.58 s\n"
]
}
],
"source": [
"%%time\n",
"train_dates = list(train_x.keys())\n",
......@@ -159,9 +254,18 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 61,
"metadata": {},
"outputs": [],
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"0.00875291615929\n",
"-0.475440026\n"
]
}
],
"source": [
"print(np.mean(train_scores))\n",
"print(np.mean(predict_scores))"
......@@ -177,8 +281,10 @@
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"execution_count": 9,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"def cross_product(x, y):\n",
......@@ -193,16 +299,35 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 34,
"metadata": {},
"outputs": [],
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"2017-01-03 00:00:00\n"
]
},
{
"ename": "NameError",
"evalue": "name 'cross_product' is not defined",
"output_type": "error",
"traceback": [
"\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[1;31mNameError\u001b[0m Traceback (most recent call last)",
"\u001b[1;32m<timed exec>\u001b[0m in \u001b[0;36m<module>\u001b[1;34m()\u001b[0m\n",
"\u001b[1;31mNameError\u001b[0m: name 'cross_product' is not defined"
]
}
],
"source": [
"%%time\n",
"train_dates = list(train_x.keys())\n",
"train_scores = []\n",
"predict_scores = []\n",
"\n",
"for i, date in enumerate(train_dates):\n",
"for i, date in enumerate(train_dates[:1]):\n",
" if i % 15 == 0:\n",
" print(date)\n",
" x = train_x[date]\n",
......@@ -223,9 +348,18 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 11,
"metadata": {},
"outputs": [],
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"0.0291928676769\n",
"-0.24146254373\n"
]
}
],
"source": [
"print(np.mean(train_scores))\n",
"print(np.mean(predict_scores))"
......@@ -241,9 +375,21 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 12,
"metadata": {},
"outputs": [],
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"2017-01-03 00:00:00\n",
"2017-04-27 00:00:00\n",
"2017-08-15 00:00:00\n",
"2017-12-05 00:00:00\n",
"Wall time: 4.78 s\n"
]
}
],
"source": [
"%%time\n",
"train_dates = list(train_x.keys())\n",
......@@ -271,9 +417,18 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 13,
"metadata": {},
"outputs": [],
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"0.000355789142204\n",
"-0.200552889618\n"
]
}
],
"source": [
"print(np.mean(train_scores))\n",
"print(np.mean(predict_scores))"
......@@ -289,9 +444,21 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 14,
"metadata": {},
"outputs": [],
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"2017-01-03 00:00:00\n",
"2017-04-27 00:00:00\n",
"2017-08-15 00:00:00\n",
"2017-12-05 00:00:00\n",
"Wall time: 1min 18s\n"
]
}
],
"source": [
"%%time\n",
"train_dates = list(train_x.keys())\n",
......@@ -315,9 +482,18 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 15,
"metadata": {},
"outputs": [],
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"0.0137863030105\n",
"-0.197952235791\n"
]
}
],
"source": [
"print(np.mean(train_scores))\n",
"print(np.mean(predict_scores))"
......@@ -333,9 +509,21 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 16,
"metadata": {},
"outputs": [],
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"2017-01-03 00:00:00\n",
"2017-04-27 00:00:00\n",
"2017-08-15 00:00:00\n",
"2017-12-05 00:00:00\n",
"Wall time: 1min 32s\n"
]
}
],
"source": [
"%%time\n",
"train_dates = list(train_x.keys())\n",
......@@ -347,7 +535,12 @@
" print(date)\n",
" x = train_x[date]\n",
" y = train_y[date]\n",
" model = XGBRegressor(n_estimators=500, max_features='sqrt', max_depth=3, n_jobs=-1)\n",
" model = XGBRegressor(n_estimators=500,\n",
" learning_rate=0.02,\n",
" max_depth=3,\n",
" n_jobs=-1,\n",
" subsample=0.25,\n",
" colsample_bytree=0.5)\n",
" model.fit(x, y)\n",
" train_scores.append(model.score(x, y))\n",
" \n",
......@@ -358,9 +551,66 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 17,
"metadata": {},
"outputs": [],
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"0.0575499865219\n",
"-0.209037365429\n"
]
}
],
"source": [
"print(np.mean(train_scores))\n",
"print(np.mean(predict_scores))"
]
},
{
"cell_type": "markdown",
"metadata": {
"collapsed": true
},
"source": [
"## Native XGBoost Regressor\n",
"---------------"
]
},
{
"cell_type": "code",
"execution_count": 68,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"2012-02-15 00:00:00\n",
"2012-06-06 00:00:00\n",
"2012-09-20 00:00:00\n",
"2013-01-15 00:00:00\n",
"2013-05-14 00:00:00\n",
"2013-08-30 00:00:00\n",
"2013-12-24 00:00:00\n",
"2014-04-17 00:00:00\n",
"2014-08-05 00:00:00\n",
"2014-11-26 00:00:00\n",
"2015-03-20 00:00:00\n",
"2015-07-08 00:00:00\n",
"2015-10-30 00:00:00\n",
"2016-02-22 00:00:00\n",
"2016-06-08 00:00:00\n",
"2016-09-27 00:00:00\n",
"2017-01-18 00:00:00\n",
"2017-05-15 00:00:00\n",
"2017-08-30 00:00:00\n",
"2017-12-20 00:00:00\n",
"Wall time: 6min 57s\n"
]
}
],
"source": [
"%%time\n",
"train_dates = list(train_x.keys())\n",
......@@ -372,19 +622,58 @@
" print(date)\n",
" x = train_x[date]\n",
" y = train_y[date]\n",
" model = XGBRegressor(n_estimators=500, max_features='sqrt', max_depth=3, n_jobs=-1)\n",
" model.fit(x, y)\n",
" new_train_scores.append(model.score(x, y))\n",
" \n",
" x_train, x_eval, y_train, y_eval = train_test_split(x, y, test_size=0.33, random_state=42)\n",
" \n",
" dtrain = xgb.DMatrix(x_train, y_train)\n",
" deval = xgb.DMatrix(x_eval, y_eval)\n",
" param = {'silent': 1,\n",
" 'objective': 'reg:linear',\n",
" 'max_depth': 3,\n",
" 'eta': 0.005,\n",
" 'boost': 'gbtree',\n",
" 'tree_method': 'hist',\n",
" 'subsample': 0.1,\n",
" 'colsample_bytree': 0.25}\n",
" num_round = 2000\n",
" model = xgb.train(param, dtrain, num_round, evals=[(deval, 'eval')], early_stopping_rounds=50, verbose_eval=False)\n",
" \n",
" y_train_predict = model.predict(dtrain)\n",
" train_scores.append(r2_score(y_train, y_train_predict, multioutput='uniform_average'))\n",
" \n",
" p_x = predict_x[date]\n",
" p_y = predict_y[date]\n",
" new_predict_scores.append(model.score(p_x, p_y))"
" dtest = xgb.DMatrix(p_x, p_y)\n",
" \n",
" y_test_predict = model.predict(dtest)\n",
" predict_scores.append(r2_score(p_y, y_test_predict, multioutput='uniform_average'))"
]
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 69,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"0.0158347715471\n",
"-0.477095380466\n"
]
}
],
"source": [
"print(np.mean(train_scores))\n",
"print(np.mean(predict_scores))"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": []
}
......@@ -405,7 +694,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.4"
"version": "3.6.3"
}
},
"nbformat": 4,
......
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"%matplotlib inline\n",
"import xgboost as xgb\n",
"import numpy as np\n",
"from alphamind.api import *\n",
"from PyFin.api import *\n",
"from sklearn.model_selection import train_test_split\n",
"\n",
"engine = SqlEngine()"
]
},
{
"cell_type": "code",
"execution_count": 23,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"start_date = '2012-01-01'\n",
"end_date = '2018-01-05'\n",
"\n",
"features = ['roe_q',\n",
" 'ep_q',\n",
" 'DivP',\n",
" 'cfinc1_q',\n",
" 'EBIT',\n",
" 'EARNYILD',\n",
" 'EPIBS']\n",
"\n",
"freq = '10b'\n",
"batch = 16\n",
"universe = Universe('custom', ['zz500', 'hs300'])\n",
"benchmark = 905\n",
"neutralized_risk = ['SIZE'] + industry_styles\n",
"horizon = map_freq(freq)"
]
},
{
"cell_type": "code",
"execution_count": 24,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"2018-01-11 15:12:44,105 - ALPHA_MIND - INFO - Starting data package fetching ...\n",
"2018-01-11 15:12:53,578 - ALPHA_MIND - INFO - factor data loading finished\n",
"2018-01-11 15:13:03,880 - ALPHA_MIND - INFO - return data loading finished\n",
"2018-01-11 15:13:05,384 - ALPHA_MIND - INFO - industry data loading finished\n",
"2018-01-11 15:13:06,178 - ALPHA_MIND - INFO - benchmark data loading finished\n",
"2018-01-11 15:13:17,845 - ALPHA_MIND - INFO - risk data loading finished\n",
"2018-01-11 15:13:21,266 - ALPHA_MIND - INFO - data merging finished\n",
"2018-01-11 15:13:23,371 - ALPHA_MIND - INFO - Loading data is finished\n",
"2018-01-11 15:13:33,174 - ALPHA_MIND - INFO - Data processing is finished\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Wall time: 49.1 s\n"
]
}
],
"source": [
"%%time\n",
"factor_data = fetch_data_package(engine,\n",
" features,\n",
" start_date,\n",
" end_date,\n",
" '5b',\n",
" universe,\n",
" benchmark,\n",
" batch=batch,\n",
" warm_start=batch,\n",
" neutralized_risk=neutralized_risk, \n",
" pre_process=[winsorize_normal, standardize],\n",
" post_process=[winsorize_normal, standardize])"
]
},
{
"cell_type": "code",
"execution_count": 25,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"features = factor_data['x_names']\n",
"\n",
"train_x = factor_data['train']['x']\n",
"train_y = factor_data['train']['y']\n",
"train_risk = factor_data['train']['risk']\n",
"ref_dates = sorted(train_x.keys())\n",
"\n",
"predict_x = factor_data['predict']['x']\n",
"predict_y = factor_data['predict']['y']\n",
"predict_risk = factor_data['predict']['risk']\n",
"settlement = factor_data['settlement']"
]
},
{
"cell_type": "code",
"execution_count": 26,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"for key, val in train_y.items():\n",
" train_y[key] = np.where(val > 0., 1, 0)\n",
" \n",
"for key, val in predict_y.items():\n",
" predict_y[key] = np.where(val > 0., 1, 0)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Logistic Regression\n",
"--------------"
]
},
{
"cell_type": "code",
"execution_count": 27,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"2012-02-15 00:00:00\n",
"2012-06-06 00:00:00\n",
"2012-09-20 00:00:00\n",
"2013-01-15 00:00:00\n",
"2013-05-14 00:00:00\n",
"2013-08-30 00:00:00\n",
"2013-12-24 00:00:00\n",
"2014-04-17 00:00:00\n",
"2014-08-05 00:00:00\n",
"2014-11-26 00:00:00\n",
"2015-03-20 00:00:00\n",
"2015-07-08 00:00:00\n",
"2015-10-30 00:00:00\n",
"2016-02-22 00:00:00\n",
"2016-06-08 00:00:00\n",
"2016-09-27 00:00:00\n",
"2017-01-18 00:00:00\n",
"2017-05-15 00:00:00\n",
"2017-08-30 00:00:00\n",
"2017-12-20 00:00:00\n",
"Wall time: 5.34 s\n"
]
}
],
"source": [
"%%time\n",
"train_dates = list(train_x.keys())\n",
"train_scores = []\n",
"predict_scores = []\n",
"\n",
"for i, date in enumerate(train_dates):\n",
" if i % 15 == 0:\n",
" print(date)\n",
" x = train_x[date]\n",
" y = train_y[date]\n",
" \n",
" model = LogisticRegression(fit_intercept=False, features=features)\n",
" model.fit(x, y)\n",
" train_scores.append(model.score(x, y))\n",
" \n",
" p_x = predict_x[date]\n",
" p_y = predict_y[date]\n",
" predict_scores.append(model.score(p_x, p_y))"
]
},
{
"cell_type": "code",
"execution_count": 28,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"0.541013986745\n",
"0.51932344036\n"
]
}
],
"source": [
"print(np.mean(train_scores))\n",
"print(np.mean(predict_scores))"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Random Forest Classifier\n",
"-----------"
]
},
{
"cell_type": "code",
"execution_count": 29,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"2012-02-15 00:00:00\n",
"2012-06-06 00:00:00\n",
"2012-09-20 00:00:00\n",
"2013-01-15 00:00:00\n",
"2013-05-14 00:00:00\n",
"2013-08-30 00:00:00\n",
"2013-12-24 00:00:00\n",
"2014-04-17 00:00:00\n",
"2014-08-05 00:00:00\n",
"2014-11-26 00:00:00\n",
"2015-03-20 00:00:00\n",
"2015-07-08 00:00:00\n",
"2015-10-30 00:00:00\n",
"2016-02-22 00:00:00\n",
"2016-06-08 00:00:00\n",
"2016-09-27 00:00:00\n",
"2017-01-18 00:00:00\n",
"2017-05-15 00:00:00\n",
"2017-08-30 00:00:00\n",
"2017-12-20 00:00:00\n",
"Wall time: 15min 34s\n"
]
}
],
"source": [
"%%time\n",
"train_dates = list(train_x.keys())\n",
"train_scores = []\n",
"predict_scores = []\n",
"\n",
"for i, date in enumerate(train_dates):\n",
" if i % 15 == 0:\n",
" print(date)\n",
" x = train_x[date]\n",
" y = train_y[date]\n",
" \n",
" model = RandomForestClassifier(n_estimators=1000, max_features='sqrt', max_depth=3, n_jobs=-1)\n",
" model.fit(x, y)\n",
" train_scores.append(model.score(x, y))\n",
" \n",
" p_x = predict_x[date]\n",
" p_y = predict_y[date]\n",
" predict_scores.append(model.score(p_x, p_y))"
]
},
{
"cell_type": "code",
"execution_count": 30,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"0.557563825608\n",
"0.553974775005\n"
]
}
],
"source": [
"print(np.mean(train_scores))\n",
"print(np.mean(predict_scores))"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## XGBoost Classifier\n",
"---------"
]
},
{
"cell_type": "code",
"execution_count": 31,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"2012-02-15 00:00:00\n",
"2012-06-06 00:00:00\n",
"2012-09-20 00:00:00\n",
"2013-01-15 00:00:00\n",
"2013-05-14 00:00:00\n",
"2013-08-30 00:00:00\n",
"2013-12-24 00:00:00\n",
"2014-04-17 00:00:00\n",
"2014-08-05 00:00:00\n",
"2014-11-26 00:00:00\n",
"2015-03-20 00:00:00\n",
"2015-07-08 00:00:00\n",
"2015-10-30 00:00:00\n",
"2016-02-22 00:00:00\n",
"2016-06-08 00:00:00\n",
"2016-09-27 00:00:00\n",
"2017-01-18 00:00:00\n",
"2017-05-15 00:00:00\n",
"2017-08-30 00:00:00\n",
"2017-12-20 00:00:00\n",
"Wall time: 13min 40s\n"
]
}
],
"source": [
"%%time\n",
"train_dates = list(train_x.keys())\n",
"train_scores = []\n",
"predict_scores = []\n",
"\n",
"for i, date in enumerate(train_dates):\n",
" if i % 15 == 0:\n",
" print(date)\n",
" x = train_x[date]\n",
" y = train_y[date]\n",
" \n",
" model = XGBClassifier(n_estimators=1000,\n",
" learning_rate=0.02,\n",
" max_depth=3,\n",
" n_jobs=-1,\n",
" subsample=0.25,\n",
" colsample_bytree=0.5)\n",
" model.fit(x, y)\n",
" train_scores.append(model.score(x, y))\n",
" \n",
" p_x = predict_x[date]\n",
" p_y = predict_y[date]\n",
" predict_scores.append(model.score(p_x, p_y))"
]
},
{
"cell_type": "code",
"execution_count": 32,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"0.642946015759\n",
"0.537550683184\n"
]
}
],
"source": [
"print(np.mean(train_scores))\n",
"print(np.mean(predict_scores))"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Native XGBoost Classifier\n",
"---------------"
]
},
{
"cell_type": "code",
"execution_count": 33,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"2012-02-15 00:00:00\n",
"2012-06-06 00:00:00\n",
"2012-09-20 00:00:00\n",
"2013-01-15 00:00:00\n",
"2013-05-14 00:00:00\n",
"2013-08-30 00:00:00\n",
"2013-12-24 00:00:00\n",
"2014-04-17 00:00:00\n",
"2014-08-05 00:00:00\n",
"2014-11-26 00:00:00\n",
"2015-03-20 00:00:00\n",
"2015-07-08 00:00:00\n",
"2015-10-30 00:00:00\n",
"2016-02-22 00:00:00\n",
"2016-06-08 00:00:00\n",
"2016-09-27 00:00:00\n",
"2017-01-18 00:00:00\n",
"2017-05-15 00:00:00\n",
"2017-08-30 00:00:00\n",
"2017-12-20 00:00:00\n",
"Wall time: 1min 6s\n"
]
}
],
"source": [
"%%time\n",
"train_dates = list(train_x.keys())\n",
"train_scores = []\n",
"predict_scores = []\n",
"\n",
"for i, date in enumerate(train_dates):\n",
" if i % 15 == 0:\n",
" print(date)\n",
" x = train_x[date]\n",
" y = train_y[date]\n",
" \n",
" x_train, x_eval, y_train, y_eval = train_test_split(x, y, test_size=0.33, random_state=42)\n",
" \n",
" dtrain = xgb.DMatrix(x_train, y_train)\n",
" deval = xgb.DMatrix(x_eval, y_eval)\n",
" param = {'silent': 1,\n",
" 'objective': 'binary:logistic',\n",
" 'max_depth': 3,\n",
" 'eta': 0.01,\n",
" 'boost': 'dart',\n",
" 'tree_method': 'hist',\n",
" 'subsample': 0.25,\n",
" 'colsample_bytree': 0.5}\n",
" num_round = 2000\n",
" model = xgb.train(param, dtrain, num_round, evals=[(deval, 'eval')], early_stopping_rounds=50, verbose_eval=False)\n",
" \n",
" y_train_predict = model.predict(dtrain)\n",
" label = dtrain.get_label()\n",
" train_score = np.sum((y_train_predict > 0.5) == label) / float(len(label))\n",
"\n",
" train_scores.append(train_score)\n",
" \n",
" p_x = predict_x[date]\n",
" p_y = predict_y[date]\n",
" dtest = xgb.DMatrix(p_x, p_y)\n",
" \n",
" y_test_predict = model.predict(dtest)\n",
" p_label = dtest.get_label()\n",
" test_score = np.sum((y_test_predict > 0.5) == p_label) / float(len(p_label))\n",
" predict_scores.append(test_score)"
]
},
{
"cell_type": "code",
"execution_count": 34,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"0.567225761699\n",
"0.550997907465\n"
]
}
],
"source": [
"print(np.mean(train_scores))\n",
"print(np.mean(predict_scores))"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Logistic Regression with More Features\n",
"-----------------"
]
},
{
"cell_type": "code",
"execution_count": 35,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"def cross_product(x, y):\n",
" n, m = x.shape\n",
" res = []\n",
" \n",
" for j in range(m):\n",
" res.append(x[:, [j]] * y)\n",
" \n",
" return np.concatenate(res, axis=1)"
]
},
{
"cell_type": "code",
"execution_count": 36,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"2012-02-15 00:00:00\n",
"2012-06-06 00:00:00\n",
"2012-09-20 00:00:00\n",
"2013-01-15 00:00:00\n",
"2013-05-14 00:00:00\n",
"2013-08-30 00:00:00\n",
"2013-12-24 00:00:00\n",
"2014-04-17 00:00:00\n",
"2014-08-05 00:00:00\n",
"2014-11-26 00:00:00\n",
"2015-03-20 00:00:00\n",
"2015-07-08 00:00:00\n",
"2015-10-30 00:00:00\n",
"2016-02-22 00:00:00\n",
"2016-06-08 00:00:00\n",
"2016-09-27 00:00:00\n",
"2017-01-18 00:00:00\n",
"2017-05-15 00:00:00\n",
"2017-08-30 00:00:00\n",
"2017-12-20 00:00:00\n",
"Wall time: 36.1 s\n"
]
}
],
"source": [
"%%time\n",
"train_dates = list(train_x.keys())\n",
"train_scores = []\n",
"predict_scores = []\n",
"\n",
"for i, date in enumerate(train_dates):\n",
" if i % 15 == 0:\n",
" print(date)\n",
" x = train_x[date]\n",
" y = train_y[date]\n",
" risk = train_risk[date][:, 1:]\n",
" new_x = cross_product(x, risk)\n",
" \n",
" model = LogisticRegression(fit_intercept=False, features=features)\n",
" model.fit(new_x, y)\n",
" train_scores.append(model.score(new_x, y))\n",
" \n",
" p_x = predict_x[date]\n",
" p_y = predict_y[date]\n",
" p_risk = predict_risk[date][:, 1:]\n",
" new_p_x = cross_product(p_x, p_risk)\n",
" predict_scores.append(model.score(new_p_x, p_y))"
]
},
{
"cell_type": "code",
"execution_count": 37,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"0.568125478425\n",
"0.517523115163\n"
]
}
],
"source": [
"print(np.mean(train_scores))\n",
"print(np.mean(predict_scores))"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Random Forest Classifier with More Features\n",
"-----------"
]
},
{
"cell_type": "code",
"execution_count": 38,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"2012-02-15 00:00:00\n",
"2012-06-06 00:00:00\n",
"2012-09-20 00:00:00\n",
"2013-01-15 00:00:00\n",
"2013-05-14 00:00:00\n",
"2013-08-30 00:00:00\n",
"2013-12-24 00:00:00\n",
"2014-04-17 00:00:00\n",
"2014-08-05 00:00:00\n",
"2014-11-26 00:00:00\n",
"2015-03-20 00:00:00\n",
"2015-07-08 00:00:00\n",
"2015-10-30 00:00:00\n",
"2016-02-22 00:00:00\n",
"2016-06-08 00:00:00\n",
"2016-09-27 00:00:00\n",
"2017-01-18 00:00:00\n",
"2017-05-15 00:00:00\n",
"2017-08-30 00:00:00\n",
"2017-12-20 00:00:00\n",
"Wall time: 14min 40s\n"
]
}
],
"source": [
"%%time\n",
"train_dates = list(train_x.keys())\n",
"train_scores = []\n",
"predict_scores = []\n",
"\n",
"for i, date in enumerate(train_dates):\n",
" if i % 15 == 0:\n",
" print(date)\n",
" x = train_x[date]\n",
" y = train_y[date]\n",
" risk = train_risk[date][:, 1:]\n",
" new_x = cross_product(x, risk)\n",
" \n",
" model = RandomForestClassifier(n_estimators=1000, max_features='sqrt', max_depth=3, n_jobs=-1)\n",
" model.fit(new_x, y)\n",
" train_scores.append(model.score(new_x, y))\n",
" \n",
" p_x = predict_x[date]\n",
" p_y = predict_y[date]\n",
" p_risk = predict_risk[date][:, 1:]\n",
" new_p_x = cross_product(p_x, p_risk)\n",
" predict_scores.append(model.score(new_p_x, p_y))"
]
},
{
"cell_type": "code",
"execution_count": 39,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"0.549090142483\n",
"0.559944504146\n"
]
}
],
"source": [
"print(np.mean(train_scores))\n",
"print(np.mean(predict_scores))"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## XGBoost Classifier with More Features\n",
"---------"
]
},
{
"cell_type": "code",
"execution_count": 40,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"2012-02-15 00:00:00\n",
"2012-06-06 00:00:00\n",
"2012-09-20 00:00:00\n",
"2013-01-15 00:00:00\n",
"2013-05-14 00:00:00\n",
"2013-08-30 00:00:00\n",
"2013-12-24 00:00:00\n",
"2014-04-17 00:00:00\n",
"2014-08-05 00:00:00\n",
"2014-11-26 00:00:00\n",
"2015-03-20 00:00:00\n",
"2015-07-08 00:00:00\n",
"2015-10-30 00:00:00\n",
"2016-02-22 00:00:00\n",
"2016-06-08 00:00:00\n",
"2016-09-27 00:00:00\n",
"2017-01-18 00:00:00\n",
"2017-05-15 00:00:00\n",
"2017-08-30 00:00:00\n",
"2017-12-20 00:00:00\n",
"Wall time: 12min 25s\n"
]
}
],
"source": [
"%%time\n",
"train_dates = list(train_x.keys())\n",
"train_scores = []\n",
"predict_scores = []\n",
"\n",
"for i, date in enumerate(train_dates):\n",
" if i % 15 == 0:\n",
" print(date)\n",
" x = train_x[date]\n",
" y = train_y[date]\n",
" risk = train_risk[date][:, 1:]\n",
" new_x = cross_product(x, risk)\n",
" \n",
" model = XGBClassifier(n_estimators=500,\n",
" learning_rate=0.02,\n",
" max_depth=3,\n",
" n_jobs=-1,\n",
" subsample=0.25,\n",
" colsample_bytree=0.1)\n",
" model.fit(new_x, y)\n",
" train_scores.append(model.score(new_x, y))\n",
" \n",
" p_x = predict_x[date]\n",
" p_y = predict_y[date]\n",
" p_risk = predict_risk[date][:, 1:]\n",
" new_p_x = cross_product(p_x, p_risk)\n",
" predict_scores.append(model.score(new_p_x, p_y))"
]
},
{
"cell_type": "code",
"execution_count": 41,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"0.59375573895\n",
"0.55230987889\n"
]
}
],
"source": [
"print(np.mean(train_scores))\n",
"print(np.mean(predict_scores))"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Native XGBoost Classifier with More Features\n",
"---------------"
]
},
{
"cell_type": "code",
"execution_count": 42,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"2012-02-15 00:00:00\n",
"2012-06-06 00:00:00\n",
"2012-09-20 00:00:00\n",
"2013-01-15 00:00:00\n",
"2013-05-14 00:00:00\n",
"2013-08-30 00:00:00\n",
"2013-12-24 00:00:00\n",
"2014-04-17 00:00:00\n",
"2014-08-05 00:00:00\n",
"2014-11-26 00:00:00\n",
"2015-03-20 00:00:00\n",
"2015-07-08 00:00:00\n",
"2015-10-30 00:00:00\n",
"2016-02-22 00:00:00\n",
"2016-06-08 00:00:00\n",
"2016-09-27 00:00:00\n",
"2017-01-18 00:00:00\n",
"2017-05-15 00:00:00\n",
"2017-08-30 00:00:00\n",
"2017-12-20 00:00:00\n",
"Wall time: 5min 23s\n"
]
}
],
"source": [
"%%time\n",
"train_dates = list(train_x.keys())\n",
"train_scores = []\n",
"predict_scores = []\n",
"\n",
"for i, date in enumerate(train_dates):\n",
" if i % 15 == 0:\n",
" print(date)\n",
" x = train_x[date]\n",
" y = train_y[date]\n",
" risk = train_risk[date][:, 1:]\n",
" new_x = cross_product(x, risk)\n",
" \n",
" x_train, x_eval, y_train, y_eval = train_test_split(new_x, y, test_size=0.33, random_state=42)\n",
" \n",
" dtrain = xgb.DMatrix(x_train, y_train)\n",
" deval = xgb.DMatrix(x_eval, y_eval)\n",
" param = {'silent': 1,\n",
" 'objective': 'binary:logistic',\n",
" 'max_depth': 3,\n",
" 'eta': 0.01,\n",
" 'booster': 'dart',\n",
" 'tree_method': 'hist',\n",
" 'subsample': 0.25,\n",
" 'colsample_bytree': 0.5}\n",
" num_round = 2000\n",
" model = xgb.train(param, dtrain, num_round, evals=[(deval, 'eval')], early_stopping_rounds=50, verbose_eval=False)\n",
" \n",
" y_train_predict = model.predict(dtrain)\n",
" label = dtrain.get_label()\n",
" train_score = np.sum((y_train_predict > 0.5) == label) / float(len(label))\n",
"\n",
" train_scores.append(train_score)\n",
" \n",
" p_x = predict_x[date]\n",
" p_y = predict_y[date]\n",
" p_risk = predict_risk[date][:, 1:]\n",
" new_p_x = cross_product(p_x, p_risk)\n",
" dtest = xgb.DMatrix(new_p_x, p_y)\n",
" \n",
" y_test_predict = model.predict(dtest)\n",
" p_label = dtest.get_label()\n",
" test_score = np.sum((y_test_predict > 0.5) == p_label) / float(len(p_label))\n",
" predict_scores.append(test_score)"
]
},
{
"cell_type": "code",
"execution_count": 43,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"0.560057712549\n",
"0.552663472836\n"
]
}
],
"source": [
"print(np.mean(train_scores))\n",
"print(np.mean(predict_scores))"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.3"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
This source diff could not be displayed because it is too large. You can view the blob instead.
Subproject commit bf4367184164e593cd2856ef38f8dd4f8cc76999
Subproject commit a187ed6c8f3aa40b47d5be80667cbbe6a6fd563d
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment