Merge branch 'master' of https://github.com/lion-sing/alpha-mind

d99cf0c5 · Yucheng · bfc257f9 · 7924db6d · d99cf0c5 · d99cf0c5
Commit d99cf0c5 authored Jan 12, 2018 by Yucheng
17 changed files
--- a/alphamind/api.py
+++ b/alphamind/api.py
@@ -33,6 +33,7 @@ from alphamind.model import RandomForestRegressor
 from alphamind.model import RandomForestClassifier
 from alphamind.model import XGBRegressor
 from alphamind.model import XGBClassifier
+from alphamind.model import XGBTrainer
 from alphamind.model import load_model
 from alphamind.model.data_preparing import fetch_data_package
 from alphamind.model.data_preparing import fetch_train_phase
@@ -74,6 +75,7 @@ __all__ = [
    'RandomForestClassifier',
    'XGBRegressor',
    'XGBClassifier',
+    'XGBTrainer',
    'load_model',
    'NaiveExecutor',
    'ThresholdExecutor',

--- a/alphamind/data/dbmodel/models.py
+++ b/alphamind/data/dbmodel/models.py
@@ -1954,5 +1954,5 @@ class OutrightTmp(Base):
 if __name__ == '__main__':
    from sqlalchemy import create_engine

-    engine = create_engine('postgres+psycopg2://postgres:A12345678!@10.63.6.220/alpha')
+    engine = create_engine('postgres+psycopg2://postgres:we083826@101.132.104.118/alpha')
    Base.metadata.create_all(engine)
--- a/alphamind/data/engines/sqlengine.py
+++ b/alphamind/data/engines/sqlengine.py
@@ -388,11 +388,13 @@ class SqlEngine(object):
        df = pd.read_sql(query, self.engine)
        if universe.is_filtered:
            codes = universe.query(self, start_date, end_date, dates)
-            df = pd.merge(df, codes, how='inner', on=['trade_date', 'code']).sort_values(['trade_date', 'code'])
+            df = pd.merge(df, codes, how='inner', on=['trade_date', 'code'])

        if external_data is not None:
            df = pd.merge(df, external_data, on=['trade_date', 'code']).dropna()

+        df.sort_values(['trade_date', 'code'], inplace=True)
+
        df.set_index('trade_date', inplace=True)
        res = transformer.transform('code', df)


--- a/alphamind/examples/formula_expression.py
+++ b/alphamind/examples/formula_expression.py
@@ -17,25 +17,24 @@ import datetime as dt

 start = dt.datetime.now()

-universe_name = 'zz500'
+universe = Universe('custom', ['zz800'])

-factor_name = 'PE'
-expression = 1. / LAST(factor_name)
+simple_expression = CSRes(LAST('OperCashInToAsset'), 'roe_q')

-alpha_factor_name = '1/PE'
-alpha_factor = {alpha_factor_name: expression}
+alpha_factor_name = 'alpha_factor'
+alpha_factor = {alpha_factor_name: simple_expression}

 # end of formula definition

 engine = SqlEngine('postgresql+psycopg2://postgres:A12345678!@10.63.6.220/alpha')
-universe = Universe('custom', [universe_name])
-neutralize_risk = ['SIZE'] + industry_styles
-freq = '5b'
+
+neutralize_risk = ['SIZE', 'LEVERAGE'] + industry_styles
+freq = '10b'
 n_bins = 5
 horizon = map_freq(freq)

 start_date = '2012-01-01'
-end_date = '2017-11-21'
+end_date = '2018-01-05'

 dates = makeSchedule(start_date,
                     end_date,
@@ -93,10 +92,9 @@ df = df.cumsum().plot(ax=axes[0], title='Quantile Analysis for {0}'.format(alpha
 # =================================================================== #

 factor_name = 'PE'
-expression = DIFF(1./LAST(factor_name))

-alpha_factor_name = '1/PE_1w_diff'
-alpha_factor = {alpha_factor_name: expression}
+alpha_factor_name = alpha_factor_name + '_1w_diff'
+alpha_factor = {alpha_factor_name: DIFF(simple_expression)}

 dates = makeSchedule(start_date,
                     end_date,

--- a/alphamind/model/__init__.py
+++ b/alphamind/model/__init__.py
@@ -14,6 +14,7 @@ from alphamind.model.treemodel import RandomForestRegressor
 from alphamind.model.treemodel import RandomForestClassifier
 from alphamind.model.treemodel import XGBRegressor
 from alphamind.model.treemodel import XGBClassifier
+from alphamind.model.treemodel import XGBTrainer

 from alphamind.model.loader import load_model

@@ -26,4 +27,5 @@ __all__ = ['LinearRegression',
           'RandomForestClassifier',
           'XGBRegressor',
           'XGBClassifier',
+           'XGBTrainer',
           'load_model']
\ No newline at end of file
--- a/alphamind/model/data_preparing.py
+++ b/alphamind/model/data_preparing.py
@@ -27,6 +27,7 @@ from alphamind.utilities import map_freq

 def _merge_df(engine, names, factor_df, return_df, universe, dates, risk_model, neutralized_risk):
    risk_df = engine.fetch_risk_model_range(universe, dates=dates, risk_model=risk_model)[1]
+    alpha_logger.info("risk data loading finished")
    used_neutralized_risk = list(set(total_risk_factors).difference(names))
    risk_df = risk_df[['trade_date', 'code'] + used_neutralized_risk].dropna()
    return_df = pd.merge(return_df, risk_df, on=['trade_date', 'code'])
@@ -45,9 +46,10 @@ def _merge_df(engine, names, factor_df, return_df, universe, dates, risk_model,
        x_values = train_x[names].values.astype(float)
        y_values = train_y[['dx']].values

+    codes = train_x['code'].values
    date_label = pd.DatetimeIndex(factor_df.trade_date).to_pydatetime()
    dates = np.unique(date_label)
-    return return_df, dates, date_label, risk_exp, x_values, y_values, train_x, train_y
+    return return_df, dates, date_label, risk_exp, x_values, y_values, train_x, train_y, codes


 def prepare_data(engine: SqlEngine,
@@ -82,9 +84,13 @@ def prepare_data(engine: SqlEngine,
    factor_df = engine.fetch_factor_range(universe,
                                          factors=transformer,
                                          dates=dates).sort_values(['trade_date', 'code'])
+    alpha_logger.info("factor data loading finished")
    return_df = engine.fetch_dx_return_range(universe, dates=dates, horizon=horizon)
+    alpha_logger.info("return data loading finished")
    industry_df = engine.fetch_industry_range(universe, dates=dates)
+    alpha_logger.info("industry data loading finished")
    benchmark_df = engine.fetch_benchmark_range(benchmark, dates=dates)
+    alpha_logger.info("benchmark data loading finished")

    df = pd.merge(factor_df, return_df, on=['trade_date', 'code']).dropna()
    df = pd.merge(df, benchmark_df, on=['trade_date', 'code'], how='left')
@@ -102,13 +108,15 @@ def batch_processing(x_values,
                     batch,
                     risk_exp,
                     pre_process,
-                     post_process):
+                     post_process,
+                     codes):
    train_x_buckets = {}
    train_y_buckets = {}
    train_risk_buckets = {}
    predict_x_buckets = {}
    predict_y_buckets = {}
    predict_risk_buckets = {}
+    predict_codes_bucket = {}

    for i, start in enumerate(groups[:-batch]):
        end = groups[i + batch]
@@ -141,6 +149,7 @@ def batch_processing(x_values,

        sub_dates = group_label[left_index:right_index]
        this_raw_x = x_values[left_index:right_index]
+        this_codes = codes[left_index:right_index]

        if risk_exp is not None:
            this_risk_exp = risk_exp[left_index:right_index]
@@ -156,6 +165,7 @@ def batch_processing(x_values,
        inner_right_index = bisect.bisect_right(sub_dates, end)
        predict_x_buckets[end] = ne_x[inner_left_index:inner_right_index]
        predict_risk_buckets[end] = this_risk_exp[inner_left_index:inner_right_index]
+        predict_codes_bucket[end] = this_codes[inner_left_index:inner_right_index]

        this_raw_y = y_values[left_index:right_index]
        if len(this_raw_y) > 0:
@@ -165,7 +175,13 @@ def batch_processing(x_values,
                                     post_process=post_process)
            predict_y_buckets[end] = ne_y[inner_left_index:inner_right_index]

-    return train_x_buckets, train_y_buckets, train_risk_buckets, predict_x_buckets, predict_y_buckets, predict_risk_buckets
+    return train_x_buckets, \
+           train_y_buckets, \
+           train_risk_buckets, \
+           predict_x_buckets, \
+           predict_y_buckets, \
+           predict_risk_buckets, \
+           predict_codes_bucket


 def fetch_data_package(engine: SqlEngine,
@@ -193,9 +209,11 @@ def fetch_data_package(engine: SqlEngine,
                                               benchmark,
                                               warm_start)

-    return_df, dates, date_label, risk_exp, x_values, y_values, train_x, train_y = \
+    return_df, dates, date_label, risk_exp, x_values, y_values, train_x, train_y, codes = \
        _merge_df(engine, transformer.names, factor_df, return_df, universe, dates, risk_model, neutralized_risk)

+    alpha_logger.info("data merging finished")
+
    return_df['weight'] = train_x['weight']
    return_df['industry'] = train_x['industry']
    return_df['industry_code'] = train_x['industry_code']
@@ -207,15 +225,16 @@ def fetch_data_package(engine: SqlEngine,

    alpha_logger.info("Loading data is finished")

-    train_x_buckets, train_y_buckets, train_risk_buckets, predict_x_buckets, predict_y_buckets, predict_risk_buckets = batch_processing(
-        x_values,
+    train_x_buckets, train_y_buckets, train_risk_buckets, predict_x_buckets, predict_y_buckets, predict_risk_buckets, predict_codes_bucket \
+        = batch_processing(x_values,
                           y_values,
                           dates,
                           date_label,
                           batch,
                           risk_exp,
                           pre_process,
-        post_process)
+                           post_process,
+                           codes)

    alpha_logger.info("Data processing is finished")

@@ -223,7 +242,7 @@ def fetch_data_package(engine: SqlEngine,
    ret['x_names'] = transformer.names
    ret['settlement'] = return_df
    ret['train'] = {'x': train_x_buckets, 'y': train_y_buckets, 'risk': train_risk_buckets}
-    ret['predict'] = {'x': predict_x_buckets, 'y': predict_y_buckets, 'risk': predict_risk_buckets}
+    ret['predict'] = {'x': predict_x_buckets, 'y': predict_y_buckets, 'risk': predict_risk_buckets, 'code': predict_codes_bucket}
    return ret


@@ -261,7 +280,7 @@ def fetch_train_phase(engine,
    return_df, factor_df = df[['trade_date', 'code', 'dx']], df[
        ['trade_date', 'code', 'isOpen'] + transformer.names]

-    return_df, dates, date_label, risk_exp, x_values, y_values, _, _ = \
+    return_df, dates, date_label, risk_exp, x_values, y_values, _, _, codes = \
        _merge_df(engine, transformer.names, factor_df, return_df, universe, dates, risk_model, neutralized_risk)

    if dates[-1] == dt.datetime.strptime(ref_date, '%Y-%m-%d'):
@@ -274,6 +293,7 @@ def fetch_train_phase(engine,
    index = (date_label >= start) & (date_label <= end)
    this_raw_x = x_values[index]
    this_raw_y = y_values[index]
+    this_code = codes[index]
    if risk_exp is not None:
        this_risk_exp = risk_exp[index]
    else:
@@ -291,7 +311,7 @@ def fetch_train_phase(engine,

    ret = dict()
    ret['x_names'] = transformer.names
-    ret['train'] = {'x': ne_x, 'y': ne_y}
+    ret['train'] = {'x': ne_x, 'y': ne_y, 'code': this_code}

    return ret

@@ -342,7 +362,6 @@ def fetch_predict_phase(engine,
        end = dates[-1]
        start = dates[-batch]

-        # index = (date_label >= start) & (date_label <= end)
        left_index = bisect.bisect_left(date_label, start)
        right_index = bisect.bisect_right(date_label, end)
        this_raw_x = x_values[left_index:right_index]
@@ -380,26 +399,12 @@ def fetch_predict_phase(engine,

 if __name__ == '__main__':
    engine = SqlEngine('postgresql+psycopg2://postgres:A12345678!@10.63.6.220/alpha')
-    universe = Universe('zz500', ['ashare_ex'])
+    universe = Universe('zz500', ['hs300', 'zz500'])
    neutralized_risk = ['SIZE']
-    res = fetch_train_phase(engine,
-                            ['EPS', 'CFinc1'],
-                            '2017-09-04',
-                            '2w',
+    res = fetch_predict_phase(engine, ['ep_q'],
+                            '2018-01-08',
+                            '5b',
                            universe,
-                            4,
-                            warm_start=1,
+                            16,
                            neutralized_risk=neutralized_risk)
-
-    print(res)
-
-    res = fetch_predict_phase(engine,
-                              ['EPS', 'CFinc1'],
-                              '2017-09-04',
-                              '2w',
-                              universe,
-                              4,
-                              warm_start=1,
-                              neutralized_risk=neutralized_risk)
-
    print(res)
--- a/alphamind/model/loader.py
+++ b/alphamind/model/loader.py
@@ -14,6 +14,7 @@ from alphamind.model.treemodel import RandomForestRegressor
 from alphamind.model.treemodel import RandomForestClassifier
 from alphamind.model.treemodel import XGBRegressor
 from alphamind.model.treemodel import XGBClassifier
+from alphamind.model.treemodel import XGBTrainer


 def load_model(model_desc: dict) -> ModelBase:
@@ -37,5 +38,7 @@ def load_model(model_desc: dict) -> ModelBase:
        return XGBRegressor.load(model_desc)
    elif 'XGBClassifier' in model_name_parts:
        return XGBClassifier.load(model_desc)
+    elif 'XGBTrainer' in model_name_parts:
+        return XGBTrainer.load(model_desc)
    else:
        raise ValueError('{0} is not currently supported in model loader.'.format(model_name))
--- a/alphamind/model/modelbase.py
+++ b/alphamind/model/modelbase.py
@@ -21,7 +21,7 @@ class ModelBase(metaclass=abc.ABCMeta):
        self.impl = None
        self.trained_time = None

-    def fit(self, x, y):
+    def fit(self, x: np.ndarray, y: np.ndarray):
        self.impl.fit(x, y.flatten())
        self.trained_time = arrow.now().format("YYYY-MM-DD HH:mm:ss")


--- a/alphamind/model/treemodel.py
+++ b/alphamind/model/treemodel.py
@@ -7,9 +7,13 @@ Created on 2017-12-4

 from typing import List
 from distutils.version import LooseVersion
+import arrow
+import numpy as np
 from sklearn import __version__ as sklearn_version
 from sklearn.ensemble import RandomForestRegressor as RandomForestRegressorImpl
 from sklearn.ensemble import RandomForestClassifier as RandomForestClassifierImpl
+from sklearn.model_selection import train_test_split
+import xgboost as xgb
 from xgboost import __version__ as xgbboot_version
 from xgboost import XGBRegressor as XGBRegressorImpl
 from xgboost import XGBClassifier as XGBClassifierImpl
@@ -155,4 +159,91 @@ class XGBClassifier(ModelBase):
        return self.impl.feature_importances_.tolist()


+class XGBTrainer(ModelBase):
+
+    def __init__(self,
+                 objective='binary:logistic',
+                 booster='gbtree',
+                 tree_method='hist',
+                 n_estimators: int=100,
+                 learning_rate: float=0.1,
+                 max_depth=3,
+                 eval_sample=None,
+                 early_stopping_rounds=None,
+                 subsample=1.,
+                 colsample_bytree=1.,
+                 features: List = None,
+                 random_state=0,
+                 **kwargs):
+        super().__init__(features)
+        self.params = {
+            'silent': 1,
+            'objective': objective,
+            'max_depth': max_depth,
+            'eta': learning_rate,
+            'booster': booster,
+            'tree_method': tree_method,
+            'subsample': subsample,
+            'colsample_bytree': colsample_bytree,
+            'seed': random_state
+        }
+
+        self.eval_sample = eval_sample
+        self.num_boost_round = n_estimators
+        self.early_stopping_rounds = early_stopping_rounds
+        self.impl = None
+        self.kwargs = kwargs
+
+    def fit(self, x, y):
+        if self.eval_sample:
+            x_train, x_eval, y_train, y_eval = train_test_split(x,
+                                                                y,
+                                                                test_size=self.eval_sample,
+                                                                random_state=42)
+            d_train = xgb.DMatrix(x_train, y_train)
+            d_eval = xgb.DMatrix(x_eval, y_eval)
+            self.impl = xgb.train(params=self.params,
+                                  dtrain=d_train,
+                                  num_boost_round=self.num_boost_round,
+                                  evals=[(d_eval, 'eval')],
+                                  verbose_eval=False,
+                                  **self.kwargs)
+        else:
+            d_train = xgb.DMatrix(x, y)
+            self.impl = xgb.train(params=self.params,
+                                  dtrain=d_train,
+                                  num_boost_round=self.num_boost_round,
+                                  **self.kwargs)
+
+        self.trained_time = arrow.now().format("YYYY-MM-DD HH:mm:ss")
+
+    def predict(self, x: np.ndarray) -> np.ndarray:
+        d_predict = xgb.DMatrix(x)
+        return self.impl.predict(d_predict)
+
+    def save(self) -> dict:
+        model_desc = super().save()
+        model_desc['xgbboot_version'] = xgbboot_version
+        model_desc['importances'] = self.importances
+        return model_desc
+
+    @classmethod
+    def load(cls, model_desc: dict):
+        obj_layout = super().load(model_desc)
+
+        if LooseVersion(sklearn_version) < LooseVersion(model_desc['xgbboot_version']):
+            alpha_logger.warning('Current xgboost version {0} is lower than the model version {1}. '
+                                 'Loaded model may work incorrectly.'.format(xgbboot_version,
+                                                                             model_desc['xgbboot_version']))
+        return obj_layout
+
+    @property
+    def importances(self):
+        imps = self.impl.get_fscore().items()
+        imps = sorted(imps, key=lambda x: x[0])
+        return list(zip(*imps))[1]
+
+
+
+

--- a/alphamind/tests/model/test_treemodel.py
+++ b/alphamind/tests/model/test_treemodel.py
@@ -12,16 +12,18 @@ from alphamind.model.treemodel import RandomForestRegressor
 from alphamind.model.treemodel import RandomForestClassifier
 from alphamind.model.treemodel import XGBRegressor
 from alphamind.model.treemodel import XGBClassifier
+from alphamind.model.treemodel import XGBTrainer


 class TestTreeModel(unittest.TestCase):

+    def setUp(self):
+        self.x = np.random.randn(1000, 10)
+        self.y = np.random.randn(1000)
+
    def test_random_forest_regress_persistence(self):
        model = RandomForestRegressor(features=list(range(10)))
-        x = np.random.randn(1000, 10)
-        y = np.random.randn(1000)
-
-        model.fit(x, y)
+        model.fit(self.x, self.y)

        desc = model.save()
        new_model = load_model(desc)
@@ -29,14 +31,12 @@ class TestTreeModel(unittest.TestCase):

        sample_x = np.random.randn(100, 10)
        np.testing.assert_array_almost_equal(model.predict(sample_x), new_model.predict(sample_x))
+        np.testing.assert_array_almost_equal(model.importances, new_model.importances)

    def test_random_forest_classify_persistence(self):
        model = RandomForestClassifier(features=list(range(10)))
-        x = np.random.randn(1000, 10)
-        y = np.random.randn(1000)
-        y = np.where(y > 0, 1, 0)
-
-        model.fit(x, y)
+        y = np.where(self.y > 0, 1, 0)
+        model.fit(self.x, y)

        desc = model.save()
        new_model = load_model(desc)
@@ -44,13 +44,11 @@ class TestTreeModel(unittest.TestCase):

        sample_x = np.random.randn(100, 10)
        np.testing.assert_array_almost_equal(model.predict(sample_x), new_model.predict(sample_x))
+        np.testing.assert_array_almost_equal(model.importances, new_model.importances)

    def test_xgb_regress_persistence(self):
        model = XGBRegressor(features=list(range(10)))
-        x = np.random.randn(1000, 10)
-        y = np.random.randn(1000)
-
-        model.fit(x, y)
+        model.fit(self.x, self.y)

        desc = model.save()
        new_model = load_model(desc)
@@ -58,14 +56,56 @@ class TestTreeModel(unittest.TestCase):

        sample_x = np.random.randn(100, 10)
        np.testing.assert_array_almost_equal(model.predict(sample_x), new_model.predict(sample_x))
+        np.testing.assert_array_almost_equal(model.importances, new_model.importances)

    def test_xgb_classify_persistence(self):
        model = XGBClassifier(features=list(range(10)))
-        x = np.random.randn(1000, 10)
-        y = np.random.randn(1000)
-        y = np.where(y > 0, 1, 0)
+        y = np.where(self.y > 0, 1, 0)
+        model.fit(self.x, y)
+
+        desc = model.save()
+        new_model = load_model(desc)
+        self.assertEqual(model.features, new_model.features)
+
+        sample_x = np.random.randn(100, 10)
+        np.testing.assert_array_almost_equal(model.predict(sample_x), new_model.predict(sample_x))
+        np.testing.assert_array_almost_equal(model.importances, new_model.importances)
+
+    def test_xgb_trainer_equal_classifier(self):
+        sample_x = np.random.randn(100, 10)

-        model.fit(x, y)
+        model1 = XGBClassifier(n_estimators=100,
+                               learning_rate=0.1,
+                               max_depth=3,
+                               features=list(range(10)),
+                               random_state=42)
+
+        model2 = XGBTrainer(features=list(range(10)),
+                            objective='reg:logistic',
+                            booster='gbtree',
+                            tree_method='exact',
+                            n_estimators=100,
+                            learning_rate=0.1,
+                            max_depth=3,
+                            random_state=42)
+
+        y = np.where(self.y > 0, 1, 0)
+        model1.fit(self.x, y)
+        model2.fit(self.x, y)
+
+        predict1 = model1.predict(sample_x)
+        predict2 = model2.predict(sample_x)
+        predict2 = np.where(predict2 > 0.5, 1., 0.)
+        np.testing.assert_array_almost_equal(predict1, predict2)
+
+    def test_xgb_trainer_persistence(self):
+        model = XGBTrainer(features=list(range(10)),
+                           objective='binary:logistic',
+                           booster='gbtree',
+                           tree_method='hist',
+                           n_estimators=200)
+        y = np.where(self.y > 0, 1, 0)
+        model.fit(self.x, y)

        desc = model.save()
        new_model = load_model(desc)
@@ -73,3 +113,4 @@ class TestTreeModel(unittest.TestCase):

        sample_x = np.random.randn(100, 10)
        np.testing.assert_array_almost_equal(model.predict(sample_x), new_model.predict(sample_x))
+        np.testing.assert_array_almost_equal(model.importances, new_model.importances)
--- a/build_linux_dependencies.sh
+++ b/build_linux_dependencies.sh
@@ -4,7 +4,12 @@ cd xgboost
 git submodule init
 git submodule update

+mkdir build
+cd build
+cmake ..
 make -j4
+cd ..
+
 cd python-package
 python setup.py install


--- a/notebooks/candidate_prod_model_20171204.ipynb
+++ b/notebooks/candidate_prod_model_20171204.ipynb
--- a/notebooks/full factor strategy.ipynb
+++ b/notebooks/full factor strategy.ipynb
--- a/notebooks/model_comparing.ipynb
+++ b/notebooks/model_comparing.ipynb
@@ -2,26 +2,31 @@
 "cells": [
  {
   "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
    "%matplotlib inline\n",
    "import numpy as np\n",
+    "import xgboost as xgb\n",
+    "from sklearn.metrics import r2_score\n",
+    "from sklearn.model_selection import train_test_split\n",
    "from alphamind.api import *\n",
    "from PyFin.api import *\n",
    "\n",
-    "engine = SqlEngine('postgres+psycopg2://postgres:we083826@localhost/alpha')"
+    "engine = SqlEngine()"
   ]
  },
  {
   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
+   "execution_count": 52,
+   "metadata": {
+    "collapsed": true
+   },
   "outputs": [],
   "source": [
    "start_date = '2012-01-01'\n",
-    "end_date = '2017-12-31'\n",
+    "end_date = '2018-01-05'\n",
    "\n",
    "features = ['roe_q',\n",
    "            'ep_q',\n",
@@ -32,7 +37,7 @@
    "            'EPIBS']\n",
    "\n",
    "freq = '5b'\n",
-    "batch = 16\n",
+    "batch = 32\n",
    "universe = Universe('custom', ['zz500', 'hs300'])\n",
    "benchmark = 905\n",
    "neutralized_risk = ['SIZE'] + industry_styles\n",
@@ -41,9 +46,32 @@
  },
  {
   "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 53,
   "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "2018-01-10 14:56:47,595 - ALPHA_MIND - INFO - Starting data package fetching ...\n",
+      "2018-01-10 14:56:54,781 - ALPHA_MIND - INFO - factor data loading finished\n",
+      "2018-01-10 14:57:03,949 - ALPHA_MIND - INFO - return data loading finished\n",
+      "2018-01-10 14:57:05,113 - ALPHA_MIND - INFO - industry data loading finished\n",
+      "2018-01-10 14:57:05,828 - ALPHA_MIND - INFO - benchmark data loading finished\n",
+      "2018-01-10 14:57:15,662 - ALPHA_MIND - INFO - risk data loading finished\n",
+      "2018-01-10 14:57:17,773 - ALPHA_MIND - INFO - data merging finished\n",
+      "2018-01-10 14:57:19,490 - ALPHA_MIND - INFO - Loading data is finished\n",
+      "2018-01-10 14:57:35,324 - ALPHA_MIND - INFO - Data processing is finished\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Wall time: 47.7 s\n"
+     ]
+    }
+   ],
   "source": [
    "%%time\n",
    "factor_data = fetch_data_package(engine,\n",
@@ -62,8 +90,10 @@
  },
  {
   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
+   "execution_count": 54,
+   "metadata": {
+    "collapsed": true
+   },
   "outputs": [],
   "source": [
    "features = factor_data['x_names']\n",
@@ -89,9 +119,37 @@
  },
  {
   "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 66,
   "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "2012-02-15 00:00:00\n",
+      "2012-06-06 00:00:00\n",
+      "2012-09-20 00:00:00\n",
+      "2013-01-15 00:00:00\n",
+      "2013-05-14 00:00:00\n",
+      "2013-08-30 00:00:00\n",
+      "2013-12-24 00:00:00\n",
+      "2014-04-17 00:00:00\n",
+      "2014-08-05 00:00:00\n",
+      "2014-11-26 00:00:00\n",
+      "2015-03-20 00:00:00\n",
+      "2015-07-08 00:00:00\n",
+      "2015-10-30 00:00:00\n",
+      "2016-02-22 00:00:00\n",
+      "2016-06-08 00:00:00\n",
+      "2016-09-27 00:00:00\n",
+      "2017-01-18 00:00:00\n",
+      "2017-05-15 00:00:00\n",
+      "2017-08-30 00:00:00\n",
+      "2017-12-20 00:00:00\n",
+      "Wall time: 1.26 s\n"
+     ]
+    }
+   ],
   "source": [
    "%%time\n",
    "train_dates = list(train_x.keys())\n",
@@ -115,9 +173,18 @@
  },
  {
   "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 67,
   "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "0.0107609007052\n",
+      "-0.480548329833\n"
+     ]
+    }
+   ],
   "source": [
    "print(np.mean(train_scores))\n",
    "print(np.mean(predict_scores))"
@@ -128,14 +195,42 @@
   "metadata": {},
   "source": [
    "## Lasso Regression\n",
-    "------------"
+    "---------"
   ]
  },
  {
   "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 60,
   "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "2012-02-15 00:00:00\n",
+      "2012-06-06 00:00:00\n",
+      "2012-09-20 00:00:00\n",
+      "2013-01-15 00:00:00\n",
+      "2013-05-14 00:00:00\n",
+      "2013-08-30 00:00:00\n",
+      "2013-12-24 00:00:00\n",
+      "2014-04-17 00:00:00\n",
+      "2014-08-05 00:00:00\n",
+      "2014-11-26 00:00:00\n",
+      "2015-03-20 00:00:00\n",
+      "2015-07-08 00:00:00\n",
+      "2015-10-30 00:00:00\n",
+      "2016-02-22 00:00:00\n",
+      "2016-06-08 00:00:00\n",
+      "2016-09-27 00:00:00\n",
+      "2017-01-18 00:00:00\n",
+      "2017-05-15 00:00:00\n",
+      "2017-08-30 00:00:00\n",
+      "2017-12-20 00:00:00\n",
+      "Wall time: 1.58 s\n"
+     ]
+    }
+   ],
   "source": [
    "%%time\n",
    "train_dates = list(train_x.keys())\n",
@@ -159,9 +254,18 @@
  },
  {
   "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 61,
   "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "0.00875291615929\n",
+      "-0.475440026\n"
+     ]
+    }
+   ],
   "source": [
    "print(np.mean(train_scores))\n",
    "print(np.mean(predict_scores))"
@@ -177,8 +281,10 @@
  },
  {
   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
+   "execution_count": 9,
+   "metadata": {
+    "collapsed": true
+   },
   "outputs": [],
   "source": [
    "def cross_product(x, y):\n",
@@ -193,16 +299,35 @@
  },
  {
   "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 34,
   "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "2017-01-03 00:00:00\n"
+     ]
+    },
+    {
+     "ename": "NameError",
+     "evalue": "name 'cross_product' is not defined",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
+      "\u001b[1;31mNameError\u001b[0m                                 Traceback (most recent call last)",
+      "\u001b[1;32m<timed exec>\u001b[0m in \u001b[0;36m<module>\u001b[1;34m()\u001b[0m\n",
+      "\u001b[1;31mNameError\u001b[0m: name 'cross_product' is not defined"
+     ]
+    }
+   ],
   "source": [
    "%%time\n",
    "train_dates = list(train_x.keys())\n",
    "train_scores = []\n",
    "predict_scores = []\n",
    "\n",
-    "for i, date in enumerate(train_dates):\n",
+    "for i, date in enumerate(train_dates[:1]):\n",
    "    if i % 15 == 0:\n",
    "        print(date)\n",
    "    x = train_x[date]\n",
@@ -223,9 +348,18 @@
  },
  {
   "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 11,
   "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "0.0291928676769\n",
+      "-0.24146254373\n"
+     ]
+    }
+   ],
   "source": [
    "print(np.mean(train_scores))\n",
    "print(np.mean(predict_scores))"
@@ -241,9 +375,21 @@
  },
  {
   "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 12,
   "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "2017-01-03 00:00:00\n",
+      "2017-04-27 00:00:00\n",
+      "2017-08-15 00:00:00\n",
+      "2017-12-05 00:00:00\n",
+      "Wall time: 4.78 s\n"
+     ]
+    }
+   ],
   "source": [
    "%%time\n",
    "train_dates = list(train_x.keys())\n",
@@ -271,9 +417,18 @@
  },
  {
   "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 13,
   "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "0.000355789142204\n",
+      "-0.200552889618\n"
+     ]
+    }
+   ],
   "source": [
    "print(np.mean(train_scores))\n",
    "print(np.mean(predict_scores))"
@@ -289,9 +444,21 @@
  },
  {
   "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 14,
   "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "2017-01-03 00:00:00\n",
+      "2017-04-27 00:00:00\n",
+      "2017-08-15 00:00:00\n",
+      "2017-12-05 00:00:00\n",
+      "Wall time: 1min 18s\n"
+     ]
+    }
+   ],
   "source": [
    "%%time\n",
    "train_dates = list(train_x.keys())\n",
@@ -315,9 +482,18 @@
  },
  {
   "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 15,
   "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "0.0137863030105\n",
+      "-0.197952235791\n"
+     ]
+    }
+   ],
   "source": [
    "print(np.mean(train_scores))\n",
    "print(np.mean(predict_scores))"
@@ -333,9 +509,21 @@
  },
  {
   "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 16,
   "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "2017-01-03 00:00:00\n",
+      "2017-04-27 00:00:00\n",
+      "2017-08-15 00:00:00\n",
+      "2017-12-05 00:00:00\n",
+      "Wall time: 1min 32s\n"
+     ]
+    }
+   ],
   "source": [
    "%%time\n",
    "train_dates = list(train_x.keys())\n",
@@ -347,7 +535,12 @@
    "        print(date)\n",
    "    x = train_x[date]\n",
    "    y = train_y[date]\n",
-    "    model = XGBRegressor(n_estimators=500, max_features='sqrt', max_depth=3, n_jobs=-1)\n",
+    "    model = XGBRegressor(n_estimators=500,\n",
+    "                         learning_rate=0.02,\n",
+    "                         max_depth=3,\n",
+    "                         n_jobs=-1,\n",
+    "                         subsample=0.25,\n",
+    "                         colsample_bytree=0.5)\n",
    "    model.fit(x, y)\n",
    "    train_scores.append(model.score(x, y))\n",
    "    \n",
@@ -358,9 +551,66 @@
  },
  {
   "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 17,
   "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "0.0575499865219\n",
+      "-0.209037365429\n"
+     ]
+    }
+   ],
+   "source": [
+    "print(np.mean(train_scores))\n",
+    "print(np.mean(predict_scores))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "collapsed": true
+   },
+   "source": [
+    "## Native XGBoost Regressor\n",
+    "---------------"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 68,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "2012-02-15 00:00:00\n",
+      "2012-06-06 00:00:00\n",
+      "2012-09-20 00:00:00\n",
+      "2013-01-15 00:00:00\n",
+      "2013-05-14 00:00:00\n",
+      "2013-08-30 00:00:00\n",
+      "2013-12-24 00:00:00\n",
+      "2014-04-17 00:00:00\n",
+      "2014-08-05 00:00:00\n",
+      "2014-11-26 00:00:00\n",
+      "2015-03-20 00:00:00\n",
+      "2015-07-08 00:00:00\n",
+      "2015-10-30 00:00:00\n",
+      "2016-02-22 00:00:00\n",
+      "2016-06-08 00:00:00\n",
+      "2016-09-27 00:00:00\n",
+      "2017-01-18 00:00:00\n",
+      "2017-05-15 00:00:00\n",
+      "2017-08-30 00:00:00\n",
+      "2017-12-20 00:00:00\n",
+      "Wall time: 6min 57s\n"
+     ]
+    }
+   ],
   "source": [
    "%%time\n",
    "train_dates = list(train_x.keys())\n",
@@ -372,19 +622,58 @@
    "        print(date)\n",
    "    x = train_x[date]\n",
    "    y = train_y[date]\n",
-    "    model = XGBRegressor(n_estimators=500, max_features='sqrt', max_depth=3, n_jobs=-1)\n",
-    "    model.fit(x, y)\n",
-    "    new_train_scores.append(model.score(x, y))\n",
+    "    \n",
+    "    x_train, x_eval, y_train, y_eval = train_test_split(x, y, test_size=0.33, random_state=42)\n",
+    "    \n",
+    "    dtrain = xgb.DMatrix(x_train, y_train)\n",
+    "    deval = xgb.DMatrix(x_eval, y_eval)\n",
+    "    param = {'silent': 1,\n",
+    "             'objective': 'reg:linear',\n",
+    "             'max_depth': 3,\n",
+    "             'eta': 0.005,\n",
+    "             'boost': 'gbtree',\n",
+    "             'tree_method': 'hist',\n",
+    "             'subsample': 0.1,\n",
+    "             'colsample_bytree': 0.25}\n",
+    "    num_round = 2000\n",
+    "    model = xgb.train(param, dtrain, num_round, evals=[(deval, 'eval')], early_stopping_rounds=50, verbose_eval=False)\n",
+    "    \n",
+    "    y_train_predict = model.predict(dtrain)\n",
+    "    train_scores.append(r2_score(y_train, y_train_predict, multioutput='uniform_average'))\n",
    "    \n",
    "    p_x = predict_x[date]\n",
    "    p_y = predict_y[date]\n",
-    "    new_predict_scores.append(model.score(p_x, p_y))"
+    "    dtest = xgb.DMatrix(p_x, p_y)\n",
+    "    \n",
+    "    y_test_predict = model.predict(dtest)\n",
+    "    predict_scores.append(r2_score(p_y, y_test_predict, multioutput='uniform_average'))"
   ]
  },
  {
   "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 69,
   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "0.0158347715471\n",
+      "-0.477095380466\n"
+     ]
+    }
+   ],
+   "source": [
+    "print(np.mean(train_scores))\n",
+    "print(np.mean(predict_scores))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": true
+   },
   "outputs": [],
   "source": []
  }
@@ -405,7 +694,7 @@
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
-   "version": "3.6.4"
+   "version": "3.6.3"
  }
 },
 "nbformat": 4,

--- a/notebooks/model_comparing_classifiers.ipynb
+++ b/notebooks/model_comparing_classifiers.ipynb
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "%matplotlib inline\n",
+    "import xgboost as xgb\n",
+    "import numpy as np\n",
+    "from alphamind.api import *\n",
+    "from PyFin.api import *\n",
+    "from sklearn.model_selection import train_test_split\n",
+    "\n",
+    "engine = SqlEngine()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 23,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "start_date = '2012-01-01'\n",
+    "end_date = '2018-01-05'\n",
+    "\n",
+    "features = ['roe_q',\n",
+    "            'ep_q',\n",
+    "            'DivP',\n",
+    "            'cfinc1_q',\n",
+    "            'EBIT',\n",
+    "            'EARNYILD',\n",
+    "            'EPIBS']\n",
+    "\n",
+    "freq = '10b'\n",
+    "batch = 16\n",
+    "universe = Universe('custom', ['zz500', 'hs300'])\n",
+    "benchmark = 905\n",
+    "neutralized_risk = ['SIZE'] + industry_styles\n",
+    "horizon = map_freq(freq)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 24,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "2018-01-11 15:12:44,105 - ALPHA_MIND - INFO - Starting data package fetching ...\n",
+      "2018-01-11 15:12:53,578 - ALPHA_MIND - INFO - factor data loading finished\n",
+      "2018-01-11 15:13:03,880 - ALPHA_MIND - INFO - return data loading finished\n",
+      "2018-01-11 15:13:05,384 - ALPHA_MIND - INFO - industry data loading finished\n",
+      "2018-01-11 15:13:06,178 - ALPHA_MIND - INFO - benchmark data loading finished\n",
+      "2018-01-11 15:13:17,845 - ALPHA_MIND - INFO - risk data loading finished\n",
+      "2018-01-11 15:13:21,266 - ALPHA_MIND - INFO - data merging finished\n",
+      "2018-01-11 15:13:23,371 - ALPHA_MIND - INFO - Loading data is finished\n",
+      "2018-01-11 15:13:33,174 - ALPHA_MIND - INFO - Data processing is finished\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Wall time: 49.1 s\n"
+     ]
+    }
+   ],
+   "source": [
+    "%%time\n",
+    "factor_data = fetch_data_package(engine,\n",
+    "                                 features,\n",
+    "                                 start_date,\n",
+    "                                 end_date,\n",
+    "                                 '5b',\n",
+    "                                 universe,\n",
+    "                                 benchmark,\n",
+    "                                 batch=batch,\n",
+    "                                 warm_start=batch,\n",
+    "                                 neutralized_risk=neutralized_risk, \n",
+    "                                 pre_process=[winsorize_normal, standardize],\n",
+    "                                 post_process=[winsorize_normal, standardize])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 25,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "features = factor_data['x_names']\n",
+    "\n",
+    "train_x = factor_data['train']['x']\n",
+    "train_y = factor_data['train']['y']\n",
+    "train_risk = factor_data['train']['risk']\n",
+    "ref_dates = sorted(train_x.keys())\n",
+    "\n",
+    "predict_x = factor_data['predict']['x']\n",
+    "predict_y = factor_data['predict']['y']\n",
+    "predict_risk = factor_data['predict']['risk']\n",
+    "settlement = factor_data['settlement']"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 26,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "for key, val in train_y.items():\n",
+    "    train_y[key] = np.where(val > 0., 1, 0)\n",
+    "    \n",
+    "for key, val in predict_y.items():\n",
+    "    predict_y[key] = np.where(val > 0., 1, 0)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Logistic Regression\n",
+    "--------------"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 27,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "2012-02-15 00:00:00\n",
+      "2012-06-06 00:00:00\n",
+      "2012-09-20 00:00:00\n",
+      "2013-01-15 00:00:00\n",
+      "2013-05-14 00:00:00\n",
+      "2013-08-30 00:00:00\n",
+      "2013-12-24 00:00:00\n",
+      "2014-04-17 00:00:00\n",
+      "2014-08-05 00:00:00\n",
+      "2014-11-26 00:00:00\n",
+      "2015-03-20 00:00:00\n",
+      "2015-07-08 00:00:00\n",
+      "2015-10-30 00:00:00\n",
+      "2016-02-22 00:00:00\n",
+      "2016-06-08 00:00:00\n",
+      "2016-09-27 00:00:00\n",
+      "2017-01-18 00:00:00\n",
+      "2017-05-15 00:00:00\n",
+      "2017-08-30 00:00:00\n",
+      "2017-12-20 00:00:00\n",
+      "Wall time: 5.34 s\n"
+     ]
+    }
+   ],
+   "source": [
+    "%%time\n",
+    "train_dates = list(train_x.keys())\n",
+    "train_scores = []\n",
+    "predict_scores = []\n",
+    "\n",
+    "for i, date in enumerate(train_dates):\n",
+    "    if i % 15 == 0:\n",
+    "        print(date)\n",
+    "    x = train_x[date]\n",
+    "    y = train_y[date]\n",
+    "    \n",
+    "    model = LogisticRegression(fit_intercept=False, features=features)\n",
+    "    model.fit(x, y)\n",
+    "    train_scores.append(model.score(x, y))\n",
+    "    \n",
+    "    p_x = predict_x[date]\n",
+    "    p_y = predict_y[date]\n",
+    "    predict_scores.append(model.score(p_x, p_y))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 28,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "0.541013986745\n",
+      "0.51932344036\n"
+     ]
+    }
+   ],
+   "source": [
+    "print(np.mean(train_scores))\n",
+    "print(np.mean(predict_scores))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Random Forest Classifier\n",
+    "-----------"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 29,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "2012-02-15 00:00:00\n",
+      "2012-06-06 00:00:00\n",
+      "2012-09-20 00:00:00\n",
+      "2013-01-15 00:00:00\n",
+      "2013-05-14 00:00:00\n",
+      "2013-08-30 00:00:00\n",
+      "2013-12-24 00:00:00\n",
+      "2014-04-17 00:00:00\n",
+      "2014-08-05 00:00:00\n",
+      "2014-11-26 00:00:00\n",
+      "2015-03-20 00:00:00\n",
+      "2015-07-08 00:00:00\n",
+      "2015-10-30 00:00:00\n",
+      "2016-02-22 00:00:00\n",
+      "2016-06-08 00:00:00\n",
+      "2016-09-27 00:00:00\n",
+      "2017-01-18 00:00:00\n",
+      "2017-05-15 00:00:00\n",
+      "2017-08-30 00:00:00\n",
+      "2017-12-20 00:00:00\n",
+      "Wall time: 15min 34s\n"
+     ]
+    }
+   ],
+   "source": [
+    "%%time\n",
+    "train_dates = list(train_x.keys())\n",
+    "train_scores = []\n",
+    "predict_scores = []\n",
+    "\n",
+    "for i, date in enumerate(train_dates):\n",
+    "    if i % 15 == 0:\n",
+    "        print(date)\n",
+    "    x = train_x[date]\n",
+    "    y = train_y[date]\n",
+    "    \n",
+    "    model = RandomForestClassifier(n_estimators=1000, max_features='sqrt', max_depth=3, n_jobs=-1)\n",
+    "    model.fit(x, y)\n",
+    "    train_scores.append(model.score(x, y))\n",
+    "    \n",
+    "    p_x = predict_x[date]\n",
+    "    p_y = predict_y[date]\n",
+    "    predict_scores.append(model.score(p_x, p_y))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 30,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "0.557563825608\n",
+      "0.553974775005\n"
+     ]
+    }
+   ],
+   "source": [
+    "print(np.mean(train_scores))\n",
+    "print(np.mean(predict_scores))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## XGBoost Classifier\n",
+    "---------"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 31,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "2012-02-15 00:00:00\n",
+      "2012-06-06 00:00:00\n",
+      "2012-09-20 00:00:00\n",
+      "2013-01-15 00:00:00\n",
+      "2013-05-14 00:00:00\n",
+      "2013-08-30 00:00:00\n",
+      "2013-12-24 00:00:00\n",
+      "2014-04-17 00:00:00\n",
+      "2014-08-05 00:00:00\n",
+      "2014-11-26 00:00:00\n",
+      "2015-03-20 00:00:00\n",
+      "2015-07-08 00:00:00\n",
+      "2015-10-30 00:00:00\n",
+      "2016-02-22 00:00:00\n",
+      "2016-06-08 00:00:00\n",
+      "2016-09-27 00:00:00\n",
+      "2017-01-18 00:00:00\n",
+      "2017-05-15 00:00:00\n",
+      "2017-08-30 00:00:00\n",
+      "2017-12-20 00:00:00\n",
+      "Wall time: 13min 40s\n"
+     ]
+    }
+   ],
+   "source": [
+    "%%time\n",
+    "train_dates = list(train_x.keys())\n",
+    "train_scores = []\n",
+    "predict_scores = []\n",
+    "\n",
+    "for i, date in enumerate(train_dates):\n",
+    "    if i % 15 == 0:\n",
+    "        print(date)\n",
+    "    x = train_x[date]\n",
+    "    y = train_y[date]\n",
+    "    \n",
+    "    model = XGBClassifier(n_estimators=1000,\n",
+    "                         learning_rate=0.02,\n",
+    "                         max_depth=3,\n",
+    "                         n_jobs=-1,\n",
+    "                         subsample=0.25,\n",
+    "                         colsample_bytree=0.5)\n",
+    "    model.fit(x, y)\n",
+    "    train_scores.append(model.score(x, y))\n",
+    "    \n",
+    "    p_x = predict_x[date]\n",
+    "    p_y = predict_y[date]\n",
+    "    predict_scores.append(model.score(p_x, p_y))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 32,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "0.642946015759\n",
+      "0.537550683184\n"
+     ]
+    }
+   ],
+   "source": [
+    "print(np.mean(train_scores))\n",
+    "print(np.mean(predict_scores))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Native XGBoost Classifier\n",
+    "---------------"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 33,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "2012-02-15 00:00:00\n",
+      "2012-06-06 00:00:00\n",
+      "2012-09-20 00:00:00\n",
+      "2013-01-15 00:00:00\n",
+      "2013-05-14 00:00:00\n",
+      "2013-08-30 00:00:00\n",
+      "2013-12-24 00:00:00\n",
+      "2014-04-17 00:00:00\n",
+      "2014-08-05 00:00:00\n",
+      "2014-11-26 00:00:00\n",
+      "2015-03-20 00:00:00\n",
+      "2015-07-08 00:00:00\n",
+      "2015-10-30 00:00:00\n",
+      "2016-02-22 00:00:00\n",
+      "2016-06-08 00:00:00\n",
+      "2016-09-27 00:00:00\n",
+      "2017-01-18 00:00:00\n",
+      "2017-05-15 00:00:00\n",
+      "2017-08-30 00:00:00\n",
+      "2017-12-20 00:00:00\n",
+      "Wall time: 1min 6s\n"
+     ]
+    }
+   ],
+   "source": [
+    "%%time\n",
+    "train_dates = list(train_x.keys())\n",
+    "train_scores = []\n",
+    "predict_scores = []\n",
+    "\n",
+    "for i, date in enumerate(train_dates):\n",
+    "    if i % 15 == 0:\n",
+    "        print(date)\n",
+    "    x = train_x[date]\n",
+    "    y = train_y[date]\n",
+    "    \n",
+    "    x_train, x_eval, y_train, y_eval = train_test_split(x, y, test_size=0.33, random_state=42)\n",
+    "    \n",
+    "    dtrain = xgb.DMatrix(x_train, y_train)\n",
+    "    deval = xgb.DMatrix(x_eval, y_eval)\n",
+    "    param = {'silent': 1,\n",
+    "             'objective': 'binary:logistic',\n",
+    "             'max_depth': 3,\n",
+    "             'eta': 0.01,\n",
+    "             'boost': 'dart',\n",
+    "             'tree_method': 'hist',\n",
+    "             'subsample': 0.25,\n",
+    "             'colsample_bytree': 0.5}\n",
+    "    num_round = 2000\n",
+    "    model = xgb.train(param, dtrain, num_round, evals=[(deval, 'eval')], early_stopping_rounds=50, verbose_eval=False)\n",
+    "    \n",
+    "    y_train_predict = model.predict(dtrain)\n",
+    "    label = dtrain.get_label()\n",
+    "    train_score = np.sum((y_train_predict > 0.5) == label) / float(len(label))\n",
+    "\n",
+    "    train_scores.append(train_score)\n",
+    "    \n",
+    "    p_x = predict_x[date]\n",
+    "    p_y = predict_y[date]\n",
+    "    dtest = xgb.DMatrix(p_x, p_y)\n",
+    "    \n",
+    "    y_test_predict = model.predict(dtest)\n",
+    "    p_label = dtest.get_label()\n",
+    "    test_score = np.sum((y_test_predict > 0.5) == p_label) / float(len(p_label))\n",
+    "    predict_scores.append(test_score)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 34,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "0.567225761699\n",
+      "0.550997907465\n"
+     ]
+    }
+   ],
+   "source": [
+    "print(np.mean(train_scores))\n",
+    "print(np.mean(predict_scores))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Logistic Regression with More Features\n",
+    "-----------------"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 35,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "def cross_product(x, y):\n",
+    "    n, m = x.shape\n",
+    "    res = []\n",
+    "    \n",
+    "    for j in range(m):\n",
+    "        res.append(x[:, [j]] * y)\n",
+    "        \n",
+    "    return np.concatenate(res, axis=1)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 36,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "2012-02-15 00:00:00\n",
+      "2012-06-06 00:00:00\n",
+      "2012-09-20 00:00:00\n",
+      "2013-01-15 00:00:00\n",
+      "2013-05-14 00:00:00\n",
+      "2013-08-30 00:00:00\n",
+      "2013-12-24 00:00:00\n",
+      "2014-04-17 00:00:00\n",
+      "2014-08-05 00:00:00\n",
+      "2014-11-26 00:00:00\n",
+      "2015-03-20 00:00:00\n",
+      "2015-07-08 00:00:00\n",
+      "2015-10-30 00:00:00\n",
+      "2016-02-22 00:00:00\n",
+      "2016-06-08 00:00:00\n",
+      "2016-09-27 00:00:00\n",
+      "2017-01-18 00:00:00\n",
+      "2017-05-15 00:00:00\n",
+      "2017-08-30 00:00:00\n",
+      "2017-12-20 00:00:00\n",
+      "Wall time: 36.1 s\n"
+     ]
+    }
+   ],
+   "source": [
+    "%%time\n",
+    "train_dates = list(train_x.keys())\n",
+    "train_scores = []\n",
+    "predict_scores = []\n",
+    "\n",
+    "for i, date in enumerate(train_dates):\n",
+    "    if i % 15 == 0:\n",
+    "        print(date)\n",
+    "    x = train_x[date]\n",
+    "    y = train_y[date]\n",
+    "    risk = train_risk[date][:, 1:]\n",
+    "    new_x = cross_product(x, risk)\n",
+    "    \n",
+    "    model = LogisticRegression(fit_intercept=False, features=features)\n",
+    "    model.fit(new_x, y)\n",
+    "    train_scores.append(model.score(new_x, y))\n",
+    "    \n",
+    "    p_x = predict_x[date]\n",
+    "    p_y = predict_y[date]\n",
+    "    p_risk = predict_risk[date][:, 1:]\n",
+    "    new_p_x = cross_product(p_x, p_risk)\n",
+    "    predict_scores.append(model.score(new_p_x, p_y))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 37,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "0.568125478425\n",
+      "0.517523115163\n"
+     ]
+    }
+   ],
+   "source": [
+    "print(np.mean(train_scores))\n",
+    "print(np.mean(predict_scores))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Random Forest Classifier with More Features\n",
+    "-----------"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 38,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "2012-02-15 00:00:00\n",
+      "2012-06-06 00:00:00\n",
+      "2012-09-20 00:00:00\n",
+      "2013-01-15 00:00:00\n",
+      "2013-05-14 00:00:00\n",
+      "2013-08-30 00:00:00\n",
+      "2013-12-24 00:00:00\n",
+      "2014-04-17 00:00:00\n",
+      "2014-08-05 00:00:00\n",
+      "2014-11-26 00:00:00\n",
+      "2015-03-20 00:00:00\n",
+      "2015-07-08 00:00:00\n",
+      "2015-10-30 00:00:00\n",
+      "2016-02-22 00:00:00\n",
+      "2016-06-08 00:00:00\n",
+      "2016-09-27 00:00:00\n",
+      "2017-01-18 00:00:00\n",
+      "2017-05-15 00:00:00\n",
+      "2017-08-30 00:00:00\n",
+      "2017-12-20 00:00:00\n",
+      "Wall time: 14min 40s\n"
+     ]
+    }
+   ],
+   "source": [
+    "%%time\n",
+    "train_dates = list(train_x.keys())\n",
+    "train_scores = []\n",
+    "predict_scores = []\n",
+    "\n",
+    "for i, date in enumerate(train_dates):\n",
+    "    if i % 15 == 0:\n",
+    "        print(date)\n",
+    "    x = train_x[date]\n",
+    "    y = train_y[date]\n",
+    "    risk = train_risk[date][:, 1:]\n",
+    "    new_x = cross_product(x, risk)\n",
+    "    \n",
+    "    model = RandomForestClassifier(n_estimators=1000, max_features='sqrt', max_depth=3, n_jobs=-1)\n",
+    "    model.fit(new_x, y)\n",
+    "    train_scores.append(model.score(new_x, y))\n",
+    "    \n",
+    "    p_x = predict_x[date]\n",
+    "    p_y = predict_y[date]\n",
+    "    p_risk = predict_risk[date][:, 1:]\n",
+    "    new_p_x = cross_product(p_x, p_risk)\n",
+    "    predict_scores.append(model.score(new_p_x, p_y))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 39,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "0.549090142483\n",
+      "0.559944504146\n"
+     ]
+    }
+   ],
+   "source": [
+    "print(np.mean(train_scores))\n",
+    "print(np.mean(predict_scores))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## XGBoost Classifier with More Features\n",
+    "---------"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 40,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "2012-02-15 00:00:00\n",
+      "2012-06-06 00:00:00\n",
+      "2012-09-20 00:00:00\n",
+      "2013-01-15 00:00:00\n",
+      "2013-05-14 00:00:00\n",
+      "2013-08-30 00:00:00\n",
+      "2013-12-24 00:00:00\n",
+      "2014-04-17 00:00:00\n",
+      "2014-08-05 00:00:00\n",
+      "2014-11-26 00:00:00\n",
+      "2015-03-20 00:00:00\n",
+      "2015-07-08 00:00:00\n",
+      "2015-10-30 00:00:00\n",
+      "2016-02-22 00:00:00\n",
+      "2016-06-08 00:00:00\n",
+      "2016-09-27 00:00:00\n",
+      "2017-01-18 00:00:00\n",
+      "2017-05-15 00:00:00\n",
+      "2017-08-30 00:00:00\n",
+      "2017-12-20 00:00:00\n",
+      "Wall time: 12min 25s\n"
+     ]
+    }
+   ],
+   "source": [
+    "%%time\n",
+    "train_dates = list(train_x.keys())\n",
+    "train_scores = []\n",
+    "predict_scores = []\n",
+    "\n",
+    "for i, date in enumerate(train_dates):\n",
+    "    if i % 15 == 0:\n",
+    "        print(date)\n",
+    "    x = train_x[date]\n",
+    "    y = train_y[date]\n",
+    "    risk = train_risk[date][:, 1:]\n",
+    "    new_x = cross_product(x, risk)\n",
+    "    \n",
+    "    model = XGBClassifier(n_estimators=500,\n",
+    "                         learning_rate=0.02,\n",
+    "                         max_depth=3,\n",
+    "                         n_jobs=-1,\n",
+    "                         subsample=0.25,\n",
+    "                         colsample_bytree=0.1)\n",
+    "    model.fit(new_x, y)\n",
+    "    train_scores.append(model.score(new_x, y))\n",
+    "    \n",
+    "    p_x = predict_x[date]\n",
+    "    p_y = predict_y[date]\n",
+    "    p_risk = predict_risk[date][:, 1:]\n",
+    "    new_p_x = cross_product(p_x, p_risk)\n",
+    "    predict_scores.append(model.score(new_p_x, p_y))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 41,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "0.59375573895\n",
+      "0.55230987889\n"
+     ]
+    }
+   ],
+   "source": [
+    "print(np.mean(train_scores))\n",
+    "print(np.mean(predict_scores))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Native XGBoost Classifier with More Features\n",
+    "---------------"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 42,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "2012-02-15 00:00:00\n",
+      "2012-06-06 00:00:00\n",
+      "2012-09-20 00:00:00\n",
+      "2013-01-15 00:00:00\n",
+      "2013-05-14 00:00:00\n",
+      "2013-08-30 00:00:00\n",
+      "2013-12-24 00:00:00\n",
+      "2014-04-17 00:00:00\n",
+      "2014-08-05 00:00:00\n",
+      "2014-11-26 00:00:00\n",
+      "2015-03-20 00:00:00\n",
+      "2015-07-08 00:00:00\n",
+      "2015-10-30 00:00:00\n",
+      "2016-02-22 00:00:00\n",
+      "2016-06-08 00:00:00\n",
+      "2016-09-27 00:00:00\n",
+      "2017-01-18 00:00:00\n",
+      "2017-05-15 00:00:00\n",
+      "2017-08-30 00:00:00\n",
+      "2017-12-20 00:00:00\n",
+      "Wall time: 5min 23s\n"
+     ]
+    }
+   ],
+   "source": [
+    "%%time\n",
+    "train_dates = list(train_x.keys())\n",
+    "train_scores = []\n",
+    "predict_scores = []\n",
+    "\n",
+    "for i, date in enumerate(train_dates):\n",
+    "    if i % 15 == 0:\n",
+    "        print(date)\n",
+    "    x = train_x[date]\n",
+    "    y = train_y[date]\n",
+    "    risk = train_risk[date][:, 1:]\n",
+    "    new_x = cross_product(x, risk)\n",
+    "    \n",
+    "    x_train, x_eval, y_train, y_eval = train_test_split(new_x, y, test_size=0.33, random_state=42)\n",
+    "    \n",
+    "    dtrain = xgb.DMatrix(x_train, y_train)\n",
+    "    deval = xgb.DMatrix(x_eval, y_eval)\n",
+    "    param = {'silent': 1,\n",
+    "             'objective': 'binary:logistic',\n",
+    "             'max_depth': 3,\n",
+    "             'eta': 0.01,\n",
+    "             'booster': 'dart',\n",
+    "             'tree_method': 'hist',\n",
+    "             'subsample': 0.25,\n",
+    "             'colsample_bytree': 0.5}\n",
+    "    num_round = 2000\n",
+    "    model = xgb.train(param, dtrain, num_round, evals=[(deval, 'eval')], early_stopping_rounds=50, verbose_eval=False)\n",
+    "    \n",
+    "    y_train_predict = model.predict(dtrain)\n",
+    "    label = dtrain.get_label()\n",
+    "    train_score = np.sum((y_train_predict > 0.5) == label) / float(len(label))\n",
+    "\n",
+    "    train_scores.append(train_score)\n",
+    "    \n",
+    "    p_x = predict_x[date]\n",
+    "    p_y = predict_y[date]\n",
+    "    p_risk = predict_risk[date][:, 1:]\n",
+    "    new_p_x = cross_product(p_x, p_risk)\n",
+    "    dtest = xgb.DMatrix(new_p_x, p_y)\n",
+    "    \n",
+    "    y_test_predict = model.predict(dtest)\n",
+    "    p_label = dtest.get_label()\n",
+    "    test_score = np.sum((y_test_predict > 0.5) == p_label) / float(len(p_label))\n",
+    "    predict_scores.append(test_score)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 43,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "0.560057712549\n",
+      "0.552663472836\n"
+     ]
+    }
+   ],
+   "source": [
+    "print(np.mean(train_scores))\n",
+    "print(np.mean(predict_scores))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.6.3"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
--- a/notebooks/strategy with classifier models.ipynb
+++ b/notebooks/strategy with classifier models.ipynb
--- a/xgboost @ a187ed6c
+++ b/xgboost @ a187ed6c
-Subproject commit bf4367184164e593cd2856ef38f8dd4f8cc76999
+Subproject commit a187ed6c8f3aa40b47d5be80667cbbe6a6fd563d