Merge branch 'master' of https://github.com/lion-sing/alpha-mind

d99cf0c5 · Yucheng · bfc257f9 · 7924db6d · d99cf0c5 · d99cf0c5
Commit d99cf0c5 authored Jan 12, 2018 by Yucheng
17 changed files
--- a/alphamind/api.py
+++ b/alphamind/api.py
@@ -33,6 +33,7 @@ from alphamind.model import RandomForestRegressor
 from alphamind.model import RandomForestClassifier
 from alphamind.model import XGBRegressor
 from alphamind.model import XGBClassifier
+from alphamind.model import XGBTrainer
 from alphamind.model import load_model
 from alphamind.model.data_preparing import fetch_data_package
 from alphamind.model.data_preparing import fetch_train_phase
@@ -74,6 +75,7 @@ __all__ = [
    'RandomForestClassifier',
    'XGBRegressor',
    'XGBClassifier',
+    'XGBTrainer',
    'load_model',
    'NaiveExecutor',
    'ThresholdExecutor',

--- a/alphamind/data/dbmodel/models.py
+++ b/alphamind/data/dbmodel/models.py
@@ -1954,5 +1954,5 @@ class OutrightTmp(Base):
 if __name__ == '__main__':
    from sqlalchemy import create_engine

-    engine = create_engine('postgres+psycopg2://postgres:A12345678!@10.63.6.220/alpha')
+    engine = create_engine('postgres+psycopg2://postgres:we083826@101.132.104.118/alpha')
    Base.metadata.create_all(engine)
--- a/alphamind/data/engines/sqlengine.py
+++ b/alphamind/data/engines/sqlengine.py
@@ -388,11 +388,13 @@ class SqlEngine(object):
        df = pd.read_sql(query, self.engine)
        if universe.is_filtered:
            codes = universe.query(self, start_date, end_date, dates)
-            df = pd.merge(df, codes, how='inner', on=['trade_date', 'code']).sort_values(['trade_date', 'code'])
+            df = pd.merge(df, codes, how='inner', on=['trade_date', 'code'])

        if external_data is not None:
            df = pd.merge(df, external_data, on=['trade_date', 'code']).dropna()

+        df.sort_values(['trade_date', 'code'], inplace=True)
+
        df.set_index('trade_date', inplace=True)
        res = transformer.transform('code', df)


--- a/alphamind/examples/formula_expression.py
+++ b/alphamind/examples/formula_expression.py
@@ -17,25 +17,24 @@ import datetime as dt

 start = dt.datetime.now()

-universe_name = 'zz500'
+universe = Universe('custom', ['zz800'])

-factor_name = 'PE'
-expression = 1. / LAST(factor_name)
+simple_expression = CSRes(LAST('OperCashInToAsset'), 'roe_q')

-alpha_factor_name = '1/PE'
-alpha_factor = {alpha_factor_name: expression}
+alpha_factor_name = 'alpha_factor'
+alpha_factor = {alpha_factor_name: simple_expression}

 # end of formula definition

 engine = SqlEngine('postgresql+psycopg2://postgres:A12345678!@10.63.6.220/alpha')
-universe = Universe('custom', [universe_name])
-neutralize_risk = ['SIZE'] + industry_styles
-freq = '5b'
+
+neutralize_risk = ['SIZE', 'LEVERAGE'] + industry_styles
+freq = '10b'
 n_bins = 5
 horizon = map_freq(freq)

 start_date = '2012-01-01'
-end_date = '2017-11-21'
+end_date = '2018-01-05'

 dates = makeSchedule(start_date,
                     end_date,
@@ -93,10 +92,9 @@ df = df.cumsum().plot(ax=axes[0], title='Quantile Analysis for {0}'.format(alpha
 # =================================================================== #

 factor_name = 'PE'
-expression = DIFF(1./LAST(factor_name))

-alpha_factor_name = '1/PE_1w_diff'
-alpha_factor = {alpha_factor_name: expression}
+alpha_factor_name = alpha_factor_name + '_1w_diff'
+alpha_factor = {alpha_factor_name: DIFF(simple_expression)}

 dates = makeSchedule(start_date,
                     end_date,

--- a/alphamind/model/__init__.py
+++ b/alphamind/model/__init__.py
@@ -14,6 +14,7 @@ from alphamind.model.treemodel import RandomForestRegressor
 from alphamind.model.treemodel import RandomForestClassifier
 from alphamind.model.treemodel import XGBRegressor
 from alphamind.model.treemodel import XGBClassifier
+from alphamind.model.treemodel import XGBTrainer

 from alphamind.model.loader import load_model

@@ -26,4 +27,5 @@ __all__ = ['LinearRegression',
           'RandomForestClassifier',
           'XGBRegressor',
           'XGBClassifier',
+           'XGBTrainer',
           'load_model']
\ No newline at end of file
--- a/alphamind/model/data_preparing.py
+++ b/alphamind/model/data_preparing.py
@@ -27,6 +27,7 @@ from alphamind.utilities import map_freq

 def _merge_df(engine, names, factor_df, return_df, universe, dates, risk_model, neutralized_risk):
    risk_df = engine.fetch_risk_model_range(universe, dates=dates, risk_model=risk_model)[1]
+    alpha_logger.info("risk data loading finished")
    used_neutralized_risk = list(set(total_risk_factors).difference(names))
    risk_df = risk_df[['trade_date', 'code'] + used_neutralized_risk].dropna()
    return_df = pd.merge(return_df, risk_df, on=['trade_date', 'code'])
@@ -45,9 +46,10 @@ def _merge_df(engine, names, factor_df, return_df, universe, dates, risk_model,
        x_values = train_x[names].values.astype(float)
        y_values = train_y[['dx']].values

+    codes = train_x['code'].values
    date_label = pd.DatetimeIndex(factor_df.trade_date).to_pydatetime()
    dates = np.unique(date_label)
-    return return_df, dates, date_label, risk_exp, x_values, y_values, train_x, train_y
+    return return_df, dates, date_label, risk_exp, x_values, y_values, train_x, train_y, codes


 def prepare_data(engine: SqlEngine,
@@ -82,9 +84,13 @@ def prepare_data(engine: SqlEngine,
    factor_df = engine.fetch_factor_range(universe,
                                          factors=transformer,
                                          dates=dates).sort_values(['trade_date', 'code'])
+    alpha_logger.info("factor data loading finished")
    return_df = engine.fetch_dx_return_range(universe, dates=dates, horizon=horizon)
+    alpha_logger.info("return data loading finished")
    industry_df = engine.fetch_industry_range(universe, dates=dates)
+    alpha_logger.info("industry data loading finished")
    benchmark_df = engine.fetch_benchmark_range(benchmark, dates=dates)
+    alpha_logger.info("benchmark data loading finished")

    df = pd.merge(factor_df, return_df, on=['trade_date', 'code']).dropna()
    df = pd.merge(df, benchmark_df, on=['trade_date', 'code'], how='left')
@@ -102,13 +108,15 @@ def batch_processing(x_values,
                     batch,
                     risk_exp,
                     pre_process,
-                     post_process):
+                     post_process,
+                     codes):
    train_x_buckets = {}
    train_y_buckets = {}
    train_risk_buckets = {}
    predict_x_buckets = {}
    predict_y_buckets = {}
    predict_risk_buckets = {}
+    predict_codes_bucket = {}

    for i, start in enumerate(groups[:-batch]):
        end = groups[i + batch]
@@ -141,6 +149,7 @@ def batch_processing(x_values,

        sub_dates = group_label[left_index:right_index]
        this_raw_x = x_values[left_index:right_index]
+        this_codes = codes[left_index:right_index]

        if risk_exp is not None:
            this_risk_exp = risk_exp[left_index:right_index]
@@ -156,6 +165,7 @@ def batch_processing(x_values,
        inner_right_index = bisect.bisect_right(sub_dates, end)
        predict_x_buckets[end] = ne_x[inner_left_index:inner_right_index]
        predict_risk_buckets[end] = this_risk_exp[inner_left_index:inner_right_index]
+        predict_codes_bucket[end] = this_codes[inner_left_index:inner_right_index]

        this_raw_y = y_values[left_index:right_index]
        if len(this_raw_y) > 0:
@@ -165,7 +175,13 @@ def batch_processing(x_values,
                                     post_process=post_process)
            predict_y_buckets[end] = ne_y[inner_left_index:inner_right_index]

-    return train_x_buckets, train_y_buckets, train_risk_buckets, predict_x_buckets, predict_y_buckets, predict_risk_buckets
+    return train_x_buckets, \
+           train_y_buckets, \
+           train_risk_buckets, \
+           predict_x_buckets, \
+           predict_y_buckets, \
+           predict_risk_buckets, \
+           predict_codes_bucket


 def fetch_data_package(engine: SqlEngine,
@@ -193,9 +209,11 @@ def fetch_data_package(engine: SqlEngine,
                                               benchmark,
                                               warm_start)

-    return_df, dates, date_label, risk_exp, x_values, y_values, train_x, train_y = \
+    return_df, dates, date_label, risk_exp, x_values, y_values, train_x, train_y, codes = \
        _merge_df(engine, transformer.names, factor_df, return_df, universe, dates, risk_model, neutralized_risk)

+    alpha_logger.info("data merging finished")
+
    return_df['weight'] = train_x['weight']
    return_df['industry'] = train_x['industry']
    return_df['industry_code'] = train_x['industry_code']
@@ -207,15 +225,16 @@ def fetch_data_package(engine: SqlEngine,

    alpha_logger.info("Loading data is finished")

-    train_x_buckets, train_y_buckets, train_risk_buckets, predict_x_buckets, predict_y_buckets, predict_risk_buckets = batch_processing(
-        x_values,
-        y_values,
-        dates,
-        date_label,
-        batch,
-        risk_exp,
-        pre_process,
-        post_process)
+    train_x_buckets, train_y_buckets, train_risk_buckets, predict_x_buckets, predict_y_buckets, predict_risk_buckets, predict_codes_bucket \
+        = batch_processing(x_values,
+                           y_values,
+                           dates,
+                           date_label,
+                           batch,
+                           risk_exp,
+                           pre_process,
+                           post_process,
+                           codes)

    alpha_logger.info("Data processing is finished")

@@ -223,7 +242,7 @@ def fetch_data_package(engine: SqlEngine,
    ret['x_names'] = transformer.names
    ret['settlement'] = return_df
    ret['train'] = {'x': train_x_buckets, 'y': train_y_buckets, 'risk': train_risk_buckets}
-    ret['predict'] = {'x': predict_x_buckets, 'y': predict_y_buckets, 'risk': predict_risk_buckets}
+    ret['predict'] = {'x': predict_x_buckets, 'y': predict_y_buckets, 'risk': predict_risk_buckets, 'code': predict_codes_bucket}
    return ret


@@ -261,7 +280,7 @@ def fetch_train_phase(engine,
    return_df, factor_df = df[['trade_date', 'code', 'dx']], df[
        ['trade_date', 'code', 'isOpen'] + transformer.names]

-    return_df, dates, date_label, risk_exp, x_values, y_values, _, _ = \
+    return_df, dates, date_label, risk_exp, x_values, y_values, _, _, codes = \
        _merge_df(engine, transformer.names, factor_df, return_df, universe, dates, risk_model, neutralized_risk)

    if dates[-1] == dt.datetime.strptime(ref_date, '%Y-%m-%d'):
@@ -274,6 +293,7 @@ def fetch_train_phase(engine,
    index = (date_label >= start) & (date_label <= end)
    this_raw_x = x_values[index]
    this_raw_y = y_values[index]
+    this_code = codes[index]
    if risk_exp is not None:
        this_risk_exp = risk_exp[index]
    else:
@@ -291,7 +311,7 @@ def fetch_train_phase(engine,

    ret = dict()
    ret['x_names'] = transformer.names
-    ret['train'] = {'x': ne_x, 'y': ne_y}
+    ret['train'] = {'x': ne_x, 'y': ne_y, 'code': this_code}

    return ret

@@ -342,7 +362,6 @@ def fetch_predict_phase(engine,
        end = dates[-1]
        start = dates[-batch]

-        # index = (date_label >= start) & (date_label <= end)
        left_index = bisect.bisect_left(date_label, start)
        right_index = bisect.bisect_right(date_label, end)
        this_raw_x = x_values[left_index:right_index]
@@ -380,26 +399,12 @@ def fetch_predict_phase(engine,

 if __name__ == '__main__':
    engine = SqlEngine('postgresql+psycopg2://postgres:A12345678!@10.63.6.220/alpha')
-    universe = Universe('zz500', ['ashare_ex'])
+    universe = Universe('zz500', ['hs300', 'zz500'])
    neutralized_risk = ['SIZE']
-    res = fetch_train_phase(engine,
-                            ['EPS', 'CFinc1'],
-                            '2017-09-04',
-                            '2w',
+    res = fetch_predict_phase(engine, ['ep_q'],
+                            '2018-01-08',
+                            '5b',
                            universe,
-                            4,
-                            warm_start=1,
+                            16,
                            neutralized_risk=neutralized_risk)
-
-    print(res)
-
-    res = fetch_predict_phase(engine,
-                              ['EPS', 'CFinc1'],
-                              '2017-09-04',
-                              '2w',
-                              universe,
-                              4,
-                              warm_start=1,
-                              neutralized_risk=neutralized_risk)
-
    print(res)
--- a/alphamind/model/loader.py
+++ b/alphamind/model/loader.py
@@ -14,6 +14,7 @@ from alphamind.model.treemodel import RandomForestRegressor
 from alphamind.model.treemodel import RandomForestClassifier
 from alphamind.model.treemodel import XGBRegressor
 from alphamind.model.treemodel import XGBClassifier
+from alphamind.model.treemodel import XGBTrainer


 def load_model(model_desc: dict) -> ModelBase:
@@ -37,5 +38,7 @@ def load_model(model_desc: dict) -> ModelBase:
        return XGBRegressor.load(model_desc)
    elif 'XGBClassifier' in model_name_parts:
        return XGBClassifier.load(model_desc)
+    elif 'XGBTrainer' in model_name_parts:
+        return XGBTrainer.load(model_desc)
    else:
        raise ValueError('{0} is not currently supported in model loader.'.format(model_name))
--- a/alphamind/model/modelbase.py
+++ b/alphamind/model/modelbase.py
@@ -21,7 +21,7 @@ class ModelBase(metaclass=abc.ABCMeta):
        self.impl = None
        self.trained_time = None

-    def fit(self, x, y):
+    def fit(self, x: np.ndarray, y: np.ndarray):
        self.impl.fit(x, y.flatten())
        self.trained_time = arrow.now().format("YYYY-MM-DD HH:mm:ss")


--- a/alphamind/model/treemodel.py
+++ b/alphamind/model/treemodel.py
@@ -7,9 +7,13 @@ Created on 2017-12-4

 from typing import List
 from distutils.version import LooseVersion
+import arrow
+import numpy as np
 from sklearn import __version__ as sklearn_version
 from sklearn.ensemble import RandomForestRegressor as RandomForestRegressorImpl
 from sklearn.ensemble import RandomForestClassifier as RandomForestClassifierImpl
+from sklearn.model_selection import train_test_split
+import xgboost as xgb
 from xgboost import __version__ as xgbboot_version
 from xgboost import XGBRegressor as XGBRegressorImpl
 from xgboost import XGBClassifier as XGBClassifierImpl
@@ -155,4 +159,91 @@ class XGBClassifier(ModelBase):
        return self.impl.feature_importances_.tolist()


+class XGBTrainer(ModelBase):
+
+    def __init__(self,
+                 objective='binary:logistic',
+                 booster='gbtree',
+                 tree_method='hist',
+                 n_estimators: int=100,
+                 learning_rate: float=0.1,
+                 max_depth=3,
+                 eval_sample=None,
+                 early_stopping_rounds=None,
+                 subsample=1.,
+                 colsample_bytree=1.,
+                 features: List = None,
+                 random_state=0,
+                 **kwargs):
+        super().__init__(features)
+        self.params = {
+            'silent': 1,
+            'objective': objective,
+            'max_depth': max_depth,
+            'eta': learning_rate,
+            'booster': booster,
+            'tree_method': tree_method,
+            'subsample': subsample,
+            'colsample_bytree': colsample_bytree,
+            'seed': random_state
+        }
+
+        self.eval_sample = eval_sample
+        self.num_boost_round = n_estimators
+        self.early_stopping_rounds = early_stopping_rounds
+        self.impl = None
+        self.kwargs = kwargs
+
+    def fit(self, x, y):
+        if self.eval_sample:
+            x_train, x_eval, y_train, y_eval = train_test_split(x,
+                                                                y,
+                                                                test_size=self.eval_sample,
+                                                                random_state=42)
+            d_train = xgb.DMatrix(x_train, y_train)
+            d_eval = xgb.DMatrix(x_eval, y_eval)
+            self.impl = xgb.train(params=self.params,
+                                  dtrain=d_train,
+                                  num_boost_round=self.num_boost_round,
+                                  evals=[(d_eval, 'eval')],
+                                  verbose_eval=False,
+                                  **self.kwargs)
+        else:
+            d_train = xgb.DMatrix(x, y)
+            self.impl = xgb.train(params=self.params,
+                                  dtrain=d_train,
+                                  num_boost_round=self.num_boost_round,
+                                  **self.kwargs)
+
+        self.trained_time = arrow.now().format("YYYY-MM-DD HH:mm:ss")
+
+    def predict(self, x: np.ndarray) -> np.ndarray:
+        d_predict = xgb.DMatrix(x)
+        return self.impl.predict(d_predict)
+
+    def save(self) -> dict:
+        model_desc = super().save()
+        model_desc['xgbboot_version'] = xgbboot_version
+        model_desc['importances'] = self.importances
+        return model_desc
+
+    @classmethod
+    def load(cls, model_desc: dict):
+        obj_layout = super().load(model_desc)
+
+        if LooseVersion(sklearn_version) < LooseVersion(model_desc['xgbboot_version']):
+            alpha_logger.warning('Current xgboost version {0} is lower than the model version {1}. '
+                                 'Loaded model may work incorrectly.'.format(xgbboot_version,
+                                                                             model_desc['xgbboot_version']))
+        return obj_layout
+
+    @property
+    def importances(self):
+        imps = self.impl.get_fscore().items()
+        imps = sorted(imps, key=lambda x: x[0])
+        return list(zip(*imps))[1]
+
+
+
+

--- a/alphamind/tests/model/test_treemodel.py
+++ b/alphamind/tests/model/test_treemodel.py
@@ -12,16 +12,18 @@ from alphamind.model.treemodel import RandomForestRegressor
 from alphamind.model.treemodel import RandomForestClassifier
 from alphamind.model.treemodel import XGBRegressor
 from alphamind.model.treemodel import XGBClassifier
+from alphamind.model.treemodel import XGBTrainer


 class TestTreeModel(unittest.TestCase):

+    def setUp(self):
+        self.x = np.random.randn(1000, 10)
+        self.y = np.random.randn(1000)
+
    def test_random_forest_regress_persistence(self):
        model = RandomForestRegressor(features=list(range(10)))
-        x = np.random.randn(1000, 10)
-        y = np.random.randn(1000)
-
-        model.fit(x, y)
+        model.fit(self.x, self.y)

        desc = model.save()
        new_model = load_model(desc)
@@ -29,14 +31,12 @@ class TestTreeModel(unittest.TestCase):

        sample_x = np.random.randn(100, 10)
        np.testing.assert_array_almost_equal(model.predict(sample_x), new_model.predict(sample_x))
+        np.testing.assert_array_almost_equal(model.importances, new_model.importances)

    def test_random_forest_classify_persistence(self):
        model = RandomForestClassifier(features=list(range(10)))
-        x = np.random.randn(1000, 10)
-        y = np.random.randn(1000)
-        y = np.where(y > 0, 1, 0)
-
-        model.fit(x, y)
+        y = np.where(self.y > 0, 1, 0)
+        model.fit(self.x, y)

        desc = model.save()
        new_model = load_model(desc)
@@ -44,13 +44,11 @@ class TestTreeModel(unittest.TestCase):

        sample_x = np.random.randn(100, 10)
        np.testing.assert_array_almost_equal(model.predict(sample_x), new_model.predict(sample_x))
+        np.testing.assert_array_almost_equal(model.importances, new_model.importances)

    def test_xgb_regress_persistence(self):
        model = XGBRegressor(features=list(range(10)))
-        x = np.random.randn(1000, 10)
-        y = np.random.randn(1000)
-
-        model.fit(x, y)
+        model.fit(self.x, self.y)

        desc = model.save()
        new_model = load_model(desc)
@@ -58,14 +56,56 @@ class TestTreeModel(unittest.TestCase):

        sample_x = np.random.randn(100, 10)
        np.testing.assert_array_almost_equal(model.predict(sample_x), new_model.predict(sample_x))
+        np.testing.assert_array_almost_equal(model.importances, new_model.importances)

    def test_xgb_classify_persistence(self):
        model = XGBClassifier(features=list(range(10)))
-        x = np.random.randn(1000, 10)
-        y = np.random.randn(1000)
-        y = np.where(y > 0, 1, 0)
+        y = np.where(self.y > 0, 1, 0)
+        model.fit(self.x, y)
+
+        desc = model.save()
+        new_model = load_model(desc)
+        self.assertEqual(model.features, new_model.features)
+
+        sample_x = np.random.randn(100, 10)
+        np.testing.assert_array_almost_equal(model.predict(sample_x), new_model.predict(sample_x))
+        np.testing.assert_array_almost_equal(model.importances, new_model.importances)
+
+    def test_xgb_trainer_equal_classifier(self):
+        sample_x = np.random.randn(100, 10)

-        model.fit(x, y)
+        model1 = XGBClassifier(n_estimators=100,
+                               learning_rate=0.1,
+                               max_depth=3,
+                               features=list(range(10)),
+                               random_state=42)
+
+        model2 = XGBTrainer(features=list(range(10)),
+                            objective='reg:logistic',
+                            booster='gbtree',
+                            tree_method='exact',
+                            n_estimators=100,
+                            learning_rate=0.1,
+                            max_depth=3,
+                            random_state=42)
+
+        y = np.where(self.y > 0, 1, 0)
+        model1.fit(self.x, y)
+        model2.fit(self.x, y)
+
+        predict1 = model1.predict(sample_x)
+        predict2 = model2.predict(sample_x)
+        predict2 = np.where(predict2 > 0.5, 1., 0.)
+        np.testing.assert_array_almost_equal(predict1, predict2)
+
+    def test_xgb_trainer_persistence(self):
+        model = XGBTrainer(features=list(range(10)),
+                           objective='binary:logistic',
+                           booster='gbtree',
+                           tree_method='hist',
+                           n_estimators=200)
+        y = np.where(self.y > 0, 1, 0)
+        model.fit(self.x, y)

        desc = model.save()
        new_model = load_model(desc)
@@ -73,3 +113,4 @@ class TestTreeModel(unittest.TestCase):

        sample_x = np.random.randn(100, 10)
        np.testing.assert_array_almost_equal(model.predict(sample_x), new_model.predict(sample_x))
+        np.testing.assert_array_almost_equal(model.importances, new_model.importances)
--- a/build_linux_dependencies.sh
+++ b/build_linux_dependencies.sh
@@ -4,7 +4,12 @@ cd xgboost
 git submodule init
 git submodule update

+mkdir build
+cd build
+cmake ..
 make -j4
+cd ..
+
 cd python-package
 python setup.py install

@@ -22,4 +27,4 @@ if [ $? -ne 0 ] ; then
    exit 1
 fi

-cd ../..
\ No newline at end of file
+cd ../..
--- a/notebooks/candidate_prod_model_20171204.ipynb
+++ b/notebooks/candidate_prod_model_20171204.ipynb
--- a/notebooks/full factor strategy.ipynb
+++ b/notebooks/full factor strategy.ipynb
--- a/notebooks/model_comparing.ipynb
+++ b/notebooks/model_comparing.ipynb
--- a/notebooks/model_comparing_classifiers.ipynb
+++ b/notebooks/model_comparing_classifiers.ipynb
--- a/notebooks/strategy with classifier models.ipynb
+++ b/notebooks/strategy with classifier models.ipynb
--- a/xgboost @ a187ed6c
+++ b/xgboost @ a187ed6c
-Subproject commit bf4367184164e593cd2856ef38f8dd4f8cc76999
+Subproject commit a187ed6c8f3aa40b47d5be80667cbbe6a6fd563d