Merge branch 'master' of https://github.com/lion-sing/alpha-mind

b19af2d0 · Yucheng · 058357c0 · 4622cdf5 · b19af2d0 · b19af2d0
Commit b19af2d0 authored Jan 08, 2018 by Yucheng
21 changed files
--- a/.gitmodules
+++ b/.gitmodules
 [submodule "alphamind/pfopt"]
 	path = alphamind/pfopt
 	url = https://github.com/alpha-miner/portfolio-optimizer.git
+[submodule "xgboost"]
+	path = xgboost
+	url = https://github.com/dmlc/xgboost.git
--- a/README.md
+++ b/README.md
@@ -42,16 +42,16 @@ alpha - mind 提供了多因子研究中常用的工具链，包括：
  在Windows上完整安装，需要有C++编译器(例如msvc）:
-```bash
+    ```bash
-build_windows_dependencies.bat
+    build_windows_dependencies.bat
-```
+    ```
 * Linux
-在linux上，需要c++编译器（例如g++）以及fortran编译器（例如gfortran)
+    在linux上，需要c++编译器（例如g++）以及fortran编译器（例如gfortran)
-```bash
+    ```bash
-build_linux_dependencies.sh
+    build_linux_dependencies.sh
 ```
 ## 安装

--- a/alphamind/__init__.py
+++ b/alphamind/__init__.py
@@ -4,3 +4,6 @@ Created on 2017-4-25
 @author: cheng.li
 """
+__version__ = "0.1.0"
--- a/alphamind/api.py
+++ b/alphamind/api.py
@@ -25,11 +25,15 @@ from alphamind.data.standardize import projection
 from alphamind.data.neutralize import neutralize
 from alphamind.data.engines.sqlengine import factor_tables
-from alphamind.model.linearmodel import LinearRegression
+from alphamind.model import LinearRegression
-from alphamind.model.linearmodel import LassoRegression
+from alphamind.model import LassoRegression
-from alphamind.model.linearmodel import ConstLinearModel
+from alphamind.model import ConstLinearModel
-from alphamind.model.treemodel import RandomForestRegressor
+from alphamind.model import LogisticRegression
-from alphamind.model.loader import load_model
+from alphamind.model import RandomForestRegressor
+from alphamind.model import RandomForestClassifier
+from alphamind.model import XGBRegressor
+from alphamind.model import XGBClassifier
+from alphamind.model import load_model
 from alphamind.model.data_preparing import fetch_data_package
 from alphamind.model.data_preparing import fetch_train_phase
@@ -39,27 +43,7 @@ from alphamind.execution.targetvolexecutor import TargetVolExecutor
 from alphamind.execution.pipeline import ExecutionPipeline
 from alphamind.utilities import alpha_logger
+from alphamind.utilities import map_freq
-def map_freq(freq):
-    if freq == '1m':
-        horizon = 21
-    elif freq == '1w':
-        horizon = 4
-    elif freq == '2w':
-        horizon = 9
-    elif freq == '3w':
-        horizon = 14
-    elif freq == '4w':
-        horizon = 19
-    elif freq == '1d':
-        horizon = 0
-    elif freq[-1] == "b":
-        horizon = int(freq[:-1]) - 1
-    else:
-        raise ValueError("Unrecognized freq: {0}".format(freq))
-    return horizon
 __all__ = [
@@ -85,7 +69,11 @@ __all__ = [
    'LinearRegression',
    'LassoRegression',
    'ConstLinearModel',
+    'LogisticRegression',
    'RandomForestRegressor',
+    'RandomForestClassifier',
+    'XGBRegressor',
+    'XGBClassifier',
    'load_model',
    'NaiveExecutor',
    'ThresholdExecutor',

--- a/alphamind/data/dbmodel/models.py
+++ b/alphamind/data/dbmodel/models.py
@@ -664,63 +664,6 @@ class Experimental(Base):
    DROEAfterNonRecurring = Column(Float(53))
    CFinc1 = Column(Float(53))
    xueqiu_hotness = Column(Float(53))
-    con_eps = Column(Float(53))
-    con_pb = Column(Float(53))
-    con_pb_order = Column(Float(53))
-    con_pb_rolling = Column(Float(53))
-    con_pb_rolling_order = Column(Float(53))
-    con_pe = Column(Float(53))
-    con_pe_order = Column(Float(53))
-    con_pe_rolling = Column(Float(53))
-    con_pe_rolling_order = Column(Float(53))
-    con_peg = Column(Float(53))
-    con_peg_order = Column(Float(53))
-    con_peg_rolling = Column(Float(53))
-    con_peg_rolling_order = Column(Float(53))
-    con_ps = Column(Float(53))
-    con_ps_order = Column(Float(53))
-    con_ps_rolling = Column(Float(53))
-    con_ps_rolling_order = Column(Float(53))
-    con_target_price = Column(Float(53))
-    market_confidence_10d = Column(Float(53))
-    market_confidence_15d = Column(Float(53))
-    market_confidence_25d = Column(Float(53))
-    market_confidence_5d = Column(Float(53))
-    market_confidence_75d = Column(Float(53))
-    optimism_confidence_10d = Column(Float(53))
-    optimism_confidence_15d = Column(Float(53))
-    optimism_confidence_25d = Column(Float(53))
-    optimism_confidence_5d = Column(Float(53))
-    optimism_confidence_75d = Column(Float(53))
-    pessimism_confidence_10d = Column(Float(53))
-    pessimism_confidence_15d = Column(Float(53))
-    pessimism_confidence_25d = Column(Float(53))
-    pessimism_confidence_5d = Column(Float(53))
-    pessimism_confidence_75d = Column(Float(53))
-    con_na_yoy = Column(Float(53))
-    con_np_yoy = Column(Float(53))
-    con_npcgrate_13w = Column(Float(53))
-    con_npcgrate_1w = Column(Float(53))
-    con_npcgrate_26w = Column(Float(53))
-    con_npcgrate_2y = Column(Float(53))
-    con_npcgrate_4w = Column(Float(53))
-    con_npcgrate_52w = Column(Float(53))
-    con_or_yoy = Column(Float(53))
-    con_roe_yoy1 = Column(Float(53))
-    con_roe_yoy2 = Column(Float(53))
-    con_roe_yoy3 = Column(Float(53))
-    con_eps_rolling = Column(Float(53))
-    con_np = Column(Float(53))
-    con_np_rolling = Column(Float(53))
-    con_or = Column(Float(53))
-    con_or_rolling = Column(Float(53))
-    con_roe = Column(Float(53))
-    con_na = Column(Float(53))
-    con_na_rolling = Column(Float(53))
-    mcap = Column(Float(53))
-    tcap = Column(Float(53))
-    ta = Column(Float(53))
-    na = Column(Float(53))
    eps_q = Column(Float(53))
    roe_q = Column(Float(53))
    cfinc1_q = Column(Float(53))

--- a/alphamind/model/__init__.py
+++ b/alphamind/model/__init__.py
@@ -8,11 +8,22 @@ Created on 2017-5-2
 from alphamind.model.linearmodel import LinearRegression
 from alphamind.model.linearmodel import LassoRegression
 from alphamind.model.linearmodel import ConstLinearModel
+from alphamind.model.linearmodel import LogisticRegression
 from alphamind.model.treemodel import RandomForestRegressor
+from alphamind.model.treemodel import RandomForestClassifier
+from alphamind.model.treemodel import XGBRegressor
+from alphamind.model.treemodel import XGBClassifier
+from alphamind.model.loader import load_model
 __all__ = ['LinearRegression',
           'LassoRegression',
           'ConstLinearModel',
-           'RandomForestRegressor']
+           'LogisticRegression',
\ No newline at end of file
+           'RandomForestRegressor',
+           'RandomForestClassifier',
+           'XGBRegressor',
+           'XGBClassifier',
+           'load_model']
\ No newline at end of file
--- a/alphamind/model/data_preparing.py
+++ b/alphamind/model/data_preparing.py
@@ -16,27 +16,13 @@ from PyFin.api import BizDayConventions
 from PyFin.api import DateGeneration
 from PyFin.api import advanceDateByCalendar
 from PyFin.DateUtilities import Period
-from PyFin.Enums import TimeUnits
 from alphamind.data.transformer import Transformer
 from alphamind.data.engines.sqlengine import SqlEngine
 from alphamind.data.engines.universe import Universe
 from alphamind.data.processing import factor_processing
 from alphamind.data.engines.sqlengine import total_risk_factors
 from alphamind.utilities import alpha_logger
+from alphamind.utilities import map_freq
-def _map_horizon(frequency: str) -> int:
-    parsed_period = Period(frequency)
-    unit = parsed_period.units()
-    length = parsed_period.length()
-    if unit == TimeUnits.BDays or unit == TimeUnits.Days:
-        return length - 1
-    elif unit == TimeUnits.Weeks:
-        return 5 * length - 1
-    elif unit == TimeUnits.Months:
-        return 22 * length - 1
-    else:
-        raise ValueError('{0} is an unrecognized frequency rule'.format(frequency))
 def _merge_df(engine, names, factor_df, return_df, universe, dates, risk_model, neutralized_risk):
@@ -86,7 +72,7 @@ def prepare_data(engine: SqlEngine,
    dates = [d.strftime('%Y-%m-%d') for d in dates]
-    horizon = _map_horizon(frequency)
+    horizon = map_freq(frequency)
    if isinstance(factors, Transformer):
        transformer = factors
@@ -119,8 +105,10 @@ def batch_processing(x_values,
                     post_process):
    train_x_buckets = {}
    train_y_buckets = {}
+    train_risk_buckets = {}
    predict_x_buckets = {}
    predict_y_buckets = {}
+    predict_risk_buckets = {}
    for i, start in enumerate(groups[:-batch]):
        end = groups[i + batch]
@@ -146,6 +134,8 @@ def batch_processing(x_values,
                                                 risk_factors=this_risk_exp,
                                                 post_process=post_process)
+        train_risk_buckets[end] = this_risk_exp
        left_index = bisect.bisect_right(group_label, start)
        right_index = bisect.bisect_right(group_label, end)
@@ -165,6 +155,7 @@ def batch_processing(x_values,
        inner_left_index = bisect.bisect_left(sub_dates, end)
        inner_right_index = bisect.bisect_right(sub_dates, end)
        predict_x_buckets[end] = ne_x[inner_left_index:inner_right_index]
+        predict_risk_buckets[end] = this_risk_exp[inner_left_index:inner_right_index]
        this_raw_y = y_values[left_index:right_index]
        if len(this_raw_y) > 0:
@@ -174,7 +165,7 @@ def batch_processing(x_values,
                                     post_process=post_process)
            predict_y_buckets[end] = ne_y[inner_left_index:inner_right_index]
-    return train_x_buckets, train_y_buckets, predict_x_buckets, predict_y_buckets
+    return train_x_buckets, train_y_buckets, train_risk_buckets, predict_x_buckets, predict_y_buckets, predict_risk_buckets
 def fetch_data_package(engine: SqlEngine,
@@ -216,7 +207,7 @@ def fetch_data_package(engine: SqlEngine,
    alpha_logger.info("Loading data is finished")
-    train_x_buckets, train_y_buckets, predict_x_buckets, predict_y_buckets = batch_processing(
+    train_x_buckets, train_y_buckets, train_risk_buckets, predict_x_buckets, predict_y_buckets, predict_risk_buckets = batch_processing(
        x_values,
        y_values,
        dates,
@@ -231,8 +222,8 @@ def fetch_data_package(engine: SqlEngine,
    ret = dict()
    ret['x_names'] = transformer.names
    ret['settlement'] = return_df
-    ret['train'] = {'x': train_x_buckets, 'y': train_y_buckets}
+    ret['train'] = {'x': train_x_buckets, 'y': train_y_buckets, 'risk': train_risk_buckets}
-    ret['predict'] = {'x': predict_x_buckets, 'y': predict_y_buckets}
+    ret['predict'] = {'x': predict_x_buckets, 'y': predict_y_buckets, 'risk': predict_risk_buckets}
    return ret
@@ -260,7 +251,7 @@ def fetch_train_phase(engine,
                         dateRule=BizDayConventions.Following,
                         dateGenerationRule=DateGeneration.Backward)
-    horizon = _map_horizon(frequency)
+    horizon = map_freq(frequency)
    factor_df = engine.fetch_factor_range(universe, factors=transformer, dates=dates)
    return_df = engine.fetch_dx_return_range(universe, dates=dates, horizon=horizon)
@@ -339,10 +330,10 @@ def fetch_predict_phase(engine,
        risk_df = risk_df[['trade_date', 'code'] + used_neutralized_risk].dropna()
        train_x = pd.merge(factor_df, risk_df, on=['trade_date', 'code'])
        risk_exp = train_x[neutralized_risk].values.astype(float)
-        x_values = train_x[names].values.astype(float)
    else:
        train_x = factor_df.copy()
        risk_exp = None
+    x_values = train_x[names].values.astype(float)
    date_label = pd.DatetimeIndex(factor_df.trade_date).to_pydatetime()
    dates = np.unique(date_label)

--- a/alphamind/model/linearmodel.py
+++ b/alphamind/model/linearmodel.py
@@ -6,87 +6,75 @@ Created on 2017-5-10
 """
 import numpy as np
-import arrow
 from distutils.version import LooseVersion
 from sklearn import __version__ as sklearn_version
 from sklearn.linear_model import LinearRegression as LinearRegressionImpl
 from sklearn.linear_model import Lasso
+from sklearn.linear_model import LogisticRegression as LogisticRegressionImpl
 from PyFin.api import pyFinAssert
 from alphamind.model.modelbase import ModelBase
 from alphamind.utilities import alpha_logger
-from alphamind.utilities import encode
-from alphamind.utilities import decode
+class ConstLinearModelImpl(object):
+    def __init__(self, weights: np.ndarray = None):
+        self.weights = np.array(weights).flatten()
+    def fit(self, x: np.ndarray, y: np.ndarray):
+        pass
+    def predict(self, x: np.ndarray):
+        return x @ self.weights
 class ConstLinearModel(ModelBase):
    def __init__(self,
-                 features: list=None,
+                 features: list = None,
-                 weights: np.ndarray=None):
+                 weights: np.ndarray = None):
        super().__init__(features)
        if features is not None and weights is not None:
            pyFinAssert(len(features) == len(weights),
                        ValueError,
                        "length of features is not equal to length of weights")
-            self.weights = np.array(weights).flatten()
+        self.impl = ConstLinearModelImpl(weights)
-    def fit(self, x: np.ndarray, y: np.ndarray):
-        pass
-    def predict(self, x):
-        return x @ self.weights
    def save(self):
        model_desc = super().save()
-        model_desc['weight'] = list(self.weights)
+        model_desc['weight'] = list(self.impl.weights)
        return model_desc
    @classmethod
    def load(cls, model_desc: dict):
-        obj_layout = cls()
+        return super().load(model_desc)
-        obj_layout.features = model_desc['features']
-        obj_layout.weights = np.array(model_desc['weight'])
+    @property
-        return obj_layout
+    def weights(self):
+        return self.impl.weights.tolist()
 class LinearRegression(ModelBase):
-    def __init__(self, features: list=None, fit_intercept: bool=False):
+    def __init__(self, features: list = None, fit_intercept: bool = False, **kwargs):
        super().__init__(features)
-        self.impl = LinearRegressionImpl(fit_intercept=fit_intercept)
+        self.impl = LinearRegressionImpl(fit_intercept=fit_intercept, **kwargs)
        self.trained_time = None
-    def fit(self, x: np.ndarray, y: np.ndarray):
-        self.impl.fit(x, y)
-        self.trained_time = arrow.now().format("YYYY-MM-DD HH:mm:ss")
-    def predict(self, x: np.ndarray) -> np.ndarray:
-        return self.impl.predict(x)
    def save(self) -> dict:
        model_desc = super().save()
-        model_desc['internal_model'] = self.impl.__class__.__module__ + "." + self.impl.__class__.__name__
-        model_desc['desc'] = encode(self.impl)
        model_desc['sklearn_version'] = sklearn_version
-        model_desc['trained_time'] = self.trained_time
        model_desc['weight'] = self.impl.coef_.tolist()
        return model_desc
-    def score(self) -> float:
-        return self.impl.score()
    @classmethod
    def load(cls, model_desc: dict):
-        obj_layout = cls()
+        obj_layout = super().load(model_desc)
-        obj_layout.features = model_desc['features']
-        obj_layout.trained_time = model_desc['trained_time']
        if LooseVersion(sklearn_version) < LooseVersion(model_desc['sklearn_version']):
            alpha_logger.warning('Current sklearn version {0} is lower than the model version {1}. '
-                                 'Loaded model may work incorrectly.'.format(
+                                 'Loaded model may work incorrectly.'.format(sklearn_version,
-                sklearn_version, model_desc['sklearn_version']))
+                                                                             model_desc['sklearn_version']))
-        obj_layout.impl = decode(model_desc['desc'])
        return obj_layout
    @property
@@ -96,42 +84,52 @@ class LinearRegression(ModelBase):
 class LassoRegression(ModelBase):
-    def __init__(self, alpha, features: list=None, fit_intercept: bool=False):
+    def __init__(self, alpha=0.01, features: list = None, fit_intercept: bool = False, **kwargs):
        super().__init__(features)
-        self.impl = Lasso(alpha=alpha, fit_intercept=fit_intercept)
+        self.impl = Lasso(alpha=alpha, fit_intercept=fit_intercept, **kwargs)
        self.trained_time = None
-    def fit(self, x: np.ndarray, y: np.ndarray):
+    def save(self) -> dict:
-        self.impl.fit(x, y)
+        model_desc = super().save()
-        self.trained_time = arrow.now().format("YYYY-MM-DD HH:mm:ss")
+        model_desc['sklearn_version'] = sklearn_version
+        model_desc['weight'] = self.impl.coef_.tolist()
+        return model_desc
+    @classmethod
+    def load(cls, model_desc: dict):
+        obj_layout = super().load(model_desc)
-    def predict(self, x: np.ndarray) -> np.ndarray:
+        if LooseVersion(sklearn_version) < LooseVersion(model_desc['sklearn_version']):
-        return self.impl.predict(x)
+            alpha_logger.warning('Current sklearn version {0} is lower than the model version {1}. '
+                                 'Loaded model may work incorrectly.'.format(sklearn_version,
+                                                                             model_desc['sklearn_version']))
+        return obj_layout
+    @property
+    def weights(self):
+        return self.impl.coef_.tolist()
+class LogisticRegression(ModelBase):
+    def __init__(self, features: list = None, fit_intercept: bool = False, **kwargs):
+        super().__init__(features)
+        self.impl = LogisticRegressionImpl(fit_intercept=fit_intercept, **kwargs)
    def save(self) -> dict:
        model_desc = super().save()
-        model_desc['internal_model'] = self.impl.__class__.__module__ + "." + self.impl.__class__.__name__
-        model_desc['desc'] = encode(self.impl)
        model_desc['sklearn_version'] = sklearn_version
-        model_desc['trained_time'] = self.trained_time
        model_desc['weight'] = self.impl.coef_.tolist()
        return model_desc
-    def score(self) -> float:
-        return self.impl.score()
    @classmethod
    def load(cls, model_desc: dict):
-        obj_layout = cls(alpha=0.)
+        obj_layout = super().load(model_desc)
-        obj_layout.features = model_desc['features']
-        obj_layout.trained_time = model_desc['trained_time']
        if LooseVersion(sklearn_version) < LooseVersion(model_desc['sklearn_version']):
            alpha_logger.warning('Current sklearn version {0} is lower than the model version {1}. '
-                                 'Loaded model may work incorrectly.'.format(
+                                 'Loaded model may work incorrectly.'.format(sklearn_version,
-                sklearn_version, model_desc['sklearn_version']))
+                                                                             model_desc['sklearn_version']))
-        obj_layout.impl = decode(model_desc['desc'])
        return obj_layout
    @property
@@ -140,8 +138,8 @@ class LassoRegression(ModelBase):
 if __name__ == '__main__':
    import pprint
    ls = ConstLinearModel(['a', 'b'], np.array([0.5, 0.5]))
    x = np.array([[0.2, 0.2],

--- a/alphamind/model/loader.py
+++ b/alphamind/model/loader.py
@@ -9,6 +9,11 @@ from alphamind.model.modelbase import ModelBase
 from alphamind.model.linearmodel import ConstLinearModel
 from alphamind.model.linearmodel import LinearRegression
 from alphamind.model.linearmodel import LassoRegression
+from alphamind.model.linearmodel import LogisticRegression
+from alphamind.model.treemodel import RandomForestRegressor
+from alphamind.model.treemodel import RandomForestClassifier
+from alphamind.model.treemodel import XGBRegressor
+from alphamind.model.treemodel import XGBClassifier
 def load_model(model_desc: dict) -> ModelBase:
@@ -22,5 +27,15 @@ def load_model(model_desc: dict) -> ModelBase:
        return LinearRegression.load(model_desc)
    elif 'LassoRegression' in model_name_parts:
        return LassoRegression.load(model_desc)
+    elif 'LogisticRegression' in model_name_parts:
+        return LogisticRegression.load(model_desc)
+    elif 'RandomForestRegressor' in model_name_parts:
+        return RandomForestRegressor.load(model_desc)
+    elif 'RandomForestClassifier' in model_name_parts:
+        return RandomForestClassifier.load(model_desc)
+    elif 'XGBRegressor' in model_name_parts:
+        return XGBRegressor.load(model_desc)
+    elif 'XGBClassifier' in model_name_parts:
+        return XGBClassifier.load(model_desc)
    else:
        raise ValueError('{0} is not currently supported in model loader.'.format(model_name))
--- a/alphamind/model/modelbase.py
+++ b/alphamind/model/modelbase.py
@@ -9,6 +9,8 @@ import abc
 import arrow
 import numpy as np
 from alphamind.utilities import alpha_logger
+from alphamind.utilities import encode
+from alphamind.utilities import decode
 class ModelBase(metaclass=abc.ABCMeta):
@@ -16,14 +18,18 @@ class ModelBase(metaclass=abc.ABCMeta):
    def __init__(self, features: list=None):
        if features is not None:
            self.features = list(features)
+        self.impl = None
+        self.trained_time = None
-    @abc.abstractmethod
    def fit(self, x, y):
-        pass
+        self.impl.fit(x, y.flatten())
+        self.trained_time = arrow.now().format("YYYY-MM-DD HH:mm:ss")
-    @abc.abstractmethod
+    def predict(self, x: np.ndarray) -> np.ndarray:
-    def predict(self, x) -> np.ndarray:
+        return self.impl.predict(x)
-        pass
+    def score(self, x: np.ndarray, y: np.ndarray) -> float:
+        return self.impl.score(x, y)
    @abc.abstractmethod
    def save(self) -> dict:
@@ -34,10 +40,17 @@ class ModelBase(metaclass=abc.ABCMeta):
        model_desc = dict(model_name=self.__class__.__module__ + "." + self.__class__.__name__,
                          language='python',
                          saved_time=arrow.now().format("YYYY-MM-DD HH:mm:ss"),
-                          features=list(self.features))
+                          features=list(self.features),
+                          trained_time=self.trained_time,
+                          desc=encode(self.impl),
+                          internal_model=self.impl.__class__.__module__ + "." + self.impl.__class__.__name__)
        return model_desc
    @abc.abstractclassmethod
    def load(cls, model_desc: dict):
-        pass
+        obj_layout = cls()
+        obj_layout.features = model_desc['features']
+        obj_layout.trained_time = model_desc['trained_time']
+        obj_layout.impl = decode(model_desc['desc'])
+        return obj_layout
--- a/alphamind/model/treemodel.py
+++ b/alphamind/model/treemodel.py
@@ -5,47 +5,154 @@ Created on 2017-12-4
 @author: cheng.li
 """
-import arrow
+from typing import List
-import numpy as np
 from distutils.version import LooseVersion
 from sklearn import __version__ as sklearn_version
 from sklearn.ensemble import RandomForestRegressor as RandomForestRegressorImpl
+from sklearn.ensemble import RandomForestClassifier as RandomForestClassifierImpl
+from xgboost import __version__ as xgbboot_version
+from xgboost import XGBRegressor as XGBRegressorImpl
+from xgboost import XGBClassifier as XGBClassifierImpl
 from alphamind.model.modelbase import ModelBase
 from alphamind.utilities import alpha_logger
-from alphamind.utilities import encode
-from alphamind.utilities import decode
 class RandomForestRegressor(ModelBase):
-    def __init__(self, n_estimators, features=None, *args, **kwargs):
+    def __init__(self,
+                 n_estimators: int=100,
+                 max_features: str='auto',
+                 features: List=None,
+                 **kwargs):
        super().__init__(features)
-        self.impl = RandomForestRegressorImpl(n_estimators, *args, **kwargs)
+        self.impl = RandomForestRegressorImpl(n_estimators=n_estimators,
+                                              max_features=max_features,
+                                              **kwargs)
+        self.trained_time = None
-    def fit(self, x: np.ndarray, y: np.ndarray):
+    def save(self) -> dict:
-        self.impl.fit(x, y)
+        model_desc = super().save()
-        self.trained_time = arrow.now().format("YYYY-MM-DD HH:mm:ss")
+        model_desc['sklearn_version'] = sklearn_version
+        model_desc['importances'] = self.importances
+        return model_desc
+    @classmethod
+    def load(cls, model_desc: dict):
+        obj_layout = super().load(model_desc)
+        if LooseVersion(sklearn_version) < LooseVersion(model_desc['sklearn_version']):
+            alpha_logger.warning('Current sklearn version {0} is lower than the model version {1}. '
+                                 'Loaded model may work incorrectly.'.format(sklearn_version,
+                                                                             model_desc['sklearn_version']))
+        return obj_layout
+    @property
+    def importances(self):
+        return self.impl.feature_importances_.tolist()
-    def predict(self, x: np.ndarray) -> np.ndarray:
-        return self.impl.predict(x)
+class RandomForestClassifier(ModelBase):
+    def __init__(self,
+                 n_estimators: int=100,
+                 max_features: str='auto',
+                 features: List = None,
+                 **kwargs):
+        super().__init__(features)
+        self.impl = RandomForestClassifierImpl(n_estimators=n_estimators,
+                                               max_features=max_features,
+                                               **kwargs)
+        self.trained_time = None
    def save(self) -> dict:
        model_desc = super().save()
-        model_desc['internal_model'] = self.impl.__class__.__module__ + "." + self.impl.__class__.__name__
-        model_desc['desc'] = encode(self.impl)
        model_desc['sklearn_version'] = sklearn_version
-        model_desc['trained_time'] = self.trained_time
+        model_desc['importances'] = self.importances
+        return model_desc
    @classmethod
    def load(cls, model_desc: dict):
-        obj_layout = cls()
+        obj_layout = super().load(model_desc)
-        obj_layout.features = model_desc['features']
-        obj_layout.trained_time = model_desc['trained_time']
        if LooseVersion(sklearn_version) < LooseVersion(model_desc['sklearn_version']):
            alpha_logger.warning('Current sklearn version {0} is lower than the model version {1}. '
-                                 'Loaded model may work incorrectly.'.format(
+                                 'Loaded model may work incorrectly.'.format(sklearn_version,
-                sklearn_version, model_desc['sklearn_version']))
+                                                                             model_desc['sklearn_version']))
+        return obj_layout
+    @property
+    def importances(self):
+        return self.impl.feature_importances_.tolist()
-        obj_layout.impl = decode(model_desc['desc'])
+class XGBRegressor(ModelBase):
+    def __init__(self,
+                 n_estimators: int=100,
+                 learning_rate: float=0.1,
+                 max_depth: int=3,
+                 features: List=None,
+                 **kwargs):
+        super().__init__(features)
+        self.impl = XGBRegressorImpl(n_estimators=n_estimators,
+                                     learning_rate=learning_rate,
+                                     max_depth=max_depth,
+                                     **kwargs)
+    def save(self) -> dict:
+        model_desc = super().save()
+        model_desc['xgbboot_version'] = xgbboot_version
+        model_desc['importances'] = self.importances
+        return model_desc
+    @classmethod
+    def load(cls, model_desc: dict):
+        obj_layout = super().load(model_desc)
+        if LooseVersion(sklearn_version) < LooseVersion(model_desc['xgbboot_version']):
+            alpha_logger.warning('Current xgboost version {0} is lower than the model version {1}. '
+                                 'Loaded model may work incorrectly.'.format(xgbboot_version,
+                                                                             model_desc['xgbboot_version']))
        return obj_layout
+    @property
+    def importances(self):
+        return self.impl.feature_importances_.tolist()
+class XGBClassifier(ModelBase):
+    def __init__(self,
+                 n_estimators: int=100,
+                 learning_rate: float=0.1,
+                 max_depth: int=3,
+                 features: List = None,
+                 **kwargs):
+        super().__init__(features)
+        self.impl = XGBClassifierImpl(n_estimators=n_estimators,
+                                     learning_rate=learning_rate,
+                                     max_depth=max_depth,
+                                     **kwargs)
+    def save(self) -> dict:
+        model_desc = super().save()
+        model_desc['xgbboot_version'] = xgbboot_version
+        model_desc['importances'] = self.importances
+        return model_desc
+    @classmethod
+    def load(cls, model_desc: dict):
+        obj_layout = super().load(model_desc)
+        if LooseVersion(sklearn_version) < LooseVersion(model_desc['xgbboot_version']):
+            alpha_logger.warning('Current xgboost version {0} is lower than the model version {1}. '
+                                 'Loaded model may work incorrectly.'.format(xgbboot_version,
+                                                                             model_desc['xgbboot_version']))
+        return obj_layout
+    @property
+    def importances(self):
+        return self.impl.feature_importances_.tolist()
--- a/alphamind/tests/model/test_linearmodel.py
+++ b/alphamind/tests/model/test_linearmodel.py
@@ -8,8 +8,11 @@ Created on 2017-9-4
 import unittest
 import numpy as np
 from sklearn.linear_model import LinearRegression as LinearRegression2
+from alphamind.model.loader import load_model
 from alphamind.model.linearmodel import ConstLinearModel
 from alphamind.model.linearmodel import LinearRegression
+from sklearn.linear_model import LogisticRegression as LogisticRegression2
+from alphamind.model.linearmodel import LogisticRegression
 class TestLinearModel(unittest.TestCase):
@@ -17,7 +20,8 @@ class TestLinearModel(unittest.TestCase):
    def setUp(self):
        self.n = 3
        self.train_x = np.random.randn(1000, self.n)
-        self.train_y = np.random.randn(1000, 1)
+        self.train_y = np.random.randn(1000)
+        self.train_y_label = np.where(self.train_y > 0., 1, 0)
        self.predict_x = np.random.randn(10, self.n)
    def test_const_linear_model(self):
@@ -36,7 +40,7 @@ class TestLinearModel(unittest.TestCase):
                                 weights=weights)
        desc = model.save()
-        new_model = ConstLinearModel.load(desc)
+        new_model = load_model(desc)
        self.assertEqual(model.features, new_model.features)
        np.testing.assert_array_almost_equal(model.weights, new_model.weights)
@@ -52,15 +56,44 @@ class TestLinearModel(unittest.TestCase):
        expected_y = expected_model.predict(self.predict_x)
        np.testing.assert_array_almost_equal(calculated_y, expected_y)
+        np.testing.assert_array_almost_equal(expected_model.coef_, model.weights)
    def test_linear_regression_persistence(self):
        model = LinearRegression(['a', 'b', 'c'], fit_intercept=False)
        model.fit(self.train_x, self.train_y)
        desc = model.save()
-        new_model = LinearRegression.load(desc)
+        new_model = load_model(desc)
        calculated_y = new_model.predict(self.predict_x)
        expected_y = model.predict(self.predict_x)
        np.testing.assert_array_almost_equal(calculated_y, expected_y)
+        np.testing.assert_array_almost_equal(new_model.weights, model.weights)
+    def test_logistic_regression(self):
+        model = LogisticRegression(['a', 'b', 'c'], fit_intercept=False)
+        model.fit(self.train_x, self.train_y_label)
+        calculated_y = model.predict(self.predict_x)
+        expected_model = LogisticRegression2(fit_intercept=False)
+        expected_model.fit(self.train_x, self.train_y_label)
+        expected_y = expected_model.predict(self.predict_x)
+        np.testing.assert_array_equal(calculated_y, expected_y)
+        np.testing.assert_array_almost_equal(expected_model.coef_, model.weights)
+    def test_logistic_regression_persistence(self):
+        model = LinearRegression(['a', 'b', 'c'], fit_intercept=False)
+        model.fit(self.train_x, self.train_y_label)
+        desc = model.save()
+        new_model = load_model(desc)
+        calculated_y = new_model.predict(self.predict_x)
+        expected_y = model.predict(self.predict_x)
+        np.testing.assert_array_almost_equal(calculated_y, expected_y)
+        np.testing.assert_array_almost_equal(new_model.weights, model.weights)
--- a/alphamind/tests/model/test_treemodel.py
+++ b/alphamind/tests/model/test_treemodel.py
+# -*- coding: utf-8 -*-
+"""
+Created on 2018-1-5
+@author: cheng.li
+"""
+import unittest
+import numpy as np
+from alphamind.model.loader import load_model
+from alphamind.model.treemodel import RandomForestRegressor
+from alphamind.model.treemodel import RandomForestClassifier
+from alphamind.model.treemodel import XGBRegressor
+from alphamind.model.treemodel import XGBClassifier
+class TestTreeModel(unittest.TestCase):
+    def test_random_forest_regress_persistence(self):
+        model = RandomForestRegressor(features=list(range(10)))
+        x = np.random.randn(1000, 10)
+        y = np.random.randn(1000)
+        model.fit(x, y)
+        desc = model.save()
+        new_model = load_model(desc)
+        self.assertEqual(model.features, new_model.features)
+        sample_x = np.random.randn(100, 10)
+        np.testing.assert_array_almost_equal(model.predict(sample_x), new_model.predict(sample_x))
+    def test_random_forest_classify_persistence(self):
+        model = RandomForestClassifier(features=list(range(10)))
+        x = np.random.randn(1000, 10)
+        y = np.random.randn(1000)
+        y = np.where(y > 0, 1, 0)
+        model.fit(x, y)
+        desc = model.save()
+        new_model = load_model(desc)
+        self.assertEqual(model.features, new_model.features)
+        sample_x = np.random.randn(100, 10)
+        np.testing.assert_array_almost_equal(model.predict(sample_x), new_model.predict(sample_x))
+    def test_xgb_regress_persistence(self):
+        model = XGBRegressor(features=list(range(10)))
+        x = np.random.randn(1000, 10)
+        y = np.random.randn(1000)
+        model.fit(x, y)
+        desc = model.save()
+        new_model = load_model(desc)
+        self.assertEqual(model.features, new_model.features)
+        sample_x = np.random.randn(100, 10)
+        np.testing.assert_array_almost_equal(model.predict(sample_x), new_model.predict(sample_x))
+    def test_xgb_classify_persistence(self):
+        model = XGBClassifier(features=list(range(10)))
+        x = np.random.randn(1000, 10)
+        y = np.random.randn(1000)
+        y = np.where(y > 0, 1, 0)
+        model.fit(x, y)
+        desc = model.save()
+        new_model = load_model(desc)
+        self.assertEqual(model.features, new_model.features)
+        sample_x = np.random.randn(100, 10)
+        np.testing.assert_array_almost_equal(model.predict(sample_x), new_model.predict(sample_x))
--- a/alphamind/tests/test_suite.py
+++ b/alphamind/tests/test_suite.py
@@ -28,6 +28,7 @@ from alphamind.tests.analysis.test_perfanalysis import TestPerformanceAnalysis
 from alphamind.tests.analysis.test_factoranalysis import TestFactorAnalysis
 from alphamind.tests.analysis.test_quantilieanalysis import TestQuantileAnalysis
 from alphamind.tests.model.test_linearmodel import TestLinearModel
+from alphamind.tests.model.test_treemodel import TestTreeModel
 from alphamind.tests.model.test_loader import TestLoader
 from alphamind.tests.execution.test_naiveexecutor import TestNaiveExecutor
 from alphamind.tests.execution.test_thresholdexecutor import TestThresholdExecutor
@@ -54,6 +55,7 @@ if __name__ == '__main__':
                         TestFactorAnalysis,
                         TestQuantileAnalysis,
                         TestLinearModel,
+                         TestTreeModel,
                         TestLoader,
                         TestNaiveExecutor,
                         TestThresholdExecutor,

--- a/alphamind/utilities.py
+++ b/alphamind/utilities.py
@@ -16,6 +16,27 @@ import numba as nb
 alpha_logger = CustomLogger('ALPHA_MIND', 'info')
+def map_freq(freq):
+    if freq == '1m':
+        horizon = 21
+    elif freq == '1w':
+        horizon = 4
+    elif freq == '2w':
+        horizon = 9
+    elif freq == '3w':
+        horizon = 14
+    elif freq == '4w':
+        horizon = 19
+    elif freq == '1d':
+        horizon = 0
+    elif freq[-1] == "b":
+        horizon = int(freq[:-1]) - 1
+    else:
+        raise ValueError("Unrecognized freq: {0}".format(freq))
+    return horizon
 def groupby(groups):
    order = groups.argsort()
    t = groups[order]

--- a/build_linux_dependencies.sh
+++ b/build_linux_dependencies.sh
 #!/bin/sh
-cd alphamind/pfopt
+cd xgboost
+git submodule init
+git submodule update
-./build_linux.sh
+make -j4
+cd python-package
+python setup.py install
+if [ $? -ne 0 ] ; then
+    cd ../..
+    exit 1
+fi
+cd ../..
+cd alphamind/pfopt
+./build_linux.sh
 if [ $? -ne 0 ] ; then
    cd ../..
    exit 1

--- a/build_windows_dependencies.bat
+++ b/build_windows_dependencies.bat
 @echo off
+cd xgboost
+git submodule init
+git submodule update
+mkdir build
+cd build
+cmake .. -G "Visual Studio 14 2015 Win64"
+msbuild xgboost.sln /m /p:Configuration=Release /p:Platform=x64
+if %errorlevel% neq 0 exit /b 1
+cd ../python-package
+python setup.py install
+if %errorlevel% neq 0 exit /b 1
+cd ../..
 cd alphamind\pfopt
 call build_windows.bat

--- a/notebooks/candidate_prod_model_20171204.ipynb
+++ b/notebooks/candidate_prod_model_20171204.ipynb
--- a/notebooks/model_comparing.ipynb
+++ b/notebooks/model_comparing.ipynb
--- a/notebooks/prod_model_20171117.ipynb
+++ b/notebooks/prod_model_20171117.ipynb
--- a/xgboost @ bf436718
+++ b/xgboost @ bf436718
+Subproject commit bf4367184164e593cd2856ef38f8dd4f8cc76999