Commit c0e2023c authored by Dr.李's avatar Dr.李

added native xgboost model

parent 8b47319b
......@@ -280,7 +280,7 @@ def fetch_train_phase(engine,
return_df, factor_df = df[['trade_date', 'code', 'dx']], df[
['trade_date', 'code', 'isOpen'] + transformer.names]
return_df, dates, date_label, risk_exp, x_values, y_values, _, _ = \
return_df, dates, date_label, risk_exp, x_values, y_values, _, _, codes = \
_merge_df(engine, transformer.names, factor_df, return_df, universe, dates, risk_model, neutralized_risk)
if dates[-1] == dt.datetime.strptime(ref_date, '%Y-%m-%d'):
......@@ -293,6 +293,7 @@ def fetch_train_phase(engine,
index = (date_label >= start) & (date_label <= end)
this_raw_x = x_values[index]
this_raw_y = y_values[index]
this_code = codes[index]
if risk_exp is not None:
this_risk_exp = risk_exp[index]
else:
......@@ -310,7 +311,7 @@ def fetch_train_phase(engine,
ret = dict()
ret['x_names'] = transformer.names
ret['train'] = {'x': ne_x, 'y': ne_y}
ret['train'] = {'x': ne_x, 'y': ne_y, 'code': this_code}
return ret
......@@ -361,7 +362,6 @@ def fetch_predict_phase(engine,
end = dates[-1]
start = dates[-batch]
# index = (date_label >= start) & (date_label <= end)
left_index = bisect.bisect_left(date_label, start)
right_index = bisect.bisect_right(date_label, end)
this_raw_x = x_values[left_index:right_index]
......@@ -399,27 +399,12 @@ def fetch_predict_phase(engine,
if __name__ == '__main__':
engine = SqlEngine('postgresql+psycopg2://postgres:A12345678!@10.63.6.220/alpha')
universe = Universe('zz500', ['ashare_ex'])
universe = Universe('zz500', ['hs300', 'zz500'])
neutralized_risk = ['SIZE']
res = fetch_data_package(engine,
['EPS', 'CFinc1'],
'2017-09-01',
'2017-09-04',
'1w',
universe,
benchmark=905,
warm_start=1,
neutralized_risk=neutralized_risk)
print(res)
res = fetch_predict_phase(engine,
['EPS', 'CFinc1'],
'2017-09-04',
'2w',
universe,
4,
warm_start=1,
neutralized_risk=neutralized_risk)
res = fetch_predict_phase(engine, ['ep_q'],
'2018-01-08',
'5b',
universe,
16,
neutralized_risk=neutralized_risk)
print(res)
......@@ -21,7 +21,7 @@ class ModelBase(metaclass=abc.ABCMeta):
self.impl = None
self.trained_time = None
def fit(self, x, y):
def fit(self, x: np.ndarray, y: np.ndarray):
self.impl.fit(x, y.flatten())
self.trained_time = arrow.now().format("YYYY-MM-DD HH:mm:ss")
......
......@@ -7,9 +7,13 @@ Created on 2017-12-4
from typing import List
from distutils.version import LooseVersion
import arrow
import numpy as np
from sklearn import __version__ as sklearn_version
from sklearn.ensemble import RandomForestRegressor as RandomForestRegressorImpl
from sklearn.ensemble import RandomForestClassifier as RandomForestClassifierImpl
from sklearn.model_selection import train_test_split
import xgboost as xgb
from xgboost import __version__ as xgbboot_version
from xgboost import XGBRegressor as XGBRegressorImpl
from xgboost import XGBClassifier as XGBClassifierImpl
......@@ -155,4 +159,86 @@ class XGBClassifier(ModelBase):
return self.impl.feature_importances_.tolist()
class XGBTrainer(ModelBase):
def __init__(self,
objective,
booster='gbtree',
tree_method='hist',
n_estimators: int=100,
learning_rate: float=0.1,
max_depth=3,
eval_sample=None,
early_stopping_rounds=None,
subsample=1.,
colsample_bytree=1.,
features: List = None,
**kwargs):
super().__init__(features)
self.params = {
'silent': 1,
'objective': objective,
'max_depth': max_depth,
'eta': learning_rate,
'booster': booster,
'tree_method': tree_method,
'subsample': subsample,
'colsample_bytree': colsample_bytree
}
self.eval_sample = eval_sample
self.num_boost_round = n_estimators
self.early_stopping_rounds = early_stopping_rounds
self.impl = None
def fit(self, x, y):
if self.eval_sample:
x_train, x_eval, y_train, y_eval = train_test_split(x,
y,
test_size=self.eval_sample,
random_state=42)
d_train = xgb.DMatrix(x_train, y_train)
d_eval = xgb.DMatrix(x_eval, y_eval)
self.impl = xgb.train(params=self.params,
dtrain=d_train,
num_boost_round=self.num_boost_round,
evals=[(d_eval, 'eval')],
verbose_eval=False)
else:
d_train = xgb.DMatrix(x, y)
self.impl = xgb.train(params=self.params,
dtrain=d_train,
num_boost_round=self.num_boost_round)
self.trained_time = arrow.now().format("YYYY-MM-DD HH:mm:ss")
def predict(self, x: np.ndarray) -> np.ndarray:
d_predict = xgb.DMatrix(x)
return self.impl.predict(d_predict)
def save(self) -> dict:
model_desc = super().save()
model_desc['xgbboot_version'] = xgbboot_version
model_desc['importances'] = self.importances
return model_desc
@classmethod
def load(cls, model_desc: dict):
obj_layout = super().load(model_desc)
if LooseVersion(sklearn_version) < LooseVersion(model_desc['xgbboot_version']):
alpha_logger.warning('Current xgboost version {0} is lower than the model version {1}. '
'Loaded model may work incorrectly.'.format(xgbboot_version,
model_desc['xgbboot_version']))
return obj_layout
@property
def importances(self):
imps = self.impl.get_fscore().items()
imps = sorted(imps, key=lambda x: x[0])
return list(zip(*imps))[1]
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment