Unverified Commit 7924db6d authored by lion-sing's avatar lion-sing Committed by GitHub

Merge pull request #2 from alpha-miner/master

sync before commit
parents 9203f363 a4d062a0
...@@ -33,6 +33,7 @@ from alphamind.model import RandomForestRegressor ...@@ -33,6 +33,7 @@ from alphamind.model import RandomForestRegressor
from alphamind.model import RandomForestClassifier from alphamind.model import RandomForestClassifier
from alphamind.model import XGBRegressor from alphamind.model import XGBRegressor
from alphamind.model import XGBClassifier from alphamind.model import XGBClassifier
from alphamind.model import XGBTrainer
from alphamind.model import load_model from alphamind.model import load_model
from alphamind.model.data_preparing import fetch_data_package from alphamind.model.data_preparing import fetch_data_package
from alphamind.model.data_preparing import fetch_train_phase from alphamind.model.data_preparing import fetch_train_phase
...@@ -74,6 +75,7 @@ __all__ = [ ...@@ -74,6 +75,7 @@ __all__ = [
'RandomForestClassifier', 'RandomForestClassifier',
'XGBRegressor', 'XGBRegressor',
'XGBClassifier', 'XGBClassifier',
'XGBTrainer',
'load_model', 'load_model',
'NaiveExecutor', 'NaiveExecutor',
'ThresholdExecutor', 'ThresholdExecutor',
......
...@@ -1954,5 +1954,5 @@ class OutrightTmp(Base): ...@@ -1954,5 +1954,5 @@ class OutrightTmp(Base):
if __name__ == '__main__': if __name__ == '__main__':
from sqlalchemy import create_engine from sqlalchemy import create_engine
engine = create_engine('postgres+psycopg2://postgres:A12345678!@10.63.6.220/alpha') engine = create_engine('postgres+psycopg2://postgres:we083826@101.132.104.118/alpha')
Base.metadata.create_all(engine) Base.metadata.create_all(engine)
...@@ -388,11 +388,13 @@ class SqlEngine(object): ...@@ -388,11 +388,13 @@ class SqlEngine(object):
df = pd.read_sql(query, self.engine) df = pd.read_sql(query, self.engine)
if universe.is_filtered: if universe.is_filtered:
codes = universe.query(self, start_date, end_date, dates) codes = universe.query(self, start_date, end_date, dates)
df = pd.merge(df, codes, how='inner', on=['trade_date', 'code']).sort_values(['trade_date', 'code']) df = pd.merge(df, codes, how='inner', on=['trade_date', 'code'])
if external_data is not None: if external_data is not None:
df = pd.merge(df, external_data, on=['trade_date', 'code']).dropna() df = pd.merge(df, external_data, on=['trade_date', 'code']).dropna()
df.sort_values(['trade_date', 'code'], inplace=True)
df.set_index('trade_date', inplace=True) df.set_index('trade_date', inplace=True)
res = transformer.transform('code', df) res = transformer.transform('code', df)
......
...@@ -17,25 +17,24 @@ import datetime as dt ...@@ -17,25 +17,24 @@ import datetime as dt
start = dt.datetime.now() start = dt.datetime.now()
universe_name = 'zz500' universe = Universe('custom', ['zz800'])
factor_name = 'PE' simple_expression = CSRes(LAST('OperCashInToAsset'), 'roe_q')
expression = 1. / LAST(factor_name)
alpha_factor_name = '1/PE' alpha_factor_name = 'alpha_factor'
alpha_factor = {alpha_factor_name: expression} alpha_factor = {alpha_factor_name: simple_expression}
# end of formula definition # end of formula definition
engine = SqlEngine('postgresql+psycopg2://postgres:A12345678!@10.63.6.220/alpha') engine = SqlEngine('postgresql+psycopg2://postgres:A12345678!@10.63.6.220/alpha')
universe = Universe('custom', [universe_name])
neutralize_risk = ['SIZE'] + industry_styles neutralize_risk = ['SIZE', 'LEVERAGE'] + industry_styles
freq = '5b' freq = '10b'
n_bins = 5 n_bins = 5
horizon = map_freq(freq) horizon = map_freq(freq)
start_date = '2012-01-01' start_date = '2012-01-01'
end_date = '2017-11-21' end_date = '2018-01-05'
dates = makeSchedule(start_date, dates = makeSchedule(start_date,
end_date, end_date,
...@@ -93,10 +92,9 @@ df = df.cumsum().plot(ax=axes[0], title='Quantile Analysis for {0}'.format(alpha ...@@ -93,10 +92,9 @@ df = df.cumsum().plot(ax=axes[0], title='Quantile Analysis for {0}'.format(alpha
# =================================================================== # # =================================================================== #
factor_name = 'PE' factor_name = 'PE'
expression = DIFF(1./LAST(factor_name))
alpha_factor_name = '1/PE_1w_diff' alpha_factor_name = alpha_factor_name + '_1w_diff'
alpha_factor = {alpha_factor_name: expression} alpha_factor = {alpha_factor_name: DIFF(simple_expression)}
dates = makeSchedule(start_date, dates = makeSchedule(start_date,
end_date, end_date,
......
...@@ -14,6 +14,7 @@ from alphamind.model.treemodel import RandomForestRegressor ...@@ -14,6 +14,7 @@ from alphamind.model.treemodel import RandomForestRegressor
from alphamind.model.treemodel import RandomForestClassifier from alphamind.model.treemodel import RandomForestClassifier
from alphamind.model.treemodel import XGBRegressor from alphamind.model.treemodel import XGBRegressor
from alphamind.model.treemodel import XGBClassifier from alphamind.model.treemodel import XGBClassifier
from alphamind.model.treemodel import XGBTrainer
from alphamind.model.loader import load_model from alphamind.model.loader import load_model
...@@ -26,4 +27,5 @@ __all__ = ['LinearRegression', ...@@ -26,4 +27,5 @@ __all__ = ['LinearRegression',
'RandomForestClassifier', 'RandomForestClassifier',
'XGBRegressor', 'XGBRegressor',
'XGBClassifier', 'XGBClassifier',
'XGBTrainer',
'load_model'] 'load_model']
\ No newline at end of file
...@@ -27,6 +27,7 @@ from alphamind.utilities import map_freq ...@@ -27,6 +27,7 @@ from alphamind.utilities import map_freq
def _merge_df(engine, names, factor_df, return_df, universe, dates, risk_model, neutralized_risk): def _merge_df(engine, names, factor_df, return_df, universe, dates, risk_model, neutralized_risk):
risk_df = engine.fetch_risk_model_range(universe, dates=dates, risk_model=risk_model)[1] risk_df = engine.fetch_risk_model_range(universe, dates=dates, risk_model=risk_model)[1]
alpha_logger.info("risk data loading finished")
used_neutralized_risk = list(set(total_risk_factors).difference(names)) used_neutralized_risk = list(set(total_risk_factors).difference(names))
risk_df = risk_df[['trade_date', 'code'] + used_neutralized_risk].dropna() risk_df = risk_df[['trade_date', 'code'] + used_neutralized_risk].dropna()
return_df = pd.merge(return_df, risk_df, on=['trade_date', 'code']) return_df = pd.merge(return_df, risk_df, on=['trade_date', 'code'])
...@@ -45,9 +46,10 @@ def _merge_df(engine, names, factor_df, return_df, universe, dates, risk_model, ...@@ -45,9 +46,10 @@ def _merge_df(engine, names, factor_df, return_df, universe, dates, risk_model,
x_values = train_x[names].values.astype(float) x_values = train_x[names].values.astype(float)
y_values = train_y[['dx']].values y_values = train_y[['dx']].values
codes = train_x['code'].values
date_label = pd.DatetimeIndex(factor_df.trade_date).to_pydatetime() date_label = pd.DatetimeIndex(factor_df.trade_date).to_pydatetime()
dates = np.unique(date_label) dates = np.unique(date_label)
return return_df, dates, date_label, risk_exp, x_values, y_values, train_x, train_y return return_df, dates, date_label, risk_exp, x_values, y_values, train_x, train_y, codes
def prepare_data(engine: SqlEngine, def prepare_data(engine: SqlEngine,
...@@ -82,9 +84,13 @@ def prepare_data(engine: SqlEngine, ...@@ -82,9 +84,13 @@ def prepare_data(engine: SqlEngine,
factor_df = engine.fetch_factor_range(universe, factor_df = engine.fetch_factor_range(universe,
factors=transformer, factors=transformer,
dates=dates).sort_values(['trade_date', 'code']) dates=dates).sort_values(['trade_date', 'code'])
alpha_logger.info("factor data loading finished")
return_df = engine.fetch_dx_return_range(universe, dates=dates, horizon=horizon) return_df = engine.fetch_dx_return_range(universe, dates=dates, horizon=horizon)
alpha_logger.info("return data loading finished")
industry_df = engine.fetch_industry_range(universe, dates=dates) industry_df = engine.fetch_industry_range(universe, dates=dates)
alpha_logger.info("industry data loading finished")
benchmark_df = engine.fetch_benchmark_range(benchmark, dates=dates) benchmark_df = engine.fetch_benchmark_range(benchmark, dates=dates)
alpha_logger.info("benchmark data loading finished")
df = pd.merge(factor_df, return_df, on=['trade_date', 'code']).dropna() df = pd.merge(factor_df, return_df, on=['trade_date', 'code']).dropna()
df = pd.merge(df, benchmark_df, on=['trade_date', 'code'], how='left') df = pd.merge(df, benchmark_df, on=['trade_date', 'code'], how='left')
...@@ -102,13 +108,15 @@ def batch_processing(x_values, ...@@ -102,13 +108,15 @@ def batch_processing(x_values,
batch, batch,
risk_exp, risk_exp,
pre_process, pre_process,
post_process): post_process,
codes):
train_x_buckets = {} train_x_buckets = {}
train_y_buckets = {} train_y_buckets = {}
train_risk_buckets = {} train_risk_buckets = {}
predict_x_buckets = {} predict_x_buckets = {}
predict_y_buckets = {} predict_y_buckets = {}
predict_risk_buckets = {} predict_risk_buckets = {}
predict_codes_bucket = {}
for i, start in enumerate(groups[:-batch]): for i, start in enumerate(groups[:-batch]):
end = groups[i + batch] end = groups[i + batch]
...@@ -141,6 +149,7 @@ def batch_processing(x_values, ...@@ -141,6 +149,7 @@ def batch_processing(x_values,
sub_dates = group_label[left_index:right_index] sub_dates = group_label[left_index:right_index]
this_raw_x = x_values[left_index:right_index] this_raw_x = x_values[left_index:right_index]
this_codes = codes[left_index:right_index]
if risk_exp is not None: if risk_exp is not None:
this_risk_exp = risk_exp[left_index:right_index] this_risk_exp = risk_exp[left_index:right_index]
...@@ -156,6 +165,7 @@ def batch_processing(x_values, ...@@ -156,6 +165,7 @@ def batch_processing(x_values,
inner_right_index = bisect.bisect_right(sub_dates, end) inner_right_index = bisect.bisect_right(sub_dates, end)
predict_x_buckets[end] = ne_x[inner_left_index:inner_right_index] predict_x_buckets[end] = ne_x[inner_left_index:inner_right_index]
predict_risk_buckets[end] = this_risk_exp[inner_left_index:inner_right_index] predict_risk_buckets[end] = this_risk_exp[inner_left_index:inner_right_index]
predict_codes_bucket[end] = this_codes[inner_left_index:inner_right_index]
this_raw_y = y_values[left_index:right_index] this_raw_y = y_values[left_index:right_index]
if len(this_raw_y) > 0: if len(this_raw_y) > 0:
...@@ -165,7 +175,13 @@ def batch_processing(x_values, ...@@ -165,7 +175,13 @@ def batch_processing(x_values,
post_process=post_process) post_process=post_process)
predict_y_buckets[end] = ne_y[inner_left_index:inner_right_index] predict_y_buckets[end] = ne_y[inner_left_index:inner_right_index]
return train_x_buckets, train_y_buckets, train_risk_buckets, predict_x_buckets, predict_y_buckets, predict_risk_buckets return train_x_buckets, \
train_y_buckets, \
train_risk_buckets, \
predict_x_buckets, \
predict_y_buckets, \
predict_risk_buckets, \
predict_codes_bucket
def fetch_data_package(engine: SqlEngine, def fetch_data_package(engine: SqlEngine,
...@@ -193,9 +209,11 @@ def fetch_data_package(engine: SqlEngine, ...@@ -193,9 +209,11 @@ def fetch_data_package(engine: SqlEngine,
benchmark, benchmark,
warm_start) warm_start)
return_df, dates, date_label, risk_exp, x_values, y_values, train_x, train_y = \ return_df, dates, date_label, risk_exp, x_values, y_values, train_x, train_y, codes = \
_merge_df(engine, transformer.names, factor_df, return_df, universe, dates, risk_model, neutralized_risk) _merge_df(engine, transformer.names, factor_df, return_df, universe, dates, risk_model, neutralized_risk)
alpha_logger.info("data merging finished")
return_df['weight'] = train_x['weight'] return_df['weight'] = train_x['weight']
return_df['industry'] = train_x['industry'] return_df['industry'] = train_x['industry']
return_df['industry_code'] = train_x['industry_code'] return_df['industry_code'] = train_x['industry_code']
...@@ -207,15 +225,16 @@ def fetch_data_package(engine: SqlEngine, ...@@ -207,15 +225,16 @@ def fetch_data_package(engine: SqlEngine,
alpha_logger.info("Loading data is finished") alpha_logger.info("Loading data is finished")
train_x_buckets, train_y_buckets, train_risk_buckets, predict_x_buckets, predict_y_buckets, predict_risk_buckets = batch_processing( train_x_buckets, train_y_buckets, train_risk_buckets, predict_x_buckets, predict_y_buckets, predict_risk_buckets, predict_codes_bucket \
x_values, = batch_processing(x_values,
y_values, y_values,
dates, dates,
date_label, date_label,
batch, batch,
risk_exp, risk_exp,
pre_process, pre_process,
post_process) post_process,
codes)
alpha_logger.info("Data processing is finished") alpha_logger.info("Data processing is finished")
...@@ -223,7 +242,7 @@ def fetch_data_package(engine: SqlEngine, ...@@ -223,7 +242,7 @@ def fetch_data_package(engine: SqlEngine,
ret['x_names'] = transformer.names ret['x_names'] = transformer.names
ret['settlement'] = return_df ret['settlement'] = return_df
ret['train'] = {'x': train_x_buckets, 'y': train_y_buckets, 'risk': train_risk_buckets} ret['train'] = {'x': train_x_buckets, 'y': train_y_buckets, 'risk': train_risk_buckets}
ret['predict'] = {'x': predict_x_buckets, 'y': predict_y_buckets, 'risk': predict_risk_buckets} ret['predict'] = {'x': predict_x_buckets, 'y': predict_y_buckets, 'risk': predict_risk_buckets, 'code': predict_codes_bucket}
return ret return ret
...@@ -261,7 +280,7 @@ def fetch_train_phase(engine, ...@@ -261,7 +280,7 @@ def fetch_train_phase(engine,
return_df, factor_df = df[['trade_date', 'code', 'dx']], df[ return_df, factor_df = df[['trade_date', 'code', 'dx']], df[
['trade_date', 'code', 'isOpen'] + transformer.names] ['trade_date', 'code', 'isOpen'] + transformer.names]
return_df, dates, date_label, risk_exp, x_values, y_values, _, _ = \ return_df, dates, date_label, risk_exp, x_values, y_values, _, _, codes = \
_merge_df(engine, transformer.names, factor_df, return_df, universe, dates, risk_model, neutralized_risk) _merge_df(engine, transformer.names, factor_df, return_df, universe, dates, risk_model, neutralized_risk)
if dates[-1] == dt.datetime.strptime(ref_date, '%Y-%m-%d'): if dates[-1] == dt.datetime.strptime(ref_date, '%Y-%m-%d'):
...@@ -274,6 +293,7 @@ def fetch_train_phase(engine, ...@@ -274,6 +293,7 @@ def fetch_train_phase(engine,
index = (date_label >= start) & (date_label <= end) index = (date_label >= start) & (date_label <= end)
this_raw_x = x_values[index] this_raw_x = x_values[index]
this_raw_y = y_values[index] this_raw_y = y_values[index]
this_code = codes[index]
if risk_exp is not None: if risk_exp is not None:
this_risk_exp = risk_exp[index] this_risk_exp = risk_exp[index]
else: else:
...@@ -291,7 +311,7 @@ def fetch_train_phase(engine, ...@@ -291,7 +311,7 @@ def fetch_train_phase(engine,
ret = dict() ret = dict()
ret['x_names'] = transformer.names ret['x_names'] = transformer.names
ret['train'] = {'x': ne_x, 'y': ne_y} ret['train'] = {'x': ne_x, 'y': ne_y, 'code': this_code}
return ret return ret
...@@ -342,7 +362,6 @@ def fetch_predict_phase(engine, ...@@ -342,7 +362,6 @@ def fetch_predict_phase(engine,
end = dates[-1] end = dates[-1]
start = dates[-batch] start = dates[-batch]
# index = (date_label >= start) & (date_label <= end)
left_index = bisect.bisect_left(date_label, start) left_index = bisect.bisect_left(date_label, start)
right_index = bisect.bisect_right(date_label, end) right_index = bisect.bisect_right(date_label, end)
this_raw_x = x_values[left_index:right_index] this_raw_x = x_values[left_index:right_index]
...@@ -380,26 +399,12 @@ def fetch_predict_phase(engine, ...@@ -380,26 +399,12 @@ def fetch_predict_phase(engine,
if __name__ == '__main__': if __name__ == '__main__':
engine = SqlEngine('postgresql+psycopg2://postgres:A12345678!@10.63.6.220/alpha') engine = SqlEngine('postgresql+psycopg2://postgres:A12345678!@10.63.6.220/alpha')
universe = Universe('zz500', ['ashare_ex']) universe = Universe('zz500', ['hs300', 'zz500'])
neutralized_risk = ['SIZE'] neutralized_risk = ['SIZE']
res = fetch_train_phase(engine, res = fetch_predict_phase(engine, ['ep_q'],
['EPS', 'CFinc1'], '2018-01-08',
'2017-09-04', '5b',
'2w',
universe, universe,
4, 16,
warm_start=1,
neutralized_risk=neutralized_risk) neutralized_risk=neutralized_risk)
print(res)
res = fetch_predict_phase(engine,
['EPS', 'CFinc1'],
'2017-09-04',
'2w',
universe,
4,
warm_start=1,
neutralized_risk=neutralized_risk)
print(res) print(res)
...@@ -14,6 +14,7 @@ from alphamind.model.treemodel import RandomForestRegressor ...@@ -14,6 +14,7 @@ from alphamind.model.treemodel import RandomForestRegressor
from alphamind.model.treemodel import RandomForestClassifier from alphamind.model.treemodel import RandomForestClassifier
from alphamind.model.treemodel import XGBRegressor from alphamind.model.treemodel import XGBRegressor
from alphamind.model.treemodel import XGBClassifier from alphamind.model.treemodel import XGBClassifier
from alphamind.model.treemodel import XGBTrainer
def load_model(model_desc: dict) -> ModelBase: def load_model(model_desc: dict) -> ModelBase:
...@@ -37,5 +38,7 @@ def load_model(model_desc: dict) -> ModelBase: ...@@ -37,5 +38,7 @@ def load_model(model_desc: dict) -> ModelBase:
return XGBRegressor.load(model_desc) return XGBRegressor.load(model_desc)
elif 'XGBClassifier' in model_name_parts: elif 'XGBClassifier' in model_name_parts:
return XGBClassifier.load(model_desc) return XGBClassifier.load(model_desc)
elif 'XGBTrainer' in model_name_parts:
return XGBTrainer.load(model_desc)
else: else:
raise ValueError('{0} is not currently supported in model loader.'.format(model_name)) raise ValueError('{0} is not currently supported in model loader.'.format(model_name))
...@@ -21,7 +21,7 @@ class ModelBase(metaclass=abc.ABCMeta): ...@@ -21,7 +21,7 @@ class ModelBase(metaclass=abc.ABCMeta):
self.impl = None self.impl = None
self.trained_time = None self.trained_time = None
def fit(self, x, y): def fit(self, x: np.ndarray, y: np.ndarray):
self.impl.fit(x, y.flatten()) self.impl.fit(x, y.flatten())
self.trained_time = arrow.now().format("YYYY-MM-DD HH:mm:ss") self.trained_time = arrow.now().format("YYYY-MM-DD HH:mm:ss")
......
...@@ -7,9 +7,13 @@ Created on 2017-12-4 ...@@ -7,9 +7,13 @@ Created on 2017-12-4
from typing import List from typing import List
from distutils.version import LooseVersion from distutils.version import LooseVersion
import arrow
import numpy as np
from sklearn import __version__ as sklearn_version from sklearn import __version__ as sklearn_version
from sklearn.ensemble import RandomForestRegressor as RandomForestRegressorImpl from sklearn.ensemble import RandomForestRegressor as RandomForestRegressorImpl
from sklearn.ensemble import RandomForestClassifier as RandomForestClassifierImpl from sklearn.ensemble import RandomForestClassifier as RandomForestClassifierImpl
from sklearn.model_selection import train_test_split
import xgboost as xgb
from xgboost import __version__ as xgbboot_version from xgboost import __version__ as xgbboot_version
from xgboost import XGBRegressor as XGBRegressorImpl from xgboost import XGBRegressor as XGBRegressorImpl
from xgboost import XGBClassifier as XGBClassifierImpl from xgboost import XGBClassifier as XGBClassifierImpl
...@@ -155,4 +159,91 @@ class XGBClassifier(ModelBase): ...@@ -155,4 +159,91 @@ class XGBClassifier(ModelBase):
return self.impl.feature_importances_.tolist() return self.impl.feature_importances_.tolist()
class XGBTrainer(ModelBase):
def __init__(self,
objective='binary:logistic',
booster='gbtree',
tree_method='hist',
n_estimators: int=100,
learning_rate: float=0.1,
max_depth=3,
eval_sample=None,
early_stopping_rounds=None,
subsample=1.,
colsample_bytree=1.,
features: List = None,
random_state=0,
**kwargs):
super().__init__(features)
self.params = {
'silent': 1,
'objective': objective,
'max_depth': max_depth,
'eta': learning_rate,
'booster': booster,
'tree_method': tree_method,
'subsample': subsample,
'colsample_bytree': colsample_bytree,
'seed': random_state
}
self.eval_sample = eval_sample
self.num_boost_round = n_estimators
self.early_stopping_rounds = early_stopping_rounds
self.impl = None
self.kwargs = kwargs
def fit(self, x, y):
if self.eval_sample:
x_train, x_eval, y_train, y_eval = train_test_split(x,
y,
test_size=self.eval_sample,
random_state=42)
d_train = xgb.DMatrix(x_train, y_train)
d_eval = xgb.DMatrix(x_eval, y_eval)
self.impl = xgb.train(params=self.params,
dtrain=d_train,
num_boost_round=self.num_boost_round,
evals=[(d_eval, 'eval')],
verbose_eval=False,
**self.kwargs)
else:
d_train = xgb.DMatrix(x, y)
self.impl = xgb.train(params=self.params,
dtrain=d_train,
num_boost_round=self.num_boost_round,
**self.kwargs)
self.trained_time = arrow.now().format("YYYY-MM-DD HH:mm:ss")
def predict(self, x: np.ndarray) -> np.ndarray:
d_predict = xgb.DMatrix(x)
return self.impl.predict(d_predict)
def save(self) -> dict:
model_desc = super().save()
model_desc['xgbboot_version'] = xgbboot_version
model_desc['importances'] = self.importances
return model_desc
@classmethod
def load(cls, model_desc: dict):
obj_layout = super().load(model_desc)
if LooseVersion(sklearn_version) < LooseVersion(model_desc['xgbboot_version']):
alpha_logger.warning('Current xgboost version {0} is lower than the model version {1}. '
'Loaded model may work incorrectly.'.format(xgbboot_version,
model_desc['xgbboot_version']))
return obj_layout
@property
def importances(self):
imps = self.impl.get_fscore().items()
imps = sorted(imps, key=lambda x: x[0])
return list(zip(*imps))[1]
...@@ -12,16 +12,18 @@ from alphamind.model.treemodel import RandomForestRegressor ...@@ -12,16 +12,18 @@ from alphamind.model.treemodel import RandomForestRegressor
from alphamind.model.treemodel import RandomForestClassifier from alphamind.model.treemodel import RandomForestClassifier
from alphamind.model.treemodel import XGBRegressor from alphamind.model.treemodel import XGBRegressor
from alphamind.model.treemodel import XGBClassifier from alphamind.model.treemodel import XGBClassifier
from alphamind.model.treemodel import XGBTrainer
class TestTreeModel(unittest.TestCase): class TestTreeModel(unittest.TestCase):
def setUp(self):
self.x = np.random.randn(1000, 10)
self.y = np.random.randn(1000)
def test_random_forest_regress_persistence(self): def test_random_forest_regress_persistence(self):
model = RandomForestRegressor(features=list(range(10))) model = RandomForestRegressor(features=list(range(10)))
x = np.random.randn(1000, 10) model.fit(self.x, self.y)
y = np.random.randn(1000)
model.fit(x, y)
desc = model.save() desc = model.save()
new_model = load_model(desc) new_model = load_model(desc)
...@@ -29,14 +31,12 @@ class TestTreeModel(unittest.TestCase): ...@@ -29,14 +31,12 @@ class TestTreeModel(unittest.TestCase):
sample_x = np.random.randn(100, 10) sample_x = np.random.randn(100, 10)
np.testing.assert_array_almost_equal(model.predict(sample_x), new_model.predict(sample_x)) np.testing.assert_array_almost_equal(model.predict(sample_x), new_model.predict(sample_x))
np.testing.assert_array_almost_equal(model.importances, new_model.importances)
def test_random_forest_classify_persistence(self): def test_random_forest_classify_persistence(self):
model = RandomForestClassifier(features=list(range(10))) model = RandomForestClassifier(features=list(range(10)))
x = np.random.randn(1000, 10) y = np.where(self.y > 0, 1, 0)
y = np.random.randn(1000) model.fit(self.x, y)
y = np.where(y > 0, 1, 0)
model.fit(x, y)
desc = model.save() desc = model.save()
new_model = load_model(desc) new_model = load_model(desc)
...@@ -44,13 +44,11 @@ class TestTreeModel(unittest.TestCase): ...@@ -44,13 +44,11 @@ class TestTreeModel(unittest.TestCase):
sample_x = np.random.randn(100, 10) sample_x = np.random.randn(100, 10)
np.testing.assert_array_almost_equal(model.predict(sample_x), new_model.predict(sample_x)) np.testing.assert_array_almost_equal(model.predict(sample_x), new_model.predict(sample_x))
np.testing.assert_array_almost_equal(model.importances, new_model.importances)
def test_xgb_regress_persistence(self): def test_xgb_regress_persistence(self):
model = XGBRegressor(features=list(range(10))) model = XGBRegressor(features=list(range(10)))
x = np.random.randn(1000, 10) model.fit(self.x, self.y)
y = np.random.randn(1000)
model.fit(x, y)
desc = model.save() desc = model.save()
new_model = load_model(desc) new_model = load_model(desc)
...@@ -58,14 +56,56 @@ class TestTreeModel(unittest.TestCase): ...@@ -58,14 +56,56 @@ class TestTreeModel(unittest.TestCase):
sample_x = np.random.randn(100, 10) sample_x = np.random.randn(100, 10)
np.testing.assert_array_almost_equal(model.predict(sample_x), new_model.predict(sample_x)) np.testing.assert_array_almost_equal(model.predict(sample_x), new_model.predict(sample_x))
np.testing.assert_array_almost_equal(model.importances, new_model.importances)
def test_xgb_classify_persistence(self): def test_xgb_classify_persistence(self):
model = XGBClassifier(features=list(range(10))) model = XGBClassifier(features=list(range(10)))
x = np.random.randn(1000, 10) y = np.where(self.y > 0, 1, 0)
y = np.random.randn(1000) model.fit(self.x, y)
y = np.where(y > 0, 1, 0)
desc = model.save()
new_model = load_model(desc)
self.assertEqual(model.features, new_model.features)
sample_x = np.random.randn(100, 10)
np.testing.assert_array_almost_equal(model.predict(sample_x), new_model.predict(sample_x))
np.testing.assert_array_almost_equal(model.importances, new_model.importances)
def test_xgb_trainer_equal_classifier(self):
sample_x = np.random.randn(100, 10)
model.fit(x, y) model1 = XGBClassifier(n_estimators=100,
learning_rate=0.1,
max_depth=3,
features=list(range(10)),
random_state=42)
model2 = XGBTrainer(features=list(range(10)),
objective='reg:logistic',
booster='gbtree',
tree_method='exact',
n_estimators=100,
learning_rate=0.1,
max_depth=3,
random_state=42)
y = np.where(self.y > 0, 1, 0)
model1.fit(self.x, y)
model2.fit(self.x, y)
predict1 = model1.predict(sample_x)
predict2 = model2.predict(sample_x)
predict2 = np.where(predict2 > 0.5, 1., 0.)
np.testing.assert_array_almost_equal(predict1, predict2)
def test_xgb_trainer_persistence(self):
model = XGBTrainer(features=list(range(10)),
objective='binary:logistic',
booster='gbtree',
tree_method='hist',
n_estimators=200)
y = np.where(self.y > 0, 1, 0)
model.fit(self.x, y)
desc = model.save() desc = model.save()
new_model = load_model(desc) new_model = load_model(desc)
...@@ -73,3 +113,4 @@ class TestTreeModel(unittest.TestCase): ...@@ -73,3 +113,4 @@ class TestTreeModel(unittest.TestCase):
sample_x = np.random.randn(100, 10) sample_x = np.random.randn(100, 10)
np.testing.assert_array_almost_equal(model.predict(sample_x), new_model.predict(sample_x)) np.testing.assert_array_almost_equal(model.predict(sample_x), new_model.predict(sample_x))
np.testing.assert_array_almost_equal(model.importances, new_model.importances)
...@@ -4,7 +4,12 @@ cd xgboost ...@@ -4,7 +4,12 @@ cd xgboost
git submodule init git submodule init
git submodule update git submodule update
mkdir build
cd build
cmake ..
make -j4 make -j4
cd ..
cd python-package cd python-package
python setup.py install python setup.py install
...@@ -22,4 +27,4 @@ if [ $? -ne 0 ] ; then ...@@ -22,4 +27,4 @@ if [ $? -ne 0 ] ; then
exit 1 exit 1
fi fi
cd ../.. cd ../..
\ No newline at end of file
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
Subproject commit bf4367184164e593cd2856ef38f8dd4f8cc76999 Subproject commit a187ed6c8f3aa40b47d5be80667cbbe6a6fd563d
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment