Commit d99cf0c5 authored by Yucheng's avatar Yucheng

Merge branch 'master' of https://github.com/lion-sing/alpha-mind

parents bfc257f9 7924db6d
......@@ -33,6 +33,7 @@ from alphamind.model import RandomForestRegressor
from alphamind.model import RandomForestClassifier
from alphamind.model import XGBRegressor
from alphamind.model import XGBClassifier
from alphamind.model import XGBTrainer
from alphamind.model import load_model
from alphamind.model.data_preparing import fetch_data_package
from alphamind.model.data_preparing import fetch_train_phase
......@@ -74,6 +75,7 @@ __all__ = [
'RandomForestClassifier',
'XGBRegressor',
'XGBClassifier',
'XGBTrainer',
'load_model',
'NaiveExecutor',
'ThresholdExecutor',
......
......@@ -1954,5 +1954,5 @@ class OutrightTmp(Base):
if __name__ == '__main__':
from sqlalchemy import create_engine
engine = create_engine('postgres+psycopg2://postgres:A12345678!@10.63.6.220/alpha')
engine = create_engine('postgres+psycopg2://postgres:we083826@101.132.104.118/alpha')
Base.metadata.create_all(engine)
......@@ -388,11 +388,13 @@ class SqlEngine(object):
df = pd.read_sql(query, self.engine)
if universe.is_filtered:
codes = universe.query(self, start_date, end_date, dates)
df = pd.merge(df, codes, how='inner', on=['trade_date', 'code']).sort_values(['trade_date', 'code'])
df = pd.merge(df, codes, how='inner', on=['trade_date', 'code'])
if external_data is not None:
df = pd.merge(df, external_data, on=['trade_date', 'code']).dropna()
df.sort_values(['trade_date', 'code'], inplace=True)
df.set_index('trade_date', inplace=True)
res = transformer.transform('code', df)
......
......@@ -17,25 +17,24 @@ import datetime as dt
start = dt.datetime.now()
universe_name = 'zz500'
universe = Universe('custom', ['zz800'])
factor_name = 'PE'
expression = 1. / LAST(factor_name)
simple_expression = CSRes(LAST('OperCashInToAsset'), 'roe_q')
alpha_factor_name = '1/PE'
alpha_factor = {alpha_factor_name: expression}
alpha_factor_name = 'alpha_factor'
alpha_factor = {alpha_factor_name: simple_expression}
# end of formula definition
engine = SqlEngine('postgresql+psycopg2://postgres:A12345678!@10.63.6.220/alpha')
universe = Universe('custom', [universe_name])
neutralize_risk = ['SIZE'] + industry_styles
freq = '5b'
neutralize_risk = ['SIZE', 'LEVERAGE'] + industry_styles
freq = '10b'
n_bins = 5
horizon = map_freq(freq)
start_date = '2012-01-01'
end_date = '2017-11-21'
end_date = '2018-01-05'
dates = makeSchedule(start_date,
end_date,
......@@ -93,10 +92,9 @@ df = df.cumsum().plot(ax=axes[0], title='Quantile Analysis for {0}'.format(alpha
# =================================================================== #
factor_name = 'PE'
expression = DIFF(1./LAST(factor_name))
alpha_factor_name = '1/PE_1w_diff'
alpha_factor = {alpha_factor_name: expression}
alpha_factor_name = alpha_factor_name + '_1w_diff'
alpha_factor = {alpha_factor_name: DIFF(simple_expression)}
dates = makeSchedule(start_date,
end_date,
......
......@@ -14,6 +14,7 @@ from alphamind.model.treemodel import RandomForestRegressor
from alphamind.model.treemodel import RandomForestClassifier
from alphamind.model.treemodel import XGBRegressor
from alphamind.model.treemodel import XGBClassifier
from alphamind.model.treemodel import XGBTrainer
from alphamind.model.loader import load_model
......@@ -26,4 +27,5 @@ __all__ = ['LinearRegression',
'RandomForestClassifier',
'XGBRegressor',
'XGBClassifier',
'XGBTrainer',
'load_model']
\ No newline at end of file
......@@ -27,6 +27,7 @@ from alphamind.utilities import map_freq
def _merge_df(engine, names, factor_df, return_df, universe, dates, risk_model, neutralized_risk):
risk_df = engine.fetch_risk_model_range(universe, dates=dates, risk_model=risk_model)[1]
alpha_logger.info("risk data loading finished")
used_neutralized_risk = list(set(total_risk_factors).difference(names))
risk_df = risk_df[['trade_date', 'code'] + used_neutralized_risk].dropna()
return_df = pd.merge(return_df, risk_df, on=['trade_date', 'code'])
......@@ -45,9 +46,10 @@ def _merge_df(engine, names, factor_df, return_df, universe, dates, risk_model,
x_values = train_x[names].values.astype(float)
y_values = train_y[['dx']].values
codes = train_x['code'].values
date_label = pd.DatetimeIndex(factor_df.trade_date).to_pydatetime()
dates = np.unique(date_label)
return return_df, dates, date_label, risk_exp, x_values, y_values, train_x, train_y
return return_df, dates, date_label, risk_exp, x_values, y_values, train_x, train_y, codes
def prepare_data(engine: SqlEngine,
......@@ -82,9 +84,13 @@ def prepare_data(engine: SqlEngine,
factor_df = engine.fetch_factor_range(universe,
factors=transformer,
dates=dates).sort_values(['trade_date', 'code'])
alpha_logger.info("factor data loading finished")
return_df = engine.fetch_dx_return_range(universe, dates=dates, horizon=horizon)
alpha_logger.info("return data loading finished")
industry_df = engine.fetch_industry_range(universe, dates=dates)
alpha_logger.info("industry data loading finished")
benchmark_df = engine.fetch_benchmark_range(benchmark, dates=dates)
alpha_logger.info("benchmark data loading finished")
df = pd.merge(factor_df, return_df, on=['trade_date', 'code']).dropna()
df = pd.merge(df, benchmark_df, on=['trade_date', 'code'], how='left')
......@@ -102,13 +108,15 @@ def batch_processing(x_values,
batch,
risk_exp,
pre_process,
post_process):
post_process,
codes):
train_x_buckets = {}
train_y_buckets = {}
train_risk_buckets = {}
predict_x_buckets = {}
predict_y_buckets = {}
predict_risk_buckets = {}
predict_codes_bucket = {}
for i, start in enumerate(groups[:-batch]):
end = groups[i + batch]
......@@ -141,6 +149,7 @@ def batch_processing(x_values,
sub_dates = group_label[left_index:right_index]
this_raw_x = x_values[left_index:right_index]
this_codes = codes[left_index:right_index]
if risk_exp is not None:
this_risk_exp = risk_exp[left_index:right_index]
......@@ -156,6 +165,7 @@ def batch_processing(x_values,
inner_right_index = bisect.bisect_right(sub_dates, end)
predict_x_buckets[end] = ne_x[inner_left_index:inner_right_index]
predict_risk_buckets[end] = this_risk_exp[inner_left_index:inner_right_index]
predict_codes_bucket[end] = this_codes[inner_left_index:inner_right_index]
this_raw_y = y_values[left_index:right_index]
if len(this_raw_y) > 0:
......@@ -165,7 +175,13 @@ def batch_processing(x_values,
post_process=post_process)
predict_y_buckets[end] = ne_y[inner_left_index:inner_right_index]
return train_x_buckets, train_y_buckets, train_risk_buckets, predict_x_buckets, predict_y_buckets, predict_risk_buckets
return train_x_buckets, \
train_y_buckets, \
train_risk_buckets, \
predict_x_buckets, \
predict_y_buckets, \
predict_risk_buckets, \
predict_codes_bucket
def fetch_data_package(engine: SqlEngine,
......@@ -193,9 +209,11 @@ def fetch_data_package(engine: SqlEngine,
benchmark,
warm_start)
return_df, dates, date_label, risk_exp, x_values, y_values, train_x, train_y = \
return_df, dates, date_label, risk_exp, x_values, y_values, train_x, train_y, codes = \
_merge_df(engine, transformer.names, factor_df, return_df, universe, dates, risk_model, neutralized_risk)
alpha_logger.info("data merging finished")
return_df['weight'] = train_x['weight']
return_df['industry'] = train_x['industry']
return_df['industry_code'] = train_x['industry_code']
......@@ -207,15 +225,16 @@ def fetch_data_package(engine: SqlEngine,
alpha_logger.info("Loading data is finished")
train_x_buckets, train_y_buckets, train_risk_buckets, predict_x_buckets, predict_y_buckets, predict_risk_buckets = batch_processing(
x_values,
y_values,
dates,
date_label,
batch,
risk_exp,
pre_process,
post_process)
train_x_buckets, train_y_buckets, train_risk_buckets, predict_x_buckets, predict_y_buckets, predict_risk_buckets, predict_codes_bucket \
= batch_processing(x_values,
y_values,
dates,
date_label,
batch,
risk_exp,
pre_process,
post_process,
codes)
alpha_logger.info("Data processing is finished")
......@@ -223,7 +242,7 @@ def fetch_data_package(engine: SqlEngine,
ret['x_names'] = transformer.names
ret['settlement'] = return_df
ret['train'] = {'x': train_x_buckets, 'y': train_y_buckets, 'risk': train_risk_buckets}
ret['predict'] = {'x': predict_x_buckets, 'y': predict_y_buckets, 'risk': predict_risk_buckets}
ret['predict'] = {'x': predict_x_buckets, 'y': predict_y_buckets, 'risk': predict_risk_buckets, 'code': predict_codes_bucket}
return ret
......@@ -261,7 +280,7 @@ def fetch_train_phase(engine,
return_df, factor_df = df[['trade_date', 'code', 'dx']], df[
['trade_date', 'code', 'isOpen'] + transformer.names]
return_df, dates, date_label, risk_exp, x_values, y_values, _, _ = \
return_df, dates, date_label, risk_exp, x_values, y_values, _, _, codes = \
_merge_df(engine, transformer.names, factor_df, return_df, universe, dates, risk_model, neutralized_risk)
if dates[-1] == dt.datetime.strptime(ref_date, '%Y-%m-%d'):
......@@ -274,6 +293,7 @@ def fetch_train_phase(engine,
index = (date_label >= start) & (date_label <= end)
this_raw_x = x_values[index]
this_raw_y = y_values[index]
this_code = codes[index]
if risk_exp is not None:
this_risk_exp = risk_exp[index]
else:
......@@ -291,7 +311,7 @@ def fetch_train_phase(engine,
ret = dict()
ret['x_names'] = transformer.names
ret['train'] = {'x': ne_x, 'y': ne_y}
ret['train'] = {'x': ne_x, 'y': ne_y, 'code': this_code}
return ret
......@@ -342,7 +362,6 @@ def fetch_predict_phase(engine,
end = dates[-1]
start = dates[-batch]
# index = (date_label >= start) & (date_label <= end)
left_index = bisect.bisect_left(date_label, start)
right_index = bisect.bisect_right(date_label, end)
this_raw_x = x_values[left_index:right_index]
......@@ -380,26 +399,12 @@ def fetch_predict_phase(engine,
if __name__ == '__main__':
engine = SqlEngine('postgresql+psycopg2://postgres:A12345678!@10.63.6.220/alpha')
universe = Universe('zz500', ['ashare_ex'])
universe = Universe('zz500', ['hs300', 'zz500'])
neutralized_risk = ['SIZE']
res = fetch_train_phase(engine,
['EPS', 'CFinc1'],
'2017-09-04',
'2w',
res = fetch_predict_phase(engine, ['ep_q'],
'2018-01-08',
'5b',
universe,
4,
warm_start=1,
16,
neutralized_risk=neutralized_risk)
print(res)
res = fetch_predict_phase(engine,
['EPS', 'CFinc1'],
'2017-09-04',
'2w',
universe,
4,
warm_start=1,
neutralized_risk=neutralized_risk)
print(res)
......@@ -14,6 +14,7 @@ from alphamind.model.treemodel import RandomForestRegressor
from alphamind.model.treemodel import RandomForestClassifier
from alphamind.model.treemodel import XGBRegressor
from alphamind.model.treemodel import XGBClassifier
from alphamind.model.treemodel import XGBTrainer
def load_model(model_desc: dict) -> ModelBase:
......@@ -37,5 +38,7 @@ def load_model(model_desc: dict) -> ModelBase:
return XGBRegressor.load(model_desc)
elif 'XGBClassifier' in model_name_parts:
return XGBClassifier.load(model_desc)
elif 'XGBTrainer' in model_name_parts:
return XGBTrainer.load(model_desc)
else:
raise ValueError('{0} is not currently supported in model loader.'.format(model_name))
......@@ -21,7 +21,7 @@ class ModelBase(metaclass=abc.ABCMeta):
self.impl = None
self.trained_time = None
def fit(self, x, y):
def fit(self, x: np.ndarray, y: np.ndarray):
self.impl.fit(x, y.flatten())
self.trained_time = arrow.now().format("YYYY-MM-DD HH:mm:ss")
......
......@@ -7,9 +7,13 @@ Created on 2017-12-4
from typing import List
from distutils.version import LooseVersion
import arrow
import numpy as np
from sklearn import __version__ as sklearn_version
from sklearn.ensemble import RandomForestRegressor as RandomForestRegressorImpl
from sklearn.ensemble import RandomForestClassifier as RandomForestClassifierImpl
from sklearn.model_selection import train_test_split
import xgboost as xgb
from xgboost import __version__ as xgbboot_version
from xgboost import XGBRegressor as XGBRegressorImpl
from xgboost import XGBClassifier as XGBClassifierImpl
......@@ -155,4 +159,91 @@ class XGBClassifier(ModelBase):
return self.impl.feature_importances_.tolist()
class XGBTrainer(ModelBase):
def __init__(self,
objective='binary:logistic',
booster='gbtree',
tree_method='hist',
n_estimators: int=100,
learning_rate: float=0.1,
max_depth=3,
eval_sample=None,
early_stopping_rounds=None,
subsample=1.,
colsample_bytree=1.,
features: List = None,
random_state=0,
**kwargs):
super().__init__(features)
self.params = {
'silent': 1,
'objective': objective,
'max_depth': max_depth,
'eta': learning_rate,
'booster': booster,
'tree_method': tree_method,
'subsample': subsample,
'colsample_bytree': colsample_bytree,
'seed': random_state
}
self.eval_sample = eval_sample
self.num_boost_round = n_estimators
self.early_stopping_rounds = early_stopping_rounds
self.impl = None
self.kwargs = kwargs
def fit(self, x, y):
if self.eval_sample:
x_train, x_eval, y_train, y_eval = train_test_split(x,
y,
test_size=self.eval_sample,
random_state=42)
d_train = xgb.DMatrix(x_train, y_train)
d_eval = xgb.DMatrix(x_eval, y_eval)
self.impl = xgb.train(params=self.params,
dtrain=d_train,
num_boost_round=self.num_boost_round,
evals=[(d_eval, 'eval')],
verbose_eval=False,
**self.kwargs)
else:
d_train = xgb.DMatrix(x, y)
self.impl = xgb.train(params=self.params,
dtrain=d_train,
num_boost_round=self.num_boost_round,
**self.kwargs)
self.trained_time = arrow.now().format("YYYY-MM-DD HH:mm:ss")
def predict(self, x: np.ndarray) -> np.ndarray:
d_predict = xgb.DMatrix(x)
return self.impl.predict(d_predict)
def save(self) -> dict:
model_desc = super().save()
model_desc['xgbboot_version'] = xgbboot_version
model_desc['importances'] = self.importances
return model_desc
@classmethod
def load(cls, model_desc: dict):
obj_layout = super().load(model_desc)
if LooseVersion(sklearn_version) < LooseVersion(model_desc['xgbboot_version']):
alpha_logger.warning('Current xgboost version {0} is lower than the model version {1}. '
'Loaded model may work incorrectly.'.format(xgbboot_version,
model_desc['xgbboot_version']))
return obj_layout
@property
def importances(self):
imps = self.impl.get_fscore().items()
imps = sorted(imps, key=lambda x: x[0])
return list(zip(*imps))[1]
......@@ -12,16 +12,18 @@ from alphamind.model.treemodel import RandomForestRegressor
from alphamind.model.treemodel import RandomForestClassifier
from alphamind.model.treemodel import XGBRegressor
from alphamind.model.treemodel import XGBClassifier
from alphamind.model.treemodel import XGBTrainer
class TestTreeModel(unittest.TestCase):
def setUp(self):
self.x = np.random.randn(1000, 10)
self.y = np.random.randn(1000)
def test_random_forest_regress_persistence(self):
model = RandomForestRegressor(features=list(range(10)))
x = np.random.randn(1000, 10)
y = np.random.randn(1000)
model.fit(x, y)
model.fit(self.x, self.y)
desc = model.save()
new_model = load_model(desc)
......@@ -29,14 +31,12 @@ class TestTreeModel(unittest.TestCase):
sample_x = np.random.randn(100, 10)
np.testing.assert_array_almost_equal(model.predict(sample_x), new_model.predict(sample_x))
np.testing.assert_array_almost_equal(model.importances, new_model.importances)
def test_random_forest_classify_persistence(self):
model = RandomForestClassifier(features=list(range(10)))
x = np.random.randn(1000, 10)
y = np.random.randn(1000)
y = np.where(y > 0, 1, 0)
model.fit(x, y)
y = np.where(self.y > 0, 1, 0)
model.fit(self.x, y)
desc = model.save()
new_model = load_model(desc)
......@@ -44,13 +44,11 @@ class TestTreeModel(unittest.TestCase):
sample_x = np.random.randn(100, 10)
np.testing.assert_array_almost_equal(model.predict(sample_x), new_model.predict(sample_x))
np.testing.assert_array_almost_equal(model.importances, new_model.importances)
def test_xgb_regress_persistence(self):
model = XGBRegressor(features=list(range(10)))
x = np.random.randn(1000, 10)
y = np.random.randn(1000)
model.fit(x, y)
model.fit(self.x, self.y)
desc = model.save()
new_model = load_model(desc)
......@@ -58,14 +56,56 @@ class TestTreeModel(unittest.TestCase):
sample_x = np.random.randn(100, 10)
np.testing.assert_array_almost_equal(model.predict(sample_x), new_model.predict(sample_x))
np.testing.assert_array_almost_equal(model.importances, new_model.importances)
def test_xgb_classify_persistence(self):
model = XGBClassifier(features=list(range(10)))
x = np.random.randn(1000, 10)
y = np.random.randn(1000)
y = np.where(y > 0, 1, 0)
y = np.where(self.y > 0, 1, 0)
model.fit(self.x, y)
desc = model.save()
new_model = load_model(desc)
self.assertEqual(model.features, new_model.features)
sample_x = np.random.randn(100, 10)
np.testing.assert_array_almost_equal(model.predict(sample_x), new_model.predict(sample_x))
np.testing.assert_array_almost_equal(model.importances, new_model.importances)
def test_xgb_trainer_equal_classifier(self):
sample_x = np.random.randn(100, 10)
model.fit(x, y)
model1 = XGBClassifier(n_estimators=100,
learning_rate=0.1,
max_depth=3,
features=list(range(10)),
random_state=42)
model2 = XGBTrainer(features=list(range(10)),
objective='reg:logistic',
booster='gbtree',
tree_method='exact',
n_estimators=100,
learning_rate=0.1,
max_depth=3,
random_state=42)
y = np.where(self.y > 0, 1, 0)
model1.fit(self.x, y)
model2.fit(self.x, y)
predict1 = model1.predict(sample_x)
predict2 = model2.predict(sample_x)
predict2 = np.where(predict2 > 0.5, 1., 0.)
np.testing.assert_array_almost_equal(predict1, predict2)
def test_xgb_trainer_persistence(self):
model = XGBTrainer(features=list(range(10)),
objective='binary:logistic',
booster='gbtree',
tree_method='hist',
n_estimators=200)
y = np.where(self.y > 0, 1, 0)
model.fit(self.x, y)
desc = model.save()
new_model = load_model(desc)
......@@ -73,3 +113,4 @@ class TestTreeModel(unittest.TestCase):
sample_x = np.random.randn(100, 10)
np.testing.assert_array_almost_equal(model.predict(sample_x), new_model.predict(sample_x))
np.testing.assert_array_almost_equal(model.importances, new_model.importances)
......@@ -4,7 +4,12 @@ cd xgboost
git submodule init
git submodule update
mkdir build
cd build
cmake ..
make -j4
cd ..
cd python-package
python setup.py install
......@@ -22,4 +27,4 @@ if [ $? -ne 0 ] ; then
exit 1
fi
cd ../..
\ No newline at end of file
cd ../..
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
Subproject commit bf4367184164e593cd2856ef38f8dd4f8cc76999
Subproject commit a187ed6c8f3aa40b47d5be80667cbbe6a6fd563d
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment