Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Sign in
Toggle navigation
A
alpha-mind
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
Dr.李
alpha-mind
Commits
8c31e011
Commit
8c31e011
authored
Aug 24, 2017
by
Dr.李
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
simplified model training example
parent
a5260eba
Changes
5
Hide whitespace changes
Inline
Side-by-side
Showing
5 changed files
with
241 additions
and
137 deletions
+241
-137
api.py
alphamind/api.py
+2
-2
sqlengine.py
alphamind/data/engines/sqlengine.py
+8
-5
model_training.py
alphamind/examples/model_training.py
+41
-59
data_preparing.py
alphamind/model/data_preparing.py
+190
-0
preparing.py
alphamind/model/preparing.py
+0
-71
No files found.
alphamind/api.py
View file @
8c31e011
...
...
@@ -22,7 +22,7 @@ from alphamind.data.standardize import standardize
from
alphamind.data.neutralize
import
neutralize
from
alphamind.data.engines.sqlengine
import
factor_tables
from
alphamind.model.
preparing
import
prepare_data
from
alphamind.model.
data_preparing
import
fetch_data_package
from
alphamind.utilities
import
alpha_logger
...
...
@@ -43,6 +43,6 @@ __all__ = [
'standardize'
,
'neutralize'
,
'factor_tables'
,
'
prepare_data
'
,
'
fetch_data_package
'
,
'alpha_logger'
]
\ No newline at end of file
alphamind/data/engines/sqlengine.py
View file @
8c31e011
...
...
@@ -224,7 +224,7 @@ class SqlEngine(object):
ref_date
:
str
,
factors
:
Iterable
[
object
],
codes
:
Iterable
[
int
],
default_window
:
int
=
0
)
->
pd
.
DataFrame
:
warm_start
:
int
=
0
)
->
pd
.
DataFrame
:
if
isinstance
(
factors
,
Transformer
):
transformer
=
factors
...
...
@@ -235,7 +235,7 @@ class SqlEngine(object):
factor_cols
=
_map_factors
(
dependency
)
start_date
=
advanceDateByCalendar
(
'china.sse'
,
ref_date
,
str
(
-
default_window
)
+
'b'
)
.
strftime
(
'
%
Y-
%
m-
%
d'
)
start_date
=
advanceDateByCalendar
(
'china.sse'
,
ref_date
,
str
(
-
warm_start
)
+
'b'
)
.
strftime
(
'
%
Y-
%
m-
%
d'
)
end_date
=
ref_date
big_table
=
Market
...
...
@@ -263,7 +263,7 @@ class SqlEngine(object):
start_date
:
str
=
None
,
end_date
:
str
=
None
,
dates
:
Iterable
[
str
]
=
None
,
default_window
:
int
=
0
)
->
pd
.
DataFrame
:
warm_start
:
int
=
0
)
->
pd
.
DataFrame
:
if
isinstance
(
factors
,
Transformer
):
transformer
=
factors
...
...
@@ -287,10 +287,10 @@ class SqlEngine(object):
real_dates
=
dates
else
:
if
dates
:
real_start_date
=
advanceDateByCalendar
(
'china.sse'
,
dates
[
0
],
str
(
-
default_window
)
+
'b'
)
.
strftime
(
'
%
Y-
%
m-
%
d'
)
real_start_date
=
advanceDateByCalendar
(
'china.sse'
,
dates
[
0
],
str
(
-
warm_start
)
+
'b'
)
.
strftime
(
'
%
Y-
%
m-
%
d'
)
real_end_date
=
dates
[
-
1
]
else
:
real_start_date
=
advanceDateByCalendar
(
'china.sse'
,
start_date
,
str
(
-
default_window
)
+
'b'
)
.
strftime
(
'
%
Y-
%
m-
%
d'
)
real_start_date
=
advanceDateByCalendar
(
'china.sse'
,
start_date
,
str
(
-
warm_start
)
+
'b'
)
.
strftime
(
'
%
Y-
%
m-
%
d'
)
real_end_date
=
end_date
real_dates
=
None
...
...
@@ -394,6 +394,9 @@ class SqlEngine(object):
risk_cov
=
pd
.
read_sql
(
query
,
self
.
engine
)
.
sort_values
([
'Date'
,
'FactorID'
])
if
not
excluded
:
excluded
=
[]
risk_exposure_cols
=
[
RiskExposure
.
__table__
.
columns
[
f
]
for
f
in
total_risk_factors
if
f
not
in
set
(
excluded
)]
big_table
=
outerjoin
(
special_risk_table
,
RiskExposure
,
and_
(
special_risk_table
.
Date
==
RiskExposure
.
Date
,
...
...
alphamind/examples/model_training.py
View file @
8c31e011
...
...
@@ -10,7 +10,9 @@ import pandas as pd
import
copy
from
sklearn.linear_model
import
LinearRegression
from
alphamind.api
import
*
from
PyFin.api
import
*
from
matplotlib
import
pyplot
as
plt
plt
.
style
.
use
(
'ggplot'
)
'''
...
...
@@ -25,6 +27,7 @@ Settings:
end_date - 2017-08-01
re-balance - 1 week
training - every 4 week
'''
engine
=
SqlEngine
(
'postgresql+psycopg2://postgres:A12345678!@10.63.6.220/alpha'
)
...
...
@@ -39,91 +42,70 @@ start_date = '2012-01-01'
end_date
=
'2017-08-01'
'''
fetch data from target data base
fetch data from target data base
and do the corresponding data processing
'''
train_y
,
train_x
=
prepare_data
(
engine
,
start_date
=
start_date
,
end_date
=
end_date
,
factors
=
alpha_factors
+
neutralize_risk
,
frequency
=
frequency
,
universe
=
universe
,
benchmark
=
benchmark
)
dates
=
train_x
.
Date
.
unique
()
groups
=
train_x
.
Date
.
values
raw_x
=
train_x
[
alpha_factors
]
.
values
.
astype
(
float
)
raw_y
=
train_y
[[
'dx'
]]
.
values
.
astype
(
float
)
benchmark_w
=
train_x
[
'weight'
]
.
values
risk_exp
=
train_x
[
neutralize_risk
]
.
values
.
astype
(
float
)
data_package
=
fetch_data_package
(
engine
,
alpha_factors
=
alpha_factors
,
start_date
=
start_date
,
end_date
=
end_date
,
frequency
=
frequency
,
universe
=
universe
,
benchmark
=
benchmark
,
batch
=
batch
,
neutralized_risk
=
neutralize_risk
,
pre_process
=
[
winsorize_normal
,
standardize
],
post_process
=
[
standardize
])
'''
pre-processing stage for winsorize, standardize and neutralize
training phase: using Linear - regression from scikit-learn
'''
ne_x
=
raw_x
.
copy
()
ne_y
=
raw_y
.
copy
()
for
i
,
start_date
in
enumerate
(
dates
[:
-
batch
]):
end_date
=
dates
[
i
+
batch
]
index
=
(
groups
>=
start_date
)
&
(
groups
<
end_date
)
this_raw_x
=
raw_x
[
index
]
this_raw_y
=
raw_y
[
index
]
this_risk_exp
=
risk_exp
[
index
]
ne_x
[
index
]
=
factor_processing
(
this_raw_x
,
pre_process
=
[
winsorize_normal
,
standardize
],
risk_factors
=
this_risk_exp
,
post_process
=
[
standardize
])
ne_y
[
index
]
=
factor_processing
(
this_raw_y
,
pre_process
=
[
winsorize_normal
,
standardize
],
risk_factors
=
this_risk_exp
,
post_process
=
[
standardize
])
train_x
=
data_package
[
'train'
][
'x'
]
train_y
=
data_package
[
'train'
][
'y'
]
'''
training phase: using Linear - regression from scikit-learn
'''
dates
=
sorted
(
train_x
.
keys
())
model
=
LinearRegression
(
fit_intercept
=
False
)
model_df
=
pd
.
Series
()
for
i
,
start_date
in
enumerate
(
dates
[:
-
batch
]):
end_date
=
dates
[
i
+
batch
]
index
=
(
groups
>=
start_date
)
&
(
groups
<
end_date
)
this_ne_x
=
ne_x
[
index
]
this_ne_y
=
ne_y
[
index
]
model
.
fit
(
this_ne_x
,
this_ne_y
)
model_df
.
loc
[
end_date
]
=
copy
.
deepcopy
(
model
)
print
(
'Date: {0} training finished'
.
format
(
end_date
))
for
train_date
in
dates
:
x
=
train_x
[
train_date
]
y
=
train_y
[
train_date
]
model
.
fit
(
x
,
y
)
model_df
.
loc
[
train_date
]
=
copy
.
deepcopy
(
model
)
print
(
'Date: {0} training finished'
.
format
(
train_date
))
'''
predicting phase: using trained model on the re-balance dates
'''
final_res
=
np
.
zeros
((
len
(
dates
)
-
batch
,
n_bins
))
predict_x
=
data_package
[
'predict'
][
'x'
]
settlement
=
data_package
[
'settlement'
]
for
i
,
predict_date
in
enumerate
(
dates
[
batch
:]):
final_res
=
np
.
zeros
((
len
(
dates
),
n_bins
))
for
i
,
predict_date
in
enumerate
(
dates
):
model
=
model_df
[
predict_date
]
index
=
groups
==
predict_date
this_ne_x
=
ne_x
[
index
]
realized_r
=
raw_y
[
index
]
this_benchmark_w
=
benchmark_w
[
index
]
x
=
predict_x
[
predict_date
]
benchmark_w
=
settlement
[
settlement
.
Date
==
predict_date
][
'weight'
]
.
values
realized_r
=
settlement
[
settlement
.
Date
==
predict_date
][
'dx'
]
.
values
predict_y
=
model
.
predict
(
this_ne_
x
)
predict_y
=
model
.
predict
(
x
)
res
=
er_quantile_analysis
(
predict_y
,
n_bins
,
dx_return
=
realized_r
,
benchmark
=
this_benchmark_w
)
benchmark
=
benchmark_w
)
final_res
[
i
]
=
res
/
benchmark_w
.
sum
()
print
(
'Date: {0} predicting finished'
.
format
(
train_date
))
final_res
[
i
]
=
res
/
this_benchmark_w
.
sum
(
)
last_date
=
advanceDateByCalendar
(
'china.sse'
,
dates
[
-
1
],
frequency
)
df
=
pd
.
DataFrame
(
final_res
,
index
=
dates
[
batch
:])
df
.
loc
[
dates
[
0
]]
=
0.
df
=
pd
.
DataFrame
(
final_res
,
index
=
dates
[
1
:]
+
[
last_date
])
df
.
sort_index
(
inplace
=
True
)
df
=
df
.
cumsum
()
.
plot
()
plt
.
title
(
'Prod factors model training with Linear Regression from 2012 - 2017'
)
...
...
alphamind/model/data_preparing.py
0 → 100644
View file @
8c31e011
# -*- coding: utf-8 -*-
"""
Created on 2017-8-24
@author: cheng.li
"""
import
numpy
as
np
import
pandas
as
pd
from
typing
import
Iterable
from
typing
import
Union
from
PyFin.api
import
makeSchedule
from
PyFin.api
import
BizDayConventions
from
alphamind.data.transformer
import
Transformer
from
alphamind.data.engines.sqlengine
import
SqlEngine
from
alphamind.data.engines.universe
import
Universe
from
alphamind.data.processing
import
factor_processing
def
_map_horizon
(
frequency
:
str
)
->
int
:
if
frequency
==
'1d'
:
return
0
elif
frequency
==
'1w'
:
return
4
elif
frequency
==
'1m'
:
return
21
elif
frequency
==
'3m'
:
return
62
else
:
raise
ValueError
(
'{0} is an unrecognized frequency rule'
.
format
(
frequency
))
def
prepare_data
(
engine
:
SqlEngine
,
factors
:
Union
[
Transformer
,
Iterable
[
object
]],
start_date
:
str
,
end_date
:
str
,
frequency
:
str
,
universe
:
Universe
,
benchmark
:
int
,
warm_start
:
int
=
0
):
dates
=
makeSchedule
(
start_date
,
end_date
,
frequency
,
calendar
=
'china.sse'
,
dateRule
=
BizDayConventions
.
Following
)
horizon
=
_map_horizon
(
frequency
)
if
isinstance
(
factors
,
Transformer
):
transformer
=
factors
else
:
transformer
=
Transformer
(
factors
)
factor_df
=
engine
.
fetch_factor_range
(
universe
,
factors
=
transformer
,
dates
=
dates
,
warm_start
=
warm_start
)
.
sort_values
([
'Date'
,
'Code'
])
return_df
=
engine
.
fetch_dx_return_range
(
universe
,
dates
=
dates
,
horizon
=
horizon
)
benchmark_df
=
engine
.
fetch_benchmark_range
(
benchmark
,
dates
=
dates
)
df
=
pd
.
merge
(
factor_df
,
return_df
,
on
=
[
'Date'
,
'Code'
])
.
dropna
()
df
=
pd
.
merge
(
df
,
benchmark_df
,
on
=
[
'Date'
,
'Code'
],
how
=
'left'
)
df
[
'weight'
]
=
df
[
'weight'
]
.
fillna
(
0.
)
return
df
[[
'Date'
,
'Code'
,
'dx'
]],
df
[[
'Date'
,
'Code'
,
'weight'
]
+
transformer
.
names
]
def
batch_processing
(
x_values
,
y_values
,
groups
,
group_label
,
batch
,
risk_exp
,
pre_process
,
post_process
):
train_x_buckets
=
{}
train_y_buckets
=
{}
predict_x_buckets
=
{}
for
i
,
start
in
enumerate
(
groups
[:
-
batch
]):
end
=
groups
[
i
+
batch
]
index
=
(
group_label
>=
start
)
&
(
group_label
<
end
)
this_raw_x
=
x_values
[
index
]
this_raw_y
=
y_values
[
index
]
if
risk_exp
is
not
None
:
this_risk_exp
=
risk_exp
[
index
]
else
:
this_risk_exp
=
None
train_x_buckets
[
end
]
=
factor_processing
(
this_raw_x
,
pre_process
=
pre_process
,
risk_factors
=
this_risk_exp
,
post_process
=
post_process
)
train_y_buckets
[
end
]
=
factor_processing
(
this_raw_y
,
pre_process
=
pre_process
,
risk_factors
=
this_risk_exp
,
post_process
=
post_process
)
index
=
(
group_label
>
start
)
&
(
group_label
<=
end
)
sub_dates
=
group_label
[
index
]
this_raw_x
=
x_values
[
index
]
if
risk_exp
is
not
None
:
this_risk_exp
=
risk_exp
[
index
]
else
:
this_risk_exp
=
None
ne_x
=
factor_processing
(
this_raw_x
,
pre_process
=
pre_process
,
risk_factors
=
this_risk_exp
,
post_process
=
post_process
)
predict_x_buckets
[
end
]
=
ne_x
[
sub_dates
==
end
]
return
train_x_buckets
,
train_y_buckets
,
predict_x_buckets
def
fetch_data_package
(
engine
:
SqlEngine
,
alpha_factors
:
Iterable
[
object
],
start_date
:
str
,
end_date
:
str
,
frequency
:
str
,
universe
:
Universe
,
benchmark
:
int
,
warm_start
:
int
=
0
,
batch
:
int
=
1
,
neutralized_risk
:
Iterable
[
str
]
=
None
,
risk_model
:
str
=
'short'
,
pre_process
:
Iterable
[
object
]
=
None
,
post_process
:
Iterable
[
object
]
=
None
):
transformer
=
Transformer
(
alpha_factors
)
dates
=
makeSchedule
(
start_date
,
end_date
,
frequency
,
calendar
=
'china.sse'
,
dateRule
=
BizDayConventions
.
Following
)
return_df
,
factor_df
=
prepare_data
(
engine
,
transformer
,
start_date
,
end_date
,
frequency
,
universe
,
benchmark
,
warm_start
)
if
neutralized_risk
:
risk_df
=
engine
.
fetch_risk_model_range
(
universe
,
dates
=
dates
,
risk_model
=
risk_model
)[
1
]
risk_df
=
risk_df
[[
'Date'
,
'Code'
]
+
neutralized_risk
]
.
dropna
()
train_x
=
pd
.
merge
(
factor_df
,
risk_df
,
on
=
[
'Date'
,
'Code'
])
return_df
=
pd
.
merge
(
return_df
,
risk_df
,
on
=
[
'Date'
,
'Code'
])[[
'Date'
,
'Code'
,
'dx'
]]
train_y
=
return_df
.
copy
()
risk_exp
=
train_x
[
neutralized_risk
]
.
values
.
astype
(
float
)
x_values
=
train_x
[
transformer
.
names
]
.
values
.
astype
(
float
)
y_values
=
train_y
[[
'dx'
]]
.
values
else
:
risk_exp
=
None
train_x
=
factor_df
.
copy
()
train_y
=
return_df
.
copy
()
x_values
=
train_x
[
transformer
.
names
]
.
values
.
astype
(
float
)
y_values
=
train_y
[[
'dx'
]]
.
values
date_label
=
pd
.
DatetimeIndex
(
factor_df
.
Date
)
.
to_pydatetime
()
dates
=
np
.
unique
(
date_label
)
return_df
[
'weight'
]
=
train_x
[
'weight'
]
train_x_buckets
,
train_y_buckets
,
predict_x_buckets
=
batch_processing
(
x_values
,
y_values
,
dates
,
date_label
,
batch
,
risk_exp
,
pre_process
,
post_process
)
ret
=
dict
()
ret
[
'settlement'
]
=
return_df
ret
[
'train'
]
=
{
'x'
:
train_x_buckets
,
'y'
:
train_y_buckets
}
ret
[
'predict'
]
=
{
'x'
:
predict_x_buckets
}
return
ret
if
__name__
==
'__main__'
:
from
PyFin.api
import
MA
engine
=
SqlEngine
(
'postgresql+psycopg2://postgres:A12345678!@10.63.6.220/alpha'
)
universe
=
Universe
(
'zz500'
,
[
'zz500'
])
res
=
fetch_data_package
(
engine
,
MA
(
10
,
'EPS'
),
'2012-01-01'
,
'2012-04-01'
,
'1m'
,
universe
,
905
,
0
)
print
(
res
)
alphamind/model/preparing.py
deleted
100644 → 0
View file @
a5260eba
# -*- coding: utf-8 -*-
"""
Created on 2017-8-24
@author: cheng.li
"""
import
pandas
as
pd
from
typing
import
Iterable
from
typing
import
Union
from
PyFin.api
import
makeSchedule
from
PyFin.api
import
BizDayConventions
from
alphamind.data.transformer
import
Transformer
from
alphamind.data.engines.sqlengine
import
SqlEngine
from
alphamind.data.engines.universe
import
Universe
def
_map_horizon
(
frequency
:
str
)
->
int
:
if
frequency
==
'1d'
:
return
0
elif
frequency
==
'1w'
:
return
4
elif
frequency
==
'1m'
:
return
21
elif
frequency
==
'3m'
:
return
62
else
:
raise
ValueError
(
'{0} is an unrecognized frequency rule'
.
format
(
frequency
))
def
prepare_data
(
engine
:
SqlEngine
,
factors
:
Iterable
[
object
],
start_date
:
str
,
end_date
:
str
,
frequency
:
str
,
universe
:
Universe
,
benchmark
:
int
,
default_window
:
int
=
0
):
dates
=
makeSchedule
(
start_date
,
end_date
,
frequency
,
calendar
=
'china.sse'
,
dateRule
=
BizDayConventions
.
Following
)
horizon
=
_map_horizon
(
frequency
)
transformer
=
Transformer
(
factors
)
factor_df
=
engine
.
fetch_factor_range
(
universe
,
factors
=
transformer
,
dates
=
dates
,
default_window
=
default_window
)
.
sort_values
([
'Date'
,
'Code'
])
return_df
=
engine
.
fetch_dx_return_range
(
universe
,
dates
=
dates
,
horizon
=
horizon
)
benchmark_df
=
engine
.
fetch_benchmark_range
(
benchmark
,
dates
=
dates
)
df
=
pd
.
merge
(
factor_df
,
return_df
,
on
=
[
'Date'
,
'Code'
])
.
dropna
()
df
=
pd
.
merge
(
df
,
benchmark_df
,
on
=
[
'Date'
,
'Code'
],
how
=
'left'
)
df
[
'weight'
]
=
df
[
'weight'
]
.
fillna
(
0.
)
return
df
[[
'Date'
,
'Code'
,
'dx'
]],
df
[[
'Date'
,
'Code'
,
'weight'
]
+
transformer
.
names
]
if
__name__
==
'__main__'
:
from
PyFin.api
import
*
engine
=
SqlEngine
(
'postgresql+psycopg2://postgres:A12345678!@10.63.6.220/alpha'
)
universe
=
Universe
(
'zz500'
,
[
'zz500'
])
df1
,
df2
=
prepare_data
(
engine
,
MA
(
10
,
'EPS'
),
'2012-01-01'
,
'2013-01-01'
,
'1w'
,
universe
)
print
(
df1
)
print
(
df2
)
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment