Merge branch 'master' of https://github.com/lion-sing/alpha-mind

4b49b7bb · Yucheng · 6c4fb657 · 2c6b4668 · 4b49b7bb · 4b49b7bb
Commit 4b49b7bb authored Feb 23, 2018 by Yucheng
25 changed files
--- a/.travis.yml
+++ b/.travis.yml
@@ -33,6 +33,8 @@ install:
  - conda install pandas
  - conda install scikit-learn
  - conda install cython
+  - conda install sqlalchemy
+  - conda install psycopg2
  - conda install -c conda-forge arrow
  - pip install simpleutils
  - pip install coveralls

--- a/README.md
+++ b/README.md
@@ -48,8 +48,8 @@ alpha - mind 提供了多因子研究中常用的工具链，包括：

 * Linux

-    在linux上，需要c++编译器（例如g++）以及fortran编译器（例如gfortran)：
-
+  在linux上，需要c++编译器（例如g++）以及fortran编译器（例如gfortran):
+    
    ```bash
    build_linux_dependencies.sh
    ```

--- a/alphamind/api.py
+++ b/alphamind/api.py
@@ -43,6 +43,10 @@ from alphamind.model import load_model
 from alphamind.model.data_preparing import fetch_data_package
 from alphamind.model.data_preparing import fetch_train_phase
 from alphamind.model.data_preparing import fetch_predict_phase
+from alphamind.model.composer import Composer
+from alphamind.model.composer import DataMeta
+from alphamind.model.composer import train_model
+from alphamind.model.composer import predict_by_model

 from alphamind.execution.naiveexecutor import NaiveExecutor
 from alphamind.execution.thresholdexecutor import ThresholdExecutor
@@ -79,6 +83,10 @@ __all__ = [
    'fetch_data_package',
    'fetch_train_phase',
    'fetch_predict_phase',
+    'Composer',
+    'DataMeta',
+    'train_model',
+    'predict_by_model',
    'LinearRegression',
    'LassoRegression',
    'ConstLinearModel',

--- a/alphamind/data/dbmodel/models.py
+++ b/alphamind/data/dbmodel/models.py
@@ -5,8 +5,7 @@ Created on 2017-6-29
 @author: cheng.li
 """

-from sqlalchemy import BigInteger, Column, DateTime, Float, Index, Integer, String, Text, Boolean, text
-from sqlalchemy.dialects.postgresql import JSONB
+from sqlalchemy import BigInteger, Column, DateTime, Float, Index, Integer, String, Text, Boolean, text, JSON
 from sqlalchemy.ext.declarative import declarative_base

 Base = declarative_base()
@@ -635,7 +634,7 @@ class DailyPortfolios(Base):
    industry = Column(String(50), nullable=False)
    benchmark_weight = Column(Float(53), nullable=False)
    is_tradable = Column(Boolean, nullable=False)
-    factor = Column(JSONB)
+    factor = Column(JSON)


 class DailyPortfoliosSchedule(Base):
@@ -857,7 +856,8 @@ class Models(Base):
    model_type = Column(String(30), nullable=False)
    model_version = Column(BigInteger, nullable=False)
    update_time = Column(DateTime, nullable=False)
-    model_desc = Column(JSONB, nullable=False)
+    model_desc = Column(JSON, nullable=False)
+    data_meta = Column(JSON, nullable=True)
    is_primary = Column(Boolean)
    model_id = Column(Integer, primary_key=True, autoincrement=True)

@@ -915,7 +915,7 @@ class Positions(Base):
    trade_date = Column(DateTime, primary_key=True, nullable=False)
    portfolio = Column(String(50), primary_key=True, nullable=False)
    type = Column(String(50), primary_key=True, nullable=False)
-    weight = Column(JSONB)
+    weight = Column(JSON)


 class QuantileAnalysis(Base):
@@ -1865,7 +1865,7 @@ class Formulas(Base):
    __tablename__ = 'formulas'

    formula = Column(String(50), primary_key=True)
-    formula_desc = Column(JSONB, nullable=False)
+    formula_desc = Column(JSON, nullable=False)
    comment = Column(Text)



--- a/alphamind/data/engines/sqlengine.py
+++ b/alphamind/data/engines/sqlengine.py
@@ -155,6 +155,13 @@ class SqlEngine(object):
                          dates: Iterable[str] = None) -> pd.DataFrame:
        return universe.query(self, start_date, end_date, dates)

+    def _create_stats(self, table, horizon, offset, code_attr='code'):
+        stats = func.sum(self.ln_func(1. + table.chgPct)).over(
+            partition_by=getattr(table, code_attr),
+            order_by=table.trade_date,
+            rows=(1 + DAILY_RETURN_OFFSET + offset, 1 + horizon + DAILY_RETURN_OFFSET + offset)).label('dx')
+        return stats
+
    def fetch_dx_return(self,
                        ref_date: str,
                        codes: Iterable[int],
@@ -169,10 +176,7 @@ class SqlEngine(object):
        else:
            end_date = expiry_date

-        stats = func.sum(self.ln_func(1. + Market.chgPct)).over(
-            partition_by=Market.code,
-            order_by=Market.trade_date,
-            rows=(1 + DAILY_RETURN_OFFSET + offset, 1 + horizon + DAILY_RETURN_OFFSET + offset)).label('dx')
+        stats = self._create_stats(Market, horizon, offset)

        query = select([Market.trade_date, Market.code, stats]).where(
            and_(
@@ -200,24 +204,22 @@ class SqlEngine(object):
        end_date = advanceDateByCalendar('china.sse', end_date,
                                         str(1 + horizon + offset + DAILY_RETURN_OFFSET) + 'b').strftime('%Y-%m-%d')

-        stats = func.sum(self.ln_func(1. + Market.chgPct)).over(
-            partition_by=Market.code,
-            order_by=Market.trade_date,
-            rows=(1 + offset + DAILY_RETURN_OFFSET, 1 + horizon + offset + DAILY_RETURN_OFFSET)).label('dx')
+        stats = self._create_stats(Market, horizon, offset)

        cond = universe._query_statements(start_date, end_date, None)

-        big_table = join(Market, UniverseTable,
+        t = select([Market.trade_date, Market.code, stats]).where(
+            Market.trade_date.between(start_date, end_date)
+        ).alias('t')
+        big_table = join(t, UniverseTable,
                         and_(
-                             Market.trade_date == UniverseTable.trade_date,
-                             Market.code == UniverseTable.code,
+                             t.columns['trade_date'] == UniverseTable.trade_date,
+                             t.columns['code'] == UniverseTable.code,
                             cond
                         )
                         )

-        query = select([Market.trade_date, Market.code, stats]) \
-            .select_from(big_table)
-
+        query = select([t]).select_from(big_table)
        df = pd.read_sql(query, self.session.bind).dropna()

        if universe.is_filtered:
@@ -242,10 +244,7 @@ class SqlEngine(object):
        else:
            end_date = expiry_date

-        stats = func.sum(self.ln_func(1. + IndexMarket.chgPct)).over(
-            partition_by=IndexMarket.indexCode,
-            order_by=IndexMarket.trade_date,
-            rows=(1 + DAILY_RETURN_OFFSET + offset, 1 + horizon + DAILY_RETURN_OFFSET + offset)).label('dx')
+            stats = self._create_stats(IndexMarket, horizon, offset, code_attr='indexCode')

        query = select([IndexMarket.trade_date, IndexMarket.indexCode.label('code'), stats]).where(
            and_(
@@ -273,10 +272,7 @@ class SqlEngine(object):
        end_date = advanceDateByCalendar('china.sse', end_date,
                                         str(1 + horizon + offset + DAILY_RETURN_OFFSET) + 'b').strftime('%Y-%m-%d')

-        stats = func.sum(self.ln_func(1. + IndexMarket.chgPct)).over(
-            partition_by=IndexMarket.indexCode,
-            order_by=IndexMarket.trade_date,
-            rows=(1 + offset + DAILY_RETURN_OFFSET, 1 + horizon + offset + DAILY_RETURN_OFFSET)).label('dx')
+        stats = self._create_stats(IndexMarket, horizon, offset, code_attr='indexCode')

        query = select([IndexMarket.trade_date, IndexMarket.indexCode.label('code'), stats]) \
            .where(
@@ -360,9 +356,11 @@ class SqlEngine(object):
            factor_cols = _map_factors(dependency, factor_tables)

        big_table = FullFactor
+        joined_tables = set()
+        joined_tables.add(FullFactor.__table__.name)

        for t in set(factor_cols.values()):
-            if t.__table__.name != FullFactor.__table__.name:
+            if t.__table__.name not in joined_tables:
                if dates is not None:
                    big_table = outerjoin(big_table, t, and_(FullFactor.trade_date == t.trade_date,
                                                             FullFactor.code == t.code,
@@ -371,20 +369,18 @@ class SqlEngine(object):
                    big_table = outerjoin(big_table, t, and_(FullFactor.trade_date == t.trade_date,
                                                             FullFactor.code == t.code,
                                                             FullFactor.trade_date.between(start_date, end_date)))
+                joined_tables.add(t.__table__.name)

-        cond = universe._query_statements(start_date, end_date, dates)
-
-        big_table = join(big_table, UniverseTable,
-                         and_(
-                             FullFactor.trade_date == UniverseTable.trade_date,
-                             FullFactor.code == UniverseTable.code,
-                             cond
-                         )
-                         )
+        universe_df = universe.query(self, start_date, end_date, dates)

        query = select(
            [FullFactor.trade_date, FullFactor.code, FullFactor.isOpen] + list(factor_cols.keys())) \
-            .select_from(big_table).distinct()
+            .select_from(big_table).where(
+                and_(
+                    FullFactor.code.in_(universe_df.code.unique().tolist()),
+                    FullFactor.trade_date.in_(dates) if dates is not None else FullFactor.trade_date.between(start_date, end_date)
+                )
+        ).distinct()

        df = pd.read_sql(query, self.engine)
        if universe.is_filtered:
@@ -395,7 +391,6 @@ class SqlEngine(object):
            df = pd.merge(df, external_data, on=['trade_date', 'code']).dropna()

        df.sort_values(['trade_date', 'code'], inplace=True)
-
        df.set_index('trade_date', inplace=True)
        res = transformer.transform('code', df)

@@ -404,11 +399,13 @@ class SqlEngine(object):
                df[col] = res[col].values

        df['isOpen'] = df.isOpen.astype(bool)
-        return df.reset_index()
+        df = df.reset_index()
+        return pd.merge(df, universe_df[['trade_date', 'code']], how='inner')

    def fetch_benchmark(self,
                        ref_date: str,
-                        benchmark: int) -> pd.DataFrame:
+                        benchmark: int,
+                        codes: Iterable[int]=None) -> pd.DataFrame:
        query = select([IndexComponent.code, (IndexComponent.weight / 100.).label('weight')]).where(
            and_(
                IndexComponent.trade_date == ref_date,
@@ -416,7 +413,13 @@ class SqlEngine(object):
            )
        )

-        return pd.read_sql(query, self.engine)
+        df = pd.read_sql(query, self.engine)
+
+        if codes:
+            df.set_index(['code'], inplace=True)
+            df = df.reindex(codes).fillna(0.)
+            df.reset_index(inplace=True)
+        return df

    def fetch_benchmark_range(self,
                              benchmark: int,
@@ -613,7 +616,7 @@ class SqlEngine(object):

        res = df[['trade_date', 'code', 'industry_code', 'industry_name'] + in_s]

-        res = res.assign(**dict(zip(out_s, [0]*len(out_s))))
+        res = res.assign(**dict(zip(out_s, [0] * len(out_s))))
        return res

    def fetch_trade_status(self,
@@ -747,6 +750,7 @@ class SqlEngine(object):
                    model_version=None,
                    is_primary=True,
                    model_id=None) -> pd.DataFrame:
+        from alphamind.model.composer import DataMeta

        conditions = []

@@ -768,8 +772,10 @@ class SqlEngine(object):

        model_df = pd.read_sql(query, self.engine)

-        for i, model_desc in enumerate(model_df.model_desc):
+        for i, data in enumerate(zip(model_df.model_desc, model_df.data_meta)):
+            model_desc, data_desc = data
            model_df.loc[i, 'model'] = load_model(model_desc)
+            model_df.loc[i, 'data_meta'] = DataMeta.load(data_desc)

        del model_df['model_desc']
        return model_df
@@ -923,10 +929,11 @@ class SqlEngine(object):


 if __name__ == '__main__':
-    universe = Universe('ss', ['hs300'])

+    from PyFin.api import *
    engine = SqlEngine()
-    ref_date = '2017-12-28'
-    codes = universe.query(engine, dates=[ref_date])
-    df = engine.fetch_trade_status(ref_date, codes.code.tolist())
-    print(df)
\ No newline at end of file
+    ref_date = '2017-06-29'
+    universe = Universe('', ['zz800'])
+
+    dates = makeSchedule('2010-01-01', '2018-02-01', '10b', 'china.sse')
+    df = engine.fetch_factor_range(universe, DIFF('roe_q'), dates=dates)
--- a/alphamind/data/engines/universe.py
+++ b/alphamind/data/engines/universe.py
@@ -7,17 +7,19 @@ Created on 2017-7-7

 from typing import Iterable
 import pandas as pd
+from simpleutils.miscellaneous import list_eq
 from sqlalchemy import and_
 from sqlalchemy import or_
 from sqlalchemy import select
 from sqlalchemy import join
 from sqlalchemy import outerjoin
-from PyFin.api import pyFinAssert
 from alphamind.data.dbmodel.models import Universe as UniverseTable
 from alphamind.data.dbmodel.models import FullFactor
 from alphamind.data.engines.utilities import _map_factors
 from alphamind.data.engines.utilities import factor_tables
 from alphamind.data.transformer import Transformer
+from alphamind.utilities import encode
+from alphamind.utilities import decode


 class Universe(object):
@@ -25,15 +27,22 @@ class Universe(object):
    def __init__(self,
                 name: str,
                 base_universe: Iterable,
-                 exclude_universe: Iterable=None,
-                 special_codes: Iterable=None,
+                 exclude_universe: Iterable = None,
+                 special_codes: Iterable = None,
                 filter_cond=None):
        self.name = name
-        self.base_universe = base_universe
-        self.exclude_universe = exclude_universe
-        self.special_codes = special_codes
+        self.base_universe = sorted(base_universe) if base_universe else None
+        self.exclude_universe = sorted(exclude_universe) if exclude_universe else None
+        self.special_codes = sorted(special_codes) if special_codes else None
        self.filter_cond = filter_cond

+    def __eq__(self, rhs):
+        return self.name == rhs.name \
+               and list_eq(self.base_universe, rhs.base_universe) \
+               and list_eq(self.exclude_universe, rhs.exclude_universe) \
+               and list_eq(self.special_codes, rhs.special_codes) \
+               and str(self.filter_cond) == str(rhs.filter_cond)
+
    @property
    def is_filtered(self):
        return True if self.filter_cond is not None else False
@@ -59,7 +68,7 @@ class Universe(object):
            *and_conditions
        )

-    def query(self, engine, start_date: str=None, end_date: str=None, dates=None) -> pd.DataFrame:
+    def query(self, engine, start_date: str = None, end_date: str = None, dates=None) -> pd.DataFrame:

        universe_cond = self._query_statements(start_date, end_date, dates)

@@ -103,6 +112,29 @@ class Universe(object):
            df = df[df[filter_fields[0]] == 1].reset_index()[['trade_date', 'code']]
            return df

+    def save(self):
+        return dict(
+            name=self.name,
+            base_universe=self.base_universe,
+            exclude_universe=self.exclude_universe,
+            special_codes=self.special_codes,
+            filter_cond=encode(self.filter_cond)
+        )
+
+    @classmethod
+    def load(cls, universe_desc: dict):
+        name = universe_desc['name']
+        base_universe = universe_desc['base_universe']
+        exclude_universe = universe_desc['exclude_universe']
+        special_codes = universe_desc['special_codes']
+        filter_cond = decode(universe_desc['filter_cond'])
+
+        return cls(name=name,
+                   base_universe=base_universe,
+                   exclude_universe=exclude_universe,
+                   special_codes=special_codes,
+                   filter_cond=filter_cond)
+

 if __name__ == '__main__':
    from PyFin.api import *

--- a/alphamind/data/transformer.py
+++ b/alphamind/data/transformer.py
@@ -12,9 +12,6 @@ from PyFin.Analysis.SecurityValueHolders import SecurityValueHolder
 from PyFin.api import transform as transform_impl


-DEFAULT_FACTOR_NAME = 'user_factor'
-
-
 def factor_translator(factor_pool):

    if not factor_pool:
@@ -23,7 +20,7 @@ def factor_translator(factor_pool):
    if isinstance(factor_pool, str):
        return {factor_pool: factor_pool}, [factor_pool]
    elif isinstance(factor_pool, SecurityValueHolder):
-        return {DEFAULT_FACTOR_NAME: factor_pool}, sorted(factor_pool.fields)
+        return {str(factor_pool): factor_pool}, sorted(factor_pool.fields)
    elif isinstance(factor_pool, dict):
        dependency = set()
        for k, v in factor_pool.items():
@@ -46,7 +43,7 @@ def factor_translator(factor_pool):
                factor_dict[f] = f
                dependency = dependency.union([f])
            elif isinstance(f, SecurityValueHolder):
-                factor_dict[DEFAULT_FACTOR_NAME + '_' + str(k).zfill(3)] = f
+                factor_dict[str(f)] = f
                dependency = dependency.union(f.fields)
                k += 1
        return factor_dict, sorted(dependency)
@@ -80,3 +77,8 @@ class Transformer(object):
            return transformed_data
        else:
            return pd.DataFrame()
+
+
+if __name__ == '__main__':
+
+    transformer = Transformer(['c', 'a'])
--- a/alphamind/examples/factor_analysis_example.py
+++ b/alphamind/examples/factor_analysis_example.py
@@ -21,7 +21,7 @@ Back test parameter settings
 """

 start_date = '2010-01-01'
-end_date = '2018-01-26'
+end_date = '2018-01-29'

 frequency = '10b'
 method = 'risk_neutral'
@@ -216,7 +216,7 @@ def factor_analysis(engine, factor_name, universe, benchmark_code, positive=True

 def worker_func_positive(factor_name):
    from alphamind.api import SqlEngine, Universe
-    neutralize_factors = None #['roe_q', 'ep_q']
+    neutralize_factors = ['roe_q', 'ep_q']
    engine = SqlEngine()
    benchmark_code = 905
    universe_name = ['zz500']
@@ -226,7 +226,7 @@ def worker_func_positive(factor_name):

 def worker_func_negative(factor_name):
    from alphamind.api import SqlEngine, Universe
-    neutralize_factors = None #['roe_q', 'ep_q']
+    neutralize_factors = ['roe_q', 'ep_q']
    engine = SqlEngine()
    benchmark_code = 905
    universe_name = ['zz500']
@@ -235,34 +235,34 @@ def worker_func_negative(factor_name):


 if __name__ == '__main__':
-    # from dask.distributed import Client
-    #
-    # client = Client('10.63.6.176:8786')
-    #
-    # engine = SqlEngine()
-    # df = engine.fetch_factor_coverage()
-    # df = df[df.universe == 'zz800'].groupby('factor').mean()
-    # df = df[df.coverage >= 0.98]
-    #
-    # tasks = client.map(worker_func_positive, df.index.tolist())
-    # res1 = client.gather(tasks)
-    #
-    # tasks = client.map(worker_func_negative, df.index.tolist())
-    # res2 = client.gather(tasks)
-    #
-    # factor_df = pd.DataFrame()
-    #
-    # for f_name, df in res1:
-    #     factor_df[f_name] = df['returns']
-    #
-    # for f_name, df in res2:
-    #     factor_df[f_name] = df['returns']
+    from dask.distributed import Client

-    factor_name = LAST('ep_q') # LAST('EBITDA') / LAST('ev')
-    f_name, ret_df = worker_func_positive(factor_name)
+    client = Client('192.168.0.102:8786')

-    ret_df[['returns', 'tc_cost']].cumsum().plot(figsize=(12, 6),
-                                                 title='Fixed frequency rebalanced: {0} for {1} with benchmark {2}'.format(
-                                                     frequency, factor_name, 905),
-                                                 secondary_y='tc_cost')
-    plt.show()
+    engine = SqlEngine()
+    df = engine.fetch_factor_coverage()
+    df = df[df.universe == 'zz800'].groupby('factor').mean()
+    df = df[df.coverage >= 0.98]
+
+    tasks = client.map(worker_func_positive, df.index.tolist())
+    res1 = client.gather(tasks)
+
+    tasks = client.map(worker_func_negative, df.index.tolist())
+    res2 = client.gather(tasks)
+
+    factor_df = pd.DataFrame()
+
+    for f_name, df in res1:
+        factor_df[f_name] = df['returns']
+
+    for f_name, df in res2:
+        factor_df[f_name] = df['returns']
+
+    # factor_name = LAST('EBITDA') / LAST('ev')
+    # f_name, ret_df = worker_func_positive(factor_name)
+    #
+    # ret_df[['returns', 'tc_cost']].cumsum().plot(figsize=(12, 6),
+    #                                              title='Fixed frequency rebalanced: {0} for {1} with benchmark {2}'.format(
+    #                                                  frequency, factor_name, 905),
+    #                                              secondary_y='tc_cost')
+    # plt.show()
--- a/alphamind/model/composer.py
+++ b/alphamind/model/composer.py
--- a/alphamind/model/data_preparing.py
+++ b/alphamind/model/data_preparing.py
@@ -15,6 +15,7 @@ from PyFin.api import makeSchedule
 from PyFin.api import BizDayConventions
 from PyFin.api import DateGeneration
 from PyFin.api import advanceDateByCalendar
+from PyFin.api import pyFinAssert
 from PyFin.DateUtilities import Period
 from alphamind.data.transformer import Transformer
 from alphamind.data.engines.sqlengine import SqlEngine
@@ -101,7 +102,8 @@ def prepare_data(engine: SqlEngine,
        ['trade_date', 'code', 'weight', 'isOpen', 'industry_code', 'industry'] + transformer.names]


-def batch_processing(x_values,
+def batch_processing(names,
+                     x_values,
                     y_values,
                     groups,
                     group_label,
@@ -132,10 +134,11 @@ def batch_processing(x_values,
        else:
            this_risk_exp = None

-        train_x_buckets[end] = factor_processing(this_raw_x,
-                                                 pre_process=pre_process,
-                                                 risk_factors=this_risk_exp,
-                                                 post_process=post_process)
+        train_x_buckets[end] = pd.DataFrame(factor_processing(this_raw_x,
+                                                              pre_process=pre_process,
+                                                              risk_factors=this_risk_exp,
+                                                              post_process=post_process),
+                                            columns=names)

        train_y_buckets[end] = factor_processing(this_raw_y,
                                                 pre_process=pre_process,
@@ -163,7 +166,7 @@ def batch_processing(x_values,

        inner_left_index = bisect.bisect_left(sub_dates, end)
        inner_right_index = bisect.bisect_right(sub_dates, end)
-        predict_x_buckets[end] = ne_x[inner_left_index:inner_right_index]
+        predict_x_buckets[end] = pd.DataFrame(ne_x[inner_left_index:inner_right_index], columns=names)
        predict_risk_buckets[end] = this_risk_exp[inner_left_index:inner_right_index]
        predict_codes_bucket[end] = this_codes[inner_left_index:inner_right_index]

@@ -198,8 +201,8 @@ def fetch_data_package(engine: SqlEngine,
                       pre_process: Iterable[object] = None,
                       post_process: Iterable[object] = None) -> dict:
    alpha_logger.info("Starting data package fetching ...")
-
    transformer = Transformer(alpha_factors)
+    names = transformer.names
    dates, return_df, factor_df = prepare_data(engine,
                                               transformer,
                                               start_date,
@@ -210,7 +213,7 @@ def fetch_data_package(engine: SqlEngine,
                                               warm_start)

    return_df, dates, date_label, risk_exp, x_values, y_values, train_x, train_y, codes = \
-        _merge_df(engine, transformer.names, factor_df, return_df, universe, dates, risk_model, neutralized_risk)
+        _merge_df(engine, names, factor_df, return_df, universe, dates, risk_model, neutralized_risk)

    alpha_logger.info("data merging finished")

@@ -226,7 +229,8 @@ def fetch_data_package(engine: SqlEngine,
    alpha_logger.info("Loading data is finished")

    train_x_buckets, train_y_buckets, train_risk_buckets, predict_x_buckets, predict_y_buckets, predict_risk_buckets, predict_codes_bucket \
-        = batch_processing(x_values,
+        = batch_processing(names,
+                           x_values,
                           y_values,
                           dates,
                           date_label,
@@ -239,15 +243,16 @@ def fetch_data_package(engine: SqlEngine,
    alpha_logger.info("Data processing is finished")

    ret = dict()
-    ret['x_names'] = transformer.names
+    ret['x_names'] = names
    ret['settlement'] = return_df
    ret['train'] = {'x': train_x_buckets, 'y': train_y_buckets, 'risk': train_risk_buckets}
-    ret['predict'] = {'x': predict_x_buckets, 'y': predict_y_buckets, 'risk': predict_risk_buckets, 'code': predict_codes_bucket}
+    ret['predict'] = {'x': predict_x_buckets, 'y': predict_y_buckets, 'risk': predict_risk_buckets,
+                      'code': predict_codes_bucket}
    return ret


 def fetch_train_phase(engine,
-                      alpha_factors: Iterable[object],
+                      alpha_factors: Union[Transformer, Iterable[object]],
                      ref_date,
                      frequency,
                      universe,
@@ -257,7 +262,10 @@ def fetch_train_phase(engine,
                      pre_process: Iterable[object] = None,
                      post_process: Iterable[object] = None,
                      warm_start: int = 0) -> dict:
-    transformer = Transformer(alpha_factors)
+    if isinstance(alpha_factors, Transformer):
+        transformer = alpha_factors
+    else:
+        transformer = Transformer(alpha_factors)

    p = Period(frequency)
    p = Period(length=-(warm_start + batch + 1) * p.length(), units=p.units())
@@ -284,11 +292,12 @@ def fetch_train_phase(engine,
        _merge_df(engine, transformer.names, factor_df, return_df, universe, dates, risk_model, neutralized_risk)

    if dates[-1] == dt.datetime.strptime(ref_date, '%Y-%m-%d'):
+        pyFinAssert(len(dates) >= 2, ValueError, "No previous data for training for the date {0}".format(ref_date))
        end = dates[-2]
-        start = dates[-batch - 1]
+        start = dates[-batch - 1] if batch <= len(dates) - 1 else dates[0]
    else:
        end = dates[-1]
-        start = dates[-batch]
+        start = dates[-batch] if batch <= len(dates) else dates[0]

    index = (date_label >= start) & (date_label <= end)
    this_raw_x = x_values[index]
@@ -311,13 +320,13 @@ def fetch_train_phase(engine,

    ret = dict()
    ret['x_names'] = transformer.names
-    ret['train'] = {'x': ne_x, 'y': ne_y, 'code': this_code}
+    ret['train'] = {'x': pd.DataFrame(ne_x, columns=transformer.names), 'y': ne_y, 'code': this_code}

    return ret


 def fetch_predict_phase(engine,
-                        alpha_factors: Iterable[object],
+                        alpha_factors: Union[Transformer, Iterable[object]],
                        ref_date,
                        frequency,
                        universe,
@@ -326,8 +335,12 @@ def fetch_predict_phase(engine,
                        risk_model: str = 'short',
                        pre_process: Iterable[object] = None,
                        post_process: Iterable[object] = None,
-                        warm_start: int = 0):
-    transformer = Transformer(alpha_factors)
+                        warm_start: int = 0,
+                        fillna: str=None):
+    if isinstance(alpha_factors, Transformer):
+        transformer = alpha_factors
+    else:
+        transformer = Transformer(alpha_factors)

    p = Period(frequency)
    p = Period(length=-(warm_start + batch) * p.length(), units=p.units())
@@ -340,7 +353,12 @@ def fetch_predict_phase(engine,
                         dateRule=BizDayConventions.Following,
                         dateGenerationRule=DateGeneration.Backward)

-    factor_df = engine.fetch_factor_range(universe, factors=transformer, dates=dates).dropna()
+    factor_df = engine.fetch_factor_range(universe, factors=transformer, dates=dates)
+
+    if fillna:
+        factor_df = factor_df.groupby('trade_date').apply(lambda x: x.fillna(x.median())).reset_index(drop=True).dropna()
+    else:
+        factor_df = factor_df.dropna()

    names = transformer.names

@@ -360,7 +378,7 @@ def fetch_predict_phase(engine,

    if dates[-1] == dt.datetime.strptime(ref_date, '%Y-%m-%d'):
        end = dates[-1]
-        start = dates[-batch]
+        start = dates[-batch] if batch <= len(dates) else dates[0]

        left_index = bisect.bisect_left(date_label, start)
        right_index = bisect.bisect_right(date_label, end)
@@ -392,7 +410,7 @@ def fetch_predict_phase(engine,

    ret = dict()
    ret['x_names'] = transformer.names
-    ret['predict'] = {'x': ne_x, 'code': codes}
+    ret['predict'] = {'x': pd.DataFrame(ne_x, columns=transformer.names), 'code': codes}

    return ret


--- a/alphamind/model/linearmodel.py
+++ b/alphamind/model/linearmodel.py
@@ -19,7 +19,7 @@ from alphamind.utilities import alpha_logger
 class ConstLinearModelImpl(object):

    def __init__(self, weights: np.ndarray = None):
-        self.weights = np.array(weights).flatten()
+        self.weights = weights.flatten()

    def fit(self, x: np.ndarray, y: np.ndarray):
        pass
@@ -31,15 +31,15 @@ class ConstLinearModelImpl(object):
 class ConstLinearModel(ModelBase):

    def __init__(self,
-                 features: list = None,
-                 formulas: dict = None,
-                 weights: np.ndarray = None):
-        super().__init__(features, formulas=formulas)
+                 features=None,
+                 weights: dict = None):
+        super().__init__(features)
        if features is not None and weights is not None:
            pyFinAssert(len(features) == len(weights),
                        ValueError,
                        "length of features is not equal to length of weights")
-        self.impl = ConstLinearModelImpl(weights)
+        if weights:
+            self.impl = ConstLinearModelImpl(np.array([weights[name] for name in self.features]))

    def save(self):
        model_desc = super().save()
@@ -57,10 +57,9 @@ class ConstLinearModel(ModelBase):

 class LinearRegression(ModelBase):

-    def __init__(self, features: list = None, formulas: dict = None, fit_intercept: bool = False, **kwargs):
-        super().__init__(features, formulas=formulas)
+    def __init__(self, features=None, fit_intercept: bool = False, **kwargs):
+        super().__init__(features)
        self.impl = LinearRegressionImpl(fit_intercept=fit_intercept, **kwargs)
-        self.trained_time = None

    def save(self) -> dict:
        model_desc = super().save()
@@ -85,10 +84,9 @@ class LinearRegression(ModelBase):

 class LassoRegression(ModelBase):

-    def __init__(self, alpha=0.01, features: list = None, formulas: dict = None, fit_intercept: bool = False, **kwargs):
-        super().__init__(features, formulas=formulas)
+    def __init__(self, alpha=0.01, features=None, fit_intercept: bool = False, **kwargs):
+        super().__init__(features)
        self.impl = Lasso(alpha=alpha, fit_intercept=fit_intercept, **kwargs)
-        self.trained_time = None

    def save(self) -> dict:
        model_desc = super().save()
@@ -113,8 +111,8 @@ class LassoRegression(ModelBase):

 class LogisticRegression(ModelBase):

-    def __init__(self, features: list = None, formulas: dict = None, fit_intercept: bool = False, **kwargs):
-        super().__init__(features, formulas=formulas)
+    def __init__(self, features=None, fit_intercept: bool = False, **kwargs):
+        super().__init__(features)
        self.impl = LogisticRegressionImpl(fit_intercept=fit_intercept, **kwargs)

    def save(self) -> dict:

--- a/alphamind/model/modelbase.py
+++ b/alphamind/model/modelbase.py
@@ -6,34 +6,42 @@ Created on 2017-9-4
 """

 import abc
-import copy
 import arrow
 import numpy as np
+import pandas as pd
+from simpleutils.miscellaneous import list_eq
 from alphamind.utilities import alpha_logger
 from alphamind.utilities import encode
 from alphamind.utilities import decode
+from alphamind.data.transformer import Transformer


 class ModelBase(metaclass=abc.ABCMeta):

-    def __init__(self, features: list=None, formulas: dict=None):
+    def __init__(self, features=None):
        if features is not None:
-            self.features = list(features)
+            self.formulas = Transformer(features)
+            self.features = self.formulas.names
        else:
            self.features = None
        self.impl = None
-        self.formulas = copy.deepcopy(formulas)
        self.trained_time = None

-    def fit(self, x: np.ndarray, y: np.ndarray):
-        self.impl.fit(x, y.flatten())
+    def __eq__(self, rhs):
+        return encode(self.impl) == encode(rhs.impl) \
+               and self.trained_time == rhs.trained_time \
+               and list_eq(self.features, rhs.features) \
+               and encode(self.formulas) == encode(rhs.formulas)
+
+    def fit(self, x: pd.DataFrame, y: np.ndarray):
+        self.impl.fit(x[self.features].values, y.flatten())
        self.trained_time = arrow.now().format("YYYY-MM-DD HH:mm:ss")

-    def predict(self, x: np.ndarray) -> np.ndarray:
-        return self.impl.predict(x)
+    def predict(self, x: pd.DataFrame) -> np.ndarray:
+        return self.impl.predict(x[self.features].values)

-    def score(self, x: np.ndarray, y: np.ndarray) -> float:
-        return self.impl.score(x, y)
+    def score(self, x: pd.DataFrame, y: np.ndarray) -> float:
+        return self.impl.score(x[self.features].values, y)

    @abc.abstractmethod
    def save(self) -> dict:

--- a/alphamind/model/treemodel.py
+++ b/alphamind/model/treemodel.py
@@ -5,10 +5,10 @@ Created on 2017-12-4
 @author: cheng.li
 """

-from typing import List
 from distutils.version import LooseVersion
 import arrow
 import numpy as np
+import pandas as pd
 from sklearn import __version__ as sklearn_version
 from sklearn.ensemble import RandomForestRegressor as RandomForestRegressorImpl
 from sklearn.ensemble import RandomForestClassifier as RandomForestClassifierImpl
@@ -26,18 +26,16 @@ class RandomForestRegressor(ModelBase):
    def __init__(self,
                 n_estimators: int=100,
                 max_features: str='auto',
-                 features: List=None,
+                 features=None,
                 **kwargs):
-        super().__init__(features, **kwargs)
+        super().__init__(features)
        self.impl = RandomForestRegressorImpl(n_estimators=n_estimators,
                                              max_features=max_features,
                                              **kwargs)
-        self.trained_time = None

    def save(self) -> dict:
        model_desc = super().save()
        model_desc['sklearn_version'] = sklearn_version
-        model_desc['importances'] = self.importances
        return model_desc

    @classmethod
@@ -60,19 +58,16 @@ class RandomForestClassifier(ModelBase):
    def __init__(self,
                 n_estimators: int=100,
                 max_features: str='auto',
-                 features: List = None,
-                 formulas: dict = None,
+                 features=None,
                 **kwargs):
-        super().__init__(features, formulas=formulas)
+        super().__init__(features)
        self.impl = RandomForestClassifierImpl(n_estimators=n_estimators,
                                               max_features=max_features,
                                               **kwargs)
-        self.trained_time = None

    def save(self) -> dict:
        model_desc = super().save()
        model_desc['sklearn_version'] = sklearn_version
-        model_desc['importances'] = self.importances
        return model_desc

    @classmethod
@@ -96,11 +91,10 @@ class XGBRegressor(ModelBase):
                 n_estimators: int=100,
                 learning_rate: float=0.1,
                 max_depth: int=3,
-                 features: List=None,
-                 formulas: dict = None,
+                 features=None,
                 n_jobs: int=1,
                 **kwargs):
-        super().__init__(features, formulas=formulas)
+        super().__init__(features)
        self.impl = XGBRegressorImpl(n_estimators=n_estimators,
                                     learning_rate=learning_rate,
                                     max_depth=max_depth,
@@ -110,7 +104,6 @@ class XGBRegressor(ModelBase):
    def save(self) -> dict:
        model_desc = super().save()
        model_desc['xgbboot_version'] = xgbboot_version
-        model_desc['importances'] = self.importances
        return model_desc

    @classmethod
@@ -134,11 +127,10 @@ class XGBClassifier(ModelBase):
                 n_estimators: int=100,
                 learning_rate: float=0.1,
                 max_depth: int=3,
-                 features: List = None,
-                 formulas: dict = None,
+                 features=None,
                 n_jobs: int=1,
                 **kwargs):
-        super().__init__(features, formulas=formulas)
+        super().__init__(features)
        self.impl = XGBClassifierImpl(n_estimators=n_estimators,
                                      learning_rate=learning_rate,
                                      max_depth=max_depth,
@@ -148,7 +140,6 @@ class XGBClassifier(ModelBase):
    def save(self) -> dict:
        model_desc = super().save()
        model_desc['xgbboot_version'] = xgbboot_version
-        model_desc['importances'] = self.importances
        return model_desc

    @classmethod
@@ -179,12 +170,11 @@ class XGBTrainer(ModelBase):
                 early_stopping_rounds=None,
                 subsample=1.,
                 colsample_bytree=1.,
-                 features: List = None,
-                 formulas: dict = None,
+                 features=None,
                 random_state: int=0,
                 n_jobs: int=1,
                 **kwargs):
-        super().__init__(features, formulas=formulas)
+        super().__init__(features)
        self.params = {
            'silent': 1,
            'objective': objective,
@@ -204,9 +194,9 @@ class XGBTrainer(ModelBase):
        self.impl = None
        self.kwargs = kwargs

-    def fit(self, x, y):
+    def fit(self, x: pd.DataFrame, y: np.ndarray):
        if self.eval_sample:
-            x_train, x_eval, y_train, y_eval = train_test_split(x,
+            x_train, x_eval, y_train, y_eval = train_test_split(x[self.features].values,
                                                                y,
                                                                test_size=self.eval_sample,
                                                                random_state=42)
@@ -219,7 +209,7 @@ class XGBTrainer(ModelBase):
                                  verbose_eval=False,
                                  **self.kwargs)
        else:
-            d_train = xgb.DMatrix(x, y)
+            d_train = xgb.DMatrix(x[self.features].values, y)
            self.impl = xgb.train(params=self.params,
                                  dtrain=d_train,
                                  num_boost_round=self.num_boost_round,
@@ -227,14 +217,13 @@ class XGBTrainer(ModelBase):

        self.trained_time = arrow.now().format("YYYY-MM-DD HH:mm:ss")

-    def predict(self, x: np.ndarray) -> np.ndarray:
-        d_predict = xgb.DMatrix(x)
+    def predict(self, x: pd.DataFrame) -> np.ndarray:
+        d_predict = xgb.DMatrix(x[self.features].values)
        return self.impl.predict(d_predict)

    def save(self) -> dict:
        model_desc = super().save()
        model_desc['xgbboot_version'] = xgbboot_version
-        model_desc['importances'] = self.importances
        return model_desc

    @classmethod

--- a/alphamind/portfolio/allocations.py
+++ b/alphamind/portfolio/allocations.py
@@ -16,9 +16,9 @@ class Allocation(object):

    def __init__(self,
                 code: int,
-                 minimum: int=0,
-                 maximum: int=inf,
-                 current: int=0):
+                 minimum: int = 0,
+                 maximum: int = inf,
+                 current: int = 0):
        self.code = code
        self.minimum = minimum
        self.maximum = maximum
@@ -62,39 +62,32 @@ class Portfolio(object):
 class Execution(object):

    def __init__(self,
+                 name: str,
                 code: int,
                 qty: int,
-                 comment: str=None):
+                 cpty: str = 'external',
+                 comment: str = None):
+        self.name = name
        self.code = code
        self.qty = qty
+        self.cpty = cpty
        self.comment = comment

    def __repr__(self):
-        return "Execution(code={0}, qty={1}, comment={2})".format(self.code,
-                                                                  self.qty,
-                                                                  self.comment)
-
-
-class Executions(object):
-
-    def __init__(self,
-                 name,
-                 executions: List[Execution]=None):
-        self.name = name
-        self.executions = executions
-
-    def __repr__(self):
-        return "Executions(name={0}, executions={1})".format(self.name,
-                                                             self.executions)
+        return "Execution(name={0}, code={1}, qty={2}, cpty={3}, comment={4})".format(self.name,
+                                                                                      self.code,
+                                                                                      self.qty,
+                                                                                      self.cpty,
+                                                                                      self.comment)


 class Asset(object):

    def __init__(self,
                 code: int,
-                 name: str=None,
-                 priority: List[str]=None,
-                 forbidden: List[str]=None):
+                 name: str = None,
+                 priority: List[str] = None,
+                 forbidden: List[str] = None):
        self.code = code
        self.name = name
        if priority:
@@ -119,11 +112,11 @@ class Asset(object):
                                                                               self.forbidden)


-class TargetPositions(object):
+class Positions(object):

    def __init__(self,
-                 assets: List[Asset]=None,
-                 qtys: List[int]=None):
+                 assets: List[Asset] = None,
+                 qtys: List[int] = None):

        if assets:
            self.targets = {asset.code: (asset, qty) for asset, qty in zip(assets, qtys)}
@@ -133,9 +126,9 @@ class TargetPositions(object):
    def add_asset(self,
                  asset: Asset,
                  qty: int):
-        if asset.code in self.targets:
-            raise ValueError()
-        self.targets[asset.code] = (asset, qty)
+        code = asset.code
+        pyFinAssert(code not in self.targets, ValueError, "code {0} is already in positions".format(code))
+        self.targets[code] = (asset, qty)

    def __getitem__(self, code: int) -> Tuple[Asset, int]:
        return self.targets[code]
@@ -148,10 +141,10 @@ class TargetPositions(object):
        return "TargetPositions(assets={0}, qtys={1})".format(*zip(*self.targets.values()))


-def handle_one_asset(pre_allocation: Allocation,
+def handle_one_asset(p_name: str,
+                     pre_allocation: Allocation,
                     asset: Asset,
                     qty: int) -> Tuple[Execution, Allocation, int]:
-
    minimum = pre_allocation.minimum
    maximum = pre_allocation.maximum
    current = pre_allocation.current
@@ -161,14 +154,20 @@ def handle_one_asset(pre_allocation: Allocation,
        raise ValueError("{0}'s target {1} is smaller than minimum amount {2}".format(asset.code, qty, pre_allocation))
    elif qty < maximum:
        # need to buy / sell
-        ex = Execution(code, qty - current)
+        ex = Execution(name=p_name,
+                       code=code,
+                       qty=qty - current,
+                       cpty='external')
        allocation = Allocation(code,
                                minimum=minimum,
                                maximum=maximum,
                                current=qty)
        qty = 0
    else:
-        ex = Execution(code, maximum - current)
+        ex = Execution(name=p_name,
+                       code=code,
+                       qty=maximum - current,
+                       cpty='external')
        allocation = Allocation(code,
                                minimum=minimum,
                                maximum=maximum,
@@ -177,41 +176,40 @@ def handle_one_asset(pre_allocation: Allocation,
    return ex, allocation, qty


-def pass_through(target_pos: TargetPositions,
-                 portfolio: Portfolio) -> Tuple[Executions, Portfolio, TargetPositions]:
-
+def pass_through(target_pos: Positions,
+                 portfolio: Portfolio) -> Tuple[List[Execution], Portfolio, Positions]:
    p_name = portfolio.name
-    new_target_pos = TargetPositions()
+    new_target_pos = Positions()

    allocations = []
    executions = []

    for code in target_pos.codes:
        asset, qty = target_pos[code]
-        if asset.priority:
-            raise ValueError("asset ({0})'s priority pool {1} is not checked yet".format(code, asset.priority))
+        pyFinAssert(not asset.priority,
+                    ValueError,
+                    "asset ({0})'s priority pool {1} is not checked yet".format(code, asset.priority))

        if p_name in asset.forbidden:
-            ex = Execution(code, 0, "{0} is forbidden for {1}".format(code, p_name))
            allocation = copy.deepcopy(portfolio[code])
            new_target_pos.add_asset(asset, qty)
        else:
            prev_allocation = portfolio[code]
-            ex, allocation, qty = handle_one_asset(prev_allocation, asset, qty)
+            ex, allocation, qty = handle_one_asset(p_name, prev_allocation, asset, qty)
            new_target_pos.add_asset(asset, qty)
+            if ex.qty != 0:
+                executions.append(ex)

        allocations.append(allocation)
-        executions.append(ex)

-    return Executions(p_name, executions), Portfolio(p_name, allocations), new_target_pos
+    return executions, Portfolio(p_name, allocations), new_target_pos


 if __name__ == '__main__':
-
    asset1 = Asset(1, 'a')
    asset2 = Asset(2, 'b')
    asset3 = Asset(3, 'b')
-    target_pos = TargetPositions([asset1, asset2, asset3], [200, 300, 100])
+    target_pos = Positions([asset1, asset2, asset3], [200, 300, 100])

    allc1 = Allocation(1, 0, 100, 0)
    allc2 = Allocation(2, 0, 400, 100)
@@ -219,8 +217,3 @@ if __name__ == '__main__':
    portfolio = Portfolio('test1', [allc1, allc2])

    executions, portfolio, target_pos = pass_through(target_pos, portfolio)
-
-
-
-
-
--- a/alphamind/tests/data/engines/__init__.py
+++ b/alphamind/tests/data/engines/__init__.py
--- a/alphamind/tests/data/engines/test_universe.py
+++ b/alphamind/tests/data/engines/test_universe.py
+# -*- coding: utf-8 -*-
+"""
+Created on 2018-2-9
+
+@author: cheng.li
+"""
+
+import unittest
+from PyFin.api import LAST
+from alphamind.data.engines.universe import Universe
+
+
+class TestUniverse(unittest.TestCase):
+
+    def test_universe_equal(self):
+        universe1 = Universe('custom', ['zz500'])
+        universe2 = Universe('custom', ['zz500'])
+        self.assertEqual(universe1, universe2)
+
+        universe1 = Universe('custom', ['zz500'])
+        universe2 = Universe('custom', ['zz800'])
+        self.assertNotEqual(universe1, universe2)
+
+        filter_cond = LAST('x') > 1.
+        universe1 = Universe('custom', ['zz500'], filter_cond=filter_cond)
+        universe2 = Universe('custom', ['zz500'], filter_cond=filter_cond)
+        self.assertEqual(universe1, universe2)
+
+        universe1 = Universe('custom', ['zz500'], filter_cond=LAST('x') > 1.)
+        universe2 = Universe('custom', ['zz500'], filter_cond=LAST('x') > 2.)
+        self.assertNotEqual(universe1, universe2)
+
+    def test_universe_persistence(self):
+        universe = Universe('custom', ['zz500'])
+        univ_desc = universe.save()
+        loaded_universe = Universe.load(univ_desc)
+
+        self.assertEqual(universe.name, loaded_universe.name)
+        self.assertListEqual(universe.base_universe, loaded_universe.base_universe)
+
+        universe = Universe('custom', ['zz500'], filter_cond=LAST('x') > 1.)
+        univ_desc = universe.save()
+        loaded_universe = Universe.load(univ_desc)
+
+        self.assertEqual(universe.name, loaded_universe.name)
+        self.assertListEqual(universe.base_universe, loaded_universe.base_universe)
+        self.assertEqual(str(universe.filter_cond), str(loaded_universe.filter_cond))
--- a/alphamind/tests/model/test_composer.py
+++ b/alphamind/tests/model/test_composer.py
+# -*- coding: utf-8 -*-
+"""
+Created on 2018-2-9
+
+@author: cheng.li
+"""
+
+import unittest
+from alphamind.data.engines.universe import Universe
+from alphamind.model.composer import DataMeta
+from alphamind.model.composer import Composer
+from alphamind.model.treemodel import XGBClassifier
+
+
+class TestComposer(unittest.TestCase):
+
+    def _assert_composer_equal(self, lhs: Composer, rhs: Composer):
+        self.assertEqual(lhs.alpha_model, rhs.alpha_model)
+        self.assertEqual(lhs.data_meta, rhs.data_meta)
+
+    def test_data_meta_persistence(self):
+
+        freq = '5b'
+        universe = Universe('custom', ['zz800'])
+        batch = 4
+        neutralized_risk = ['SIZE']
+        risk_model = 'long'
+        pre_process = ['standardize', 'winsorize_normal']
+        post_process = ['standardize', 'winsorize_normal']
+        warm_start = 2
+        data_source = 'postgresql://user:pwd@server/dummy'
+
+        data_meta = DataMeta(freq=freq,
+                             universe=universe,
+                             batch=batch,
+                             neutralized_risk=neutralized_risk,
+                             risk_model=risk_model,
+                             pre_process=pre_process,
+                             post_process=post_process,
+                             warm_start=warm_start,
+                             data_source=data_source)
+
+        data_desc = data_meta.save()
+
+        loaded_data = DataMeta.load(data_desc)
+        self.assertEqual(data_meta.freq, loaded_data.freq)
+        self.assertEqual(data_meta.universe, loaded_data.universe)
+        self.assertEqual(data_meta.batch, loaded_data.batch)
+        self.assertEqual(data_meta.neutralized_risk, loaded_data.neutralized_risk)
+        self.assertEqual(data_meta.risk_model, loaded_data.risk_model)
+        self.assertEqual(data_meta.pre_process, loaded_data.pre_process)
+        self.assertEqual(data_meta.post_process, loaded_data.post_process)
+        self.assertEqual(data_meta.warm_start, loaded_data.warm_start)
+        self.assertEqual(data_meta.data_source, loaded_data.data_source)
+
+    def test_composer_persistence(self):
+        freq = '5b'
+        universe = Universe('custom', ['zz800'])
+        batch = 4
+        neutralized_risk = ['SIZE']
+        risk_model = 'long'
+        pre_process = ['standardize', 'winsorize_normal']
+        post_process = ['standardize', 'winsorize_normal']
+        warm_start = 2
+        data_source = 'postgresql://user:pwd@server/dummy'
+
+        data_meta = DataMeta(freq=freq,
+                             universe=universe,
+                             batch=batch,
+                             neutralized_risk=neutralized_risk,
+                             risk_model=risk_model,
+                             pre_process=pre_process,
+                             post_process=post_process,
+                             warm_start=warm_start,
+                             data_source=data_source)
+
+        features = {'f1': 'closePrice', 'f2': 'openPrice'}
+        alpha_model = XGBClassifier(features=features)
+
+        composer = Composer(alpha_model=alpha_model,
+                            data_meta=data_meta)
+
+        comp_desc = composer.save()
+        loaded_comp = Composer.load(comp_desc)
+        self._assert_composer_equal(composer, loaded_comp)
+
+
+
+
--- a/alphamind/tests/model/test_linearmodel.py
+++ b/alphamind/tests/model/test_linearmodel.py
@@ -7,6 +7,7 @@ Created on 2017-9-4

 import unittest
 import numpy as np
+import pandas as pd
 from sklearn.linear_model import LinearRegression as LinearRegression2
 from alphamind.model.loader import load_model
 from alphamind.model.linearmodel import ConstLinearModel
@@ -19,23 +20,25 @@ class TestLinearModel(unittest.TestCase):

    def setUp(self):
        self.n = 3
-        self.train_x = np.random.randn(1000, self.n)
+        self.features = ['a', 'b', 'c']
+        self.train_x = pd.DataFrame(np.random.randn(1000, self.n), columns=['a', 'b', 'c'])
        self.train_y = np.random.randn(1000)
        self.train_y_label = np.where(self.train_y > 0., 1, 0)
-        self.predict_x = np.random.randn(10, self.n)
+        self.predict_x = pd.DataFrame(np.random.randn(10, self.n), columns=['a', 'b', 'c'])

    def test_const_linear_model(self):

-        weights = np.array([1., 2., 3.])
-        model = ConstLinearModel(features=['a', 'b', 'c'],
+        features = ['c', 'b', 'a']
+        weights = dict(c=3., b=2., a=1.)
+        model = ConstLinearModel(features=features,
                                 weights=weights)

        calculated_y = model.predict(self.predict_x)
-        expected_y = self.predict_x @ weights
+        expected_y = self.predict_x[features] @ np.array([weights[f] for f in features])
        np.testing.assert_array_almost_equal(calculated_y, expected_y)

    def test_const_linear_model_persistence(self):
-        weights = np.array([1., 2., 3.])
+        weights = dict(c=3., b=2., a=1.)
        model = ConstLinearModel(features=['a', 'b', 'c'],
                                 weights=weights)


--- a/alphamind/tests/model/test_loader.py
+++ b/alphamind/tests/model/test_loader.py
@@ -7,6 +7,7 @@ Created on 2017-9-5

 import unittest
 import numpy as np
+import pandas as pd
 from alphamind.model.linearmodel import LinearRegression
 from alphamind.model.loader import load_model

@@ -15,10 +16,10 @@ class TestLoader(unittest.TestCase):

    def setUp(self):
        self.n = 3
-        self.trained_x = np.random.randn(1000, self.n)
+        self.trained_x = pd.DataFrame(np.random.randn(1000, self.n), columns=['a', 'b', 'c'])
        self.trained_y = np.random.randn(1000, 1)

-        self.predict_x = np.random.randn(100, self.n)
+        self.predict_x = pd.DataFrame(np.random.randn(100, self.n), columns=['a', 'b', 'c'])

    def test_load_model(self):
        model = LinearRegression(['a', 'b', 'c'])

--- a/alphamind/tests/model/test_modelbase.py
+++ b/alphamind/tests/model/test_modelbase.py
+# -*- coding: utf-8 -*-
+"""
+Created on 2018-2-8
+
+@author: cheng.li
+"""
+
+import unittest
+from alphamind.model.linearmodel import ConstLinearModel
+
+
+class TestModelBase(unittest.TestCase):
+
+    def test_simple_model_features(self):
+        model = ConstLinearModel(features=['c', 'b', 'a'])
+        self.assertListEqual(['a', 'b', 'c'], model.features)
\ No newline at end of file
--- a/alphamind/tests/model/test_treemodel.py
+++ b/alphamind/tests/model/test_treemodel.py
@@ -7,6 +7,7 @@ Created on 2018-1-5

 import unittest
 import numpy as np
+import pandas as pd
 from alphamind.model.loader import load_model
 from alphamind.model.treemodel import RandomForestRegressor
 from alphamind.model.treemodel import RandomForestClassifier
@@ -18,23 +19,24 @@ from alphamind.model.treemodel import XGBTrainer
 class TestTreeModel(unittest.TestCase):

    def setUp(self):
-        self.x = np.random.randn(1000, 10)
+        self.features = list('0123456789')
+        self.x = pd.DataFrame(np.random.randn(1000, 10), columns=self.features)
        self.y = np.random.randn(1000)
+        self.sample_x = pd.DataFrame(np.random.randn(100, 10), columns=self.features)

    def test_random_forest_regress_persistence(self):
-        model = RandomForestRegressor(features=list(range(10)))
+        model = RandomForestRegressor(features=self.features)
        model.fit(self.x, self.y)

        desc = model.save()
        new_model = load_model(desc)
        self.assertEqual(model.features, new_model.features)

-        sample_x = np.random.randn(100, 10)
-        np.testing.assert_array_almost_equal(model.predict(sample_x), new_model.predict(sample_x))
+        np.testing.assert_array_almost_equal(model.predict(self.sample_x), new_model.predict(self.sample_x))
        np.testing.assert_array_almost_equal(model.importances, new_model.importances)

    def test_random_forest_classify_persistence(self):
-        model = RandomForestClassifier(features=list(range(10)))
+        model = RandomForestClassifier(features=self.features)
        y = np.where(self.y > 0, 1, 0)
        model.fit(self.x, y)

@@ -42,24 +44,22 @@ class TestTreeModel(unittest.TestCase):
        new_model = load_model(desc)
        self.assertEqual(model.features, new_model.features)

-        sample_x = np.random.randn(100, 10)
-        np.testing.assert_array_almost_equal(model.predict(sample_x), new_model.predict(sample_x))
+        np.testing.assert_array_almost_equal(model.predict(self.sample_x), new_model.predict(self.sample_x))
        np.testing.assert_array_almost_equal(model.importances, new_model.importances)

    def test_xgb_regress_persistence(self):
-        model = XGBRegressor(features=list(range(10)))
+        model = XGBRegressor(features=self.features)
        model.fit(self.x, self.y)

        desc = model.save()
        new_model = load_model(desc)
        self.assertEqual(model.features, new_model.features)

-        sample_x = np.random.randn(100, 10)
-        np.testing.assert_array_almost_equal(model.predict(sample_x), new_model.predict(sample_x))
+        np.testing.assert_array_almost_equal(model.predict(self.sample_x), new_model.predict(self.sample_x))
        np.testing.assert_array_almost_equal(model.importances, new_model.importances)

    def test_xgb_classify_persistence(self):
-        model = XGBClassifier(features=list(range(10)))
+        model = XGBClassifier(features=self.features)
        y = np.where(self.y > 0, 1, 0)
        model.fit(self.x, y)

@@ -67,20 +67,18 @@ class TestTreeModel(unittest.TestCase):
        new_model = load_model(desc)
        self.assertEqual(model.features, new_model.features)

-        sample_x = np.random.randn(100, 10)
-        np.testing.assert_array_almost_equal(model.predict(sample_x), new_model.predict(sample_x))
+        np.testing.assert_array_almost_equal(model.predict(self.sample_x), new_model.predict(self.sample_x))
        np.testing.assert_array_almost_equal(model.importances, new_model.importances)

    def test_xgb_trainer_equal_classifier(self):
-        sample_x = np.random.randn(100, 10)

        model1 = XGBClassifier(n_estimators=100,
                               learning_rate=0.1,
                               max_depth=3,
-                               features=list(range(10)),
+                               features=self.features,
                               random_state=42)

-        model2 = XGBTrainer(features=list(range(10)),
+        model2 = XGBTrainer(features=self.features,
                            objective='reg:logistic',
                            booster='gbtree',
                            tree_method='exact',
@@ -93,13 +91,13 @@ class TestTreeModel(unittest.TestCase):
        model1.fit(self.x, y)
        model2.fit(self.x, y)

-        predict1 = model1.predict(sample_x)
-        predict2 = model2.predict(sample_x)
+        predict1 = model1.predict(self.sample_x)
+        predict2 = model2.predict(self.sample_x)
        predict2 = np.where(predict2 > 0.5, 1., 0.)
        np.testing.assert_array_almost_equal(predict1, predict2)

    def test_xgb_trainer_persistence(self):
-        model = XGBTrainer(features=list(range(10)),
+        model = XGBTrainer(features=self.features,
                           objective='binary:logistic',
                           booster='gbtree',
                           tree_method='hist',
@@ -111,6 +109,5 @@ class TestTreeModel(unittest.TestCase):
        new_model = load_model(desc)
        self.assertEqual(model.features, new_model.features)

-        sample_x = np.random.randn(100, 10)
-        np.testing.assert_array_almost_equal(model.predict(sample_x), new_model.predict(sample_x))
+        np.testing.assert_array_almost_equal(model.predict(self.sample_x), new_model.predict(self.sample_x))
        np.testing.assert_array_almost_equal(model.importances, new_model.importances)
--- a/alphamind/tests/portfolio/test_allocations.py
+++ b/alphamind/tests/portfolio/test_allocations.py
+# -*- coding: utf-8 -*-
+"""
+Created on 2018-2-7
+
+@author: cheng.li
+"""
+
+import unittest
+from alphamind.portfolio.allocations import Asset
+from alphamind.portfolio.allocations import Portfolio
+from alphamind.portfolio.allocations import Positions
+from alphamind.portfolio.allocations import Execution
+
+
+class TestAllocation(unittest.TestCase):
+    pass
\ No newline at end of file
--- a/alphamind/tests/test_suite.py
+++ b/alphamind/tests/test_suite.py
@@ -15,6 +15,7 @@ from alphamind.tests.data.test_neutralize import TestNeutralize
 from alphamind.tests.data.test_standardize import TestStandardize
 from alphamind.tests.data.test_winsorize import TestWinsorize
 from alphamind.tests.data.test_quantile import TestQuantile
+from alphamind.tests.data.engines.test_universe import TestUniverse
 from alphamind.tests.portfolio.test_constraints import TestConstraints
 from alphamind.tests.portfolio.test_evolver import TestEvolver
 from alphamind.tests.portfolio.test_longshortbuild import TestLongShortBuild
@@ -27,9 +28,11 @@ from alphamind.tests.analysis.test_riskanalysis import TestRiskAnalysis
 from alphamind.tests.analysis.test_perfanalysis import TestPerformanceAnalysis
 from alphamind.tests.analysis.test_factoranalysis import TestFactorAnalysis
 from alphamind.tests.analysis.test_quantilieanalysis import TestQuantileAnalysis
+from alphamind.tests.model.test_modelbase import TestModelBase
 from alphamind.tests.model.test_linearmodel import TestLinearModel
 from alphamind.tests.model.test_treemodel import TestTreeModel
 from alphamind.tests.model.test_loader import TestLoader
+from alphamind.tests.model.test_composer import TestComposer
 from alphamind.tests.execution.test_naiveexecutor import TestNaiveExecutor
 from alphamind.tests.execution.test_thresholdexecutor import TestThresholdExecutor
 from alphamind.tests.execution.test_targetvolexecutor import TestTargetVolExecutor
@@ -42,6 +45,7 @@ if __name__ == '__main__':
                         TestStandardize,
                         TestWinsorize,
                         TestQuantile,
+                         TestUniverse,
                         TestConstraints,
                         TestEvolver,
                         TestLongShortBuild,
@@ -54,9 +58,11 @@ if __name__ == '__main__':
                         TestPerformanceAnalysis,
                         TestFactorAnalysis,
                         TestQuantileAnalysis,
+                         TestModelBase,
                         TestLinearModel,
                         TestTreeModel,
                         TestLoader,
+                         TestComposer,
                         TestNaiveExecutor,
                         TestThresholdExecutor,
                         TestTargetVolExecutor,

--- a/notebooks/full factors strategy with dask cluster.ipynb
+++ b/notebooks/full factors strategy with dask cluster.ipynb
--- a/requirements.txt
+++ b/requirements.txt
@@ -6,7 +6,7 @@ pandas >= 0.19.2
 scikit-learn >= 0.18.1
 numba >= 0.33.0
 scipy >= 0.19.0
-simpleutils >= 0.1.0
+simpleutils >= 0.1.2
 sqlalchemy >= 1.1.14
 psycopg2 >= 2.7.1
 finance-python >= 0.5.7
\ No newline at end of file