added factor strategy with dask cluster

2b165e01 · Dr.李 · 2dc2692a · 2b165e01
Commit 2b165e01 authored Feb 10, 2018 by Dr.李
Show whitespace changes
Inline Side-by-side

Showing with 447 additions and 0 deletions

full factors strategy with dask cluster.ipynb notebooks/full factors strategy with dask cluster.ipynb +447 -0

No files found.
--- a/notebooks/full factors strategy with dask cluster.ipynb
+++ b/notebooks/full factors strategy with dask cluster.ipynb
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%matplotlib inline\n",
+    "import numpy as np\n",
+    "import pandas as pd\n",
+    "from matplotlib import pyplot as plt\n",
+    "import functools\n",
+    "from alphamind.api import *\n",
+    "from PyFin.api import *\n",
+    "\n",
+    "plt.style.use('ggplot')"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Meta Data Parameters\n",
+    "----------------------"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "freq = '10b'\n",
+    "universe = Universe('custom', ['zz800'])\n",
+    "batch = 16\n",
+    "neutralized_risk = ['SIZE'] + industry_styles\n",
+    "risk_model = 'short'\n",
+    "pre_process = [winsorize_normal, standardize]\n",
+    "post_process = [winsorize_normal, standardize]\n",
+    "warm_start = 0\n",
+    "data_source = 'postgres+psycopg2://postgres:we083826@192.168.0.102/alpha'\n",
+    "dask_cluster = '192.168.0.102:8786'\n",
+    "\n",
+    "horizon = map_freq(freq)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Helper function to train / Precit a model\n",
+    "-------------------"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def train_daily_model(params):\n",
+    "    ref_date, meta_model = params\n",
+    "    data_meta = DataMeta(freq=freq,\n",
+    "                         universe=universe,\n",
+    "                         batch=batch,\n",
+    "                         neutralized_risk=neutralized_risk,\n",
+    "                         risk_model=risk_model,\n",
+    "                         pre_process=pre_process,\n",
+    "                         post_process=post_process,\n",
+    "                         warm_start=warm_start,\n",
+    "                         data_source=data_source)\n",
+    "\n",
+    "    return train_model(ref_date=ref_date, alpha_model=meta_model, data_meta=data_meta)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "def predict_daily_model(params):\n",
+    "    ref_date, alpha_model = params\n",
+    "    data_meta = DataMeta(freq=freq,\n",
+    "                         universe=universe,\n",
+    "                         batch=batch,\n",
+    "                         neutralized_risk=neutralized_risk,\n",
+    "                         risk_model=risk_model,\n",
+    "                         pre_process=pre_process,\n",
+    "                         post_process=post_process,\n",
+    "                         warm_start=warm_start,\n",
+    "                         data_source=data_source)\n",
+    "    return predict_by_model(ref_date=ref_date, alpha_model=alpha_model, data_meta=data_meta)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Strategy Settings\n",
+    "---------------------"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "start_date = '2010-02-01'\n",
+    "end_date = '2018-01-29'\n",
+    "industry_category = 'sw_adj'\n",
+    "industry_level = 1\n",
+    "industries = industry_list(industry_category, industry_level)\n",
+    "styles = ['SIZE']\n",
+    "benchmark = 300\n",
+    "turn_over_target_base = 0.30\n",
+    "executor = NaiveExecutor()\n",
+    "\n",
+    "ref_dates = makeSchedule(firstDate=start_date,\n",
+    "                         endDate=end_date,\n",
+    "                         tenor=freq,\n",
+    "                         calendar='china.sse',\n",
+    "                         dateGenerationRule=DateGeneration.Backward)\n",
+    "ref_dates = [ref_date.strftime('%Y-%m-%d') for ref_date in ref_dates]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "# Features\n",
+    "\n",
+    "base1 = LAST('roe_q')\n",
+    "base2 = CSRes('ep_q', base1)\n",
+    "\n",
+    "features = {'f01': base1,\n",
+    "            'f02': base2,\n",
+    "            'f03': CSRes(CSRes('market_confidence_15d', base1), base2),\n",
+    "            'f04': CSRes(CSRes('RecievableTO', base1), base2),\n",
+    "            'f05': CSRes(CSRes('val_q', base1), base2),\n",
+    "            'f06': CSRes(CSRes('BP', base1), base2),\n",
+    "            'f07': CSRes(CSRes('con_pe_rolling_order', base1), base2),\n",
+    "            'f08': CSRes(CSRes('con_pb_rolling_order', base1), base2),\n",
+    "            'f09': CSRes(CSRes('DebtEquityRatio', base1), base2)}\n",
+    "\n",
+    "weights = {'f01': 1.0,\n",
+    "           'f02': 1.0,\n",
+    "           'f03': 0.25,\n",
+    "           'f04': 0.25,\n",
+    "           'f05': 0.25,\n",
+    "           'f06': 0.25,\n",
+    "           'f07': -0.25,\n",
+    "           'f08': -0.25,\n",
+    "           'f09': -0.25}\n",
+    "\n",
+    "const_model = ConstLinearModel(features=features, weights=weights)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "# Model\n",
+    "\n",
+    "meta_model = XGBTrainer(features=features,\n",
+    "                        objective='reg:linear',\n",
+    "                        booster='gbtree',\n",
+    "                        tree_method='hist',\n",
+    "                        n_estimators=2000,\n",
+    "                        learning_rate=0.01,\n",
+    "                        early_stopping_rounds=30,\n",
+    "                        subsample=0.25,\n",
+    "                        colsample_bytree=1.,\n",
+    "                        n_jobs=1,\n",
+    "                        eval_sample=0.3)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Train / Predict Models with Dask Cluster\n",
+    "--------------------------"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Training Phase"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "from dask.distributed import Client\n",
+    "client = Client(dask_cluster)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%%time\n",
+    "\n",
+    "tasks = client.map(train_daily_model, [(ref_date, meta_model) for ref_date in ref_dates])\n",
+    "models = client.gather(tasks) "
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Predicting Phase"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%%time\n",
+    "\n",
+    "tasks = client.map(predict_daily_model, list(zip(ref_dates, models)))\n",
+    "predictions1 = client.gather(tasks)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%%time\n",
+    "\n",
+    "tasks = client.map(predict_daily_model, [(ref_date, const_model) for ref_date in ref_dates])\n",
+    "predictions2 = client.gather(tasks)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Portfolio Rebalance\n",
+    "-----------------------"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "# constraints setting\n",
+    "total_risks = industries + styles + ['benchmark']\n",
+    "\n",
+    "b_type = []\n",
+    "l_val = []\n",
+    "u_val = []\n",
+    "\n",
+    "for j, name in enumerate(total_risks):\n",
+    "    if name == 'benchmar':\n",
+    "        b_type.append(BoundaryType.RELATIVE)\n",
+    "        l_val.append(0.8)\n",
+    "        u_val.append(1.0)\n",
+    "    elif name == 'SIZE':\n",
+    "        b_type.append(BoundaryType.ABSOLUTE)\n",
+    "        l_val.append(0.)\n",
+    "        u_val.append(0.)\n",
+    "    else:\n",
+    "        b_type.append(BoundaryType.RELATIVE)\n",
+    "        l_val.append(1.0)\n",
+    "        u_val.append(1.0)\n",
+    "        \n",
+    "\n",
+    "bounds = create_box_bounds(total_risks, b_type, l_val, u_val)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%%time\n",
+    "\n",
+    "engine = SqlEngine(data_source)\n",
+    "\n",
+    "rets = []\n",
+    "turn_overs = []\n",
+    "previous_pos = pd.DataFrame()\n",
+    "\n",
+    "trade_dates = ref_dates\n",
+    "\n",
+    "for i, ref_date in enumerate(trade_dates):\n",
+    "    er = 0.0 * predictions1[i].values.flatten().astype(float) + 1.0 * predictions2[i].values.flatten().astype(float)\n",
+    "    codes = predictions[i].index.values.astype(int).tolist()\n",
+    "    industry_data = engine.fetch_industry_matrix(ref_date, codes, category=industry_category, level=industry_level)\n",
+    "    industry_exp = industry_data[industries].values\n",
+    "    industry_names = industry_data.industry_name.values\n",
+    "    style_exp = engine.fetch_risk_model(ref_date, codes, risk_model=risk_model)[1][styles].values\n",
+    "    benchmark_w = engine.fetch_benchmark(ref_date, benchmark, codes).weight.values\n",
+    "    is_in_benchmark = (benchmark_w > 0.).astype(float)\n",
+    "    \n",
+    "    risk_exp = np.concatenate([industry_exp, style_exp, is_in_benchmark.reshape((-1, 1))], axis=1)\n",
+    "    cons_mat = pd.DataFrame(risk_exp, columns=total_risks)\n",
+    "    constraint = LinearConstraints(bounds=bounds,\n",
+    "                                   cons_mat=cons_mat,\n",
+    "                                   backbone=benchmark_w)\n",
+    "    \n",
+    "    lbound = np.maximum(0., benchmark_w - 0.02)\n",
+    "    ubound = 0.02 + benchmark_w\n",
+    "    \n",
+    "    if previous_pos.empty:\n",
+    "        current_position = None\n",
+    "        turn_over_target = None\n",
+    "    else:\n",
+    "        previous_pos.set_index('code', inplace=True)\n",
+    "        remained_pos = previous_pos.loc[codes]\n",
+    "\n",
+    "        remained_pos.fillna(0., inplace=True)\n",
+    "        turn_over_target = turn_over_target_base\n",
+    "        current_position = remained_pos.weight.values\n",
+    "        \n",
+    "    try:\n",
+    "        target_pos, _ = er_portfolio_analysis(er,\n",
+    "                                              industry_names,\n",
+    "                                              None,\n",
+    "                                              constraint,\n",
+    "                                              False,\n",
+    "                                              benchmark_w,\n",
+    "                                              method='risk_neutral',\n",
+    "                                              turn_over_target=turn_over_target,\n",
+    "                                              current_position=current_position,\n",
+    "                                              lbound=lbound,\n",
+    "                                              ubound=ubound)\n",
+    "    except ValueError:\n",
+    "        alpha_logger.info('{0} full re-balance'.format(date))\n",
+    "        target_pos, _ = er_portfolio_analysis(er,\n",
+    "                                              industry_names,\n",
+    "                                              None,\n",
+    "                                              constraint,\n",
+    "                                              False,\n",
+    "                                              benchmark_w,\n",
+    "                                              method=method,\n",
+    "                                              lbound=lbound,\n",
+    "                                              ubound=ubound)\n",
+    "        \n",
+    "    target_pos['code'] = codes\n",
+    "    \n",
+    "    turn_over, executed_pos = executor.execute(target_pos=target_pos)\n",
+    "    executed_codes = executed_pos.code.tolist()\n",
+    "    \n",
+    "    dx_returns = engine.fetch_dx_return(ref_date, executed_codes, horizon=horizon, offset=1)\n",
+    "    result = pd.merge(executed_pos, dx_returns, on=['code'])\n",
+    "    ret = result.weight.values @ (np.exp(result.dx.values) - 1.)\n",
+    "    rets.append(np.log(1. + ret))\n",
+    "    \n",
+    "    executor.set_current(executed_pos)\n",
+    "    turn_overs.append(turn_over)\n",
+    "    previous_pos = executed_pos\n",
+    "\n",
+    "    alpha_logger.info('{0} is finished'.format(ref_date))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "ret_df = pd.DataFrame({'returns': rets, 'turn_over': turn_overs}, index=trade_dates)\n",
+    "\n",
+    "# index return\n",
+    "index_return = engine.fetch_dx_return_index_range(benchmark,\n",
+    "                                                  dates=trade_dates,\n",
+    "                                                  horizon=horizon,\n",
+    "                                                  offset=1).set_index('trade_date')\n",
+    "ret_df['index'] = index_return['dx']\n",
+    "\n",
+    "ret_df.loc[advanceDateByCalendar('china.sse', trade_dates[-1], freq)] = 0.\n",
+    "ret_df = ret_df.shift(1)\n",
+    "ret_df.iloc[0] = 0.\n",
+    "ret_df['tc_cost'] = ret_df.turn_over * 0.002\n",
+    "ret_df['returns'] = ret_df['returns'] - ret_df['index']\n",
+    "\n",
+    "ret_df[['returns', 'tc_cost']].cumsum().plot(figsize=(12, 6),\n",
+    "                                             title='Fixed frequency rebalanced: {0}'.format(freq),\n",
+    "                                             secondary_y='tc_cost')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.6.3"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}