added sample notebooks

2f997476 · Dr.李 · e86d6f8f · 2f997476
Commit 2f997476 authored May 01, 2017 by Dr.李
Hide whitespace changes
Inline Side-by-side

Showing with 571 additions and 0 deletions

factor analysis.ipynb notebooks/factor analysis.ipynb +571 -0

No files found.
--- a/notebooks/factor analysis.ipynb
+++ b/notebooks/factor analysis.ipynb
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%pylab inline\n",
+    "import matplotlib.pyplot as plt\n",
+    "plt.style.use('ggplot')\n",
+    "print(plt.style.available)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "import numpy as np\n",
+    "import sqlalchemy\n",
+    "import pandas as pd\n",
+    "import alphamind.data.neutralize as ne\n",
+    "import alphamind.data.winsorize as ws\n",
+    "import alphamind.data.standardize as st\n",
+    "import alphamind.portfolio.rankbuilder as rb"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%%time\n",
+    "risk_factor_table = 'risk_factor_500'\n",
+    "benchmark = 'zz500'\n",
+    "factor = 'ROEAfterNonRecurring' # 'DROEAfterNonRecurring'\n",
+    "\n",
+    "conn = sqlalchemy.create_engine('mysql+mysqldb://root:we083826@localhost:3306/multifactor?charset=utf8')\n",
+    "df = pd.read_sql('select factor_data.{0},  trade_data.Return as dailyReturn, {1}.* '\n",
+    "                 'from factor_data, trade_data, {1} '\n",
+    "                 'where factor_data.Date = {1}.Date and factor_data.Code = {1}.Code '\n",
+    "                 'and factor_data.Date = trade_data.Date and factor_data.Code = trade_data.Code;'.format(factor, \n",
+    "                                                                                                         risk_factor_table), \n",
+    "                 conn)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "del df['Bank']\n",
+    "del df['NonBankFinancial']"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "risk_facto_cols = df.columns[4:]\n",
+    "risk_facto_cols"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "df['d1ret'] = df.dailyReturn.groupby(df.Code).shift(-1)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "benchmark_data = pd.read_sql('select {0}, Date from index_data'.format(benchmark), conn)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "benchmark_data['ret'] = benchmark_data[benchmark] / benchmark_data[benchmark].shift(1) - 1.\n",
+    "benchmark_data['d1ret_b'] = benchmark_data['ret'] .shift(-1)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "df = pd.merge(df, benchmark_data[['Date', 'd1ret_b']], on='Date', how='inner')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df.dropna(inplace=True)\n",
+    "df.shape"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Factor Date Preprocessing (Winsorize -> Standardize -> neutralize)\n",
+    "-----------------------------------------------------------------------"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "total_data = df.copy()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "y = total_data[factor].values\n",
+    "y.shape = -1, 1\n",
+    "groups = total_data.Date.values.astype(int)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%%time\n",
+    "total_data['res'] = ne.neutralize(total_data[risk_facto_cols].values,\n",
+    "                                  st.standardize(ws.winsorize_normal(y, groups=groups),\n",
+    "                                                 groups=groups),\n",
+    "                                  groups)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "total_data[risk_facto_cols].tail()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "total_data[[factor, 'res', 'Date', 'Code']].tail()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Factor Performance (long_short)\n",
+    "------------------------------------------------------------------------------------"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%%time\n",
+    "total_data['pos'] = total_data.res.groupby(groups).apply(lambda x: x / np.abs(x).sum())"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "total_data[['pos', 'res', 'dailyReturn', 'd1ret', 'd1ret_b', 'Code', 'Date']].tail()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "ret_series = (total_data.pos * (total_data.d1ret - total_data.d1ret_b)).groupby(total_data.Date).sum()\n",
+    "ret_series.index = pd.to_datetime(ret_series.index, format='%Y%m%d')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "ret_series.cumsum().plot(figsize=(14,7))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "ret_series.cumsum()[-20:].plot(figsize=(14,7))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "total_data.pos.groupby(groups).apply(lambda x: np.abs(x).sum()).head()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "collapsed": true
+   },
+   "source": [
+    "# Factor Performance (Long Only - Top 100 equal weighted)\n",
+    "------------------------------------------------------------------------------------"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%%time\n",
+    "use_rank = 100\n",
+    "total_data['pos'] = rb.rank_build(total_data.res.values, use_rank, groups) / use_rank"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "total_data[['pos', 'res', 'dailyReturn', 'd1ret', 'd1ret_b', 'Code', 'Date']].tail()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "ret_series = (total_data.pos * (total_data.d1ret - total_data.d1ret_b)).groupby(groups).sum()\n",
+    "ret_series.index = pd.to_datetime(ret_series.index, format='%Y%m%d')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "ret_series.cumsum().plot(figsize=(14,7))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "ret_series.cumsum()[-20:].plot(figsize=(14,7))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "total_data.pos.groupby(groups).sum().head()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "collapsed": true
+   },
+   "source": [
+    "# Risk Exposure\n",
+    "-------------------------------------"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "excess_return = (total_data.d1ret - total_data.d1ret_b).values\n",
+    "excess_return.shape = -1, 1\n",
+    "pos_series = total_data.pos.values\n",
+    "pos_series.shape = -1, 1"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "to_explain = total_data.pos.values * excess_return\n",
+    "depends_pos = total_data[risk_facto_cols].values\n",
+    "depends = depends_pos * excess_return"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%%time\n",
+    "idiosyncratic, other_stats = ne.neutralize(depends, to_explain, groups, output_exposure=True, output_explained=True)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "systemetic = other_stats['explained']\n",
+    "exposure = other_stats['exposure']"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "analyis_table = hstack((idiosyncratic, systemetic[:, :, 0]))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "cols = ['idiosyncratic']\n",
+    "cols.extend(risk_facto_cols)\n",
+    "analyis_table = pd.DataFrame(analyis_table, columns=cols, index=groups)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "aggregated_bars = analyis_table.groupby(level=0).sum()\n",
+    "aggregated_bars.index = pd.to_datetime(aggregated_bars.index, format='%Y%m%d')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "top_sources = aggregated_bars.sum().abs().sort_values(ascending=False).index[:10]\n",
+    "aggregated_bars.sum().abs().sort_values(ascending=False).plot(kind='bar', figsize=(16, 8))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "aggregated_bars[top_sources].cumsum().plot(figsize=(14, 7))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "exposure_table = pd.DataFrame(exposure[:, :, 0], columns=risk_facto_cols, index=groups)\n",
+    "exposure_table = exposure_table.groupby(level=0).first()\n",
+    "exposure_table.index = pd.to_datetime(exposure_table.index, format='%Y%m%d')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "exposure_table[top_sources[1:]].plot(figsize=(14,7))\n",
+    "plt.legend(loc='upper center', ncol=len(top_sources[1:]) // 3)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Clean up\n",
+    "-----------------------"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "del df\n",
+    "del total_data"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "total_factors = [factor] + risk_facto_cols.tolist()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "total_factors"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "aggregated_bars.sum().abs().sort_values(ascending=False)[:10].plot?"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "aggregated_bars.sum().abs().sort_values(ascending=False)[:10].plot"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python [conda root]",
+   "language": "python",
+   "name": "conda-root-py"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.5.3"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}