-
Notifications
You must be signed in to change notification settings - Fork 12
/
pairs_trading.py
1893 lines (1688 loc) · 81.5 KB
/
pairs_trading.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
# ---
# jupyter:
# jupytext:
# text_representation:
# extension: .py
# format_name: light
# format_version: '1.5'
# jupytext_version: 1.14.1
# kernelspec:
# display_name: Python 3 (ipykernel)
# language: python
# name: python3
# ---
# <h2>
# Exploratory Statistics of Pairs Trading
# </h2>
# <p>
# The Jupyter notebook explores pairs trading strategies.
# This document is available on the GitHub repository https://github.com/IanLKaplan/pairs_trading
# </p>
# <blockquote>
# <p>
# Pairs trading is an approach that takes advantage of the
# mispricing between two (or more) co-moving assets, by
# taking a long position in one (many) and shorting the
# other(s), betting that the relationship will hold and that
# prices will converge back to an equilibrium level.
# </p>
# <p>
# <i>Definitive Guide to Pairs Trading</i> availabel from <a href="https://hudsonthames.org/">Hudson and Thames</a>
# </p>
# </blockquote>
# <p>
# Pairs trading is sometimes referred to as a statistical arbitrage trading strategy.
# </p>
# <blockquote>
# <p>
# Statistical arbitrage and pairs trading tries to solve this problem using price relativity. If two assets share the same
# characteristics and risk exposures, then we can assume that their behavior would be similar as well. This has
# the benefit of not having to estimate the intrinsic value of an asset but rather just if it is under or overvalued
# relative to a peer(s). We only have to focus on the relationship between the two, and if the spread happens
# to widen, it could be that one of the securities is overpriced, the other is underpriced, or the mispricing is a
# combination of both.
# </p>
# <p>
# <i>Definitive Guide to Pairs Trading</i> available from <a href="https://hudsonthames.org/">Hudson and Thames</a>
# </p>
# </blockquote>
# <p>
# Pairs trading algorithms have been reported to yield portfolios with Sharpe ratios in excess of 1.0 and returns of 10% or
# higher. Pairs trading takes both long and short positions, so the portfolio tends to be market neutral. A pairs trading portfolio
# can have drawdowns, but the drawdowns should be less than a benchmark like the S&P 500 because of the market neutral nature of the
# portfolio. The lower, market neutral, structure of a pairs trading portfolio means that the portfolio will have a lower return
# than a comparable benchmark like the S&P 500.
# </p>
# <p>
# Markets tend toward efficiency and many quantitative approaches fade over time as they are adopted by hedge funds. Pairs trading
# goes back to the mid-1980s. Surprisingly, pairs trading still seems to be profitable. One reason for this could be that there are a vast
# number of possible pairs and the pairs portfolio is a faction of the pairs universe. This could
# leave unexploited pairs in the market. Pairs trading may also be difficult to scale to a level that would be attractive to institutional
# traders, like hedge funds, so the strategy has not been arbitraged out of the market.
# </p>
# <p>
# Mathematical finance often uses models that are based on normal distributions, constant means and standard deviations. Actual market
# data is often not normally distributed and changes constantly. The statistics used to select stocks for pairs trading assume
# that the pair distribution has a constant mean and standard deviation (e.g., the pairs spread is a stationary time series). As this notebook
# shows, this assumption is statistically valid about forty percent of the time.
# </p>
# <p>
# The statistics that predict a successful pair will not be accurate in all time periods. For the strategy to be successful, the prediction
# must be right more often than not. To minimize the risk in any particular trade, this suggests that trading a larger portfolio will
# be more successful than trading a small portfolio.
# </p>
# <h3>
# Overview
# </h3>
# <p>
# The primary references used for this notebook are the books <i>Pairs Trading</i> by Ganapathy Vidyamurthy and <i>Algorithmic
# Trading</i> by Ernest P. Chan.
# </p>
# <p>
# The pairs trading strategy attempts to find a pair of stocks where the weighted spread between the stock close prices is
# mean reverting.
# </p>
# <p>
# Implementing the pairs trading strategy involves two logical steps:
# </p>
# <ol>
# <li>
# <p>
# Pairs selection: Identify a pair of stocks that are likely to have mean reverting behavior using a lookback period.
# </p>
# </li>
# <li>
# <p>
# Trading the stocks using the long/short strategy over the trading period. This involves building a trading signal
# from the weighted spread formed from the close prices of the stock pair. When the trading signal is above or below
# the mean by some threshold value, a long and short position are taken in the two stocks.
# </p>
# </li>
# </ol>
# <h2>
# Pairs Selection
# </h2>
# <h3>
# S&P 500 Industry Sectors
# </h3>
# <p>
# In this notebook, pairs are selected from the S&P 500 stock universe. These stocks have a high trading volume, with a small
# bid-ask spread. S&P 500 stocks are also easier to short, with lower borrowing fees and a lower chance of a short position being called
# back.
# </p>
# <p>
# In pairs selection we are trying to find pairs that are cointegrated, where the price spread has mean reverting behavior. Just as there
# can be spurious correlation there can be spurious cointegration, so the stock pair should
# have some logical connection. In the book <i>Pairs Trading</i> (Vidyamurthy) the author discusses using factor models to select pairs with
# similar factor characteristics.
# </p>
# <p>
# Factor models are often built using company fundamental factors like earnings, corporate debt and cash flow. These factors
# tend to be generic. Many companies in completely different industry sectors may have similar fundamental factors. When selecting pairs
# we would like to select stocks that are affected by similar market forces. For example, energy sector stocks tend to be
# affected by the same economic and market forces. Factors affecting companies outside the energy sector can be much more complicated.
# In many cases the company factors that affect S&P 500 companies are broad economic factors that are not obviously useful in choosing pairs
# for mean reversion trading.
# </p>
# <p>
# In lieu of specific factors, the S&P 500 industry sector is used as the set from which pairs are drawn.
# Although not perfect, the industry sector will tend to select stocks with similar behavior, while reducing the universe of
# stocks from which pairs are selected.
# </p>
# <p>
# Reducing the universe of stock pairs is important because, even with modern computing power, it would be difficult
# to test all possible stock pairs in the S&P 500, since the number of pairs grows exponentially with N, the number of stocks.
# </p>
#
# \$ number \space of \space pairs = \frac{N^2 - N}{2} $
#
# <p>
# The S&P 500 component stocks (in 2022) and their related industries have been downloaded from barchart.com. The files are included in this
# GitHub repository.
# </p>
# <p>
# The S&P 500 industry sectors are:
# </p>
# <ol>
# <li>
# Consumer discretionary
# </li>
# <li>
# Consumer staples
# </li>
# <li>
# Energy
# </li>
# <li>
# Financials
# </li>
# <li>
# Health care
# </li>
# <li>
# Industrials
# </li>
# <li>
# Infotech
# </li>
# <li>
# Materials
# </li>
# <li>
# Real estate
# </li>
# <li>
# Communication services
# </li>
# <li>
# Utilities
# </li>
# </ol>
# <p>
# The stocks from the Barchart list include stocks for the same company, with different classes (e.g., class A, class B or class C stocks).
# The list of stocks is filtered to remove all but the lowest stock class (e.g., if there is a class B stock, the class A stock is removed
# from the list).
# </p>
# <h3>
# Stock Market Close Data
# </h3>
# <p>
# The data used to model pairs trading in this notebook uses close price data for all of the S&P 500 stocks from the start date to yesterday
# (e.g., one day in the past). In other models (see Stock Market Cash Trigger and ETF Rotation) the close price data was downloaded the first time the
# notebook was run and stored in temporary files. The first notebook run incurred the initial overhead of downloading
# the data, but subsequent runs could read the data from local temporary files.
# </p>
# <p>
# For the S&P 500 stock universe, downloading all of the close price data the first time the notebook is run would have an unacceptable
# overhead. To avoid this, the data is downloaded once and stored in local files. When the notebook is run at later times, only the
# data between the end of the last date in the file and the current end date will be downloaded.
# </p>
# <p>
# There are stocks in the S&P 500 list that were listed on the stock exchange later than the start date. These
# stocks are filtered out, so the final stock set does not include all of the S&P 500 stocks.
# </p>
# <p>
# Filtering stocks in this way can create a survivorship bias. This should not be a problem for back testing pairs trading
# algorithms through the historical time period. The purpose of this backtest is to understand the pairs trading behavior.
# The results do not depend on the stock universe, only on the pairs selected.
# </p>
#
# +
#
# To generate a python file from the notebook use jupytext:
# pip install jupytext --upgrade
# jupytext --to py pairs_trading.ipynb
#
import os
from datetime import datetime
# from multiprocessing import Pool
# pathos is used as a more robust multiprocessing library
# pip install pathos
from pathos.multiprocessing import Pool
from typing import List, Tuple, Dict
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import statsmodels.api as sm
from numpy import log
from statsmodels.tsa.stattools import adfuller
from statsmodels.tsa.vector_ar.vecm import coint_johansen
from tabulate import tabulate
from coint_analysis.coint_analysis_result import CointAnalysisResult, CointInfo
from coint_data_io.coint_matrix_io import CointMatrixIO
from pairs.pairs import get_pairs
#
# Local libraries
#
from plot_ts.plot_time_series import plot_ts, plot_two_ts
from read_market_data.MarketData import MarketData, read_s_and_p_stock_info, extract_sectors
# Apply the default theme
sns.set_theme()
from s_and_p_filter import s_and_p_directory, s_and_p_stock_file
s_and_p_file = s_and_p_directory + os.path.sep + s_and_p_stock_file
start_date_str = '2007-01-03'
start_date: datetime = datetime.fromisoformat(start_date_str)
trading_days = 252
def calc_pair_counts(sector_info: dict) -> pd.DataFrame:
"""
Return a DataFrame
:param sector_info: a dictionary where the key is the sector name and the
value is a list of stock symbols for that sector
:return: a DataFrame where the index is the sector names and the columns are "num stocks" and "num pairs"
"num stocks" is the number of stocks in the sector. "num pairs" is the number of unique pairs
that can be formed from the set of sector stocks. The last row is the sum of the columns.
"""
column_label = ['num stocks', 'num pairs']
sectors = list(sector_info.keys())
counts_l: list = list()
n_l: list = list()
for sector in sectors:
n = len(sector_info[sector])
n_l.append(n)
count = ((n ** 2.0) - n) / 2.0
counts_l.append(count)
num_stocks = sum(n_l)
info_df = pd.DataFrame(n_l)
info_df = pd.concat([info_df, pd.DataFrame(counts_l)], axis=1)
info_df.columns = column_label
sum_pairs = sum(counts_l)
num_stocks_df = pd.DataFrame([num_stocks])
sum_df = pd.DataFrame([sum_pairs])
row_df = pd.concat([num_stocks_df, sum_df], axis=1)
row_df.columns = column_label
info_df = pd.concat([info_df, row_df], axis=0)
sectors.append('Sum')
info_df.index = sectors
return info_df
# stock_info_df: a DataFrame with columns Symbol, Name, Sector
stock_info_df = read_s_and_p_stock_info(s_and_p_file)
stock_l: list = list(set(stock_info_df['Symbol']))
stock_l.sort()
market_data = MarketData(start_date=start_date)
# Get close prices for the S&P 500 list
close_prices_df = market_data.get_close_data(stock_l)
final_stock_list = list(close_prices_df.columns)
mask = stock_info_df['Symbol'].isin(final_stock_list)
# Some stocks were listed on the stock exchange later than start_date. In this case the stock will not
# be returned by MarketData.get_close_data(). final_stock_info has the Symbol, Name, Sector for the
# set of stocks that it was possible to obtain close prices for the date range.
final_stock_info_df = stock_info_df[mask]
sectors = extract_sectors(final_stock_info_df)
pairs_info_df = calc_pair_counts(sectors)
# -
#
# <p>
# The table below shows the number of unique pairs for each S&P 500 sector and the total number of pairs. By drawing pairs from sectors,
# rather than the whole S&P 500 set of stocks, the number of possible pairs is reduced from 124,750.
# </p>
# +
print(tabulate(pairs_info_df, headers=[*pairs_info_df.columns], tablefmt='fancy_grid'))
# An experiment to replace tabulate tables. Unfortunately, this table does not display when the notebook is examined
# on GitHub.
#
# import plotly.graph_objects as go
#
# fig = go.Figure(data=[go.Table(
# header=dict(values=['S&P 500 sector', *pairs_info_df.columns],
# fill_color='paleturquoise',
# align='left'),
# cells=dict(values=[pairs_info_df.index, pairs_info_df['num stocks'], pairs_info_df['num pairs']],
# fill_color='lavender',
# align='left'))
# ])
#
# fig.show()
# -
# <h3>
# Lookback Time Period
# </h3>
# <p>
# Pairs are selected for trading using a lookback period. The longer the lookback period, the
# less error there will be in the selection statistics, assuming that the data is stable
# (e.g., constant mean and standard deviation). Stock price time series are not stable over time, however. The mean and the
# standard deviation change, as do other statistics like correlation and cointegration (mean reversion).
# </p>
# <p>
# In using a lookback period to choose trading pairs we are making the assumption that the past
# will resemble the future trading period (as we shall see, this is not true in the majority of cases).
# The longer the lookback period, the less likely it is that the statistics will match
# the trading period. This creates a tension between statistical accuracy and statistics that are more likely to reflect
# the future trading period.
# </p>
# <p>
# A half year period is used for the lookback period. In practice the statistics for a year period (252 trading days) and a
# six month period (126 trading days) seem to be similar. We assume that the six month period will more accurately resemble the
# future trading period.
# </p>
#
# <h2>
# Mean Reversion
# </h2>
# <p>
# A single stock price series is rarely stationary and mean reverting. In pairs trading, we are looking for stock pairs where the stock price spread time series is stationary and mean in the in-sample (lookback) period. We are making a bet that the spread time series will also be stationary and mean reverting in the out-of-sample trading period.
# </p>
# <p>
# When a pair forms a mean reverting, stationary time series, it is referred to as a cointegrated time series.
# </p>
# <blockquote>
# <p>
# This (linear) price data combination of n different time series into one price data series is called cointegration and the resulting
# price series w.r.t. financial data is called a cointegrated pair.
# </p>
# <p>
# Cointegration Analysis of Financial Time Series Data by Johannes Steffen, Pascal Held and Rudolf Kruse, 2014
# </p>
# <p>
# https://www.inf.ovgu.de/inf_media/downloads/forschung/technical_reports_und_preprints/2014/02_2014.pdf
# </p>
# </blockquote>
# <p>
# In the equation below, <i>s</i> is the stationary mean reverting weighted spread time series, P<sub>A</sub> is the price series for stock A,
# P<sub>B</sub> is the price series for stock B and β is the weight factor (for one share of stock A there will be β shares of
# stock B).
# </p>
#
# \$ s = P_A - \beta P_B $
#
# \$ s = \mu \space when \space P_A = \beta P_B $
#
# \$ s > \mu + \delta \space when \space P_A > \beta P_B \space (short \space P_A, \space long \space P_B) $
#
# \$ s < \mu - \delta \space when \space P_A < \beta P_B \space (long \space P_A, \space sort \space P_B) $
#
# <p>
# When s is above the mean at some level, delta (perhaps one standard deviation), a short position will be taken in stock A and a long position will be taken in β shares of stock B. When s is below the mean at some level (perhaps one standard deviation) a long position will be taken in stock A and a short position will be taken in β shares of stock B.
# </p>
# <p>
# In identifying a pair for pairs trading, a determination is made on whether <i>s</i> is mean reverting. The process of determining
# mean reversion will also provide the value of β.
# </p>
# <h2>
# Testing for Cointegration and Mean Reversion
# </h2>
# <p>
# Highly correlated pairs are selected from a common industry sector. Once a pair with high correlation is identified, the next step
# is to test whether the pair is cointegrated and mean reverting. Two tests are commonly used to test for mean reversion:
# </p>
# <ol>
# <li>
# Engle-Granger Test: Linear Regression and the Augmented Dickey-Fuller (ADF) test
# </li>
# <li>
# The Johansen Test
# </li>
# </ol>
# <p>
# Each of these tests has advantages and disadvantages, which will be discussed below.
# </p>
# <h3>
# Engle-Granger Test: Linear Regression and the Augmented Dickey-Fuller Test
# </h3>
# <p>
# Two linear regressions are performed on the price series of the stock pair. The residuals of the regression with the highest
# slope are tested with the Augmented Dickey-Fuller (ADF) test to determine whether mean reversion is likely.
# </p>
# <p>
# Linear regression is designed to provide a measure of the effect of a dependent variable (on the x-axis) on the independent
# variable (on the y-axis). An example might be body mass index (a measure of body fat) on the x-axis and blood cholesterol on the y-axis.
# </p>
# <p>
# In selecting pairs, we pick pairs with a high correlation from the same industry sector. This means that some
# process is acting on both stocks. However, the movement of one stock does not necessarily cause movement in the other stock. Also,
# both stock price series are driven by an underlying random process. This means that linear regression is not perfectly suited
# for analyzing pairs. Two linear regressions are performed since we don't know which stock to pick as the dependent variable. The regression
# with the highest slope is used to analyze mean reversion and build the cointegrated time series.
# </p>
# <p>
# The result of the (Engle) Granger test the linear regression slope is the weight value for the equation above. A linear regression
# intercept is also available, along with the confidence level of the cointegration process (e.g., 90%, 95% and 99% confidence).
# </p>
#
# +
class CointData:
"""
cointegrated: boolean - true if the time series were cointegrated, false otherwise
weight: float - in the stationary cointegrtion equation x = A - w * B
confidence: int - confidence level, in percent: 90, 95, 99
asset_a: str - the symbol for the time series A in the above equation
asset_b: str - the symbol for the time series B in the above equation
"""
def __init__(self,
cointegrated: bool,
confidence: int,
weight: float,
asset_a: str,
asset_b: str):
self.cointegrated = cointegrated
self.confidence = confidence
self.weight = weight
self.asset_a = asset_a
self.asset_b = asset_b
self.intercept = np.NAN
def __str__(self):
if self.has_intercept():
s = f'cointegrated: {self.cointegrated}, confidence: {self.confidence}, weight: {self.weight}, intercept: {self.get_intercept()} ({self.asset_a}, {self.asset_b})'
else:
s = f'cointegrated: {self.cointegrated}, confidence: {self.confidence}, weight: {self.weight}, ({self.asset_a}, {self.asset_b})'
return s
def set_intercept(self, intercept: float) -> None:
self.intercept = intercept
def get_intercept(self) -> float:
return self.intercept
def has_intercept(self) -> bool:
return not np.isnan(self.intercept)
class PairStatistics:
def __init__(self):
self.decimals = 2
pass
def correlation(self, data_a_df: pd.DataFrame, data_b_df: pd.DataFrame) -> float:
data_a = np.array(data_a_df).flatten()
data_b = np.array(data_b_df).flatten()
c = np.corrcoef(data_a, data_b)
cor_v = round(c[0, 1], 2)
return cor_v
def find_interval(self, coint_stat: float, critical_vals: dict) -> Tuple[bool, int]:
"""
:param coint_stat: the ADF statistic
:param critical_vals: a dictionary defining the ADF intervals {'1%': -3.49, '5%': -2.89, '10%': -2.58}. The
dictionary values may be either positive or negative.
:return: if the adf_stat is in the critical value range, return True and the integer value of the interval
(e.g., 1, 5, 10). Or False and 0
"""
cointegrated = False
interval_key = ''
interval = 0
abs_coint_stat = abs(coint_stat)
for key, value in critical_vals.items():
abs_value = abs(value)
if abs_coint_stat > abs_value and abs_value > interval:
interval = abs_value
interval_key = key
cointegrated = True
key_val = int(interval_key.replace('%','')) if cointegrated else 0
return cointegrated, key_val
def engle_granger_coint(self, data_a: pd.DataFrame, data_b: pd.DataFrame) -> CointData:
sym_a = data_a.columns[0]
sym_b = data_b.columns[0]
data_b_const = sm.add_constant(data_b)
# x = I + b * A
result_ab = sm.OLS(data_a, data_b_const).fit()
data_a_const = sm.add_constant(data_a)
# x = I + b * B
result_ba = sm.OLS(data_b, data_a_const).fit()
slope_ab = result_ab.params[data_b.columns[0]]
slope_ba = result_ba.params[data_a.columns[0]]
result = result_ab
slope = slope_ab
if slope_ab < slope_ba:
t = sym_a
sym_a = sym_b
sym_b = t
result = result_ba
slope = slope_ba
intercept = round(result.params['const'], self.decimals)
slope = round(slope, self.decimals)
residuals = result.resid
adf_result = adfuller(residuals)
adf_stat = round(adf_result[0], self.decimals)
critical_vals = adf_result[4]
cointegrated, interval = self.find_interval(adf_stat, critical_vals)
coint_data = CointData(cointegrated=cointegrated, confidence=interval, weight=slope, asset_a=sym_a, asset_b=sym_b)
coint_data.set_intercept(intercept=intercept)
return coint_data
def johansen_coint(self, data_a: pd.DataFrame, data_b: pd.DataFrame) -> CointData:
ts_df = pd.concat([data_a, data_b], axis=1)
johansen_rslt = coint_johansen(ts_df, 0, 1)
hedge_val = round(np.abs(johansen_rslt.evec[0, 0] / johansen_rslt.evec[1, 0]), 2)
critical_vals = pd.DataFrame(johansen_rslt.trace_stat_crit_vals[0]).transpose()
critical_vals.columns = ['10%', '5%', '1%']
critical_vals_dict = critical_vals.to_dict('records')[0]
trace_stat = johansen_rslt.trace_stat[0]
cointegrated, interval = self.find_interval(coint_stat=trace_stat, critical_vals=critical_vals_dict)
sym_a = data_a.columns[0]
sym_b = data_b.columns[0]
coint_data = CointData(cointegrated=cointegrated, confidence=interval, weight=hedge_val, asset_a=sym_a,
asset_b=sym_b)
return coint_data
def compute_halflife(self, z_df: pd.DataFrame) -> int:
"""
Calculate the half-life of a mean reverting stationary series where the series
is an Ornstein–Uhlenbeck process
From example 7.5 in Quantitative Trading by Ernest P. Chan
"""
prevz = z_df.shift()
dz = z_df - prevz
dz = dz[1:]
prevz = prevz[1:]
result = sm.OLS(dz, prevz - np.mean(prevz)).fit()
theta = result.params
halflife_f = -np.log(2) / theta
halflife = round(halflife_f, 0)
return int(halflife)
def stationary_series(self, data_a: pd.DataFrame, data_b: pd.DataFrame, coint_data: CointData) -> pd.DataFrame:
"""
compute the stationary time series x = A - w * B or x = A - i - w * B if there is an intercept i
"""
data_df = pd.concat([data_a, data_b], axis=1)
data_a_df = pd.DataFrame(data_df[coint_data.asset_a])
data_b_df = pd.DataFrame(data_df[coint_data.asset_b])
if coint_data.has_intercept():
stationary_a = data_a_df.values - coint_data.get_intercept() - coint_data.weight * data_b_df.values
else:
stationary_a = data_a_df.values - coint_data.weight * data_b_df.values
stationary_df = pd.DataFrame(stationary_a.flatten())
stationary_df.index = data_df.index
return stationary_df
def plot_stationary_ts(stationary_df: pd.DataFrame, plus_delta: float, minus_delta: float, title: str) -> None:
stationary_df.plot(grid=True, title=title, figsize=(10, 6))
stat_mean = stationary_df.mean()[0]
plt.axhline(y=stat_mean, color='black', linewidth=2)
plt.axhline(y=stat_mean + plus_delta, color='red', linewidth=1, linestyle='--')
plt.axhline(y=stat_mean - minus_delta, color='green', linewidth=1, linestyle='--')
plt.show()
half_year = int(trading_days/2)
close_index = close_prices_df.index
d2007_ix = 0
d2007_close = pd.DataFrame(close_prices_df[['AAPL', 'MPWR', 'YUM']]).iloc[d2007_ix:d2007_ix+half_year]
d2007_cor = round(d2007_close.corr().iloc[0,[1,2]], 2)
cor_df = pd.DataFrame([d2007_cor])
cor_df.index = ['2007']
# -
# <h3>
# Example: AAPL/MPWR
# </h3>
# <p>
# AAPL (Apple Inc) and MPWR (Monolithic Power Systems, Inc) are in the technology industry sector. YUM (Yum brands) is a food company in a
# different industry sector. The correlations of MPWR and YUM with AAPL in the first half of 2007 are shown below. AAPL and MPWR are both technology sector
# stocks, while YUM brands is a food company. Other than overall stock market dynamics, we would not expect AAPL and YUM to be correlated, so this
# appears to be an example of false correlation.
# </p>
# +
print(tabulate(cor_df, headers=[*cor_df.columns], tablefmt='fancy_grid'))
# -
# <p>
# Monolithic Power Systems, Inc. (MPWR) stock grew at a rate that was similar to Apple's, although their everall market
# capitalization is a faction of Apples now. I was unfamiliar with MPWR until I wrote this notebook. Bloomberg describes
# MPWR's business as:
# </p>
# <blockquote>
# Monolithic Power Systems, Inc. designs and manufactures power management solutions. The Company provides power conversion, LED lighting, load switches, cigarette lighter adapters, chargers, position sensors, analog input, and other electrical components. Monolithic Power Systems serves customers globally.
# </blockquote>
# <p>
# A brief digression: Apple is now (2022) one of the most
# valuable companies in the world. A number of investment funds bought Apple shares and were able to beat the overall market
# for the years when Apple had exceptional growth.
# </p>
# <p>
# My father invested in such a fund. Every quarter they sent out a "market outlook"
# newsletter. This newsletter was filled with blather to make fund investors think that the people managing the fund had special insight into the
# stock market (which justified the fees the fund charged). In fact, their only insight was their position in Apple stock.
# When Apple's share price plateaued, so did the funds returns.
# </p>
# <p>
# Pairs may have high correlation without being cointegrated. The (Engle) Granger test shows that in the first half of 2007
# AAPL and MPWR were cointegrated at the 99% level.
# </p>
# +
pair_stat = PairStatistics()
d2007_aapl = pd.DataFrame(d2007_close['AAPL'])
d2007_mpwr = pd.DataFrame(d2007_close['MPWR'])
d2007_yum = pd.DataFrame(d2007_close['YUM'])
coint_data_granger_aapl_mpwr = pair_stat.engle_granger_coint(data_a=d2007_aapl, data_b=d2007_mpwr)
print(f'Granger test for cointegration (AAPL/MPWR): {coint_data_granger_aapl_mpwr}')
data_a = pd.DataFrame(d2007_close[coint_data_granger_aapl_mpwr.asset_a])
data_b = pd.DataFrame(d2007_close[coint_data_granger_aapl_mpwr.asset_b])
stationary_df = pair_stat.stationary_series(data_a=data_a, data_b=data_b, coint_data=coint_data_granger_aapl_mpwr)
# -
# <p>
# The confidence level represents the error percentage. So 1 = 1% error or 99% confidence, 5 = 5% or 95% confidence and 10 = 10% or
# 90% confidence.
# </p>
# <p>
# The plot below shows the stationary spread time series formed by
# </p>
#
# \$ s = MPWR - intercept - 3.7 * AAPL $
#
# <p>
# The dotted lines are a plus one and minus one standard deviation. The intercept adjusts the time series so that the mean is zero.
# </p>
# +
std_dev = stationary_df.std()[0]
plot_stationary_ts(stationary_df=stationary_df, plus_delta=std_dev, minus_delta=std_dev, title='Granger AAPL/MPWR stationary time series, 1st half of 2007' )
# -
# <p>
# AAPL and MPWR are both technology stocks that have some overlapping businesses (MPWR's products are used by companies like Apple). We would
# expect that the two stocks might be cointegrated.
# </p>
# <p>
# The test for cointegration is performed in a lookback period. The next period is the trading period where the close prices of the
# pair are combined with the weight (and perhaps the intercept) to form what we hope will be a stationary mean reverting time series
# that can be profitably traded.
# </p>
# <p>
# The test below applies the Granger test to the second half of 2007 to see if mean reversion persisted. As the test results shows,
# this is not the case.
# </p>
# +
second_half_start = d2007_ix+half_year
d2007_aapl_2 = pd.DataFrame(close_prices_df['AAPL']).iloc[second_half_start:second_half_start+half_year]
d2007_mpwr_2 = pd.DataFrame(close_prices_df['MPWR']).iloc[second_half_start:second_half_start+half_year]
coint_data_granger_aapl_mpwr_2 = pair_stat.engle_granger_coint(data_a=d2007_aapl_2, data_b=d2007_mpwr_2)
print(f'Granger test for cointegration (AAPL/MPWR): second half of 2007 {coint_data_granger_aapl_mpwr_2}')
# -
# <p>
# The pair AAPL and YUM (Yum Brands, a food company) would not be expected to be cointegrated (although they have a surprisingly
# high correlation). As expected, the Granger test does not show cointegration and mean reversion.
# </p>
coint_data_granger_aapl_yum = pair_stat.engle_granger_coint(data_a=d2007_aapl, data_b=d2007_yum)
print(f'Granger test for cointegration (AAPL/YUM): {coint_data_granger_aapl_yum}')
# <h3>
# Correlation and Cointegration
# </h3>
# <p>
# In choosing pairs for pairs trading, we are looking for a stock pair that is influenced by similar factors. Industry sector and high
# correlation can be used as a first filter for pairs.
# </p>
# <p>
# The tests for cointegration may find that a pair with a low correlation value is cointegrated and mean reverting. One
# example is AAPL and technology sector stock GPN (Global Payments Inc.) For the first half of 2007, AAPL and GPN have
# a low correlation.
# </p>
# +
d2007_gpn = pd.DataFrame(close_prices_df['GPN']).iloc[d2007_ix:d2007_ix+half_year]
cor_df = pd.DataFrame([pair_stat.correlation(data_a_df=d2007_aapl, data_b_df=d2007_gpn)])
cor_df.index = ['2007']
cor_df.columns = ['Correlation AAPL/GPN']
print(tabulate(cor_df, headers=[*cor_df.columns], tablefmt='fancy_grid'))
# -
# <p>
# The normalized close prices for the two stocks in the first half of 2007 are shown below.
# </p>
# +
def normalize_df(data_df: pd.DataFrame) -> pd.DataFrame:
min_s = data_df.min()
max_s = data_df.max()
norm_df = (data_df - min_s) / (max_s - min_s)
return norm_df
d2007_aapl_norm = normalize_df(d2007_aapl)
d2007_gpn_norm = normalize_df(d2007_gpn)
plot_two_ts(data_a=d2007_aapl_norm, data_b=d2007_gpn_norm, title='Normalized AAPL/GPN first half of 2007',x_label='date', y_label='Normalized Price')
# -
#
# <p>
# The Granger test shows that AAPL and GPN are cointegrated with 99% confidence (1% error). The Johansen test (see below) also shows
# that AAPL and GPN are cointegrated with 99% confidence.
# </p>
coint_data_granger_aapl_gpn = pair_stat.engle_granger_coint(data_a=d2007_aapl, data_b=d2007_gpn)
print(f'Granger test for cointegration (AAPL/GPN) : {coint_data_granger_aapl_gpn}')
# <p>
# The plot below shows the stationary time series formed from AAPL and GPN close prices.
# </p>
stationary_df = pair_stat.stationary_series(data_a=d2007_aapl, data_b=d2007_gpn, coint_data=coint_data_granger_aapl_gpn)
stat_sd = stationary_df.std()[0]
title='Granger AAPL/GPN stationary time series, 1st half of 2007'
plot_stationary_ts(stationary_df=stationary_df, plus_delta=stat_sd, minus_delta=stat_sd, title=title)
# <p>
# Given their low correlation and unrelated businesses (computer and media hardware vs payments) this may be an example of spurious
# cointegration. If the cointegratoin is spurious, cointegration may be more likely to break down in a future time period and the
# pair may not be profitable to trade.
# </p>
# <p>
# The Granger test for the second half of 2007 (e.g., the six months following the period above) is shown below:
# </p>
# +
d2007_gpn_2 = pd.DataFrame(close_prices_df['GPN']).iloc[second_half_start:second_half_start+half_year]
coint_data_granger_aapl_gpn_2 = pair_stat.engle_granger_coint(data_a=d2007_aapl_2, data_b=d2007_gpn_2)
print(f'Granger test for cointegration (AAPL/GPN): second half of 2007 {coint_data_granger_aapl_gpn_2}')
# -
# <p>
# The Granger test shows that there is no cointegration in the six momnth period following the first half of 2007, which reinforces the idea
# that the previous cointegration was spurious.
# </p>
# <h3>
# The Johansen Test
# </h3>
# <p>
# The Granger linear regression based test can only be used for two assets. The Johansen test can be used on more than two assets.
# </p>
# <p>
# The Johansen test uses eigenvalue decomposition for the estimation of cointegration. In contrast to the Granger test which has two steps:
# linear regression and the ADF test, the Johansen test is a single step test that also provides the weight factors. There is no linear
# constant (regression intercept) as there is with the Granger test, so the Johansen result may not have a mean of zero.
# </p>
# <p>
# The Johansen test and the Granger test do not always agree. The Johansen test is applied to AAPL/MPWR for the close prices from the first
# half of 2007. The Johansen test shows no cointegration, although the Granger test showed cointegration at the 99% confidence level.
# </p>
coint_data_johansen_aapl_mpwr = pair_stat.johansen_coint(data_a=d2007_aapl, data_b=d2007_mpwr)
print(f'Johansen test for cointegration (AAPL/MPWR), first half of 2007 : {coint_data_johansen_aapl_mpwr}')
# <p>
# In the research papers on cointegration, there doesn't seem to be a consensus on whether the Granger or
# Johansen tests are better. Some authors suggest using both tests, but they don't provide any empirical insight into why this might be advantageous.
# </p>
#
# <h2>
# Correlation
# </h2>
# <p>
# After selecting stocks based on their industry sector, the next filter used is the pair correlation of the close prices.
# </p>
# <p>
# Stocks that are strongly correlated are more likely to also exhibit mean reversion since they have similar market behavior.
# This section examines the correlation statistics for the S&P 500 sector pairs.
# </p>
#
# +
def display_histogram(data_v: np.array, x_label: str, y_label: str) -> None:
num_bins = int(np.sqrt(data_v.shape[0])) * 4
fix, ax = plt.subplots(figsize=(10, 8))
ax.set_xlabel(x_label)
ax.set_ylabel(y_label)
ax.grid(True)
ax.hist(data_v, bins=num_bins, color="blue", ec="blue")
plt.show()
# +
class SerialCorrelation:
class SerialCorrResult:
def __init__(self, pair: Tuple, corr_df: pd.DataFrame):
self.pair: Tuple = pair
self.corr_df: pd.DataFrame = corr_df
def __init__(self, stock_close_df: pd.DataFrame, pairs_list: List[Tuple], window: int):
self.stock_close_df = stock_close_df
self.pairs_list = pairs_list
self.window = window
self.index = self.stock_close_df.index
def calc_pair_serial_correlation(self, pair) -> SerialCorrResult:
stock_a_sym = pair[0]
stock_b_sym = pair[1]
stock_a_df = self.stock_close_df[stock_a_sym]
stock_b_df = self.stock_close_df[stock_b_sym]
corr_list = list()
date_list = list()
for ix in range(0, self.stock_close_df.shape[0], self.window):
stock_a_win = log(stock_a_df.iloc[ix:ix + self.window])
stock_b_win = log(stock_b_df.iloc[ix:ix + self.window])
c = np.corrcoef(stock_a_win, stock_b_win)
corr = round(c[0, 1], 2)
corr_list.append(corr)
date_list.append(self.index[ix])
corr_df = pd.DataFrame(corr_list)
corr_df.index = date_list
serial_corr_result = self.SerialCorrResult(pair, corr_df)
return serial_corr_result
def build_corr_frame(self, corr_list: List[SerialCorrResult]) -> pd.DataFrame:
num_cols = len(corr_list)
num_rows = corr_list[0].corr_df.shape[0]
corr_m = np.zeros([num_rows, num_cols])
col_names = list()
for col_ix in range(num_cols):
pair = corr_list[col_ix].pair
col = f'{pair[0]}:{pair[1]}'
col_names.append(col)
corr_df = corr_list[col_ix].corr_df
for row_ix in range(num_rows):
corr_m[row_ix, col_ix] = corr_df.iloc[row_ix]
corr_df = pd.DataFrame(corr_m)
corr_df.columns = col_names
corr_df.index = corr_list[0].corr_df.index
return corr_df
def serial_correlation(self) -> pd.DataFrame:
# serial_corr_list = list()
# for pair in self.pairs_list:
# serial_corr = self.calc_pair_serial_correlation(pair)
# serial_corr_list.append(serial_corr)
with Pool() as mp_pool:
serial_corr_list = mp_pool.map(self.calc_pair_serial_correlation, self.pairs_list)
corr_df = self.build_corr_frame(serial_corr_list)
return corr_df
pairs_list = get_pairs(sectors)
serial_correlation = SerialCorrelation(close_prices_df, pairs_list, half_year)
apple_tuple: Tuple = ('AAPL', 'MPWR')
apple_corr_result: SerialCorrelation.SerialCorrResult = serial_correlation.calc_pair_serial_correlation(apple_tuple)
apple_corr_df = apple_corr_result.corr_df
# -
# <p>
# The lookback period for pairs selection is six months (126 trading days). As a first step, all of the S&P 500 sector
# pairs will be tested for correlation over the lookback period.
# </p>
# <p>
# Correlation is calculated price for each stock pair.
# </p>
# <p>
# The windowed correlation is not stable. The plot below shows the correlation between two stocks, AAPL and MPWR, over windowed
# periods from the start date.
# </p>
# +
plot_ts(data_s=apple_corr_df[0], title=f'correlation between {apple_tuple[0]} and {apple_tuple[1]}',
x_label='Window Start Date', y_label=f'Correlation over {half_year} day window')
# -
# <p>
# Since correlation is not stable, a stock pair that is highly correlated in one time period may be uncorrelated (or negatively
# correlated) in the next time period.
# </p>
#
# +
corr_df = serial_correlation.serial_correlation()
def calc_corr_dist(corr_df: pd.DataFrame, cut_off: float) -> pd.DataFrame:
count_list = list()
for row_num, row in corr_df.iterrows():
count = 0
for val in row:
if val >= cut_off:
count = count + 1
count_list.append(count)
count_df = pd.DataFrame(count_list)
count_df.index = corr_df.index
return count_df
cor_a = np.array(corr_df.values).ravel()
# -
# <p>
# The histogram below shows the aggregate distribution of the pair correlation over all half year time periods.
# </p>
# +
display_histogram(cor_a, 'Correlation between pairs', 'Count')
# -
# <p>
# The pairs are selected from a common S&P industry sector, so there are a significant number of pairs that
# have a correlation above 0.75.
# </p>
# <p>
# There are a small number of pairs that have a strong negative correlation (a negative correlation -0.75 to approximately -0.9).
# Initially we look at pairs that have a strong positive correlation, but it may be unwise to ignore the negative correlations as
# well.