Skip to content

Commit

Permalink
cleaning
Browse files Browse the repository at this point in the history
  • Loading branch information
Thomas Schmelzer committed Apr 30, 2020
1 parent 2c60c30 commit 2f58fe1
Show file tree
Hide file tree
Showing 10 changed files with 140 additions and 132 deletions.
39 changes: 24 additions & 15 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -6,27 +6,36 @@ We take heavily advantage of the scipy.cluster.hierarchy package.
Here's a simple example

```python
import numpy as np
import pandas as pd
from pyhrp.hrp import dist, linkage, tree, _hrp

from pyhrp.graph import dendrogram
from pyhrp.hrp import hrp_feed, linkage, tree
prices = pd.read_csv("test/resources/stock_prices.csv", index_col=0, parse_dates=True)

from pyhrp.linalg import dist, correlation_from_covariance
returns = prices.pct_change().dropna(axis=0, how="all")
cov, cor = returns.cov(), returns.corr()
links = linkage(dist(cor.values), method='ward')
node = tree(links)

# use a small covariance matrix
cov = np.array([[1, 0.5, 0.2], [0.5, 2, 0.2], [0.2, 0.2, 3]])
rootcluster = _hrp(node, cov)

# we compute the root(node) of a graph here
link = linkage(dist(correlation_from_covariance(cov)), 'ward')
root = tree(link)

# plot the dendrogram
ax = dendrogram(link, orientation="left")
ax = dendrogram(links, orientation="left")
ax.get_figure().savefig("dendrogram.png")
```
For your convenience you can bypass the construction of the covariance and correlation matrix, the links and the node, e.g. the root of the tree (dendrogram).
```python
import pandas as pd
from pyhrp.hrp import hrp

v, weights = hrp_feed(node=root, cov=cov)

print(weights)
prices = pd.read_csv("test/resources/stock_prices.csv", index_col=0, parse_dates=True)
root = hrp(prices=prices)
```
You may expect a weight series here but instead the `hrp` function returns a `Cluster` object. The `Cluster` simplifies all further post-analysis.
```python
print(cluster.weights)
print(cluster.variance)
# You can drill into the graph by going downstream
print(cluster.left)
print(cluster.right)
```

## Installation:
Expand Down
16 changes: 2 additions & 14 deletions pyhrp/cluster.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,23 +32,16 @@ def rp(v_left, v_right):

class Cluster(object):
def __init__(self, assets, variance, left=None, right=None):
# assert len(assets) == len(weights)
# assert len(assets) == len(set(assets))
# assert isinstance(weights, np.ndarray)
w = np.array(list(assets.values()))

assert np.all(w > 0)
assert variance >= 0

# test that the weights are close to 1.0
assert np.isclose(np.sum(w), 1.0)

# distinct values in assets dictionary
# assert len(set(assets.values())) == len(assets)

self.__assets = assets
self.__variance = variance
# self.__weights = weights

self.__left = left
self.__right = right

Expand All @@ -59,9 +52,6 @@ def __init__(self, assets, variance, left=None, right=None):
# left is not None, hence both left and right have to be clusters
assert isinstance(left, Cluster)
assert isinstance(right, Cluster)

# assert self.__assets == {**left.assets, **right.assets}
# assert set(left.assets.keys()).isdisjoint(set(right.assets.keys()))
assert set(left.assets.keys()).isdisjoint(set(right.assets.keys()))

@property
Expand All @@ -85,6 +75,4 @@ def is_leaf(self):

@property
def weights(self):
a = pd.Series(self.assets, name="Weights")
a.index.name = "Asset"
return a.sort_index()
return pd.Series(self.assets, name="Weights").sort_index()
92 changes: 22 additions & 70 deletions pyhrp/hrp.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,21 @@
import scipy.cluster.hierarchy as sch

from pyhrp.cluster import Cluster, risk_parity
from pyhrp.linalg import dist

import scipy.spatial.distance as ssd


def dist(cor):
"""
Compute the correlation based distance matrix d, compare with page 239 of the first book by Marcos
:param cor: the n x n correlation matrix
:return: The matrix d indicating the distance between column i and i. Note that all the diagonal entries are zero.
"""
# https://stackoverflow.com/questions/18952587/
matrix = np.sqrt(np.clip((1.0 - cor) / 2., a_min=0.0, a_max=1.0))
np.fill_diagonal(matrix, val=0.0)
return ssd.squareform(matrix)


def linkage(dist, method="ward", **kwargs):
Expand All @@ -25,82 +39,19 @@ def tree(linkage):
"""
return sch.to_tree(linkage, rd=False)

#
# def bisection(ids):
# """
# Compute the graph underlying the recursive bisection of Marcos Lopez de Prado
#
# :param ids: A (ranked) set of indixes
# :return: The root ClusterNode of this tree
# """
#
# def split(ids):
# # split the vector ids in two parts, split in the middle
# assert len(ids) >= 2
# n = len(ids)
# return ids[:n // 2], ids[n // 2:]
#
# assert len(ids) >= 1
#
# if len(ids) == 1:
# return sch.ClusterNode(id=ids[0])
#
# left, right = split(ids)
# return sch.ClusterNode(id=nr.randint(low=100000, high=200000), left=bisection(ids=left), right=bisection(ids=right))


# def __hrp(node, cov, weights):
# if node.is_leaf():
# # a node is a leaf if has no further relatives downstream. No leaves, no branches...
# return cov[node.id][node.id], weights
# else:
# # compute the variance of the left branch
# v_left, _ = __hrp(node.left, cov, weights)
#
# # compute the variance of the right branch
# v_right, _ = __hrp(node.right, cov, weights)
#
# # compute the split factors alpha_left and alpha_right
# # the split is such that v_left * alpha_left == v_right * alpha_right and alpha + beta = 1
# alpha_left, alpha_right = risk_parity(v_left, v_right)
#
# # compile a list of reachable leafs from the left node and from the right node
# # this could be done with an expensive recursive function but scipy's tree provides a powerful pre_order
# left, right = node.left.pre_order(), node.right.pre_order()
#
# # update the weights linked to those leafs
# weights[left], weights[right] = alpha_left * weights[left], alpha_right * weights[right]
#
# # return the variance for the node and the updated weights
# return variance(w=weights[left + right], cov=sub(cov, idx=left + right)), weights


def _hrp2(node, cov):
def _hrp(node, cov):
if node.is_leaf():
# a node is a leaf if has no further relatives downstream. No leaves, no branches...
asset = cov.keys().to_list()[node.id]
return Cluster(assets={asset: 1.0}, variance=cov[asset][asset])
else:
cluster_left = _hrp2(node.left, cov)
cluster_right = _hrp2(node.right, cov)
cluster_left = _hrp(node.left, cov)
cluster_right = _hrp(node.right, cov)
return risk_parity(cluster_left, cluster_right, cov=cov)


# def hrp_feed(cov, node=None):
# """
# Computes the expected variance and the weights for the hierarchical risk parity portfolio
# :param cov: This is the covariance matrix that shall be used
# :param node: Optional. This is the rootnode of the graph describing the dendrogram
# :return: variance, weights
# """
# if node is None:
# cor = correlation_from_covariance(cov)
# node = tree(linkage(dist(cor)))
#
# return __hrp(node, cov, weights=np.ones(cov.shape[1]))


def hrp_feed2(prices, node=None, method="single"):
def hrp(prices, node=None, method="single"):
"""
Computes the expected variance and the weights for the hierarchical risk parity portfolio
:param cov: This is the covariance matrix that shall be used
Expand All @@ -109,7 +60,8 @@ def hrp_feed2(prices, node=None, method="single"):
"""
returns = prices.pct_change().dropna(axis=0, how="all")
cov, cor = returns.cov(), returns.corr()
node = node or tree(linkage(dist(cor.values), method=method))
links = linkage(dist(cor.values), method=method)
node = node or tree(links)

return _hrp2(node, cov)
return _hrp(node, cov)

16 changes: 0 additions & 16 deletions pyhrp/linalg.py

This file was deleted.

9 changes: 6 additions & 3 deletions pyhrp/marcos.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,11 @@
# the original implementation by Marcos Lopez de Prado is using recursive bisection on a ranked list of columns of the covariance matrix
# To get to this list Lopez de Prado is using what he calls the matrix quasi-diagonlization but it's induced by the order (from left to right) of the dendrogram
# Based on that we build a tree reflecting the recursive bisection.
# With that tree and the covariance matrix we go back to the hrp algorithm.
import numpy.random as nr
import scipy.cluster.hierarchy as sch
from pyhrp.linalg import dist

from pyhrp.hrp import tree, linkage, _hrp2
from pyhrp.hrp import tree, linkage, _hrp, dist


def bisection(ids):
Expand Down Expand Up @@ -40,4 +43,4 @@ def marcos(prices, node=None):
root = bisection(ids=ids)

# It's not clear to me why Marcos is going down this route. Rather than sticking with the graph computed above.
return _hrp2(node=root, cov=cov)
return _hrp(node=root, cov=cov)
61 changes: 61 additions & 0 deletions pyhrp/obsolete.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
# here we implement the HRP algorithm without the cluster concept. Rather we reach around a weight vector.
# It's not(!) exactly possible to the a smart post-analysis.

import numpy as np
import pandas as pd

from pyhrp.hrp import dist, tree, linkage


def __rp(v_left, v_right):
"""
Compute the weights for a risk parity portfolio of two assets
:param v_left: Variance of the "left" portfolio
:param v_right: Variance of the "right" portfolio
:return: w, 1-w the weights for the left and the right portfolio. It is w*v_left == (1-w)*v_right and hence w = v_right / (v_right + v_left)
"""
return v_right / (v_left + v_right), v_left / (v_left + v_right)


def __hrp(node, cov, weights):
if node.is_leaf():
# a node is a leaf if has no further relatives downstream. No leaves, no branches...
return cov[node.id][node.id], weights
else:
# compute the variance of the left branch
v_left, _ = __hrp(node.left, cov, weights)

# compute the variance of the right branch
v_right, _ = __hrp(node.right, cov, weights)

# compute the split factors alpha_left and alpha_right
# the split is such that v_left * alpha_left == v_right * alpha_right and alpha + beta = 1
alpha_left, alpha_right = __rp(v_left, v_right)

# compile a list of reachable leafs from the left node and from the right node
# this could be done with an expensive recursive function but scipy's tree provides a powerful pre_order
left, right = node.left.pre_order(), node.right.pre_order()

# update the weights linked to those leafs
weights[left], weights[right] = alpha_left * weights[left], alpha_right * weights[right]

# return the variance for the node and the updated weights
w = weights[left + right]
c = cov[left + right, :][:, left + right]

return np.linalg.multi_dot((w,c,w)), weights


def hrp(prices, node=None, method="single"):
"""
Computes the expected variance and the weights for the hierarchical risk parity portfolio
:param cov: This is the covariance matrix that shall be used
:param node: Optional. This is the rootnode of the graph describing the dendrogram
:return: variance, weights
"""
returns = prices.pct_change().dropna(axis=0, how="all")
cov, cor = returns.cov(), returns.corr()
links = linkage(dist(cor.values), method=method)
node = node or tree(links)

return __hrp(node, cov.values, weights=np.ones(cov.shape[1]))
10 changes: 6 additions & 4 deletions test/test_hrp.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,7 @@
import numpy as np
import pandas as pd

from pyhrp.hrp import linkage, tree, hrp_feed2
from pyhrp.linalg import dist
from pyhrp.hrp import linkage, tree, hrp, dist
from test.config import resource, get_data


Expand Down Expand Up @@ -49,6 +48,9 @@
# print(root.right.weights)
#
# nt.assert_allclose(root.right.weights, np.array([2.0 / 3.0, 1.0 / 3.0]))
def test_dist():
a = np.array([[1.0, 0.2 / np.sqrt(2.0)], [0.2/np.sqrt(2.0), 1.0]])
np.testing.assert_allclose(dist(a), np.array([6.552017e-01]), rtol=1e-6, atol=1e-6)


def test_quasi_diag():
Expand Down Expand Up @@ -77,13 +79,13 @@ def test_quasi_diag():
def test_hrp():
prices = get_data()

root = hrp_feed2(prices=prices)
root = hrp(prices=prices)

# uncomment this line if you want generating a new file
# root.weights_series(index=list(prices.keys())).to_csv(resource("weights_hrp.csv"), header=False)

x = pd.read_csv(resource("weights_hrp.csv"), squeeze=True, index_col=0, header=None)
x.name = "Weights"
x.index.name = "Asset"
x.index.name = None

pd.testing.assert_series_equal(x, root.weights, check_exact=False)
9 changes: 0 additions & 9 deletions test/test_linalg.py

This file was deleted.

2 changes: 1 addition & 1 deletion test/test_marcos.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,6 @@ def test_marcos():

x = pd.read_csv(resource("weights_marcos.csv"), squeeze=True, index_col=0, header=None)
x.name = "Weights"
x.index.name = "Asset"
x.index.name = None

pd.testing.assert_series_equal(x, root.weights, check_exact=False)
18 changes: 18 additions & 0 deletions test/test_obsolete.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
import pandas as pd

from test.config import get_data, resource
from pyhrp.obsolete import hrp


def test_hrp():
prices = get_data()

variance, weights = hrp(prices=prices)

w = pd.Series(index=prices.keys(), data=weights, name="Weights").sort_index()

x = pd.read_csv(resource("weights_hrp.csv"), squeeze=True, index_col=0, header=None)
x.name = "Weights"
x.index.name = None

pd.testing.assert_series_equal(x, w, check_exact=False)

0 comments on commit 2f58fe1

Please sign in to comment.