cleaning

gaojn · Apr 30, 2020 · 2f58fe1 · 2f58fe1
1 parent 2c60c30
commit 2f58fe1
Show file tree

Hide file tree

Showing 10 changed files with 140 additions and 132 deletions.
diff --git a/README.md b/README.md
@@ -6,27 +6,36 @@ We take heavily advantage of the scipy.cluster.hierarchy package.
 Here's a simple example
 
 ```python
-import numpy as np
+import pandas as pd
+from pyhrp.hrp import dist, linkage, tree, _hrp
 
-from pyhrp.graph import dendrogram
-from pyhrp.hrp import hrp_feed, linkage, tree
+prices = pd.read_csv("test/resources/stock_prices.csv", index_col=0, parse_dates=True)
 
-from pyhrp.linalg import dist, correlation_from_covariance
+returns = prices.pct_change().dropna(axis=0, how="all")
+cov, cor = returns.cov(), returns.corr()
+links = linkage(dist(cor.values), method='ward')
+node = tree(links)
 
-# use a small covariance matrix
-cov = np.array([[1, 0.5, 0.2], [0.5, 2, 0.2], [0.2, 0.2, 3]])
+rootcluster = _hrp(node, cov)
 
-# we compute the root(node) of a graph here
-link = linkage(dist(correlation_from_covariance(cov)), 'ward')
-root = tree(link)
-
-# plot the dendrogram
-ax = dendrogram(link, orientation="left")
+ax = dendrogram(links, orientation="left")
 ax.get_figure().savefig("dendrogram.png")
+```
+For your convenience you can bypass the construction of the covariance and correlation matrix, the links and the node, e.g. the root of the tree (dendrogram).
+```python
+import pandas as pd
+from pyhrp.hrp import hrp
 
-v, weights = hrp_feed(node=root, cov=cov)
-
-print(weights)
+prices = pd.read_csv("test/resources/stock_prices.csv", index_col=0, parse_dates=True)
+root = hrp(prices=prices)
+```
+You may expect a weight series here but instead the `hrp` function returns a `Cluster` object. The `Cluster` simplifies all further post-analysis.
+```python
+print(cluster.weights)
+print(cluster.variance)
+# You can drill into the graph by going downstream
+print(cluster.left)
+print(cluster.right)
 ```
 
 ## Installation:

diff --git a/pyhrp/cluster.py b/pyhrp/cluster.py
@@ -32,23 +32,16 @@ def rp(v_left, v_right):
 
 class Cluster(object):
     def __init__(self, assets, variance, left=None, right=None):
-        # assert len(assets) == len(weights)
-        # assert len(assets) == len(set(assets))
-        # assert isinstance(weights, np.ndarray)
         w = np.array(list(assets.values()))
 
         assert np.all(w > 0)
         assert variance >= 0
+
         # test that the weights are close to 1.0
         assert np.isclose(np.sum(w), 1.0)
 
-        # distinct values in assets dictionary
-        # assert len(set(assets.values())) == len(assets)
-
         self.__assets = assets
         self.__variance = variance
-        # self.__weights = weights
-
         self.__left = left
         self.__right = right
 
@@ -59,9 +52,6 @@ def __init__(self, assets, variance, left=None, right=None):
             # left is not None, hence both left and right have to be clusters
             assert isinstance(left, Cluster)
             assert isinstance(right, Cluster)
-
-            # assert self.__assets == {**left.assets, **right.assets}
-            # assert set(left.assets.keys()).isdisjoint(set(right.assets.keys()))
             assert set(left.assets.keys()).isdisjoint(set(right.assets.keys()))
 
     @property
@@ -85,6 +75,4 @@ def is_leaf(self):
 
     @property
     def weights(self):
-        a = pd.Series(self.assets, name="Weights")
-        a.index.name = "Asset"
-        return a.sort_index()
+        return pd.Series(self.assets, name="Weights").sort_index()
diff --git a/pyhrp/hrp.py b/pyhrp/hrp.py
@@ -2,7 +2,21 @@
 import scipy.cluster.hierarchy as sch
 
 from pyhrp.cluster import Cluster, risk_parity
-from pyhrp.linalg import dist
+
+import scipy.spatial.distance as ssd
+
+
+def dist(cor):
+    """
+    Compute the correlation based distance matrix d, compare with page 239 of the first book by Marcos
+    :param cor: the n x n correlation matrix
+    :return: The matrix d indicating the distance between column i and i. Note that all the diagonal entries are zero.
+
+    """
+    # https://stackoverflow.com/questions/18952587/
+    matrix = np.sqrt(np.clip((1.0 - cor) / 2., a_min=0.0, a_max=1.0))
+    np.fill_diagonal(matrix, val=0.0)
+    return ssd.squareform(matrix)
 
 
 def linkage(dist, method="ward", **kwargs):
@@ -25,82 +39,19 @@ def tree(linkage):
     """
     return sch.to_tree(linkage, rd=False)
 
-#
-# def bisection(ids):
-#     """
-#     Compute the graph underlying the recursive bisection of Marcos Lopez de Prado
-#
-#     :param ids: A (ranked) set of indixes
-#     :return: The root ClusterNode of this tree
-#     """
-#
-#     def split(ids):
-#         # split the vector ids in two parts, split in the middle
-#         assert len(ids) >= 2
-#         n = len(ids)
-#         return ids[:n // 2], ids[n // 2:]
-#
-#     assert len(ids) >= 1
-#
-#     if len(ids) == 1:
-#         return sch.ClusterNode(id=ids[0])
-#
-#     left, right = split(ids)
-#     return sch.ClusterNode(id=nr.randint(low=100000, high=200000), left=bisection(ids=left), right=bisection(ids=right))
 
-
-# def __hrp(node, cov, weights):
-#     if node.is_leaf():
-#         # a node is a leaf if has no further relatives downstream. No leaves, no branches...
-#         return cov[node.id][node.id], weights
-#     else:
-#         # compute the variance of the left branch
-#         v_left, _ = __hrp(node.left, cov, weights)
-#
-#         # compute the variance of the right branch
-#         v_right, _ = __hrp(node.right, cov, weights)
-#
-#         # compute the split factors alpha_left and alpha_right
-#         # the split is such that v_left * alpha_left == v_right * alpha_right and alpha + beta = 1
-#         alpha_left, alpha_right = risk_parity(v_left, v_right)
-#
-#         # compile a list of reachable leafs from the left node and from the right node
-#         # this could be done with an expensive recursive function but scipy's tree provides a powerful pre_order
-#         left, right = node.left.pre_order(), node.right.pre_order()
-#
-#         # update the weights linked to those leafs
-#         weights[left], weights[right] = alpha_left * weights[left], alpha_right * weights[right]
-#
-#         # return the variance for the node and the updated weights
-#         return variance(w=weights[left + right], cov=sub(cov, idx=left + right)), weights
-
-
-def _hrp2(node, cov):
+def _hrp(node, cov):
     if node.is_leaf():
         # a node is a leaf if has no further relatives downstream. No leaves, no branches...
         asset = cov.keys().to_list()[node.id]
         return Cluster(assets={asset: 1.0}, variance=cov[asset][asset])
     else:
-        cluster_left = _hrp2(node.left, cov)
-        cluster_right = _hrp2(node.right, cov)
+        cluster_left = _hrp(node.left, cov)
+        cluster_right = _hrp(node.right, cov)
         return risk_parity(cluster_left, cluster_right, cov=cov)
 
 
-# def hrp_feed(cov, node=None):
-#     """
-#     Computes the expected variance and the weights for the hierarchical risk parity portfolio
-#     :param cov: This is the covariance matrix that shall be used
-#     :param node: Optional. This is the rootnode of the graph describing the dendrogram
-#     :return: variance, weights
-#     """
-#     if node is None:
-#         cor = correlation_from_covariance(cov)
-#         node = tree(linkage(dist(cor)))
-#
-#     return __hrp(node, cov, weights=np.ones(cov.shape[1]))
-
-
-def hrp_feed2(prices, node=None, method="single"):
+def hrp(prices, node=None, method="single"):
     """
     Computes the expected variance and the weights for the hierarchical risk parity portfolio
     :param cov: This is the covariance matrix that shall be used
@@ -109,7 +60,8 @@ def hrp_feed2(prices, node=None, method="single"):
     """
     returns = prices.pct_change().dropna(axis=0, how="all")
     cov, cor = returns.cov(), returns.corr()
-    node = node or tree(linkage(dist(cor.values), method=method))
+    links = linkage(dist(cor.values), method=method)
+    node = node or tree(links)
 
-    return _hrp2(node, cov)
+    return _hrp(node, cov)
 
diff --git a/pyhrp/linalg.py b/pyhrp/linalg.py
diff --git a/pyhrp/marcos.py b/pyhrp/marcos.py
@@ -1,8 +1,11 @@
+# the original implementation by Marcos Lopez de Prado is using recursive bisection on a ranked list of columns of the covariance matrix
+# To get to this list Lopez de Prado is using what he calls the matrix quasi-diagonlization but it's induced by the order (from left to right) of the dendrogram
+# Based on that we build a tree reflecting the recursive bisection.
+# With that tree and the covariance matrix we go back to the hrp algorithm.
 import numpy.random as nr
 import scipy.cluster.hierarchy as sch
-from pyhrp.linalg import dist
 
-from pyhrp.hrp import tree, linkage, _hrp2
+from pyhrp.hrp import tree, linkage, _hrp, dist
 
 
 def bisection(ids):
@@ -40,4 +43,4 @@ def marcos(prices, node=None):
     root = bisection(ids=ids)
 
     # It's not clear to me why Marcos is going down this route. Rather than sticking with the graph computed above.
-    return _hrp2(node=root, cov=cov)
+    return _hrp(node=root, cov=cov)
diff --git a/pyhrp/obsolete.py b/pyhrp/obsolete.py
@@ -0,0 +1,61 @@
+# here we implement the HRP algorithm without the cluster concept. Rather we reach around a weight vector.
+# It's not(!) exactly possible to the a smart post-analysis.
+
+import numpy as np
+import pandas as pd
+
+from pyhrp.hrp import dist, tree, linkage
+
+
+def __rp(v_left, v_right):
+    """
+    Compute the weights for a risk parity portfolio of two assets
+    :param v_left: Variance of the "left" portfolio
+    :param v_right: Variance of the "right" portfolio
+    :return: w, 1-w the weights for the left and the right portfolio. It is w*v_left == (1-w)*v_right and hence w = v_right / (v_right + v_left)
+    """
+    return v_right / (v_left + v_right), v_left / (v_left + v_right)
+
+
+def __hrp(node, cov, weights):
+    if node.is_leaf():
+        # a node is a leaf if has no further relatives downstream. No leaves, no branches...
+        return cov[node.id][node.id], weights
+    else:
+        # compute the variance of the left branch
+        v_left, _ = __hrp(node.left, cov, weights)
+
+        # compute the variance of the right branch
+        v_right, _ = __hrp(node.right, cov, weights)
+
+        # compute the split factors alpha_left and alpha_right
+        # the split is such that v_left * alpha_left == v_right * alpha_right and alpha + beta = 1
+        alpha_left, alpha_right = __rp(v_left, v_right)
+
+        # compile a list of reachable leafs from the left node and from the right node
+        # this could be done with an expensive recursive function but scipy's tree provides a powerful pre_order
+        left, right = node.left.pre_order(), node.right.pre_order()
+
+        # update the weights linked to those leafs
+        weights[left], weights[right] = alpha_left * weights[left], alpha_right * weights[right]
+
+        # return the variance for the node and the updated weights
+        w = weights[left + right]
+        c = cov[left + right, :][:, left + right]
+
+        return np.linalg.multi_dot((w,c,w)), weights
+
+
+def hrp(prices, node=None, method="single"):
+    """
+    Computes the expected variance and the weights for the hierarchical risk parity portfolio
+    :param cov: This is the covariance matrix that shall be used
+    :param node: Optional. This is the rootnode of the graph describing the dendrogram
+    :return: variance, weights
+    """
+    returns = prices.pct_change().dropna(axis=0, how="all")
+    cov, cor = returns.cov(), returns.corr()
+    links = linkage(dist(cor.values), method=method)
+    node = node or tree(links)
+
+    return __hrp(node, cov.values, weights=np.ones(cov.shape[1]))
diff --git a/test/test_hrp.py b/test/test_hrp.py
@@ -1,8 +1,7 @@
 import numpy as np
 import pandas as pd
 
-from pyhrp.hrp import linkage, tree, hrp_feed2
-from pyhrp.linalg import dist
+from pyhrp.hrp import linkage, tree, hrp, dist
 from test.config import resource, get_data
 
 
@@ -49,6 +48,9 @@
 #     print(root.right.weights)
 #
 #     nt.assert_allclose(root.right.weights, np.array([2.0 / 3.0, 1.0 / 3.0]))
+def test_dist():
+    a = np.array([[1.0, 0.2 / np.sqrt(2.0)], [0.2/np.sqrt(2.0), 1.0]])
+    np.testing.assert_allclose(dist(a), np.array([6.552017e-01]), rtol=1e-6, atol=1e-6)
 
 
 def test_quasi_diag():
@@ -77,13 +79,13 @@ def test_quasi_diag():
 def test_hrp():
     prices = get_data()
 
-    root = hrp_feed2(prices=prices)
+    root = hrp(prices=prices)
 
     # uncomment this line if you want generating a new file
     # root.weights_series(index=list(prices.keys())).to_csv(resource("weights_hrp.csv"), header=False)
 
     x = pd.read_csv(resource("weights_hrp.csv"), squeeze=True, index_col=0, header=None)
     x.name = "Weights"
-    x.index.name = "Asset"
+    x.index.name = None
 
     pd.testing.assert_series_equal(x, root.weights, check_exact=False)
diff --git a/test/test_linalg.py b/test/test_linalg.py
diff --git a/test/test_marcos.py b/test/test_marcos.py
@@ -14,6 +14,6 @@ def test_marcos():
 
     x = pd.read_csv(resource("weights_marcos.csv"), squeeze=True, index_col=0, header=None)
     x.name = "Weights"
-    x.index.name = "Asset"
+    x.index.name = None
 
     pd.testing.assert_series_equal(x, root.weights, check_exact=False)
diff --git a/test/test_obsolete.py b/test/test_obsolete.py
@@ -0,0 +1,18 @@
+import pandas as pd
+
+from test.config import get_data, resource
+from pyhrp.obsolete import hrp
+
+
+def test_hrp():
+    prices = get_data()
+
+    variance, weights = hrp(prices=prices)
+
+    w = pd.Series(index=prices.keys(), data=weights, name="Weights").sort_index()
+
+    x = pd.read_csv(resource("weights_hrp.csv"), squeeze=True, index_col=0, header=None)
+    x.name = "Weights"
+    x.index.name = None
+
+    pd.testing.assert_series_equal(x, w, check_exact=False)