Merge pull request czbiohub-sf#1 from czbiohub/phoenix/initial-xi

initial R and python code for xi correlation
avisionh · Mar 3, 2020 · 02075b8 · 02075b8
2 parents f9bd590 + 58d964c
commit 02075b8
Showing 1 changed file with 292 additions and 0 deletions.
diff --git a/xi.ipynb b/xi.ipynb
@@ -0,0 +1,292 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/home/phoenix/miniconda3/envs/correlation/lib/python3.7/site-packages/rpy2/robjects/pandas2ri.py:17: FutureWarning: pandas.core.index is deprecated and will be removed in a future version.  The public classes are available in the top-level namespace.\n",
+      "  from pandas.core.index import Index as PandasIndex\n"
+     ]
+    }
+   ],
+   "source": [
+    "%load_ext rpy2.ipython"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 343,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%%R\n",
+    "\n",
+    "# The following function computes the new correlation coefficient, and returns the P-value for testing independence unless otherwise specified. Removes NAs and converts factor variables to integers automatically, unless otherwise specified. In general, it is safe to just supply x and y, and leave the other parameters to their default values.\n",
+    "\n",
+    "# The following function computes the new correlation coefficient, and returns the P-value for testing independence unless otherwise specified. Removes NAs and converts factor variables to integers automatically, unless otherwise specified. In general, it is safe to just supply x and y, and leave the other parameters to their default values.\n",
+    "\n",
+    "xi = function(x, y, pvalue = T, ties = T, method = \"asymptotic\", nperm = 1000, factor = T) {\n",
+    "    \n",
+    "    # x and y are the data vectors\n",
+    "    # The P-value for the test of independence is returned if pvalue = T. Otherwise, only the coefficient is returned. \n",
+    "    # If ties = T, the algorithm assumes that the data has ties and employs the more elaborated theory for calculating the P-value. Otherwise, it uses the simpler theory. There is no harm in putting ties = T even if there are no ties.\n",
+    "    # method = \"asymptotic\" returns P-values computed by the asymptotic theory. If method = \"permutation\", a permutation test with nperm permutations is employed to estimate the P-value. Usually, there is no need for the permutation test. The asymptotic theory is good enough.\n",
+    "    # nperm is the number of permutations for the permutation test, if needed.\n",
+    "    # factor = T results in the algorithm checking whether x and y are factor variables and converting them to integers if they are. If it is known that the variables are numeric, a little bit of time can be saved by setting factor = F.\n",
+    "    \n",
+    "    \n",
+    "    # NAs are removed here:\n",
+    "    ok = complete.cases(x,y)\n",
+    "    x = x[ok]\n",
+    "    y = y[ok]    \n",
+    "    \n",
+    "    # Factor variables are converted to integers here:\n",
+    "    if (factor == T) {\n",
+    "        if (!is.numeric(x)) x = as.numeric(factor(x))\n",
+    "        if (!is.numeric(y)) y = as.numeric(factor(y))\n",
+    "    }\n",
+    "    \n",
+    "    \n",
+    "    n = length(x)                             # n is the sample size.\n",
+    "    \n",
+    "    PI = rank(x, ties.method = \"random\")        # PI is the rank vector for x, with ties broken at random \n",
+    "    \n",
+    "    \n",
+    "    f = rank(y, ties.method = \"max\")/n        # f[i] is number of j s.t. y[j] <= y[i], divided by n.\n",
+    "    \n",
+    "    g = rank(-y, ties.method = \"max\")/n        # g[i] is number of j s.t. y[j] >= y[i], divided by n.\n",
+    "    \n",
+    "    ord = order(PI)                            # order of the x's, ties broken at random.\n",
+    "    \n",
+    "    f = f[ord]                                # Rearrange f according to ord.\n",
+    "    \n",
+    "    # xi is calculated in the next three lines:\n",
+    "    A1 = mean(abs(f[1:(n-1)] - f[2:n]))*(n-1)/(2*n)\n",
+    "    C = mean(g*(1-g))\n",
+    "    xi = 1 - A1/C\n",
+    "    \n",
+    "    \n",
+    "    # If P-value needs to be computed:\n",
+    "    if (pvalue == T) {\n",
+    "        \n",
+    "        # If there are no ties, return xi and theoretical P-value:\n",
+    "        if (ties == F) return(list(xi = xi, pval = 1 - pnorm(sqrt(n)*xi/sqrt(2/5))))\n",
+    "        \n",
+    "        # If there are ties, and the theoretical method is to be used for calculation P-values:\n",
+    "        if (method == \"asymptotic\") {\n",
+    "            \n",
+    "            # The following steps calculate the theoretical variance in the presence of ties:\n",
+    "            q = sort(f)\n",
+    "            ind = c(1:n)\n",
+    "            ind2 = 2*n - 2*ind + 1\n",
+    "            a = mean(ind2*q*q)/n\n",
+    "            c = mean(ind2*q)/n\n",
+    "            cq = cumsum(q)\n",
+    "            m = (cq + (n - ind)*q)/n\n",
+    "            b = mean(m^2)\n",
+    "            v = (a - 2*b + c^2)/(C^2)\n",
+    "            \n",
+    "            # Return xi and P-value:\n",
+    "            return(list(xi = xi, pval = 1 - pnorm(sqrt(n)*xi/sqrt(v))))\n",
+    "        }\n",
+    "        \n",
+    "        # If permutation test is to be used for calculating P-value:\n",
+    "        if (method == \"permutation\") {\n",
+    "            r = rep(0, nperm)\n",
+    "            for (i in 1:nperm) {\n",
+    "                x1 = runif(n, 0, 1)\n",
+    "                r[i] = xi(x1,y)$xi\n",
+    "            }\n",
+    "        \n",
+    "            # Return xi and P-value based on permutation test:\n",
+    "            return(list(xi = xi, pval = mean(r > xi)))\n",
+    "        }\n",
+    "        cat(\"Invalid method. Use either asymptotic or permutation.\")\n",
+    "    }\n",
+    "    \n",
+    "    # If only xi is desired, return xi:\n",
+    "    else return(xi)\n",
+    "}\n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 345,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import pandas as pd\n",
+    "import numpy as np\n",
+    "import scipy.stats as ss\n",
+    "from scipy.stats import norm\n",
+    "\n",
+    "def xi(x, y):\n",
+    "    x= pd.Series(x)\n",
+    "    y= pd.Series(y)\n",
+    "    \n",
+    "    # remove NAs\n",
+    "    both_not_null = x.isnull() & y.isnull()\n",
+    "    x = x[~both_not_null]\n",
+    "    y = y[~both_not_null]\n",
+    "    \n",
+    "    # sample size\n",
+    "    n = len(x) \n",
+    "    # PI is the rank vector for x, with ties broken at ordinal (needs to be random)\n",
+    "    PI = x.rank(method='dense')\n",
+    "    # f[i] is number of j s.t. y[j] <= y[i], divided by n.\n",
+    "    f = y.rank(method=\"max\")/n        \n",
+    "    # g[i] is number of j s.t. y[j] >= y[i], divided by n.\n",
+    "    g = pd.Series([1-i for i in y]).rank(method=\"max\")/n \n",
+    "    # order of the x's, ties broken at random.\n",
+    "    ord = np.argsort(PI)   \n",
+    "    # Rearrange f according to ord.\n",
+    "    f = [f[i] for i in ord]                              \n",
+    "\n",
+    "    A1 = np.mean(np.abs([x-y for x,y in zip(f[0:(n-1)], f[1:n])]))*(n-1)/(2*n)\n",
+    "    C = np.mean(g*(1-g))\n",
+    "    xi_val = 1 - A1/C\n",
+    "    return xi_val, n, f, C\n",
+    "\n",
+    "\n",
+    "def pvalue_asymptotic(xi_val, n, f, C, ties = False, nperm = 1000, factor = True):\n",
+    "    \n",
+    "    # If there are no ties, return xi and theoretical P-value:\n",
+    "    if ties:\n",
+    "        return 1-ss.norm.cdf(np.sqrt(n)*xi_val/np.sqrt(2/5))\n",
+    "\n",
+    "    # If there are ties, and the theoretical method is to be used for calculation P-values:\n",
+    "    # The following steps calculate the theoretical variance in the presence of ties:\n",
+    "    q = sorted(f)\n",
+    "    ind = [i+1 for i in range(n)]\n",
+    "    ind2 = [2*n - 2*ind[i-1]+1 for i in ind]\n",
+    "    a = np.mean([i*j*j for i,j in zip(ind2,q)])/n\n",
+    "    c = np.mean([i*j for i,j in zip(ind2,q)])/n\n",
+    "    cq = np.cumsum(q)\n",
+    "    m =[(i + (n - j)*k)/n for i,j,k in zip(cq,ind,q)]\n",
+    "    b = np.mean([np.square(i) for i in m])\n",
+    "    v = (a - 2*b + np.square(c))/np.square(C)\n",
+    "    return 1-ss.norm.cdf(np.sqrt(n)*xi_val/np.sqrt(v))\n",
+    "    \n",
+    "\n",
+    "def pvalue_permutation_test(nperm, n, y):\n",
+    "#     # If permutation test is to be used for calculating P-value:\n",
+    "    r = np.zeros((nperm,), dtype=int)\n",
+    "    for idx,val in enumerate(nperm):\n",
+    "        # x1 = runif(n, 0, 1)\n",
+    "        x1 = np.random.uniform(n, 0, 1)\n",
+    "        r[idx] = xi(x1,y)\n",
+    "\n",
+    "        # Return xi and P-value based on permutation test:\n",
+    "        return (np.mean([ri for ri in r if ri > xi]))\n",
+    "\n",
+    "    \n",
+    "def wrapper(x, y, pvalue=True, ties=False, method=\"asymptotic\", nperm=1000, factor=True):\n",
+    "    xi_val, n, f, C = xi(x,y)\n",
+    "    if pvalue:\n",
+    "        if method == \"asymptotic\":\n",
+    "            pvalue = pvalue_asymptotic(xi_val, n, f, C, ties=ties, nperm=nperm, factor=factor)\n",
+    "        elif method == \"permutation\":\n",
+    "            pvalue = pvalue_permutation_test(nperm, n, y)\n",
+    "        else:\n",
+    "            print(f\"method: {method} currently not supported, please select asymptotic or permutation\")\n",
+    "            import sys\n",
+    "            sys.exit()\n",
+    "            \n",
+    "        return (xi_val, pvalue)\n",
+    "    return xi\n",
+    "            "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 346,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "0.2749999999999999 0.07841556446646347\n"
+     ]
+    }
+   ],
+   "source": [
+    "# def xi(x, y, pvalue=True, ties=True, method=\"asymptotic\", nperm=1000, factor=True):\n",
+    "x_i = [10,  8, 13,  9, 11, 14,  6,  4, 12,  7,  5]\n",
+    "y_i = [8.04, 6.95, 7.58, 8.81, 8.33, 9.96, 7.24, 4.26, 10.84, 4.82, 5.68]\n",
+    "x_ii = [10,  8, 13,  9, 11, 14,  6,  4, 12,  7,  5]\n",
+    "y_ii = [9.14, 8.14, 8.74, 8.77, 9.26, 8.1,  6.13, 3.1,  9.13, 7.26, 4.74]\n",
+    "pvalue = True\n",
+    "ties = False\n",
+    "method = \"asymptotic\"\n",
+    "nperm = 1000\n",
+    "factor = True\n",
+    "\n",
+    "xi_val, pvalue = wrapper(x_i, y_i, pvalue=True, ties=False, method=\"asymptotic\", nperm=1000, factor=True)\n",
+    "print(xi_val, pvalue)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 347,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "$xi\n",
+       "[1] 0.275\n",
+       "\n",
+       "$pval\n",
+       "[1] 0.07841556\n",
+       "\n"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    }
+   ],
+   "source": [
+    "%%R\n",
+    "\n",
+    "x_i = c(10,  8, 13,  9, 11, 14,  6,  4, 12,  7,  5)\n",
+    "y_i = c( 8.04, 6.95, 7.58, 8.81, 8.33, 9.96, 7.24, 4.26, 10.84, 4.82, 5.68) \n",
+    "x_ii = c(10,  8, 13,  9, 11, 14,  6,  4, 12,  7,  5)\n",
+    "y_ii = c(9.14, 8.14, 8.74, 8.77, 9.26, 8.1,  6.13, 3.1,  9.13, 7.26, 4.74)\n",
+    "x_iii = c(10,  8, 13,  9, 11, 14,  6,  4, 12,  7,  5)\n",
+    "y_iii = c( 7.46,  6.77, 12.74,  7.11,  7.81,  8.84,  6.08,  5.39,  8.15,  6.42,  5.73)\n",
+    "x_iv = c( 8,  8,  8,  8,  8,  8,  8, 19,  8,  8,  8)\n",
+    "y_iv = c( 6.58,  5.76,  7.71,  8.84,  8.47,  7.04,  5.25, 12.5 ,  5.56,  7.91,  6.89)\n",
+    "\n",
+    "xi(x_i, y_i, pvalue = T, ties = T, method = \"asymptotic\", nperm = 1000, factor = T)\n"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python [conda env:correlation]",
+   "language": "python",
+   "name": "conda-env-correlation-py"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.7.6"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}