Skip to content

Commit

Permalink
Merge pull request czbiohub-sf#1 from czbiohub/phoenix/initial-xi
Browse files Browse the repository at this point in the history
initial R and python code for xi correlation
  • Loading branch information
olgabot authored Mar 3, 2020
2 parents f9bd590 + 58d964c commit 02075b8
Showing 1 changed file with 292 additions and 0 deletions.
292 changes: 292 additions & 0 deletions xi.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,292 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/home/phoenix/miniconda3/envs/correlation/lib/python3.7/site-packages/rpy2/robjects/pandas2ri.py:17: FutureWarning: pandas.core.index is deprecated and will be removed in a future version. The public classes are available in the top-level namespace.\n",
" from pandas.core.index import Index as PandasIndex\n"
]
}
],
"source": [
"%load_ext rpy2.ipython"
]
},
{
"cell_type": "code",
"execution_count": 343,
"metadata": {},
"outputs": [],
"source": [
"%%R\n",
"\n",
"# The following function computes the new correlation coefficient, and returns the P-value for testing independence unless otherwise specified. Removes NAs and converts factor variables to integers automatically, unless otherwise specified. In general, it is safe to just supply x and y, and leave the other parameters to their default values.\n",
"\n",
"# The following function computes the new correlation coefficient, and returns the P-value for testing independence unless otherwise specified. Removes NAs and converts factor variables to integers automatically, unless otherwise specified. In general, it is safe to just supply x and y, and leave the other parameters to their default values.\n",
"\n",
"xi = function(x, y, pvalue = T, ties = T, method = \"asymptotic\", nperm = 1000, factor = T) {\n",
" \n",
" # x and y are the data vectors\n",
" # The P-value for the test of independence is returned if pvalue = T. Otherwise, only the coefficient is returned. \n",
" # If ties = T, the algorithm assumes that the data has ties and employs the more elaborated theory for calculating the P-value. Otherwise, it uses the simpler theory. There is no harm in putting ties = T even if there are no ties.\n",
" # method = \"asymptotic\" returns P-values computed by the asymptotic theory. If method = \"permutation\", a permutation test with nperm permutations is employed to estimate the P-value. Usually, there is no need for the permutation test. The asymptotic theory is good enough.\n",
" # nperm is the number of permutations for the permutation test, if needed.\n",
" # factor = T results in the algorithm checking whether x and y are factor variables and converting them to integers if they are. If it is known that the variables are numeric, a little bit of time can be saved by setting factor = F.\n",
" \n",
" \n",
" # NAs are removed here:\n",
" ok = complete.cases(x,y)\n",
" x = x[ok]\n",
" y = y[ok] \n",
" \n",
" # Factor variables are converted to integers here:\n",
" if (factor == T) {\n",
" if (!is.numeric(x)) x = as.numeric(factor(x))\n",
" if (!is.numeric(y)) y = as.numeric(factor(y))\n",
" }\n",
" \n",
" \n",
" n = length(x) # n is the sample size.\n",
" \n",
" PI = rank(x, ties.method = \"random\") # PI is the rank vector for x, with ties broken at random \n",
" \n",
" \n",
" f = rank(y, ties.method = \"max\")/n # f[i] is number of j s.t. y[j] <= y[i], divided by n.\n",
" \n",
" g = rank(-y, ties.method = \"max\")/n # g[i] is number of j s.t. y[j] >= y[i], divided by n.\n",
" \n",
" ord = order(PI) # order of the x's, ties broken at random.\n",
" \n",
" f = f[ord] # Rearrange f according to ord.\n",
" \n",
" # xi is calculated in the next three lines:\n",
" A1 = mean(abs(f[1:(n-1)] - f[2:n]))*(n-1)/(2*n)\n",
" C = mean(g*(1-g))\n",
" xi = 1 - A1/C\n",
" \n",
" \n",
" # If P-value needs to be computed:\n",
" if (pvalue == T) {\n",
" \n",
" # If there are no ties, return xi and theoretical P-value:\n",
" if (ties == F) return(list(xi = xi, pval = 1 - pnorm(sqrt(n)*xi/sqrt(2/5))))\n",
" \n",
" # If there are ties, and the theoretical method is to be used for calculation P-values:\n",
" if (method == \"asymptotic\") {\n",
" \n",
" # The following steps calculate the theoretical variance in the presence of ties:\n",
" q = sort(f)\n",
" ind = c(1:n)\n",
" ind2 = 2*n - 2*ind + 1\n",
" a = mean(ind2*q*q)/n\n",
" c = mean(ind2*q)/n\n",
" cq = cumsum(q)\n",
" m = (cq + (n - ind)*q)/n\n",
" b = mean(m^2)\n",
" v = (a - 2*b + c^2)/(C^2)\n",
" \n",
" # Return xi and P-value:\n",
" return(list(xi = xi, pval = 1 - pnorm(sqrt(n)*xi/sqrt(v))))\n",
" }\n",
" \n",
" # If permutation test is to be used for calculating P-value:\n",
" if (method == \"permutation\") {\n",
" r = rep(0, nperm)\n",
" for (i in 1:nperm) {\n",
" x1 = runif(n, 0, 1)\n",
" r[i] = xi(x1,y)$xi\n",
" }\n",
" \n",
" # Return xi and P-value based on permutation test:\n",
" return(list(xi = xi, pval = mean(r > xi)))\n",
" }\n",
" cat(\"Invalid method. Use either asymptotic or permutation.\")\n",
" }\n",
" \n",
" # If only xi is desired, return xi:\n",
" else return(xi)\n",
"}\n",
"\n"
]
},
{
"cell_type": "code",
"execution_count": 345,
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"import numpy as np\n",
"import scipy.stats as ss\n",
"from scipy.stats import norm\n",
"\n",
"def xi(x, y):\n",
" x= pd.Series(x)\n",
" y= pd.Series(y)\n",
" \n",
" # remove NAs\n",
" both_not_null = x.isnull() & y.isnull()\n",
" x = x[~both_not_null]\n",
" y = y[~both_not_null]\n",
" \n",
" # sample size\n",
" n = len(x) \n",
" # PI is the rank vector for x, with ties broken at ordinal (needs to be random)\n",
" PI = x.rank(method='dense')\n",
" # f[i] is number of j s.t. y[j] <= y[i], divided by n.\n",
" f = y.rank(method=\"max\")/n \n",
" # g[i] is number of j s.t. y[j] >= y[i], divided by n.\n",
" g = pd.Series([1-i for i in y]).rank(method=\"max\")/n \n",
" # order of the x's, ties broken at random.\n",
" ord = np.argsort(PI) \n",
" # Rearrange f according to ord.\n",
" f = [f[i] for i in ord] \n",
"\n",
" A1 = np.mean(np.abs([x-y for x,y in zip(f[0:(n-1)], f[1:n])]))*(n-1)/(2*n)\n",
" C = np.mean(g*(1-g))\n",
" xi_val = 1 - A1/C\n",
" return xi_val, n, f, C\n",
"\n",
"\n",
"def pvalue_asymptotic(xi_val, n, f, C, ties = False, nperm = 1000, factor = True):\n",
" \n",
" # If there are no ties, return xi and theoretical P-value:\n",
" if ties:\n",
" return 1-ss.norm.cdf(np.sqrt(n)*xi_val/np.sqrt(2/5))\n",
"\n",
" # If there are ties, and the theoretical method is to be used for calculation P-values:\n",
" # The following steps calculate the theoretical variance in the presence of ties:\n",
" q = sorted(f)\n",
" ind = [i+1 for i in range(n)]\n",
" ind2 = [2*n - 2*ind[i-1]+1 for i in ind]\n",
" a = np.mean([i*j*j for i,j in zip(ind2,q)])/n\n",
" c = np.mean([i*j for i,j in zip(ind2,q)])/n\n",
" cq = np.cumsum(q)\n",
" m =[(i + (n - j)*k)/n for i,j,k in zip(cq,ind,q)]\n",
" b = np.mean([np.square(i) for i in m])\n",
" v = (a - 2*b + np.square(c))/np.square(C)\n",
" return 1-ss.norm.cdf(np.sqrt(n)*xi_val/np.sqrt(v))\n",
" \n",
"\n",
"def pvalue_permutation_test(nperm, n, y):\n",
"# # If permutation test is to be used for calculating P-value:\n",
" r = np.zeros((nperm,), dtype=int)\n",
" for idx,val in enumerate(nperm):\n",
" # x1 = runif(n, 0, 1)\n",
" x1 = np.random.uniform(n, 0, 1)\n",
" r[idx] = xi(x1,y)\n",
"\n",
" # Return xi and P-value based on permutation test:\n",
" return (np.mean([ri for ri in r if ri > xi]))\n",
"\n",
" \n",
"def wrapper(x, y, pvalue=True, ties=False, method=\"asymptotic\", nperm=1000, factor=True):\n",
" xi_val, n, f, C = xi(x,y)\n",
" if pvalue:\n",
" if method == \"asymptotic\":\n",
" pvalue = pvalue_asymptotic(xi_val, n, f, C, ties=ties, nperm=nperm, factor=factor)\n",
" elif method == \"permutation\":\n",
" pvalue = pvalue_permutation_test(nperm, n, y)\n",
" else:\n",
" print(f\"method: {method} currently not supported, please select asymptotic or permutation\")\n",
" import sys\n",
" sys.exit()\n",
" \n",
" return (xi_val, pvalue)\n",
" return xi\n",
" "
]
},
{
"cell_type": "code",
"execution_count": 346,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"0.2749999999999999 0.07841556446646347\n"
]
}
],
"source": [
"# def xi(x, y, pvalue=True, ties=True, method=\"asymptotic\", nperm=1000, factor=True):\n",
"x_i = [10, 8, 13, 9, 11, 14, 6, 4, 12, 7, 5]\n",
"y_i = [8.04, 6.95, 7.58, 8.81, 8.33, 9.96, 7.24, 4.26, 10.84, 4.82, 5.68]\n",
"x_ii = [10, 8, 13, 9, 11, 14, 6, 4, 12, 7, 5]\n",
"y_ii = [9.14, 8.14, 8.74, 8.77, 9.26, 8.1, 6.13, 3.1, 9.13, 7.26, 4.74]\n",
"pvalue = True\n",
"ties = False\n",
"method = \"asymptotic\"\n",
"nperm = 1000\n",
"factor = True\n",
"\n",
"xi_val, pvalue = wrapper(x_i, y_i, pvalue=True, ties=False, method=\"asymptotic\", nperm=1000, factor=True)\n",
"print(xi_val, pvalue)"
]
},
{
"cell_type": "code",
"execution_count": 347,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"$xi\n",
"[1] 0.275\n",
"\n",
"$pval\n",
"[1] 0.07841556\n",
"\n"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"%%R\n",
"\n",
"x_i = c(10, 8, 13, 9, 11, 14, 6, 4, 12, 7, 5)\n",
"y_i = c( 8.04, 6.95, 7.58, 8.81, 8.33, 9.96, 7.24, 4.26, 10.84, 4.82, 5.68) \n",
"x_ii = c(10, 8, 13, 9, 11, 14, 6, 4, 12, 7, 5)\n",
"y_ii = c(9.14, 8.14, 8.74, 8.77, 9.26, 8.1, 6.13, 3.1, 9.13, 7.26, 4.74)\n",
"x_iii = c(10, 8, 13, 9, 11, 14, 6, 4, 12, 7, 5)\n",
"y_iii = c( 7.46, 6.77, 12.74, 7.11, 7.81, 8.84, 6.08, 5.39, 8.15, 6.42, 5.73)\n",
"x_iv = c( 8, 8, 8, 8, 8, 8, 8, 19, 8, 8, 8)\n",
"y_iv = c( 6.58, 5.76, 7.71, 8.84, 8.47, 7.04, 5.25, 12.5 , 5.56, 7.91, 6.89)\n",
"\n",
"xi(x_i, y_i, pvalue = T, ties = T, method = \"asymptotic\", nperm = 1000, factor = T)\n"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python [conda env:correlation]",
"language": "python",
"name": "conda-env-correlation-py"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.6"
}
},
"nbformat": 4,
"nbformat_minor": 4
}

0 comments on commit 02075b8

Please sign in to comment.