forked from czbiohub-sf/xicor
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request czbiohub-sf#1 from czbiohub/phoenix/initial-xi
initial R and python code for xi correlation
- Loading branch information
Showing
1 changed file
with
292 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,292 @@ | ||
{ | ||
"cells": [ | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 1, | ||
"metadata": {}, | ||
"outputs": [ | ||
{ | ||
"name": "stderr", | ||
"output_type": "stream", | ||
"text": [ | ||
"/home/phoenix/miniconda3/envs/correlation/lib/python3.7/site-packages/rpy2/robjects/pandas2ri.py:17: FutureWarning: pandas.core.index is deprecated and will be removed in a future version. The public classes are available in the top-level namespace.\n", | ||
" from pandas.core.index import Index as PandasIndex\n" | ||
] | ||
} | ||
], | ||
"source": [ | ||
"%load_ext rpy2.ipython" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 343, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"%%R\n", | ||
"\n", | ||
"# The following function computes the new correlation coefficient, and returns the P-value for testing independence unless otherwise specified. Removes NAs and converts factor variables to integers automatically, unless otherwise specified. In general, it is safe to just supply x and y, and leave the other parameters to their default values.\n", | ||
"\n", | ||
"# The following function computes the new correlation coefficient, and returns the P-value for testing independence unless otherwise specified. Removes NAs and converts factor variables to integers automatically, unless otherwise specified. In general, it is safe to just supply x and y, and leave the other parameters to their default values.\n", | ||
"\n", | ||
"xi = function(x, y, pvalue = T, ties = T, method = \"asymptotic\", nperm = 1000, factor = T) {\n", | ||
" \n", | ||
" # x and y are the data vectors\n", | ||
" # The P-value for the test of independence is returned if pvalue = T. Otherwise, only the coefficient is returned. \n", | ||
" # If ties = T, the algorithm assumes that the data has ties and employs the more elaborated theory for calculating the P-value. Otherwise, it uses the simpler theory. There is no harm in putting ties = T even if there are no ties.\n", | ||
" # method = \"asymptotic\" returns P-values computed by the asymptotic theory. If method = \"permutation\", a permutation test with nperm permutations is employed to estimate the P-value. Usually, there is no need for the permutation test. The asymptotic theory is good enough.\n", | ||
" # nperm is the number of permutations for the permutation test, if needed.\n", | ||
" # factor = T results in the algorithm checking whether x and y are factor variables and converting them to integers if they are. If it is known that the variables are numeric, a little bit of time can be saved by setting factor = F.\n", | ||
" \n", | ||
" \n", | ||
" # NAs are removed here:\n", | ||
" ok = complete.cases(x,y)\n", | ||
" x = x[ok]\n", | ||
" y = y[ok] \n", | ||
" \n", | ||
" # Factor variables are converted to integers here:\n", | ||
" if (factor == T) {\n", | ||
" if (!is.numeric(x)) x = as.numeric(factor(x))\n", | ||
" if (!is.numeric(y)) y = as.numeric(factor(y))\n", | ||
" }\n", | ||
" \n", | ||
" \n", | ||
" n = length(x) # n is the sample size.\n", | ||
" \n", | ||
" PI = rank(x, ties.method = \"random\") # PI is the rank vector for x, with ties broken at random \n", | ||
" \n", | ||
" \n", | ||
" f = rank(y, ties.method = \"max\")/n # f[i] is number of j s.t. y[j] <= y[i], divided by n.\n", | ||
" \n", | ||
" g = rank(-y, ties.method = \"max\")/n # g[i] is number of j s.t. y[j] >= y[i], divided by n.\n", | ||
" \n", | ||
" ord = order(PI) # order of the x's, ties broken at random.\n", | ||
" \n", | ||
" f = f[ord] # Rearrange f according to ord.\n", | ||
" \n", | ||
" # xi is calculated in the next three lines:\n", | ||
" A1 = mean(abs(f[1:(n-1)] - f[2:n]))*(n-1)/(2*n)\n", | ||
" C = mean(g*(1-g))\n", | ||
" xi = 1 - A1/C\n", | ||
" \n", | ||
" \n", | ||
" # If P-value needs to be computed:\n", | ||
" if (pvalue == T) {\n", | ||
" \n", | ||
" # If there are no ties, return xi and theoretical P-value:\n", | ||
" if (ties == F) return(list(xi = xi, pval = 1 - pnorm(sqrt(n)*xi/sqrt(2/5))))\n", | ||
" \n", | ||
" # If there are ties, and the theoretical method is to be used for calculation P-values:\n", | ||
" if (method == \"asymptotic\") {\n", | ||
" \n", | ||
" # The following steps calculate the theoretical variance in the presence of ties:\n", | ||
" q = sort(f)\n", | ||
" ind = c(1:n)\n", | ||
" ind2 = 2*n - 2*ind + 1\n", | ||
" a = mean(ind2*q*q)/n\n", | ||
" c = mean(ind2*q)/n\n", | ||
" cq = cumsum(q)\n", | ||
" m = (cq + (n - ind)*q)/n\n", | ||
" b = mean(m^2)\n", | ||
" v = (a - 2*b + c^2)/(C^2)\n", | ||
" \n", | ||
" # Return xi and P-value:\n", | ||
" return(list(xi = xi, pval = 1 - pnorm(sqrt(n)*xi/sqrt(v))))\n", | ||
" }\n", | ||
" \n", | ||
" # If permutation test is to be used for calculating P-value:\n", | ||
" if (method == \"permutation\") {\n", | ||
" r = rep(0, nperm)\n", | ||
" for (i in 1:nperm) {\n", | ||
" x1 = runif(n, 0, 1)\n", | ||
" r[i] = xi(x1,y)$xi\n", | ||
" }\n", | ||
" \n", | ||
" # Return xi and P-value based on permutation test:\n", | ||
" return(list(xi = xi, pval = mean(r > xi)))\n", | ||
" }\n", | ||
" cat(\"Invalid method. Use either asymptotic or permutation.\")\n", | ||
" }\n", | ||
" \n", | ||
" # If only xi is desired, return xi:\n", | ||
" else return(xi)\n", | ||
"}\n", | ||
"\n" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 345, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"import pandas as pd\n", | ||
"import numpy as np\n", | ||
"import scipy.stats as ss\n", | ||
"from scipy.stats import norm\n", | ||
"\n", | ||
"def xi(x, y):\n", | ||
" x= pd.Series(x)\n", | ||
" y= pd.Series(y)\n", | ||
" \n", | ||
" # remove NAs\n", | ||
" both_not_null = x.isnull() & y.isnull()\n", | ||
" x = x[~both_not_null]\n", | ||
" y = y[~both_not_null]\n", | ||
" \n", | ||
" # sample size\n", | ||
" n = len(x) \n", | ||
" # PI is the rank vector for x, with ties broken at ordinal (needs to be random)\n", | ||
" PI = x.rank(method='dense')\n", | ||
" # f[i] is number of j s.t. y[j] <= y[i], divided by n.\n", | ||
" f = y.rank(method=\"max\")/n \n", | ||
" # g[i] is number of j s.t. y[j] >= y[i], divided by n.\n", | ||
" g = pd.Series([1-i for i in y]).rank(method=\"max\")/n \n", | ||
" # order of the x's, ties broken at random.\n", | ||
" ord = np.argsort(PI) \n", | ||
" # Rearrange f according to ord.\n", | ||
" f = [f[i] for i in ord] \n", | ||
"\n", | ||
" A1 = np.mean(np.abs([x-y for x,y in zip(f[0:(n-1)], f[1:n])]))*(n-1)/(2*n)\n", | ||
" C = np.mean(g*(1-g))\n", | ||
" xi_val = 1 - A1/C\n", | ||
" return xi_val, n, f, C\n", | ||
"\n", | ||
"\n", | ||
"def pvalue_asymptotic(xi_val, n, f, C, ties = False, nperm = 1000, factor = True):\n", | ||
" \n", | ||
" # If there are no ties, return xi and theoretical P-value:\n", | ||
" if ties:\n", | ||
" return 1-ss.norm.cdf(np.sqrt(n)*xi_val/np.sqrt(2/5))\n", | ||
"\n", | ||
" # If there are ties, and the theoretical method is to be used for calculation P-values:\n", | ||
" # The following steps calculate the theoretical variance in the presence of ties:\n", | ||
" q = sorted(f)\n", | ||
" ind = [i+1 for i in range(n)]\n", | ||
" ind2 = [2*n - 2*ind[i-1]+1 for i in ind]\n", | ||
" a = np.mean([i*j*j for i,j in zip(ind2,q)])/n\n", | ||
" c = np.mean([i*j for i,j in zip(ind2,q)])/n\n", | ||
" cq = np.cumsum(q)\n", | ||
" m =[(i + (n - j)*k)/n for i,j,k in zip(cq,ind,q)]\n", | ||
" b = np.mean([np.square(i) for i in m])\n", | ||
" v = (a - 2*b + np.square(c))/np.square(C)\n", | ||
" return 1-ss.norm.cdf(np.sqrt(n)*xi_val/np.sqrt(v))\n", | ||
" \n", | ||
"\n", | ||
"def pvalue_permutation_test(nperm, n, y):\n", | ||
"# # If permutation test is to be used for calculating P-value:\n", | ||
" r = np.zeros((nperm,), dtype=int)\n", | ||
" for idx,val in enumerate(nperm):\n", | ||
" # x1 = runif(n, 0, 1)\n", | ||
" x1 = np.random.uniform(n, 0, 1)\n", | ||
" r[idx] = xi(x1,y)\n", | ||
"\n", | ||
" # Return xi and P-value based on permutation test:\n", | ||
" return (np.mean([ri for ri in r if ri > xi]))\n", | ||
"\n", | ||
" \n", | ||
"def wrapper(x, y, pvalue=True, ties=False, method=\"asymptotic\", nperm=1000, factor=True):\n", | ||
" xi_val, n, f, C = xi(x,y)\n", | ||
" if pvalue:\n", | ||
" if method == \"asymptotic\":\n", | ||
" pvalue = pvalue_asymptotic(xi_val, n, f, C, ties=ties, nperm=nperm, factor=factor)\n", | ||
" elif method == \"permutation\":\n", | ||
" pvalue = pvalue_permutation_test(nperm, n, y)\n", | ||
" else:\n", | ||
" print(f\"method: {method} currently not supported, please select asymptotic or permutation\")\n", | ||
" import sys\n", | ||
" sys.exit()\n", | ||
" \n", | ||
" return (xi_val, pvalue)\n", | ||
" return xi\n", | ||
" " | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 346, | ||
"metadata": {}, | ||
"outputs": [ | ||
{ | ||
"name": "stdout", | ||
"output_type": "stream", | ||
"text": [ | ||
"0.2749999999999999 0.07841556446646347\n" | ||
] | ||
} | ||
], | ||
"source": [ | ||
"# def xi(x, y, pvalue=True, ties=True, method=\"asymptotic\", nperm=1000, factor=True):\n", | ||
"x_i = [10, 8, 13, 9, 11, 14, 6, 4, 12, 7, 5]\n", | ||
"y_i = [8.04, 6.95, 7.58, 8.81, 8.33, 9.96, 7.24, 4.26, 10.84, 4.82, 5.68]\n", | ||
"x_ii = [10, 8, 13, 9, 11, 14, 6, 4, 12, 7, 5]\n", | ||
"y_ii = [9.14, 8.14, 8.74, 8.77, 9.26, 8.1, 6.13, 3.1, 9.13, 7.26, 4.74]\n", | ||
"pvalue = True\n", | ||
"ties = False\n", | ||
"method = \"asymptotic\"\n", | ||
"nperm = 1000\n", | ||
"factor = True\n", | ||
"\n", | ||
"xi_val, pvalue = wrapper(x_i, y_i, pvalue=True, ties=False, method=\"asymptotic\", nperm=1000, factor=True)\n", | ||
"print(xi_val, pvalue)" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 347, | ||
"metadata": {}, | ||
"outputs": [ | ||
{ | ||
"data": { | ||
"text/plain": [ | ||
"$xi\n", | ||
"[1] 0.275\n", | ||
"\n", | ||
"$pval\n", | ||
"[1] 0.07841556\n", | ||
"\n" | ||
] | ||
}, | ||
"metadata": {}, | ||
"output_type": "display_data" | ||
} | ||
], | ||
"source": [ | ||
"%%R\n", | ||
"\n", | ||
"x_i = c(10, 8, 13, 9, 11, 14, 6, 4, 12, 7, 5)\n", | ||
"y_i = c( 8.04, 6.95, 7.58, 8.81, 8.33, 9.96, 7.24, 4.26, 10.84, 4.82, 5.68) \n", | ||
"x_ii = c(10, 8, 13, 9, 11, 14, 6, 4, 12, 7, 5)\n", | ||
"y_ii = c(9.14, 8.14, 8.74, 8.77, 9.26, 8.1, 6.13, 3.1, 9.13, 7.26, 4.74)\n", | ||
"x_iii = c(10, 8, 13, 9, 11, 14, 6, 4, 12, 7, 5)\n", | ||
"y_iii = c( 7.46, 6.77, 12.74, 7.11, 7.81, 8.84, 6.08, 5.39, 8.15, 6.42, 5.73)\n", | ||
"x_iv = c( 8, 8, 8, 8, 8, 8, 8, 19, 8, 8, 8)\n", | ||
"y_iv = c( 6.58, 5.76, 7.71, 8.84, 8.47, 7.04, 5.25, 12.5 , 5.56, 7.91, 6.89)\n", | ||
"\n", | ||
"xi(x_i, y_i, pvalue = T, ties = T, method = \"asymptotic\", nperm = 1000, factor = T)\n" | ||
] | ||
} | ||
], | ||
"metadata": { | ||
"kernelspec": { | ||
"display_name": "Python [conda env:correlation]", | ||
"language": "python", | ||
"name": "conda-env-correlation-py" | ||
}, | ||
"language_info": { | ||
"codemirror_mode": { | ||
"name": "ipython", | ||
"version": 3 | ||
}, | ||
"file_extension": ".py", | ||
"mimetype": "text/x-python", | ||
"name": "python", | ||
"nbconvert_exporter": "python", | ||
"pygments_lexer": "ipython3", | ||
"version": "3.7.6" | ||
} | ||
}, | ||
"nbformat": 4, | ||
"nbformat_minor": 4 | ||
} |