Merge branch 'master' into dev

sukses24 · Aug 11, 2020 · c7a1999 · c7a1999
2 parents 3b869df + a13e808
commit c7a1999
Showing 10 changed files with 458 additions and 240 deletions.
diff --git a/docs/requires.txt b/docs/requires.txt
@@ -14,3 +14,4 @@ patsy
 sklearn
 scanpy
 statsmodels
+pickle
diff --git a/requirements.txt b/requirements.txt
@@ -4,7 +4,7 @@ seaborn
 matplotlib
 tensorflow>=2.1.0
 tensorflow-probability>=0.9.0
-arviz>=0.7
+arviz>=0.8
 scipy
 anndata
 patsy

diff --git a/scdcdm/util/result_classes.py b/scdcdm/util/result_classes.py
@@ -98,14 +98,14 @@ def summary_prepare(self, *args, **kwargs):
         intercept_df -- pandas df
             Summary of intercept parameters. Contains one row per cell type. Columns:
             Final Parameter: Final intercept model parameter
-            HPD X%: Upper and lower boundaries of confidence interval (width specified via credible_interval=)
+            hdi X%: Upper and lower boundaries of confidence interval (width specified via hdi_prob=)
             SD: Standard deviation of MCMC samples
             Expected sample: Expected cell counts for a sample with no present covariates. See the tutorial for more explanation
 
         effect_df -- pandas df
             Summary of effect (slope) parameters. Contains one row per covariate/cell type combination. Columns:
             Final Parameter: Final effect model parameter. If this parameter is 0, the effect is not significant, else it is.
-            HPD X%: Upper and lower boundaries of confidence interval (width specified via credible_interval=)
+            HDI X%: Upper and lower boundaries of confidence interval (width specified via hdi_prob=)
             SD: Standard deviation of MCMC samples
             Expected sample: Expected cell counts for a sample with only the current covariate set to 1. See the tutorial for more explanation
             log2-fold change: Log2-fold change between expected cell counts with no covariates and with only the current covariate
@@ -129,20 +129,20 @@ def summary_prepare(self, *args, **kwargs):
         effect_df = self.complete_beta_df(intercept_df, effect_df)
 
         # Give nice column names, remove unnecessary columns
-        hpds = intercept_df.columns[intercept_df.columns.str.contains("hpd")]
-        hpds_new = hpds.str.replace("hpd_", "HPD ")
+        hdis = intercept_df.columns[intercept_df.columns.str.contains("hdi")]
+        hdis_new = hdis.str.replace("hdi_", "HDI ")
 
-        intercept_df = intercept_df.loc[:, ["final_parameter", hpds[0], hpds[1], "sd", "expected_sample"]]
+        intercept_df = intercept_df.loc[:, ["final_parameter", hdis[0], hdis[1], "sd", "expected_sample"]]
         intercept_df = intercept_df.rename(columns=dict(zip(
             intercept_df.columns,
-            ["Final Parameter", hpds_new[0], hpds_new[1], "SD", "Expected Sample"]
+            ["Final Parameter", hdis_new[0], hdis_new[1], "SD", "Expected Sample"]
         )))
 
-        effect_df = effect_df.loc[:, ["final_parameter", hpds[0], hpds[1], "sd", "inclusion_prob",
+        effect_df = effect_df.loc[:, ["final_parameter", hdis[0], hdis[1], "sd", "inclusion_prob",
                                        "expected_sample", "log_fold"]]
         effect_df = effect_df.rename(columns=dict(zip(
             effect_df.columns,
-            ["Final Parameter", hpds_new[0], hpds_new[1], "SD", "Inclusion probability",
+            ["Final Parameter", hdis_new[0], hdis_new[1], "SD", "Inclusion probability",
              "Expected Sample", "log2-fold change"]
         )))
 
@@ -399,3 +399,4 @@ def distances(self):
     def save(self, path_to_file):
         with open(path_to_file, "wb") as f:
             pkl.dump(self, file=f, protocol=4)
+
diff --git a/tests/tests_during_development/arviz_testing.py b/tests/tests_during_development/arviz_testing.py
@@ -41,7 +41,7 @@
 #%%
 ca_result = ana.sample_hmc(num_results=int(1000), n_burnin=int(500))
 
-ca_result.summary(credible_interval=0.95)
+ca_result.summary(hdi_prob=0.95)
 
 #%%
 az.plot_trace(ca_result, var_names="beta", coords={"cov": [0], "cell_type": ["0", "1", "2", "3", "4"]})

diff --git a/tests/tests_during_development/debug.py b/tests/tests_during_development/debug.py
@@ -0,0 +1,50 @@
+import numpy as np
+import pandas as pd
+import matplotlib.pyplot as plt
+import seaborn as sns
+import importlib
+import arviz as az
+
+from scdcdm.util import data_generation as gen
+from scdcdm.util import comp_ana as mod
+from scdcdm.util import result_classes as res
+from scdcdm.util import multi_parameter_sampling as mult
+from scdcdm.util import cell_composition_data as dat
+
+pd.options.display.float_format = '{:10,.3f}'.format
+pd.set_option('display.max_columns', None)
+
+# Artificial data
+np.random.seed(1234)
+
+n = 3
+
+cases = 1
+K = 5
+n_samples = [n, n]
+n_total = np.full(shape=[2*n], fill_value=1000)
+
+data = gen.generate_case_control(cases, K, n_total[0], n_samples,
+                                 w_true=np.array([[1, 0, 0, 0, 0]]),
+                                 b_true=np.log(np.repeat(0.2, K)).tolist())
+
+print(data.uns["w_true"])
+print(data.uns["b_true"])
+
+print(data.X)
+print(data.obs)
+
+#%%
+importlib.reload(mod)
+
+ana = mod.CompositionalAnalysis(data, "x_0", baseline_index=None)
+print(ana.x)
+print(ana.y)
+print(ana.covariate_names)
+
+params_mcmc = ana.sample_hmc(num_results=int(1000), n_burnin=500)
+print(params_mcmc)
+
+#%%
+
+params_mcmc.summary(hdi_prob=0.9)
diff --git a/tests/tests_during_development/final_model_testing.py b/tests/tests_during_development/final_model_testing.py
@@ -72,18 +72,19 @@
 
 ana = mod.CompositionalAnalysis(data, "x_0", baseline_index=None)
 print(ana.x)
+print(ana.y)
 print(ana.covariate_names)
 
 #%%
+importlib.reload(mod)
 params_mcmc = ana.sample_hmc(num_results=int(1000), n_burnin=500)
 print(params_mcmc)
 
 #%%
 params_mcmc.summary()
 
 #%%
-params_mcmc.summary_extended(credible_interval=0.9)
-
+params_mcmc.summary_extended(hdi_prob=0.9)
 
 #%%
 path = "data/test"
@@ -118,7 +119,7 @@
 params_mcmc = ana_simple.sample_hmc(num_results=int(2000), n_burnin=500)
 
 #%%
-params_mcmc.summary(credible_interval=0.95)
+params_mcmc.summary(hdi_prob=0.95)
 
 #%%
 az.plot_trace(params_mcmc)

diff --git a/tests/tests_during_development/sle_freq_analysis.ipynb b/tests/tests_during_development/sle_freq_analysis.ipynb
@@ -124,24 +124,28 @@
     "print(cluster_names)\n",
     "\n",
     "cell_types = pd.DataFrame(index=cluster_names)\n",
-    "print(cell_types)\n",
-    "\n",
-    "#%%# Put all together\n",
-    "\n",
-    "sle_freq_data = ad.AnnData(X=cell_counts, var=cell_types, obs=group_df)"
+    "print(cell_types)"
    ]
   },
   {
    "cell_type": "markdown",
-   "metadata": {},
+   "metadata": {
+    "pycharm": {
+     "name": "#%% md\n"
+    }
+   },
    "source": [
-    "### Modeling without baseline\n"
+    "sle_freq_data = ad.AnnData(X=cell_counts, var=cell_types, obs=group_df)"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": 3,
-   "metadata": {},
+   "metadata": {
+    "pycharm": {
+     "name": "#%%\n"
+    }
+   },
    "outputs": [
     {
      "name": "stdout",
@@ -467,28 +471,32 @@
     }
    ],
    "source": [
-    "\n",
-    "ana = mod.CompositionalAnalysis(sle_freq_data, \"Group\", baseline_index=None)\n",
-    "ca_result = ana.sample_hmc(num_results=int(20000), n_burnin=5000)\n",
-    "\n",
-    "ca_result.summary(credible_interval=0.95)"
+    "### Modeling without baseline\n"
    ]
   },
   {
    "cell_type": "markdown",
-   "metadata": {},
+   "metadata": {
+    "pycharm": {
+     "name": "#%% md\n"
+    }
+   },
    "source": [
-    "This does not work well (too many cell types)! -> all inclusion_probs are 0 or 1\n",
     "\n",
-    "Lowering number of cell types by excluding underrepresented ones...\n",
+    "ana = mod.CompositionalAnalysis(sle_freq_data, \"Group\", baseline_index=None)\n",
+    "ca_result = ana.sample_hmc(num_results=int(20000), n_burnin=5000)\n",
     "\n",
-    "### Use only celltypes with more than 3% of total cells in at least one sample (17 types)\n"
+    "ca_result.summary(hdi_prob=0.95)"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": 9,
-   "metadata": {},
+   "metadata": {
+    "pycharm": {
+     "name": "#%%\n"
+    }
+   },
    "outputs": [
     {
      "name": "stdout",
@@ -509,29 +517,27 @@
     }
    ],
    "source": [
-    "cells_rep = cell_counts_df.loc[:, ~(cell_freq < 0.03).all()]\n",
-    "cells_rep_rel = cell_freq.loc[:, ~(cell_freq < 0.03).all()]\n",
-    "cell_types_rep = pd.DataFrame(index=cells_rep.columns)\n",
-    "\n",
-    "sle_freq_data = ad.AnnData(X=cells_rep.values, var=cell_types_rep, obs=group_df)\n",
-    "print(sle_freq_data.X.shape)\n",
-    "\n",
-    "#%%\n",
-    "cells_plot = pd.melt(cells_rep_rel.reset_index(), id_vars=\"file\", var_name=\"Cell_Type\", value_name=\"Count\")\n",
+    "This does not work well (too many cell types)! -> all inclusion_probs are 0 or 1\n",
     "\n",
-    "cells_plot_2 = pd.melt(cells_rep_rel.drop(columns=\"Clust_36|percent_total\").reset_index(), id_vars=\"file\", var_name=\"Cell_Type\", value_name=\"Count\")\n",
+    "Lowering number of cell types by excluding underrepresented ones...\n",
     "\n",
-    "fig, ax = plt.subplots(figsize=(12, 5))\n",
-    "sns.barplot(data=cells_plot, x=\"Cell_Type\", y=\"Count\", hue=\"file\", ax=ax)\n",
-    "plt.xticks(rotation=90)\n",
-    "plt.show()"
+    "### Use only celltypes with more than 3% of total cells in at least one sample (17 types)\n"
    ]
   },
   {
    "cell_type": "markdown",
-   "metadata": {},
+   "metadata": {
+    "pycharm": {
+     "name": "#%% md\n"
+    }
+   },
    "source": [
-    "### Modeling without baseline\n"
+    "cells_rep = cell_counts_df.loc[:, ~(cell_freq < 0.03).all()]\n",
+    "cells_rep_rel = cell_freq.loc[:, ~(cell_freq < 0.03).all()]\n",
+    "cell_types_rep = pd.DataFrame(index=cells_rep.columns)\n",
+    "\n",
+    "sle_freq_data = ad.AnnData(X=cells_rep.values, var=cell_types_rep, obs=group_df)\n",
+    "print(sle_freq_data.X.shape)"
    ]
   },
   {
@@ -677,21 +683,21 @@
     }
    ],
    "source": [
-    "ana = mod.CompositionalAnalysis(sle_freq_data, \"int_Group\", baseline_index=None)\n",
+    "cells_plot = pd.melt(cells_rep_rel.reset_index(), id_vars=\"file\", var_name=\"Cell_Type\", value_name=\"Count\")\n",
     "\n",
-    "# Still bad convergence -> increase chain length\n",
-    "ca_result = ana.sample_hmc(num_results=int(100000), n_burnin=5000)\n",
+    "cells_plot_2 = pd.melt(cells_rep_rel.drop(columns=\"Clust_36|percent_total\").reset_index(), id_vars=\"file\", var_name=\"Cell_Type\", value_name=\"Count\")\n",
     "\n",
-    "ca_result.summary(credible_interval=0.95)"
+    "fig, ax = plt.subplots(figsize=(12, 5))\n",
+    "sns.barplot(data=cells_plot, x=\"Cell_Type\", y=\"Count\", hue=\"file\", ax=ax)\n",
+    "plt.xticks(rotation=90)\n",
+    "plt.show()"
    ]
   },
   {
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "### With baseline\n",
-    "\n",
-    "Baseline on celltype clust_26 (index 9)"
+    "### Modeling without baseline\n"
    ]
   },
   {
@@ -700,32 +706,47 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "ana_bl_reduced = mod.CompositionalAnalysis(sle_freq_data, \"int_Group\", baseline_index=9)\n",
+    "ana = mod.CompositionalAnalysis(sle_freq_data, \"int_Group\", baseline_index=None)\n",
     "\n",
     "# Still bad convergence -> increase chain length\n",
-    "result_bl_reduced = ana_bl_reduced.sample_hmc(num_results=int(100000), n_burnin=5000)\n",
+    "ca_result = ana.sample_hmc(num_results=int(100000), n_burnin=5000)\n",
     "\n",
-    "result_bl_reduced.summary(credible_interval=0.95)"
+    "ca_result.summary(hdi_prob=0.95)"
    ]
   },
   {
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "Not a good idea from relative point of view (see plot)"
+    "### With baseline\n",
+    "\n",
+    "Baseline on celltype clust_26 (index 9)"
    ]
   },
   {
    "cell_type": "markdown",
-   "metadata": {},
+   "metadata": {
+    "pycharm": {
+     "name": "#%% md\n"
+    }
+   },
    "source": [
-    "Baseline on celltype clust_04 (index 1)"
+    "ana_bl_reduced = mod.CompositionalAnalysis(sle_freq_data, \"int_Group\", baseline_index=9)\n",
+    "\n",
+    "# Still bad convergence -> increase chain length\n",
+    "result_bl_reduced = ana_bl_reduced.sample_hmc(num_results=int(100000), n_burnin=5000)\n",
+    "\n",
+    "result_bl_reduced.summary(hdi_prob=0.95)"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": 5,
-   "metadata": {},
+   "metadata": {
+    "pycharm": {
+     "name": "#%%\n"
+    }
+   },
    "outputs": [
     {
      "name": "stdout",
@@ -841,19 +862,14 @@
     }
    ],
    "source": [
-    "ana_bl_reduced = mod.CompositionalAnalysis(sle_freq_data, \"int_Group\", baseline_index=1)\n",
-    "\n",
-    "# Still bad convergence -> increase chain length\n",
-    "result_bl_reduced = ana_bl_reduced.sample_hmc(num_results=int(100000), n_burnin=5000)\n",
-    "\n",
-    "result_bl_reduced.summary(credible_interval=0.95)"
+    "Not a good idea from relative point of view (see plot)"
    ]
   },
   {
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "Baseline on celltype clust_09 (index 3)"
+    "Baseline on celltype clust_04 (index 1)"
    ]
   },
   {
@@ -975,18 +991,22 @@
     }
    ],
    "source": [
-    "ana_bl_reduced = mod.CompositionalAnalysis(sle_freq_data, \"int_Group\", baseline_index=3)\n",
+    "ana_bl_reduced = mod.CompositionalAnalysis(sle_freq_data, \"int_Group\", baseline_index=1)\n",
     "\n",
     "# Still bad convergence -> increase chain length\n",
     "result_bl_reduced = ana_bl_reduced.sample_hmc(num_results=int(100000), n_burnin=5000)\n",
     "\n",
-    "result_bl_reduced.summary(credible_interval=0.95)"
+    "result_bl_reduced.summary(hdi_prob=0.95)"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": 7,
-   "metadata": {},
+   "metadata": {
+    "pycharm": {
+     "name": "#%%\n"
+    }
+   },
    "outputs": [
     {
      "data": {
@@ -1000,8 +1020,7 @@
     }
    ],
    "source": [
-    "az.plot_trace(result_bl_reduced)\n",
-    "plt.show()"
+    "Baseline on celltype clust_09 (index 3)"
    ]
   }
  ],
@@ -1022,8 +1041,17 @@
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
    "version": "3.7.6"
+  },
+  "pycharm": {
+   "stem_cell": {
+    "cell_type": "raw",
+    "source": [],
+    "metadata": {
+     "collapsed": false
+    }
+   }
   }
  },
  "nbformat": 4,
  "nbformat_minor": 4
-}
+}
diff --git a/tests/tests_during_development/sle_freq_analysis.py b/tests/tests_during_development/sle_freq_analysis.py
@@ -59,7 +59,7 @@
 #%%
 ca_result = ana.sample_hmc(num_results=int(20000), n_burnin=5000)
 
-ca_result.summary(credible_interval=0.95)
+ca_result.summary(hdi_prob=0.95)
 
 #%%
 az.plot_trace(ca_result)
@@ -73,7 +73,7 @@
 #%%
 ca_result_2 = ana_2.sample_hmc(num_results=int(2e4), n_burnin=5e3)
 
-ca_result_2.summary(credible_interval=0.95)
+ca_result_2.summary(hdi_prob=0.95)
 
 
 #%%
@@ -106,7 +106,7 @@
 #%%
 ca_result = ana.sample_hmc(num_results=int(20000), n_burnin=5000)
 
-ca_result.summary(credible_interval=0.95)
+ca_result.summary(hdi_prob=0.95)
 
 #%%
 az.plot_trace(ca_result)

diff --git a/tutorials/Tutorial.ipynb b/tutorials/Tutorial.ipynb
diff --git a/tutorials/test b/tutorials/test
-Original file line number
+Diff line change
@@ @@ -14,3 +14,4 @@ patsy @@
     sklearn
     scanpy
     statsmodels
+    pickle