Skip to content

Commit

Permalink
Merge branch 'master' into dev
Browse files Browse the repository at this point in the history
johannesostner committed Aug 11, 2020
2 parents 3b869df + a13e808 commit c7a1999
Showing 10 changed files with 458 additions and 240 deletions.
1 change: 1 addition & 0 deletions docs/requires.txt
Original file line number Diff line number Diff line change
@@ -14,3 +14,4 @@ patsy
sklearn
scanpy
statsmodels
pickle
2 changes: 1 addition & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
@@ -4,7 +4,7 @@ seaborn
matplotlib
tensorflow>=2.1.0
tensorflow-probability>=0.9.0
arviz>=0.7
arviz>=0.8
scipy
anndata
patsy
17 changes: 9 additions & 8 deletions scdcdm/util/result_classes.py
Original file line number Diff line number Diff line change
@@ -98,14 +98,14 @@ def summary_prepare(self, *args, **kwargs):
intercept_df -- pandas df
Summary of intercept parameters. Contains one row per cell type. Columns:
Final Parameter: Final intercept model parameter
HPD X%: Upper and lower boundaries of confidence interval (width specified via credible_interval=)
hdi X%: Upper and lower boundaries of confidence interval (width specified via hdi_prob=)
SD: Standard deviation of MCMC samples
Expected sample: Expected cell counts for a sample with no present covariates. See the tutorial for more explanation
effect_df -- pandas df
Summary of effect (slope) parameters. Contains one row per covariate/cell type combination. Columns:
Final Parameter: Final effect model parameter. If this parameter is 0, the effect is not significant, else it is.
HPD X%: Upper and lower boundaries of confidence interval (width specified via credible_interval=)
HDI X%: Upper and lower boundaries of confidence interval (width specified via hdi_prob=)
SD: Standard deviation of MCMC samples
Expected sample: Expected cell counts for a sample with only the current covariate set to 1. See the tutorial for more explanation
log2-fold change: Log2-fold change between expected cell counts with no covariates and with only the current covariate
@@ -129,20 +129,20 @@ def summary_prepare(self, *args, **kwargs):
effect_df = self.complete_beta_df(intercept_df, effect_df)

# Give nice column names, remove unnecessary columns
hpds = intercept_df.columns[intercept_df.columns.str.contains("hpd")]
hpds_new = hpds.str.replace("hpd_", "HPD ")
hdis = intercept_df.columns[intercept_df.columns.str.contains("hdi")]
hdis_new = hdis.str.replace("hdi_", "HDI ")

intercept_df = intercept_df.loc[:, ["final_parameter", hpds[0], hpds[1], "sd", "expected_sample"]]
intercept_df = intercept_df.loc[:, ["final_parameter", hdis[0], hdis[1], "sd", "expected_sample"]]
intercept_df = intercept_df.rename(columns=dict(zip(
intercept_df.columns,
["Final Parameter", hpds_new[0], hpds_new[1], "SD", "Expected Sample"]
["Final Parameter", hdis_new[0], hdis_new[1], "SD", "Expected Sample"]
)))

effect_df = effect_df.loc[:, ["final_parameter", hpds[0], hpds[1], "sd", "inclusion_prob",
effect_df = effect_df.loc[:, ["final_parameter", hdis[0], hdis[1], "sd", "inclusion_prob",
"expected_sample", "log_fold"]]
effect_df = effect_df.rename(columns=dict(zip(
effect_df.columns,
["Final Parameter", hpds_new[0], hpds_new[1], "SD", "Inclusion probability",
["Final Parameter", hdis_new[0], hdis_new[1], "SD", "Inclusion probability",
"Expected Sample", "log2-fold change"]
)))

@@ -399,3 +399,4 @@ def distances(self):
def save(self, path_to_file):
with open(path_to_file, "wb") as f:
pkl.dump(self, file=f, protocol=4)

2 changes: 1 addition & 1 deletion tests/tests_during_development/arviz_testing.py
Original file line number Diff line number Diff line change
@@ -41,7 +41,7 @@
#%%
ca_result = ana.sample_hmc(num_results=int(1000), n_burnin=int(500))

ca_result.summary(credible_interval=0.95)
ca_result.summary(hdi_prob=0.95)

#%%
az.plot_trace(ca_result, var_names="beta", coords={"cov": [0], "cell_type": ["0", "1", "2", "3", "4"]})
50 changes: 50 additions & 0 deletions tests/tests_during_development/debug.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import importlib
import arviz as az

from scdcdm.util import data_generation as gen
from scdcdm.util import comp_ana as mod
from scdcdm.util import result_classes as res
from scdcdm.util import multi_parameter_sampling as mult
from scdcdm.util import cell_composition_data as dat

pd.options.display.float_format = '{:10,.3f}'.format
pd.set_option('display.max_columns', None)

# Artificial data
np.random.seed(1234)

n = 3

cases = 1
K = 5
n_samples = [n, n]
n_total = np.full(shape=[2*n], fill_value=1000)

data = gen.generate_case_control(cases, K, n_total[0], n_samples,
w_true=np.array([[1, 0, 0, 0, 0]]),
b_true=np.log(np.repeat(0.2, K)).tolist())

print(data.uns["w_true"])
print(data.uns["b_true"])

print(data.X)
print(data.obs)

#%%
importlib.reload(mod)

ana = mod.CompositionalAnalysis(data, "x_0", baseline_index=None)
print(ana.x)
print(ana.y)
print(ana.covariate_names)

params_mcmc = ana.sample_hmc(num_results=int(1000), n_burnin=500)
print(params_mcmc)

#%%

params_mcmc.summary(hdi_prob=0.9)
7 changes: 4 additions & 3 deletions tests/tests_during_development/final_model_testing.py
Original file line number Diff line number Diff line change
@@ -72,18 +72,19 @@

ana = mod.CompositionalAnalysis(data, "x_0", baseline_index=None)
print(ana.x)
print(ana.y)
print(ana.covariate_names)

#%%
importlib.reload(mod)
params_mcmc = ana.sample_hmc(num_results=int(1000), n_burnin=500)
print(params_mcmc)

#%%
params_mcmc.summary()

#%%
params_mcmc.summary_extended(credible_interval=0.9)

params_mcmc.summary_extended(hdi_prob=0.9)

#%%
path = "data/test"
@@ -118,7 +119,7 @@
params_mcmc = ana_simple.sample_hmc(num_results=int(2000), n_burnin=500)

#%%
params_mcmc.summary(credible_interval=0.95)
params_mcmc.summary(hdi_prob=0.95)

#%%
az.plot_trace(params_mcmc)
150 changes: 89 additions & 61 deletions tests/tests_during_development/sle_freq_analysis.ipynb
Original file line number Diff line number Diff line change
@@ -124,24 +124,28 @@
"print(cluster_names)\n",
"\n",
"cell_types = pd.DataFrame(index=cluster_names)\n",
"print(cell_types)\n",
"\n",
"#%%# Put all together\n",
"\n",
"sle_freq_data = ad.AnnData(X=cell_counts, var=cell_types, obs=group_df)"
"print(cell_types)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"metadata": {
"pycharm": {
"name": "#%% md\n"
}
},
"source": [
"### Modeling without baseline\n"
"sle_freq_data = ad.AnnData(X=cell_counts, var=cell_types, obs=group_df)"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"metadata": {
"pycharm": {
"name": "#%%\n"
}
},
"outputs": [
{
"name": "stdout",
@@ -467,28 +471,32 @@
}
],
"source": [
"\n",
"ana = mod.CompositionalAnalysis(sle_freq_data, \"Group\", baseline_index=None)\n",
"ca_result = ana.sample_hmc(num_results=int(20000), n_burnin=5000)\n",
"\n",
"ca_result.summary(credible_interval=0.95)"
"### Modeling without baseline\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"metadata": {
"pycharm": {
"name": "#%% md\n"
}
},
"source": [
"This does not work well (too many cell types)! -> all inclusion_probs are 0 or 1\n",
"\n",
"Lowering number of cell types by excluding underrepresented ones...\n",
"ana = mod.CompositionalAnalysis(sle_freq_data, \"Group\", baseline_index=None)\n",
"ca_result = ana.sample_hmc(num_results=int(20000), n_burnin=5000)\n",
"\n",
"### Use only celltypes with more than 3% of total cells in at least one sample (17 types)\n"
"ca_result.summary(hdi_prob=0.95)"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"metadata": {
"pycharm": {
"name": "#%%\n"
}
},
"outputs": [
{
"name": "stdout",
@@ -509,29 +517,27 @@
}
],
"source": [
"cells_rep = cell_counts_df.loc[:, ~(cell_freq < 0.03).all()]\n",
"cells_rep_rel = cell_freq.loc[:, ~(cell_freq < 0.03).all()]\n",
"cell_types_rep = pd.DataFrame(index=cells_rep.columns)\n",
"\n",
"sle_freq_data = ad.AnnData(X=cells_rep.values, var=cell_types_rep, obs=group_df)\n",
"print(sle_freq_data.X.shape)\n",
"\n",
"#%%\n",
"cells_plot = pd.melt(cells_rep_rel.reset_index(), id_vars=\"file\", var_name=\"Cell_Type\", value_name=\"Count\")\n",
"This does not work well (too many cell types)! -> all inclusion_probs are 0 or 1\n",
"\n",
"cells_plot_2 = pd.melt(cells_rep_rel.drop(columns=\"Clust_36|percent_total\").reset_index(), id_vars=\"file\", var_name=\"Cell_Type\", value_name=\"Count\")\n",
"Lowering number of cell types by excluding underrepresented ones...\n",
"\n",
"fig, ax = plt.subplots(figsize=(12, 5))\n",
"sns.barplot(data=cells_plot, x=\"Cell_Type\", y=\"Count\", hue=\"file\", ax=ax)\n",
"plt.xticks(rotation=90)\n",
"plt.show()"
"### Use only celltypes with more than 3% of total cells in at least one sample (17 types)\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"metadata": {
"pycharm": {
"name": "#%% md\n"
}
},
"source": [
"### Modeling without baseline\n"
"cells_rep = cell_counts_df.loc[:, ~(cell_freq < 0.03).all()]\n",
"cells_rep_rel = cell_freq.loc[:, ~(cell_freq < 0.03).all()]\n",
"cell_types_rep = pd.DataFrame(index=cells_rep.columns)\n",
"\n",
"sle_freq_data = ad.AnnData(X=cells_rep.values, var=cell_types_rep, obs=group_df)\n",
"print(sle_freq_data.X.shape)"
]
},
{
@@ -677,21 +683,21 @@
}
],
"source": [
"ana = mod.CompositionalAnalysis(sle_freq_data, \"int_Group\", baseline_index=None)\n",
"cells_plot = pd.melt(cells_rep_rel.reset_index(), id_vars=\"file\", var_name=\"Cell_Type\", value_name=\"Count\")\n",
"\n",
"# Still bad convergence -> increase chain length\n",
"ca_result = ana.sample_hmc(num_results=int(100000), n_burnin=5000)\n",
"cells_plot_2 = pd.melt(cells_rep_rel.drop(columns=\"Clust_36|percent_total\").reset_index(), id_vars=\"file\", var_name=\"Cell_Type\", value_name=\"Count\")\n",
"\n",
"ca_result.summary(credible_interval=0.95)"
"fig, ax = plt.subplots(figsize=(12, 5))\n",
"sns.barplot(data=cells_plot, x=\"Cell_Type\", y=\"Count\", hue=\"file\", ax=ax)\n",
"plt.xticks(rotation=90)\n",
"plt.show()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### With baseline\n",
"\n",
"Baseline on celltype clust_26 (index 9)"
"### Modeling without baseline\n"
]
},
{
@@ -700,32 +706,47 @@
"metadata": {},
"outputs": [],
"source": [
"ana_bl_reduced = mod.CompositionalAnalysis(sle_freq_data, \"int_Group\", baseline_index=9)\n",
"ana = mod.CompositionalAnalysis(sle_freq_data, \"int_Group\", baseline_index=None)\n",
"\n",
"# Still bad convergence -> increase chain length\n",
"result_bl_reduced = ana_bl_reduced.sample_hmc(num_results=int(100000), n_burnin=5000)\n",
"ca_result = ana.sample_hmc(num_results=int(100000), n_burnin=5000)\n",
"\n",
"result_bl_reduced.summary(credible_interval=0.95)"
"ca_result.summary(hdi_prob=0.95)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Not a good idea from relative point of view (see plot)"
"### With baseline\n",
"\n",
"Baseline on celltype clust_26 (index 9)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"metadata": {
"pycharm": {
"name": "#%% md\n"
}
},
"source": [
"Baseline on celltype clust_04 (index 1)"
"ana_bl_reduced = mod.CompositionalAnalysis(sle_freq_data, \"int_Group\", baseline_index=9)\n",
"\n",
"# Still bad convergence -> increase chain length\n",
"result_bl_reduced = ana_bl_reduced.sample_hmc(num_results=int(100000), n_burnin=5000)\n",
"\n",
"result_bl_reduced.summary(hdi_prob=0.95)"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"metadata": {
"pycharm": {
"name": "#%%\n"
}
},
"outputs": [
{
"name": "stdout",
@@ -841,19 +862,14 @@
}
],
"source": [
"ana_bl_reduced = mod.CompositionalAnalysis(sle_freq_data, \"int_Group\", baseline_index=1)\n",
"\n",
"# Still bad convergence -> increase chain length\n",
"result_bl_reduced = ana_bl_reduced.sample_hmc(num_results=int(100000), n_burnin=5000)\n",
"\n",
"result_bl_reduced.summary(credible_interval=0.95)"
"Not a good idea from relative point of view (see plot)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Baseline on celltype clust_09 (index 3)"
"Baseline on celltype clust_04 (index 1)"
]
},
{
@@ -975,18 +991,22 @@
}
],
"source": [
"ana_bl_reduced = mod.CompositionalAnalysis(sle_freq_data, \"int_Group\", baseline_index=3)\n",
"ana_bl_reduced = mod.CompositionalAnalysis(sle_freq_data, \"int_Group\", baseline_index=1)\n",
"\n",
"# Still bad convergence -> increase chain length\n",
"result_bl_reduced = ana_bl_reduced.sample_hmc(num_results=int(100000), n_burnin=5000)\n",
"\n",
"result_bl_reduced.summary(credible_interval=0.95)"
"result_bl_reduced.summary(hdi_prob=0.95)"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"metadata": {
"pycharm": {
"name": "#%%\n"
}
},
"outputs": [
{
"data": {
@@ -1000,8 +1020,7 @@
}
],
"source": [
"az.plot_trace(result_bl_reduced)\n",
"plt.show()"
"Baseline on celltype clust_09 (index 3)"
]
}
],
@@ -1022,8 +1041,17 @@
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.6"
},
"pycharm": {
"stem_cell": {
"cell_type": "raw",
"source": [],
"metadata": {
"collapsed": false
}
}
}
},
"nbformat": 4,
"nbformat_minor": 4
}
}
6 changes: 3 additions & 3 deletions tests/tests_during_development/sle_freq_analysis.py
Original file line number Diff line number Diff line change
@@ -59,7 +59,7 @@
#%%
ca_result = ana.sample_hmc(num_results=int(20000), n_burnin=5000)

ca_result.summary(credible_interval=0.95)
ca_result.summary(hdi_prob=0.95)

#%%
az.plot_trace(ca_result)
@@ -73,7 +73,7 @@
#%%
ca_result_2 = ana_2.sample_hmc(num_results=int(2e4), n_burnin=5e3)

ca_result_2.summary(credible_interval=0.95)
ca_result_2.summary(hdi_prob=0.95)


#%%
@@ -106,7 +106,7 @@
#%%
ca_result = ana.sample_hmc(num_results=int(20000), n_burnin=5000)

ca_result.summary(credible_interval=0.95)
ca_result.summary(hdi_prob=0.95)

#%%
az.plot_trace(ca_result)
463 changes: 300 additions & 163 deletions tutorials/Tutorial.ipynb

Large diffs are not rendered by default.

Binary file added tutorials/test
Binary file not shown.

0 comments on commit c7a1999

Please sign in to comment.