forked from theislab/scCODA
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
added functions for data visualization, extended model comparison
1 parent
682852a
commit cabc79d
Showing
10 changed files
with
2,296 additions
and
35 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -111,3 +111,4 @@ venv.bak/ | |
|
||
# Other | ||
prototyping/ | ||
.pkl |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,184 @@ | ||
""" | ||
This script contains methods to visualize compositional data that was imported into SCDCdm's data format | ||
:authors: Johannes Ostner | ||
""" | ||
|
||
# Setup | ||
|
||
import numpy as np | ||
import pandas as pd | ||
import matplotlib.pyplot as plt | ||
import seaborn as sns | ||
from matplotlib.pyplot import cm | ||
|
||
sns.set_style("ticks") | ||
|
||
|
||
def plot_one_stackbar(y, type_names, title, level_names): | ||
|
||
""" | ||
Plots a stacked barplot for one (discrete) covariate | ||
Typical use: plot_one_stackbar(data.X, data.var.index, "xyz", data.obs.index) | ||
Parameters | ||
---------- | ||
y -- numpy array | ||
The count data, collapsed onto the level of interest. i.e. a binary covariate has two rows containing the count | ||
mean of all samples of each cell type | ||
type_names -- list-like | ||
The names of all cell types | ||
title -- string | ||
Plot title, usually the covariate's name | ||
level_names -- list-like | ||
names of the covariate's levels | ||
Returns | ||
------- | ||
a plot | ||
""" | ||
|
||
plt.figure(figsize=(20, 10)) | ||
n_samples, n_types = y.shape | ||
r = np.array(range(n_samples)) | ||
sample_sums = np.sum(y, axis=1) | ||
barwidth = 0.85 | ||
cum_bars = np.zeros(n_samples) | ||
colors = cm.tab20 | ||
|
||
for n in range(n_types): | ||
bars = [i / j * 100 for i, j in zip([y[k][n] for k in range(n_samples)], sample_sums)] | ||
plt.bar(r, bars, bottom=cum_bars, color=colors(n % 20), width=barwidth, label=type_names[n]) | ||
cum_bars += bars | ||
|
||
plt.title(title) | ||
plt.legend(loc='upper left', bbox_to_anchor=(1, 1), ncol=1) | ||
plt.xticks(r, level_names, rotation=45) | ||
|
||
plt.show() | ||
|
||
|
||
def plot_feature_stackbars(data, features): | ||
|
||
""" | ||
Plots stackbars for all listed covariates | ||
Usage: plot_feature_stackbars(data, ["cov1", "cov2", "cov3"]) | ||
Parameters | ||
---------- | ||
data -- AnnData object | ||
A SCDCdm-compatible data object | ||
features -- list | ||
List of all covariates to plot. Specifying "samples" as an element will plot a stackbar with all samples | ||
Returns | ||
------- | ||
One plot for every category | ||
""" | ||
|
||
type_names = data.var.index | ||
for f in features: | ||
|
||
if f == "samples": | ||
plot_one_stackbar(data.X, data.var.index, "samples", data.obs.index) | ||
else: | ||
levels = pd.unique(data.obs[f]) | ||
n_levels = len(levels) | ||
feature_totals = np.zeros([n_levels, data.X.shape[1]]) | ||
|
||
for level in range(n_levels): | ||
l_indices = np.where(data.obs[f] == levels[level]) | ||
feature_totals[level] = np.sum(data.X[l_indices], axis=0) | ||
|
||
plot_one_stackbar(feature_totals, type_names=type_names, level_names=levels, title=f) | ||
|
||
|
||
def grouped_boxplot(data, feature, log_scale=False, *args, **kwargs): | ||
""" | ||
Grouped boxplot -cell types on x-Axis; one boxplot per feature level for each cell type | ||
Parameters | ||
---------- | ||
data -- AnnData object | ||
A SCDCdm-compatible data object | ||
feature -- string | ||
The name of the feature in data.obs to plot | ||
log_scale -- bool | ||
If true, use log(data.X + 1) (pseudocount 1 to avoid log(0)-issues) | ||
*args, **kwargs -- Passed to sns.boxplot | ||
Returns | ||
------- | ||
Plot! | ||
""" | ||
|
||
plt.figure(figsize=(20, 10)) | ||
|
||
# add pseudocount 1 if using log scale (needs to be improved) | ||
if log_scale: | ||
X = np.log(data.X + 1) | ||
value_name = "log(count)" | ||
else: | ||
X = data.X | ||
value_name = "count" | ||
|
||
count_df = pd.DataFrame(X, columns=data.var.index, index=data.obs.index).\ | ||
merge(data.obs[feature], left_index=True, right_index=True) | ||
plot_df = pd.melt(count_df, id_vars=feature, var_name="Cell type", value_name=value_name) | ||
|
||
d = sns.boxplot(x="Cell type", y=value_name, hue=feature, data=plot_df, fliersize=1, | ||
palette='Blues', *args, **kwargs) | ||
|
||
loc, labels = plt.xticks() | ||
d.set_xticklabels(labels, rotation=90) | ||
plt.show() | ||
|
||
|
||
def boxplot_facets(data, feature, log_scale=False, args_boxplot={}, args_swarmplot={}): | ||
""" | ||
Grouped boxplot -cell types on x-Axis; one boxplot per feature level for each cell type | ||
Parameters | ||
---------- | ||
data -- AnnData object | ||
A SCDCdm-compatible data object | ||
feature -- string | ||
The name of the feature in data.obs to plot | ||
log_scale -- bool | ||
If true, use log(data.X + 1) (pseudocount 1 to avoid log(0)-issues) | ||
args_boxplot -- dict | ||
Arguments passed to sns.boxplot | ||
args_swarmplot -- dict | ||
Arguments passed to sns.swarmplot | ||
Returns | ||
------- | ||
Plot! | ||
""" | ||
|
||
# add pseudocount 1 if using log scale (needs to be improved) | ||
if log_scale: | ||
X = np.log(data.X + 1) | ||
value_name = "log(count)" | ||
else: | ||
X = data.X | ||
value_name = "count" | ||
|
||
K = X.shape[1] | ||
|
||
count_df = pd.DataFrame(X, columns=data.var.index, index=data.obs.index).merge(data.obs, left_index=True, | ||
right_index=True) | ||
plot_df = pd.melt(count_df, id_vars=data.obs.columns, var_name="Cell type", value_name=value_name) | ||
print(plot_df) | ||
|
||
if "hue" in args_swarmplot: | ||
hue = args_swarmplot.pop("hue") | ||
else: | ||
hue = None | ||
|
||
g = sns.FacetGrid(plot_df, col="Cell type", sharey=False, col_wrap=np.floor(np.sqrt(K)), height=5, aspect=2) | ||
g.map(sns.boxplot, feature, value_name, palette="Blues", **args_boxplot) | ||
if hue is None: | ||
g.map(sns.swarmplot, feature, value_name, color="black", **args_swarmplot).set_titles("{col_name}") | ||
else: | ||
g.map(sns.swarmplot, feature, value_name, hue, **args_swarmplot).set_titles("{col_name}") | ||
plt.show() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
1,712 changes: 1,712 additions & 0 deletions
1,712
tests/microbiome_testing/MovingPictures_Analysis.ipynb
Large diffs are not rendered by default.
Oops, something went wrong.
Empty file.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,63 @@ | ||
import numpy as np | ||
import pandas as pd | ||
import matplotlib.pyplot as plt | ||
import seaborn as sns | ||
import importlib | ||
import arviz as az | ||
import pickle as pkl | ||
from matplotlib.pyplot import cm | ||
import matplotlib | ||
|
||
from scdcdm.util import data_generation as gen | ||
from scdcdm.util import comp_ana as mod | ||
from scdcdm.util import result_classes as res | ||
from scdcdm.util import multi_parameter_sampling as mult | ||
from scdcdm.util import cell_composition_data as dat | ||
from scdcdm.util import data_visualization as viz | ||
|
||
pd.options.display.float_format = '{:10,.3f}'.format | ||
pd.set_option('display.max_columns', 10) | ||
|
||
#%% | ||
|
||
data_path = "C:/Users/Johannes/Documents/PhD/data/rosen_mincount10_maxee2_trim200_results_forpaper/" | ||
|
||
otu_file = "rosen_mincount10_maxee2_trim200.otu_table.99.denovo.rdp_assigned.paper_samples.txt" | ||
|
||
with open(data_path + otu_file, "rb") as file: | ||
otus = pd.read_csv(file, sep="\t", index_col=0) | ||
|
||
print(otus) | ||
|
||
#%% | ||
meta_file = "patient_clinical_metadata.csv" | ||
|
||
with open(data_path + meta_file, "rb") as file: | ||
metadata = pd.read_csv(file, sep=",", index_col=0) | ||
|
||
metadata = metadata[~metadata.index.str.endswith('F')] | ||
metadata = metadata[~metadata.index.str.endswith('sick')] | ||
metadata = metadata[~metadata.index.str.endswith('F2')] | ||
metadata = metadata[~metadata.index.str.endswith('F2T')] | ||
metadata = metadata[~metadata.index.str.endswith('2')] | ||
metadata = metadata[~metadata.index.str.startswith('05')] | ||
|
||
print(metadata) | ||
|
||
#%% | ||
|
||
meta_rel = metadata[(~pd.isna(metadata["mbs_consolidated"])) & (~pd.isna(metadata["bal"]))] | ||
|
||
print(meta_rel) | ||
|
||
#%% | ||
|
||
print(meta_rel.groupby("mbs_consolidated").count()) | ||
|
||
#%% | ||
|
||
otus_rel = otus.loc[otus.index.isin(meta_rel.index)] | ||
|
||
print(otus_rel) | ||
|
||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
212 changes: 212 additions & 0 deletions
212
tests/microbiome_testing/moving_pictures_consistency_test.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,212 @@ | ||
""" | ||
Testing scdcdm for consistency on the "Moving pictures" dataset | ||
""" | ||
|
||
import numpy as np | ||
import pandas as pd | ||
import matplotlib.pyplot as plt | ||
import seaborn as sns | ||
import importlib | ||
import arviz as az | ||
import pickle as pkl | ||
from matplotlib.pyplot import cm | ||
import matplotlib | ||
|
||
from scdcdm.util import data_generation as gen | ||
from scdcdm.util import comp_ana as mod | ||
from scdcdm.util import result_classes as res | ||
from scdcdm.util import multi_parameter_sampling as mult | ||
from scdcdm.util import cell_composition_data as dat | ||
from scdcdm.util import data_visualization as viz | ||
|
||
pd.options.display.float_format = '{:10,.3f}'.format | ||
pd.set_option('display.max_columns', None) | ||
|
||
write_path = "C:/Users/Johannes/Documents/Uni/Master's_Thesis/compositionalDiff-johannes_tests_2/tests/microbiome_testing" | ||
|
||
#%% | ||
|
||
# read phylum-level data from biom file as tsv | ||
data_path = "C:/Users/Johannes/AppData/Local/Packages/CanonicalGroupLimited.Ubuntu18.04onWindows_79rhkp1fndgsc/LocalState/rootfs/home/johannes/qiime2_projects/moving-pictures-tutorial" | ||
|
||
with open(data_path+"/exported_data/feature-table-l2.tsv", "rb") as f: | ||
biom_data = pd.read_csv(f, sep="\t", header=1, index_col="#OTU ID") | ||
|
||
biom_data = biom_data.transpose() | ||
|
||
# remove rare groups (<10 in all samples) | ||
|
||
# read metadata | ||
with open(data_path+"/sample-metadata.tsv", "rb") as f: | ||
metadata = pd.read_csv(f, sep="\t", index_col="sample-id").iloc[1:, :] | ||
|
||
metadata_columns = ["subject", "reported-antibiotic-usage", "days-since-experiment-start", "body-site"] | ||
|
||
# add subject to count data | ||
biom_data = pd.merge(biom_data, metadata[metadata_columns], left_index=True, right_index=True) | ||
|
||
data = dat.from_pandas(biom_data, metadata_columns) | ||
data.obs = data.obs.rename(columns={"reported-antibiotic-usage": "antibiotic", "body-site": "site", | ||
"days-since-experiment-start": "days_since_start"}) | ||
|
||
print(data.obs) | ||
|
||
#%% | ||
importlib.reload(viz) | ||
|
||
sns.set(style="ticks", font_scale=2) | ||
args_swarmplot={"hue": "subject", "size": 10, "palette": "Reds"} | ||
viz.boxplot_facets(data, feature="site") | ||
plt.show() | ||
|
||
#%% | ||
|
||
# Model that differentiates both palms | ||
model_palms = mod.CompositionalAnalysis(data[data.obs["site"].isin(["left palm", "right palm"])], "site", baseline_index=None) | ||
|
||
result_palms = model_palms.sample_hmc(num_results=int(20000), n_burnin=5000) | ||
|
||
result_palms.summary_extended(hdi_prob=0.95) | ||
|
||
#%% | ||
|
||
with az.rc_context(rc={'plot.max_subplots': None}): | ||
az.plot_trace(result_palms, compact=True) | ||
plt.show() | ||
|
||
#%% | ||
|
||
# less samples, less burnin | ||
result_palms_5000 = model_palms.sample_hmc(num_results=int(5000), n_burnin=1000) | ||
|
||
result_palms_5000.summary_extended(hdi_prob=0.95) | ||
|
||
#%% | ||
|
||
with az.rc_context(rc={'plot.max_subplots': None}): | ||
az.plot_trace(result_palms_5000, var_names="beta") | ||
plt.show() | ||
|
||
|
||
|
||
#%% | ||
|
||
with az.rc_context(rc={'plot.max_subplots': None}): | ||
az.plot_trace(result_palms, var_names=["beta", "b_raw"]) | ||
plt.show() | ||
|
||
#%% | ||
|
||
az.rhat(result_palms, method="folded") | ||
|
||
|
||
#%% | ||
# run palms comparison multiple times | ||
|
||
results = [] | ||
n_chains = 50 | ||
|
||
model_palms = mod.CompositionalAnalysis(data[data.obs["site"].isin(["left palm", "right palm"])], "site", baseline_index=None) | ||
|
||
for n in range(n_chains): | ||
result_temp = model_palms.sample_hmc(num_results=int(20000), n_burnin=5000) | ||
|
||
results.append(result_temp) | ||
|
||
#%% | ||
res_all = az.concat(results, dim="chain") | ||
|
||
print(res_all.posterior) | ||
|
||
#%% | ||
az.to_netcdf(res_all, write_path + "/multi_chain_50_len20000_all") | ||
|
||
#%% | ||
|
||
acc_probs = pd.DataFrame(pd.concat([r.effect_df.loc[:, "Inclusion probability"] for r in results])) | ||
|
||
acc_probs["chain_no"] = np.concatenate([np.repeat(i+1, 21) for i in range(n_chains)]) | ||
|
||
acc_probs.index = acc_probs.index.droplevel(0) | ||
|
||
acc_probs = acc_probs.reset_index() | ||
|
||
print(acc_probs) | ||
|
||
#%% | ||
with open(write_path + "/multi_chain_50_len20000_acc", "wb") as file: | ||
pkl.dump(acc_probs, file) | ||
|
||
|
||
#%% | ||
with open(write_path + "/multi_chain_50_len20000_acc", "rb") as file: | ||
acc_probs = pkl.load(file) | ||
|
||
res_all = az.from_netcdf(write_path + "/multi_chain_50_len20000_all") | ||
|
||
#%% | ||
coords = {"cell_type": "k__Bacteria;p__Proteobacteria"} | ||
az.plot_trace(res_all, var_names="beta", coords=coords) | ||
plt.show() | ||
|
||
|
||
#%% | ||
sns.set(style="ticks", font_scale=1) | ||
|
||
n_chains = 50 | ||
col = [cm.tab20(i % 20) for i in range(n_chains)] | ||
|
||
g = sns.FacetGrid(data=acc_probs.loc[acc_probs["Cell Type"].isin(["k__Bacteria;p__Fusobacteria", "k__Bacteria;p__Firmicutes", "k__Bacteria;p__Tenericutes"])], col="Cell Type", col_wrap=3) | ||
g.map(sns.kdeplot, "Inclusion probability") | ||
rug = g.map(sns.rugplot, "Inclusion probability", height=0.3, color="black") | ||
for ax in g.axes: | ||
ax.axvline(0.81, color="red", linewidth=0.5) | ||
|
||
# There is no labels, need to define the labels | ||
legend_labels = [i+1 for i in range(n_chains)] | ||
|
||
# Create the legend patches | ||
legend_patches = [matplotlib.patches.Patch(color=C, label=L) for | ||
C, L in zip(col, legend_labels)] | ||
|
||
# Plot the legend | ||
# plt.legend(handles=legend_patches, loc="lower right") | ||
|
||
plt.show() | ||
|
||
|
||
#%% | ||
# Only inconsistencies for Firmicutes | ||
|
||
acc_probs["is_significant"] = np.where(acc_probs["Inclusion probability"] > 0.81, True, False) | ||
|
||
sig_table = acc_probs.groupby(["Cell Type"]).agg({"Inclusion probability": "mean", "is_significant": "sum"}) | ||
|
||
print(sig_table) | ||
|
||
#%% | ||
|
||
types = ["k__Bacteria;p__Proteobacteria", "k__Bacteria;p__FBP"] | ||
|
||
ax = az.plot_pair( | ||
res_all, | ||
kind=["scatter", "kde"], | ||
kde_kwargs={"fill_last": False}, | ||
marginals=True, | ||
coords={"chain": [33], "cell_type": types, "cell_type_nb": types}, | ||
point_estimate="median", | ||
) | ||
|
||
plt.show() | ||
|
||
#%% | ||
|
||
az.rhat(res_all) | ||
|
||
#%% | ||
|
||
az.summary(res_all) | ||
|
||
#%% | ||
|
||
|