Skip to content

Commit

Permalink
added functions for data visualization, extended model comparison
Browse files Browse the repository at this point in the history
johannesostner committed Sep 2, 2020
1 parent 682852a commit cabc79d
Showing 10 changed files with 2,296 additions and 35 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -111,3 +111,4 @@ venv.bak/

# Other
prototyping/
.pkl
2 changes: 1 addition & 1 deletion scdcdm/model/dirichlet_models.py
Original file line number Diff line number Diff line change
@@ -176,7 +176,7 @@ def sample_hmc(self, num_results=int(20e3), n_burnin=int(5e3), num_leapfrog_step
hmc_kernel = tfp.mcmc.TransformedTransitionKernel(
inner_kernel=hmc_kernel, bijector=constraining_bijectors)
hmc_kernel = tfp.mcmc.SimpleStepSizeAdaptation(
inner_kernel=hmc_kernel, num_adaptation_steps=int(4000), target_accept_prob=0.8)
inner_kernel=hmc_kernel, num_adaptation_steps=int(0.8*n_burnin), target_accept_prob=0.8)

# HMC sampling
states, kernel_results, duration = self.sampling(num_results, n_burnin, hmc_kernel, self.params)
41 changes: 38 additions & 3 deletions scdcdm/model/other_models.py
Original file line number Diff line number Diff line change
@@ -515,7 +515,7 @@ def fit_model(self):
p_val = [0 for x in range(K)]
else:
# computes clr-transformed data matrix as a pandas DataFrame
geom_mean = np.prod(self.y, axis=1, keepdims=True) ** (1 / D)
geom_mean = np.prod(self.y, axis=1, keepdims=True) ** (1 / K)
y_clr = np.log(self.y / geom_mean)

for k in range(K):
@@ -528,7 +528,7 @@ def fit_model(self):
self.p_val = p_val


class KSTest(FrequentistModel):
class TTest(FrequentistModel):
"""
Implements a KS test one each cell type into the scdcdm framework
(for model comparison purposes)
@@ -554,7 +554,42 @@ def fit_model(self):
else:
for k in range(K):

test = stats.ks_2samp(self.y[0:n_group, k], self.y[n_group:, k])
test = stats.ttest_ind(self.y[0:n_group, k], self.y[n_group:, k])
p_val.append(test[1])

self.p_val = p_val


class CLR_ttest(FrequentistModel):
"""
Implements a CLR transform and subsequent linear model on each cell type into the scdcdm framework
(for model comparison purposes)
"""
def fit_model(self):
"""
Fits CLR model
Returns
-------
p_val : list
p-values for differential abundance test of all cell types
"""

p_val = []
N, K = self.y.shape
D = self.x.shape[1]

n_group = int(N/2)

if self.y.shape[0] == 2:
p_val = [0 for x in range(K)]
else:
# computes clr-transformed data matrix as a pandas DataFrame
geom_mean = np.prod(self.y, axis=1, keepdims=True) ** (1 / K)
y_clr = np.log(self.y / geom_mean)

for k in range(K):
test = stats.ttest_ind(y_clr[0:n_group, k], y_clr[n_group:, k])
p_val.append(test[1])

self.p_val = p_val
184 changes: 184 additions & 0 deletions scdcdm/util/data_visualization.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,184 @@
"""
This script contains methods to visualize compositional data that was imported into SCDCdm's data format
:authors: Johannes Ostner
"""

# Setup

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib.pyplot import cm

sns.set_style("ticks")


def plot_one_stackbar(y, type_names, title, level_names):

"""
Plots a stacked barplot for one (discrete) covariate
Typical use: plot_one_stackbar(data.X, data.var.index, "xyz", data.obs.index)
Parameters
----------
y -- numpy array
The count data, collapsed onto the level of interest. i.e. a binary covariate has two rows containing the count
mean of all samples of each cell type
type_names -- list-like
The names of all cell types
title -- string
Plot title, usually the covariate's name
level_names -- list-like
names of the covariate's levels
Returns
-------
a plot
"""

plt.figure(figsize=(20, 10))
n_samples, n_types = y.shape
r = np.array(range(n_samples))
sample_sums = np.sum(y, axis=1)
barwidth = 0.85
cum_bars = np.zeros(n_samples)
colors = cm.tab20

for n in range(n_types):
bars = [i / j * 100 for i, j in zip([y[k][n] for k in range(n_samples)], sample_sums)]
plt.bar(r, bars, bottom=cum_bars, color=colors(n % 20), width=barwidth, label=type_names[n])
cum_bars += bars

plt.title(title)
plt.legend(loc='upper left', bbox_to_anchor=(1, 1), ncol=1)
plt.xticks(r, level_names, rotation=45)

plt.show()


def plot_feature_stackbars(data, features):

"""
Plots stackbars for all listed covariates
Usage: plot_feature_stackbars(data, ["cov1", "cov2", "cov3"])
Parameters
----------
data -- AnnData object
A SCDCdm-compatible data object
features -- list
List of all covariates to plot. Specifying "samples" as an element will plot a stackbar with all samples
Returns
-------
One plot for every category
"""

type_names = data.var.index
for f in features:

if f == "samples":
plot_one_stackbar(data.X, data.var.index, "samples", data.obs.index)
else:
levels = pd.unique(data.obs[f])
n_levels = len(levels)
feature_totals = np.zeros([n_levels, data.X.shape[1]])

for level in range(n_levels):
l_indices = np.where(data.obs[f] == levels[level])
feature_totals[level] = np.sum(data.X[l_indices], axis=0)

plot_one_stackbar(feature_totals, type_names=type_names, level_names=levels, title=f)


def grouped_boxplot(data, feature, log_scale=False, *args, **kwargs):
"""
Grouped boxplot -cell types on x-Axis; one boxplot per feature level for each cell type
Parameters
----------
data -- AnnData object
A SCDCdm-compatible data object
feature -- string
The name of the feature in data.obs to plot
log_scale -- bool
If true, use log(data.X + 1) (pseudocount 1 to avoid log(0)-issues)
*args, **kwargs -- Passed to sns.boxplot
Returns
-------
Plot!
"""

plt.figure(figsize=(20, 10))

# add pseudocount 1 if using log scale (needs to be improved)
if log_scale:
X = np.log(data.X + 1)
value_name = "log(count)"
else:
X = data.X
value_name = "count"

count_df = pd.DataFrame(X, columns=data.var.index, index=data.obs.index).\
merge(data.obs[feature], left_index=True, right_index=True)
plot_df = pd.melt(count_df, id_vars=feature, var_name="Cell type", value_name=value_name)

d = sns.boxplot(x="Cell type", y=value_name, hue=feature, data=plot_df, fliersize=1,
palette='Blues', *args, **kwargs)

loc, labels = plt.xticks()
d.set_xticklabels(labels, rotation=90)
plt.show()


def boxplot_facets(data, feature, log_scale=False, args_boxplot={}, args_swarmplot={}):
"""
Grouped boxplot -cell types on x-Axis; one boxplot per feature level for each cell type
Parameters
----------
data -- AnnData object
A SCDCdm-compatible data object
feature -- string
The name of the feature in data.obs to plot
log_scale -- bool
If true, use log(data.X + 1) (pseudocount 1 to avoid log(0)-issues)
args_boxplot -- dict
Arguments passed to sns.boxplot
args_swarmplot -- dict
Arguments passed to sns.swarmplot
Returns
-------
Plot!
"""

# add pseudocount 1 if using log scale (needs to be improved)
if log_scale:
X = np.log(data.X + 1)
value_name = "log(count)"
else:
X = data.X
value_name = "count"

K = X.shape[1]

count_df = pd.DataFrame(X, columns=data.var.index, index=data.obs.index).merge(data.obs, left_index=True,
right_index=True)
plot_df = pd.melt(count_df, id_vars=data.obs.columns, var_name="Cell type", value_name=value_name)
print(plot_df)

if "hue" in args_swarmplot:
hue = args_swarmplot.pop("hue")
else:
hue = None

g = sns.FacetGrid(plot_df, col="Cell type", sharey=False, col_wrap=np.floor(np.sqrt(K)), height=5, aspect=2)
g.map(sns.boxplot, feature, value_name, palette="Blues", **args_boxplot)
if hue is None:
g.map(sns.swarmplot, feature, value_name, color="black", **args_swarmplot).set_titles("{col_name}")
else:
g.map(sns.swarmplot, feature, value_name, hue, **args_swarmplot).set_titles("{col_name}")
plt.show()
114 changes: 84 additions & 30 deletions tests/benchmark_analysis/model_comparison_analysis.py
Original file line number Diff line number Diff line change
@@ -238,8 +238,8 @@


fg.add_legend()
# plt.savefig(result_path + "\\model_comparison.svg", format="svg", bbox_inces="tight")
# plt.savefig(result_path + "\\model_comparison.png", format="png", bbox_inces="tight")
plt.savefig(result_path + "\\model_comparison.svg", format="svg", bbox_inces="tight")
plt.savefig(result_path + "\\model_comparison.png", format="png", bbox_inces="tight")

plt.show()

@@ -368,8 +368,8 @@ def get_scores(df):
col_names = simulation_parameters + ["tp", "tn", "fp", "fn", "model"]

haber_fin = pd.DataFrame(columns=col_names)
clr_fin = pd.DataFrame(columns=col_names)
ks_fin = pd.DataFrame(columns=col_names)
clrt_fin = pd.DataFrame(columns=col_names)
t_fin = pd.DataFrame(columns=col_names)

l = len(results)
k = 1
@@ -385,11 +385,11 @@ def get_scores(df):
haber_df["fn"] = 0
haber_df["model"] = "Haber"

clr_df = haber_df.copy()
clr_df["model"] = "CLR"
clrt_df = haber_df.copy()
clrt_df["model"] = "CLR_ks"

ks_df = haber_df.copy()
ks_df["model"] = "KS"
t_df = haber_df.copy()
t_df["model"] = "KS"

for j in range(len(r.data)):
test_data = r.data[j]
@@ -402,39 +402,39 @@ def get_scores(df):
haber_df.loc[j, "tn"] = tn
haber_df.loc[j, "fn"] = fn

clr_mod = om.CLRModel(test_data)
clr_mod.fit_model()
tp, tn, fp, fn = clr_mod.eval_model(alpha, True)
clr_df.loc[j, "tp"] = tp
clr_df.loc[j, "fp"] = fp
clr_df.loc[j, "tn"] = tn
clr_df.loc[j, "fn"] = fn
clrks_mod = om.CLR_ttest(test_data)
clrks_mod.fit_model()
tp, tn, fp, fn = clrks_mod.eval_model(alpha, True)
clrt_df.loc[j, "tp"] = tp
clrt_df.loc[j, "fp"] = fp
clrt_df.loc[j, "tn"] = tn
clrt_df.loc[j, "fn"] = fn

ks_mod = om.KSTest(test_data)
ks_mod = om.TTest(test_data)
ks_mod.fit_model()
tp, tn, fp, fn = ks_mod.eval_model(alpha, True)
ks_df.loc[j, "tp"] = tp
ks_df.loc[j, "fp"] = fp
ks_df.loc[j, "tn"] = tn
ks_df.loc[j, "fn"] = fn
t_df.loc[j, "tp"] = tp
t_df.loc[j, "fp"] = fp
t_df.loc[j, "tn"] = tn
t_df.loc[j, "fn"] = fn

haber_fin = haber_fin.append(haber_df)
clr_fin = clr_fin.append(clr_df)
ks_fin = ks_fin.append(ks_df)
clrt_fin = clrt_fin.append(clrt_df)
t_fin = t_fin.append(t_df)

k += 1

print(haber_fin)
print(clr_fin)
print(ks_fin)
print(clrt_fin)
print(t_fin)

#%%

haber_fin_2 = get_scores(haber_fin)
clr_fin_2 = get_scores(clr_fin)
ks_fin_2 = get_scores(ks_fin)
clrt_fin_2 = get_scores(clrt_fin)
t_fin_2 = get_scores(t_fin)

fin_dfs = pd.concat([haber_fin_2, clr_fin_2, ks_fin_2])
fin_dfs = pd.concat([haber_fin_2, clrt_fin_2, t_fin_2])
print(fin_dfs)

#%%
@@ -472,14 +472,15 @@ def get_scores(df):
all_df = pd.concat([final_df, fin_dfs])

plot_df = all_df.rename(columns={"b_count": "Base", "num_increase": "Increase", "model": "Model",
"log_fold_increase": "log-fold increase"})
"log_fold_increase": "log-fold increase"}).loc[~all_df["model"].isin(["Poisson (Haber et al.)", "CLR_ks"])]

leg_labels = ["Simple DM", "SCDCdm", "scDC (Cao, Lin)", "Poisson regression", "T-test"]
#%%

# Plot for concept fig
fig, ax = plt.subplots(figsize=(10, 6))
sns.lineplot(data=plot_df, x="n_controls", y="mcc", hue="Model", palette="colorblind", ax=ax)
plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)
plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0., labels=leg_labels)
ax.set(xlabel="Replicates per group", ylabel="MCC")

plt.savefig(result_path + "\\model_comparison_replicates_confint_extended.svg", format="svg", bbox_inches="tight")
@@ -491,10 +492,63 @@ def get_scores(df):

fig, ax = plt.subplots(figsize=(10, 6))
sns.lineplot(data=plot_df, x="log-fold increase", y="mcc", hue="Model", palette="colorblind", ax=ax)
plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)
plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0., labels=leg_labels)
ax.set(xlabel="log-fold effect", ylabel="MCC")

plt.savefig(result_path + "\\model_comparison_logfold_confint_extended.svg", format="svg", bbox_inches="tight")
plt.savefig(result_path + "\\model_comparison_logfold_confint_extended.png", format="png", bbox_inches="tight")

plt.show()


#%%

# Revision of clr model
i = 150

test_data = results[i].data[8]

print(results[i].parameters)

#%%

importlib.reload(om)

clr_mod = om.CLRModel(test_data)
clr_mod.fit_model()
print(clr_mod.p_val)
res = clr_mod.eval_model(0.05, True)
print(res)

#%%

print(test_data.X)
p = np.prod(test_data.X, axis=1, keepdims=True)**(1/5)

clr = np.log(test_data.X/p)
print(clr)
print(np.sum(clr, axis=1))

#%%
from statsmodels.formula.api import glm
import statsmodels as sm

data_ct = pd.DataFrame({"x": test_data.obs["x_0"],
"y_0": clr[:, 0],
"y_1": clr[:, 1],
"y_2": clr[:, 2],
"y_3": clr[:, 3],
"y_4": clr[:, 4],})

model_ct = glm('y_0 ~ x', data=data_ct, family=sm.genmod.families.Poisson()).fit()

print(model_ct.summary())

#%%
importlib.reload(om)

clr_mod = om.CLR_ttest(test_data)
clr_mod.fit_model()
print(clr_mod.p_val)
res = clr_mod.eval_model(0.05, True)
print(res)
1,712 changes: 1,712 additions & 0 deletions tests/microbiome_testing/MovingPictures_Analysis.ipynb

Large diffs are not rendered by default.

Empty file.
63 changes: 63 additions & 0 deletions tests/microbiome_testing/harrison_analysis.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import importlib
import arviz as az
import pickle as pkl
from matplotlib.pyplot import cm
import matplotlib

from scdcdm.util import data_generation as gen
from scdcdm.util import comp_ana as mod
from scdcdm.util import result_classes as res
from scdcdm.util import multi_parameter_sampling as mult
from scdcdm.util import cell_composition_data as dat
from scdcdm.util import data_visualization as viz

pd.options.display.float_format = '{:10,.3f}'.format
pd.set_option('display.max_columns', 10)

#%%

data_path = "C:/Users/Johannes/Documents/PhD/data/rosen_mincount10_maxee2_trim200_results_forpaper/"

otu_file = "rosen_mincount10_maxee2_trim200.otu_table.99.denovo.rdp_assigned.paper_samples.txt"

with open(data_path + otu_file, "rb") as file:
otus = pd.read_csv(file, sep="\t", index_col=0)

print(otus)

#%%
meta_file = "patient_clinical_metadata.csv"

with open(data_path + meta_file, "rb") as file:
metadata = pd.read_csv(file, sep=",", index_col=0)

metadata = metadata[~metadata.index.str.endswith('F')]
metadata = metadata[~metadata.index.str.endswith('sick')]
metadata = metadata[~metadata.index.str.endswith('F2')]
metadata = metadata[~metadata.index.str.endswith('F2T')]
metadata = metadata[~metadata.index.str.endswith('2')]
metadata = metadata[~metadata.index.str.startswith('05')]

print(metadata)

#%%

meta_rel = metadata[(~pd.isna(metadata["mbs_consolidated"])) & (~pd.isna(metadata["bal"]))]

print(meta_rel)

#%%

print(meta_rel.groupby("mbs_consolidated").count())

#%%

otus_rel = otus.loc[otus.index.isin(meta_rel.index)]

print(otus_rel)


Original file line number Diff line number Diff line change
@@ -103,13 +103,13 @@ def plot_feature_stackbars(data, features):


#%%
# No significances
# Model with subject as covariate
model_subject = mod.CompositionalAnalysis(data, "subject", baseline_index=None)

result_subject = model_subject.sample_hmc(num_results=int(20000), n_burnin=5000)

result_subject.summary_extended(hdi_prob=0.95)
# No significances

#%%
az.plot_trace(result_subject, var_names=["beta"])
212 changes: 212 additions & 0 deletions tests/microbiome_testing/moving_pictures_consistency_test.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,212 @@
"""
Testing scdcdm for consistency on the "Moving pictures" dataset
"""

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import importlib
import arviz as az
import pickle as pkl
from matplotlib.pyplot import cm
import matplotlib

from scdcdm.util import data_generation as gen
from scdcdm.util import comp_ana as mod
from scdcdm.util import result_classes as res
from scdcdm.util import multi_parameter_sampling as mult
from scdcdm.util import cell_composition_data as dat
from scdcdm.util import data_visualization as viz

pd.options.display.float_format = '{:10,.3f}'.format
pd.set_option('display.max_columns', None)

write_path = "C:/Users/Johannes/Documents/Uni/Master's_Thesis/compositionalDiff-johannes_tests_2/tests/microbiome_testing"

#%%

# read phylum-level data from biom file as tsv
data_path = "C:/Users/Johannes/AppData/Local/Packages/CanonicalGroupLimited.Ubuntu18.04onWindows_79rhkp1fndgsc/LocalState/rootfs/home/johannes/qiime2_projects/moving-pictures-tutorial"

with open(data_path+"/exported_data/feature-table-l2.tsv", "rb") as f:
biom_data = pd.read_csv(f, sep="\t", header=1, index_col="#OTU ID")

biom_data = biom_data.transpose()

# remove rare groups (<10 in all samples)

# read metadata
with open(data_path+"/sample-metadata.tsv", "rb") as f:
metadata = pd.read_csv(f, sep="\t", index_col="sample-id").iloc[1:, :]

metadata_columns = ["subject", "reported-antibiotic-usage", "days-since-experiment-start", "body-site"]

# add subject to count data
biom_data = pd.merge(biom_data, metadata[metadata_columns], left_index=True, right_index=True)

data = dat.from_pandas(biom_data, metadata_columns)
data.obs = data.obs.rename(columns={"reported-antibiotic-usage": "antibiotic", "body-site": "site",
"days-since-experiment-start": "days_since_start"})

print(data.obs)

#%%
importlib.reload(viz)

sns.set(style="ticks", font_scale=2)
args_swarmplot={"hue": "subject", "size": 10, "palette": "Reds"}
viz.boxplot_facets(data, feature="site")
plt.show()

#%%

# Model that differentiates both palms
model_palms = mod.CompositionalAnalysis(data[data.obs["site"].isin(["left palm", "right palm"])], "site", baseline_index=None)

result_palms = model_palms.sample_hmc(num_results=int(20000), n_burnin=5000)

result_palms.summary_extended(hdi_prob=0.95)

#%%

with az.rc_context(rc={'plot.max_subplots': None}):
az.plot_trace(result_palms, compact=True)
plt.show()

#%%

# less samples, less burnin
result_palms_5000 = model_palms.sample_hmc(num_results=int(5000), n_burnin=1000)

result_palms_5000.summary_extended(hdi_prob=0.95)

#%%

with az.rc_context(rc={'plot.max_subplots': None}):
az.plot_trace(result_palms_5000, var_names="beta")
plt.show()



#%%

with az.rc_context(rc={'plot.max_subplots': None}):
az.plot_trace(result_palms, var_names=["beta", "b_raw"])
plt.show()

#%%

az.rhat(result_palms, method="folded")


#%%
# run palms comparison multiple times

results = []
n_chains = 50

model_palms = mod.CompositionalAnalysis(data[data.obs["site"].isin(["left palm", "right palm"])], "site", baseline_index=None)

for n in range(n_chains):
result_temp = model_palms.sample_hmc(num_results=int(20000), n_burnin=5000)

results.append(result_temp)

#%%
res_all = az.concat(results, dim="chain")

print(res_all.posterior)

#%%
az.to_netcdf(res_all, write_path + "/multi_chain_50_len20000_all")

#%%

acc_probs = pd.DataFrame(pd.concat([r.effect_df.loc[:, "Inclusion probability"] for r in results]))

acc_probs["chain_no"] = np.concatenate([np.repeat(i+1, 21) for i in range(n_chains)])

acc_probs.index = acc_probs.index.droplevel(0)

acc_probs = acc_probs.reset_index()

print(acc_probs)

#%%
with open(write_path + "/multi_chain_50_len20000_acc", "wb") as file:
pkl.dump(acc_probs, file)


#%%
with open(write_path + "/multi_chain_50_len20000_acc", "rb") as file:
acc_probs = pkl.load(file)

res_all = az.from_netcdf(write_path + "/multi_chain_50_len20000_all")

#%%
coords = {"cell_type": "k__Bacteria;p__Proteobacteria"}
az.plot_trace(res_all, var_names="beta", coords=coords)
plt.show()


#%%
sns.set(style="ticks", font_scale=1)

n_chains = 50
col = [cm.tab20(i % 20) for i in range(n_chains)]

g = sns.FacetGrid(data=acc_probs.loc[acc_probs["Cell Type"].isin(["k__Bacteria;p__Fusobacteria", "k__Bacteria;p__Firmicutes", "k__Bacteria;p__Tenericutes"])], col="Cell Type", col_wrap=3)
g.map(sns.kdeplot, "Inclusion probability")
rug = g.map(sns.rugplot, "Inclusion probability", height=0.3, color="black")
for ax in g.axes:
ax.axvline(0.81, color="red", linewidth=0.5)

# There is no labels, need to define the labels
legend_labels = [i+1 for i in range(n_chains)]

# Create the legend patches
legend_patches = [matplotlib.patches.Patch(color=C, label=L) for
C, L in zip(col, legend_labels)]

# Plot the legend
# plt.legend(handles=legend_patches, loc="lower right")

plt.show()


#%%
# Only inconsistencies for Firmicutes

acc_probs["is_significant"] = np.where(acc_probs["Inclusion probability"] > 0.81, True, False)

sig_table = acc_probs.groupby(["Cell Type"]).agg({"Inclusion probability": "mean", "is_significant": "sum"})

print(sig_table)

#%%

types = ["k__Bacteria;p__Proteobacteria", "k__Bacteria;p__FBP"]

ax = az.plot_pair(
res_all,
kind=["scatter", "kde"],
kde_kwargs={"fill_last": False},
marginals=True,
coords={"chain": [33], "cell_type": types, "cell_type_nb": types},
point_estimate="median",
)

plt.show()

#%%

az.rhat(res_all)

#%%

az.summary(res_all)

#%%


0 comments on commit cabc79d

Please sign in to comment.