forked from BostonGene/MFP
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Signature calculation; scaling; clustering functions; Example script
- Loading branch information
Showing
5 changed files
with
294 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,63 @@ | ||
# Python3.7 | ||
|
||
from clustering.clustering import * | ||
from clustering.utils import * | ||
|
||
# Example script | ||
|
||
# Read signatures | ||
gmt = read_gene_sets('signatures.gmt') # GMT format like in MSIGdb | ||
|
||
# Read expressions | ||
exp = pd.read_csv('expression.tsv', sep='\t', index_col=0) # log2+1 transformed; Genes in columns | ||
|
||
# Calc signature scores | ||
signature_scores = ssgsea_formula(exp, gmt) | ||
|
||
# Scale signatures | ||
signature_scores_scaled = median_scale(signature_scores) | ||
|
||
# Check the clustering within a range of 30 to 65% similarity. | ||
# >65% - usually graph is not connected; <30% - unreasonable correlation | ||
clustering_metrics = clustering_profile_metrics(signature_scores_scaled, threshold_mm=(.3, .65), step=.01) | ||
|
||
# Visualize the partitions | ||
clustering_profile_metrics_plot(clustering_metrics) | ||
|
||
# Then select the best threshold using one ore more metrics. | ||
best_threshold = '0.51' | ||
|
||
|
||
def detect_type(ser, scores): | ||
cmeans = pd.DataFrame({cg: scores.loc[samps.index].mean() for cg, samps in ser.groupby(ser)}) | ||
mapper = {} | ||
deltas = (cmeans.loc[['Angiogenesis', 'Endothelium', 'CAF', 'Matrix', 'Matrix_remodeling']].mean() - | ||
cmeans.loc[['MHCII', 'Antitumor_cytokines', 'Coactivation_molecules', | ||
'B_cells', 'NK_cells', 'Checkpoint_inhibition', | ||
'Effector_cells', 'T_cells', 'Th1_signature', | ||
'T_cell_traffic', 'MHCI']].mean()).sort_values() | ||
|
||
mapper[deltas.index[-1]] = 'F' # That's fibrotic | ||
mapper[deltas.index[0]] = 'IE' # Immune enriched, non-fibrotic | ||
cmeans.pop(deltas.index[-1]) | ||
cmeans.pop(deltas.index[0]) | ||
|
||
deltas = (cmeans.loc[['Angiogenesis', 'Endothelium', 'CAF', 'Matrix', 'Matrix_remodeling', | ||
'Protumor_cytokines', 'Neutrophil_signature', 'Granulocyte_traffic', | ||
'Macrophages', 'Macrophage_DC_traffic', 'MDSC_traffic', 'MDSC', | ||
'Th2_signature', 'T_reg_traffic', 'Treg', 'M1_signatures', 'MHCII', | ||
'Antitumor_cytokines', 'Coactivation_molecules', 'B_cells', 'NK_cells', | ||
'Checkpoint_inhibition', 'Effector_cells', 'T_cells', 'Th1_signature', | ||
'T_cell_traffic', 'MHCI', 'EMT_signature']].mean() - | ||
cmeans.loc['Proliferation_rate']).sort_values() | ||
|
||
mapper[deltas.index[-1]] = 'IE/F' # Immune enriched & fibrotic | ||
mapper[deltas.index[0]] = 'D' # Desert | ||
return ser.map(mapper).rename('MFP') | ||
|
||
|
||
# Detect cluster types | ||
final_clusters = detect_type(clustering_metrics[best_threshold], signature_scores_scaled) | ||
|
||
# Output the clusters | ||
final_clusters.to_csv('final_clusters.tsv', sep='\t', index=True) |
Empty file.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,160 @@ | ||
import warnings | ||
|
||
import community # louvain | ||
import matplotlib.pyplot as plt | ||
import networkx as nx | ||
import numpy as np | ||
import pandas as pd | ||
from tqdm import tqdm | ||
|
||
|
||
def gen_graph(similarity_matrix, threshold=0.8): | ||
""" | ||
Generates a graph from the similarity_matrix (square dataframe). Each sample is a node, similarity - edge weight. | ||
Edges with weight lower than the threshold are ignored. | ||
Only nodes with at least 1 edge with weight above the threshold will be present in the final graph | ||
:param similarity_matrix: | ||
:param threshold: | ||
:return: | ||
""" | ||
G = nx.Graph() | ||
for col_n in similarity_matrix: | ||
col = similarity_matrix[col_n].drop(col_n) | ||
mtr = col[col > threshold] | ||
for row_n, val in list(mtr.to_dict().items()): | ||
G.add_edge(col_n, row_n, weight=round(val, 2)) | ||
return G | ||
|
||
|
||
def louvain_community(correlation_matrix, threshold=1.4, resolution=1, random_state=100, **kwargs): | ||
""" | ||
Generates a graph from a correlation_matrix with weighted edges (weight<threshold excluded). | ||
Then detects communities using louvain algorithm (https://github.com/taynaud/python-louvain) | ||
:param correlation_matrix: | ||
:param threshold: | ||
:param resolution: | ||
:param random_state: | ||
:param kwargs: | ||
:return: | ||
""" | ||
G = gen_graph(correlation_matrix, threshold) | ||
if len(G.edges()) > 10000000: | ||
warnings.warn("Too many edges will result in huge computational time") | ||
return pd.Series(community.best_partition(G, resolution=resolution, | ||
random_state=random_state, **kwargs)) + 1 | ||
|
||
|
||
def dense_clustering(data, threshold=0.4, name='MFP', method='louvain', **kwargs): | ||
""" | ||
Generates a graph from the table features(cols)*samples(rows). | ||
Then performs community detection using a selected method leiden|louvain | ||
:param method: | ||
:param data: | ||
:param threshold: | ||
:param name: | ||
:return: | ||
""" | ||
if method == 'louvain': # others could be implemented like Leiden | ||
partition = louvain_community(data.T.corr() + 1, threshold + 1, **kwargs) | ||
else: | ||
raise Exception('Unknown method') | ||
|
||
return partition.rename(name) | ||
|
||
|
||
def clustering_profile_metrics(data, threshold_mm=(0.3, 0.65), step=0.01, method='louvain'): | ||
""" | ||
Iterates threshold in threshold_mm area with step. Calculates cluster separation metrics on each threshold. | ||
Returns a pd.DataFrame with the metrics | ||
:param data: | ||
:param threshold_mm: | ||
:param step: | ||
:param method: | ||
:return: | ||
""" | ||
from sklearn.metrics import silhouette_score, calinski_harabasz_score, davies_bouldin_score | ||
cluster_metrics = {} | ||
|
||
for tr in tqdm(np.round(np.arange(threshold_mm[0], threshold_mm[1], step), 3)): | ||
clusters_comb = dense_clustering(data, threshold=tr, method=method) | ||
cluster_metrics[tr] = { | ||
'ch': calinski_harabasz_score(data.loc[clusters_comb.index], clusters_comb), | ||
'db': davies_bouldin_score(data.loc[clusters_comb.index], clusters_comb), | ||
'sc': silhouette_score(data.loc[clusters_comb.index], clusters_comb), | ||
'N': len(clusters_comb.unique()), | ||
'perc': clusters_comb, | ||
} | ||
|
||
return pd.DataFrame(cluster_metrics).T | ||
|
||
|
||
def clustering_profile_metrics_plot(cluster_metrics, num_clusters_ylim_max=7): | ||
""" | ||
Plots a dataframe from clustering_profile_metrics | ||
:param cluster_metrics: | ||
:param num_clusters_ylim_max: | ||
:return: axis array | ||
""" | ||
# necessary for correct x axis sharing | ||
cluster_metrics.index = [str(x) for x in cluster_metrics.index] | ||
|
||
plots_ratios = [3, 3, 3, 1, 2] | ||
fig, axs = plt.subplots(len(plots_ratios), 1, figsize=(8, np.sum(plots_ratios)), | ||
gridspec_kw={'height_ratios': plots_ratios}, sharex=True) | ||
for ax in axs: | ||
ax.tick_params(axis='x', which='minor', length=0) | ||
af = axs.flat | ||
|
||
ax = cluster_metrics.db.plot(ax=next(af), label='Davies Bouldin', color='#E63D06') | ||
ax.legend() | ||
|
||
ax = cluster_metrics.ch.plot(ax=next(af), label='Calinski Harabasz', color='#E63D06') | ||
ax.legend() | ||
|
||
ax = cluster_metrics.sc.plot(ax=next(af), label='Silhouette score', color='#E63D06') | ||
ax.legend() | ||
|
||
ax = cluster_metrics.N.plot(kind='line', ax=next(af), label='# clusters', color='#000000') | ||
ax.set_ylim(0, num_clusters_ylim_max) | ||
ax.legend() | ||
|
||
# display percentage for 10 clusters max | ||
clusters_perc = pd.DataFrame([x.value_counts() for x in cluster_metrics.perc], | ||
index=cluster_metrics.index).iloc[:, :10] | ||
|
||
clusters_perc.plot(kind='bar', stached=True, ax=next(af), offset=.5) | ||
|
||
ax.set_xticks(ax.get_xticks() - .5) | ||
ax.set_xticklabels(ax.get_xticklabels(), rotation=90) | ||
|
||
ax.set_ylabel('Cluster %') | ||
return ax | ||
|
||
|
||
def clustering_select_best_tr(data, n_clusters=4, threshold_mm=(0.3, 0.6), | ||
step=0.025, method='leiden', num_clusters_ylim_max=7, plot=True): | ||
""" | ||
Selects the best threshold for n_clusters separation using dense_clustering with selected method | ||
from threshold_mm with a particular step | ||
:param data: dataframe with processes (rows - samples, columns - signatures) | ||
:param n_clusters: desired number of clusters | ||
:param threshold_mm: range of thresholds | ||
:param step: step to go through range of thresholds | ||
:param method: clustering method | ||
:param num_clusters_ylim_max: set y_lim for plot with number of clusters | ||
:param plot: whether to plot all matrix | ||
:return: the threshold to get n_clusters | ||
""" | ||
cl_scs = clustering_profile_metrics(data, threshold_mm=threshold_mm, step=step, method=method) | ||
|
||
if plot: | ||
clustering_profile_metrics_plot(cl_scs, num_clusters_ylim_max) | ||
plt.show() | ||
|
||
cl_scs_filtered = cl_scs[cl_scs.N == n_clusters] | ||
|
||
if not len(cl_scs_filtered): | ||
raise Exception('No partition with n_clusters = {}'.format(n_clusters)) | ||
|
||
cl_scs_filtered.sc += 1 - cl_scs_filtered.sc.min() | ||
return (cl_scs_filtered.ch / cl_scs_filtered.db / cl_scs_filtered.sc).sort_values().index[-1] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,63 @@ | ||
import pandas as pd | ||
|
||
|
||
class GeneSet(object): | ||
def __init__(self, name, descr, genes): | ||
self.name = name | ||
self.descr = descr | ||
self.genes = set(genes) | ||
self.genes_ordered = list(genes) | ||
|
||
def __str__(self): | ||
return '{}\t{}\t{}'.format(self.name, self.descr, '\t'.join(self.genes)) | ||
|
||
|
||
def read_gene_sets(gmt_file): | ||
""" | ||
Return dict {geneset_name : GeneSet object} | ||
:param gmt_file: str, path to .gmt file | ||
:return: dict | ||
""" | ||
gene_sets = {} | ||
with open(gmt_file) as handle: | ||
for line in handle: | ||
items = line.strip().split('\t') | ||
name = items[0].strip() | ||
description = items[1].strip() | ||
genes = set([gene.strip() for gene in items[2:]]) | ||
gene_sets[name] = GeneSet(name, description, genes) | ||
|
||
return gene_sets | ||
|
||
|
||
def ssgsea_score(ranks, genes): | ||
common_genes = list(set(genes).intersection(set(ranks.index))) | ||
if not len(common_genes): | ||
return pd.Series([0] * len(ranks.columns), index=ranks.columns) | ||
sranks = ranks.loc[common_genes] | ||
return (sranks ** 1.25).sum() / (sranks ** 0.25).sum() - (len(ranks.index) - len(common_genes) + 1) / 2 | ||
|
||
|
||
def ssgsea_formula(data, gene_sets, rank_method='max'): | ||
""" | ||
Return DataFrame with ssgsea scores | ||
Only overlapping genes will be analyzed | ||
:param data: pd.DataFrame, DataFrame with samples in columns and variables in rows | ||
:param gene_sets: dict, keys - processes, values - bioreactor.gsea.GeneSet | ||
:param rank_method: str, 'min' or 'max'. | ||
:return: pd.DataFrame, ssgsea scores, index - genesets, columns - patients | ||
""" | ||
|
||
ranks = data.T.rank(method=rank_method, na_option='bottom') | ||
|
||
return pd.DataFrame({gs_name: ssgsea_score(ranks, gene_sets[gs_name].genes) | ||
for gs_name in list(gene_sets.keys())}) | ||
|
||
|
||
def median_scale(data, clip=None): | ||
c_data = (data - data.median()) / data.mad() | ||
if clip is not None: | ||
return c_data.clip(-clip, clip) | ||
return c_data |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,8 @@ | ||
scipy>=1.1.0 | ||
numpy>=1.15.4 | ||
matplotlib>=2.2.0 | ||
pandas>=0.23.4 | ||
scikit-learn>=0.20.4 | ||
seaborn>=0.8.1 | ||
networkx==2.0 | ||
python-louvain==0.12 |