Skip to content

Commit

Permalink
Signature calculation; scaling; clustering functions; Example script
Browse files Browse the repository at this point in the history
  • Loading branch information
avkitex committed Sep 10, 2020
1 parent 9779305 commit 37564d4
Show file tree
Hide file tree
Showing 5 changed files with 294 additions and 0 deletions.
63 changes: 63 additions & 0 deletions clustering.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
# Python3.7

from clustering.clustering import *
from clustering.utils import *

# Example script

# Read signatures
gmt = read_gene_sets('signatures.gmt') # GMT format like in MSIGdb

# Read expressions
exp = pd.read_csv('expression.tsv', sep='\t', index_col=0) # log2+1 transformed; Genes in columns

# Calc signature scores
signature_scores = ssgsea_formula(exp, gmt)

# Scale signatures
signature_scores_scaled = median_scale(signature_scores)

# Check the clustering within a range of 30 to 65% similarity.
# >65% - usually graph is not connected; <30% - unreasonable correlation
clustering_metrics = clustering_profile_metrics(signature_scores_scaled, threshold_mm=(.3, .65), step=.01)

# Visualize the partitions
clustering_profile_metrics_plot(clustering_metrics)

# Then select the best threshold using one ore more metrics.
best_threshold = '0.51'


def detect_type(ser, scores):
cmeans = pd.DataFrame({cg: scores.loc[samps.index].mean() for cg, samps in ser.groupby(ser)})
mapper = {}
deltas = (cmeans.loc[['Angiogenesis', 'Endothelium', 'CAF', 'Matrix', 'Matrix_remodeling']].mean() -
cmeans.loc[['MHCII', 'Antitumor_cytokines', 'Coactivation_molecules',
'B_cells', 'NK_cells', 'Checkpoint_inhibition',
'Effector_cells', 'T_cells', 'Th1_signature',
'T_cell_traffic', 'MHCI']].mean()).sort_values()

mapper[deltas.index[-1]] = 'F' # That's fibrotic
mapper[deltas.index[0]] = 'IE' # Immune enriched, non-fibrotic
cmeans.pop(deltas.index[-1])
cmeans.pop(deltas.index[0])

deltas = (cmeans.loc[['Angiogenesis', 'Endothelium', 'CAF', 'Matrix', 'Matrix_remodeling',
'Protumor_cytokines', 'Neutrophil_signature', 'Granulocyte_traffic',
'Macrophages', 'Macrophage_DC_traffic', 'MDSC_traffic', 'MDSC',
'Th2_signature', 'T_reg_traffic', 'Treg', 'M1_signatures', 'MHCII',
'Antitumor_cytokines', 'Coactivation_molecules', 'B_cells', 'NK_cells',
'Checkpoint_inhibition', 'Effector_cells', 'T_cells', 'Th1_signature',
'T_cell_traffic', 'MHCI', 'EMT_signature']].mean() -
cmeans.loc['Proliferation_rate']).sort_values()

mapper[deltas.index[-1]] = 'IE/F' # Immune enriched & fibrotic
mapper[deltas.index[0]] = 'D' # Desert
return ser.map(mapper).rename('MFP')


# Detect cluster types
final_clusters = detect_type(clustering_metrics[best_threshold], signature_scores_scaled)

# Output the clusters
final_clusters.to_csv('final_clusters.tsv', sep='\t', index=True)
Empty file added clustering/__init__.py
Empty file.
160 changes: 160 additions & 0 deletions clustering/clustering.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,160 @@
import warnings

import community # louvain
import matplotlib.pyplot as plt
import networkx as nx
import numpy as np
import pandas as pd
from tqdm import tqdm


def gen_graph(similarity_matrix, threshold=0.8):
"""
Generates a graph from the similarity_matrix (square dataframe). Each sample is a node, similarity - edge weight.
Edges with weight lower than the threshold are ignored.
Only nodes with at least 1 edge with weight above the threshold will be present in the final graph
:param similarity_matrix:
:param threshold:
:return:
"""
G = nx.Graph()
for col_n in similarity_matrix:
col = similarity_matrix[col_n].drop(col_n)
mtr = col[col > threshold]
for row_n, val in list(mtr.to_dict().items()):
G.add_edge(col_n, row_n, weight=round(val, 2))
return G


def louvain_community(correlation_matrix, threshold=1.4, resolution=1, random_state=100, **kwargs):
"""
Generates a graph from a correlation_matrix with weighted edges (weight<threshold excluded).
Then detects communities using louvain algorithm (https://github.com/taynaud/python-louvain)
:param correlation_matrix:
:param threshold:
:param resolution:
:param random_state:
:param kwargs:
:return:
"""
G = gen_graph(correlation_matrix, threshold)
if len(G.edges()) > 10000000:
warnings.warn("Too many edges will result in huge computational time")
return pd.Series(community.best_partition(G, resolution=resolution,
random_state=random_state, **kwargs)) + 1


def dense_clustering(data, threshold=0.4, name='MFP', method='louvain', **kwargs):
"""
Generates a graph from the table features(cols)*samples(rows).
Then performs community detection using a selected method leiden|louvain
:param method:
:param data:
:param threshold:
:param name:
:return:
"""
if method == 'louvain': # others could be implemented like Leiden
partition = louvain_community(data.T.corr() + 1, threshold + 1, **kwargs)
else:
raise Exception('Unknown method')

return partition.rename(name)


def clustering_profile_metrics(data, threshold_mm=(0.3, 0.65), step=0.01, method='louvain'):
"""
Iterates threshold in threshold_mm area with step. Calculates cluster separation metrics on each threshold.
Returns a pd.DataFrame with the metrics
:param data:
:param threshold_mm:
:param step:
:param method:
:return:
"""
from sklearn.metrics import silhouette_score, calinski_harabasz_score, davies_bouldin_score
cluster_metrics = {}

for tr in tqdm(np.round(np.arange(threshold_mm[0], threshold_mm[1], step), 3)):
clusters_comb = dense_clustering(data, threshold=tr, method=method)
cluster_metrics[tr] = {
'ch': calinski_harabasz_score(data.loc[clusters_comb.index], clusters_comb),
'db': davies_bouldin_score(data.loc[clusters_comb.index], clusters_comb),
'sc': silhouette_score(data.loc[clusters_comb.index], clusters_comb),
'N': len(clusters_comb.unique()),
'perc': clusters_comb,
}

return pd.DataFrame(cluster_metrics).T


def clustering_profile_metrics_plot(cluster_metrics, num_clusters_ylim_max=7):
"""
Plots a dataframe from clustering_profile_metrics
:param cluster_metrics:
:param num_clusters_ylim_max:
:return: axis array
"""
# necessary for correct x axis sharing
cluster_metrics.index = [str(x) for x in cluster_metrics.index]

plots_ratios = [3, 3, 3, 1, 2]
fig, axs = plt.subplots(len(plots_ratios), 1, figsize=(8, np.sum(plots_ratios)),
gridspec_kw={'height_ratios': plots_ratios}, sharex=True)
for ax in axs:
ax.tick_params(axis='x', which='minor', length=0)
af = axs.flat

ax = cluster_metrics.db.plot(ax=next(af), label='Davies Bouldin', color='#E63D06')
ax.legend()

ax = cluster_metrics.ch.plot(ax=next(af), label='Calinski Harabasz', color='#E63D06')
ax.legend()

ax = cluster_metrics.sc.plot(ax=next(af), label='Silhouette score', color='#E63D06')
ax.legend()

ax = cluster_metrics.N.plot(kind='line', ax=next(af), label='# clusters', color='#000000')
ax.set_ylim(0, num_clusters_ylim_max)
ax.legend()

# display percentage for 10 clusters max
clusters_perc = pd.DataFrame([x.value_counts() for x in cluster_metrics.perc],
index=cluster_metrics.index).iloc[:, :10]

clusters_perc.plot(kind='bar', stached=True, ax=next(af), offset=.5)

ax.set_xticks(ax.get_xticks() - .5)
ax.set_xticklabels(ax.get_xticklabels(), rotation=90)

ax.set_ylabel('Cluster %')
return ax


def clustering_select_best_tr(data, n_clusters=4, threshold_mm=(0.3, 0.6),
step=0.025, method='leiden', num_clusters_ylim_max=7, plot=True):
"""
Selects the best threshold for n_clusters separation using dense_clustering with selected method
from threshold_mm with a particular step
:param data: dataframe with processes (rows - samples, columns - signatures)
:param n_clusters: desired number of clusters
:param threshold_mm: range of thresholds
:param step: step to go through range of thresholds
:param method: clustering method
:param num_clusters_ylim_max: set y_lim for plot with number of clusters
:param plot: whether to plot all matrix
:return: the threshold to get n_clusters
"""
cl_scs = clustering_profile_metrics(data, threshold_mm=threshold_mm, step=step, method=method)

if plot:
clustering_profile_metrics_plot(cl_scs, num_clusters_ylim_max)
plt.show()

cl_scs_filtered = cl_scs[cl_scs.N == n_clusters]

if not len(cl_scs_filtered):
raise Exception('No partition with n_clusters = {}'.format(n_clusters))

cl_scs_filtered.sc += 1 - cl_scs_filtered.sc.min()
return (cl_scs_filtered.ch / cl_scs_filtered.db / cl_scs_filtered.sc).sort_values().index[-1]
63 changes: 63 additions & 0 deletions clustering/utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
import pandas as pd


class GeneSet(object):
def __init__(self, name, descr, genes):
self.name = name
self.descr = descr
self.genes = set(genes)
self.genes_ordered = list(genes)

def __str__(self):
return '{}\t{}\t{}'.format(self.name, self.descr, '\t'.join(self.genes))


def read_gene_sets(gmt_file):
"""
Return dict {geneset_name : GeneSet object}
:param gmt_file: str, path to .gmt file
:return: dict
"""
gene_sets = {}
with open(gmt_file) as handle:
for line in handle:
items = line.strip().split('\t')
name = items[0].strip()
description = items[1].strip()
genes = set([gene.strip() for gene in items[2:]])
gene_sets[name] = GeneSet(name, description, genes)

return gene_sets


def ssgsea_score(ranks, genes):
common_genes = list(set(genes).intersection(set(ranks.index)))
if not len(common_genes):
return pd.Series([0] * len(ranks.columns), index=ranks.columns)
sranks = ranks.loc[common_genes]
return (sranks ** 1.25).sum() / (sranks ** 0.25).sum() - (len(ranks.index) - len(common_genes) + 1) / 2


def ssgsea_formula(data, gene_sets, rank_method='max'):
"""
Return DataFrame with ssgsea scores
Only overlapping genes will be analyzed
:param data: pd.DataFrame, DataFrame with samples in columns and variables in rows
:param gene_sets: dict, keys - processes, values - bioreactor.gsea.GeneSet
:param rank_method: str, 'min' or 'max'.
:return: pd.DataFrame, ssgsea scores, index - genesets, columns - patients
"""

ranks = data.T.rank(method=rank_method, na_option='bottom')

return pd.DataFrame({gs_name: ssgsea_score(ranks, gene_sets[gs_name].genes)
for gs_name in list(gene_sets.keys())})


def median_scale(data, clip=None):
c_data = (data - data.median()) / data.mad()
if clip is not None:
return c_data.clip(-clip, clip)
return c_data
8 changes: 8 additions & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
scipy>=1.1.0
numpy>=1.15.4
matplotlib>=2.2.0
pandas>=0.23.4
scikit-learn>=0.20.4
seaborn>=0.8.1
networkx==2.0
python-louvain==0.12

0 comments on commit 37564d4

Please sign in to comment.