forked from JinmiaoChenLab/SEDR_analyses
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathDLPFC_stLearn.py
executable file
·106 lines (75 loc) · 4.03 KB
/
DLPFC_stLearn.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
import pandas as pd
from sklearn.metrics import adjusted_rand_score, normalized_mutual_info_score, silhouette_score, \
homogeneity_completeness_v_measure
from sklearn.metrics.cluster import contingency_matrix
from sklearn.preprocessing import LabelEncoder
import numpy as np
import scanpy as sc
import stlearn as st
from pathlib import Path
from sklearn.preprocessing import LabelEncoder
import sys
import matplotlib.pyplot as plt
BASE_PATH = Path('../data/DLPFC')
sample = sys.argv[1]
TILE_PATH = Path("/tmp/{}_tiles".format(sample))
TILE_PATH.mkdir(parents=True, exist_ok=True)
OUTPUT_PATH = Path(f"../output/DLPFC/{sample}/stLearn")
OUTPUT_PATH.mkdir(parents=True, exist_ok=True)
data = st.Read10X(BASE_PATH / sample)
ground_truth_df = pd.read_csv( BASE_PATH / sample / 'metadata.tsv', sep='\t')
ground_truth_df['ground_truth'] = ground_truth_df['layer_guess']
le = LabelEncoder()
ground_truth_le = le.fit_transform(list(ground_truth_df["ground_truth"].values))
n_cluster = len((set(ground_truth_df["ground_truth"]))) - 1
data.obs['ground_truth'] = ground_truth_df["ground_truth"]
ground_truth_df["ground_truth_le"] = ground_truth_le
# pre-processing for gene count table
st.pp.filter_genes(data,min_cells=1)
st.pp.normalize_total(data)
st.pp.log1p(data)
# run PCA for gene expression data
st.em.run_pca(data,n_comps=15)
# pre-processing for spot image
st.pp.tiling(data, TILE_PATH)
# this step uses deep learning model to extract high-level features from tile images
# may need few minutes to be completed
st.pp.extract_feature(data)
def calculate_clustering_matrix(pred, gt, sample, methods_):
df = pd.DataFrame(columns=['Sample', 'Score', 'PCA_or_UMAP', 'Method', "test"])
pca_ari = adjusted_rand_score(pred, gt)
df = df.append(pd.Series([sample, pca_ari, "pca", methods_, "Adjusted_Rand_Score"],
index=['Sample', 'Score', 'PCA_or_UMAP', 'Method', "test"]), ignore_index=True)
pca_nmi = normalized_mutual_info_score(pred, gt)
df = df.append(pd.Series([sample, pca_nmi, "pca", methods_, "Normalized_Mutual_Info_Score"],
index=['Sample', 'Score', 'PCA_or_UMAP', 'Method', "test"]), ignore_index=True)
pca_purity = purity_score(pred, gt)
df = df.append(pd.Series([sample, pca_purity, "pca", methods_, "Purity_Score"],
index=['Sample', 'Score', 'PCA_or_UMAP', 'Method', "test"]), ignore_index=True)
pca_homogeneity, pca_completeness, pca_v_measure = homogeneity_completeness_v_measure(pred, gt)
df = df.append(pd.Series([sample, pca_homogeneity, "pca", methods_, "Homogeneity_Score"],
index=['Sample', 'Score', 'PCA_or_UMAP', 'Method', "test"]), ignore_index=True)
df = df.append(pd.Series([sample, pca_completeness, "pca", methods_, "Completeness_Score"],
index=['Sample', 'Score', 'PCA_or_UMAP', 'Method', "test"]), ignore_index=True)
df = df.append(pd.Series([sample, pca_v_measure, "pca", methods_, "V_Measure_Score"],
index=['Sample', 'Score', 'PCA_or_UMAP', 'Method', "test"]), ignore_index=True)
return df
def purity_score(y_true, y_pred):
# compute contingency matrix (also called confusion matrix)
cm = contingency_matrix(y_true, y_pred)
# return purity
return np.sum(np.amax(cm, axis=0)) / np.sum(cm)
# stSME
st.spatial.SME.SME_normalize(data, use_data="raw", weights="physical_distance")
data_ = data.copy()
data_.X = data_.obsm['raw_SME_normalized']
st.pp.scale(data_)
st.em.run_pca(data_,n_comps=30)
st.tl.clustering.kmeans(data_, n_clusters=n_cluster, use_data="X_pca", key_added="X_pca_kmeans")
st.pl.cluster_plot(data_, use_label="X_pca_kmeans")
methods_ = "stSME_disk"
results_df = calculate_clustering_matrix(data_.obs["X_pca_kmeans"], ground_truth_le, sample, methods_)
plt.savefig(OUTPUT_PATH / 'cluster.png')
data_.obs.to_csv(OUTPUT_PATH / 'metadata.tsv', sep='\t', index=False)
df_PCA = pd.DataFrame(data = data_.obsm['X_pca'], index = data_.obs.index)
df_PCA.to_csv(OUTPUT_PATH / 'PCs.tsv', sep='\t')