-
Notifications
You must be signed in to change notification settings - Fork 12
/
utils.py
138 lines (109 loc) · 4.6 KB
/
utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
import pandas as pd
import numpy as np
import os
import re
from itertools import chain
from collections import OrderedDict
import warnings
def load_from_csvs(csv_files, sample_names=None, min_cell_count=10):
""" Read in scRNA-seq data from csv files
:param csv_files: List of csv files
:param sample_names: Prefix to attach to the cell names
:param min_cell_count: Minimum number of cells a gene should be detected in
:return: Pandas data frame representing the count matrix
"""
if sample_names is None:
sample_names = [re.sub('.csv', '', os.path.split(i)[-1]) for i in csv_files]
# Load counts
print('Loading count matrices...')
counts_dict = OrderedDict()
for csv_file, sample in zip(csv_files, sample_names):
print(sample)
counts = pd.read_csv(csv_file, index_col=0)
# Update sample names
counts.index = sample + '_' + counts.index.astype(str)
# Remove zero count genes
counts = counts.loc[:, counts.sum() > 0]
counts = counts.astype(np.int16)
# Update dictionary
counts_dict[sample] = counts
# Concatenate cells and genes
print('Concatenating data..')
all_cells = list(chain(*[list(counts_dict[sample].index)
for sample in sample_names]))
all_genes = list(
chain(*[list(counts_dict[sample].columns) for sample in sample_names]))
all_genes = list(set(all_genes))
# Counts matrix
counts = pd.DataFrame(0, index=all_cells,
columns=all_genes, dtype=np.int16)
for sample in sample_names:
sample_counts = counts_dict[sample]
counts.loc[sample_counts.index, sample_counts.columns] = sample_counts
# Filter out low detection genes
gs = counts.sum()
counts = counts.loc[:, counts.columns[gs > min_cell_count]]
return counts
def hvg_genes(norm_df, no_genes=1000):
""" Select genes based on normalized dispersion
"""
# Mean and variance
mean = norm_df.mean()
var = norm_df.var()
# Construct data frame
df = pd.DataFrame()
df['mean'] = mean
df['dispersion'] = var/mean
df[df.isnull()] = 0
# Normalized disperion
from statsmodels import robust
df['mean_bin'] = pd.cut(df['mean'], np.r_[-np.inf,
np.percentile(df['mean'],
np.arange(10, 105, 5)), np.inf])
disp_grouped = df.groupby('mean_bin')['dispersion']
disp_median_bin = disp_grouped.median()
# the next line raises the warning: "Mean of empty slice"
with warnings.catch_warnings():
warnings.simplefilter('ignore')
disp_mad_bin = disp_grouped.apply(robust.mad)
df['dispersion_norm'] = np.abs((df['dispersion'].values - disp_median_bin[df['mean_bin'].values].values)) \
/ disp_mad_bin[df['mean_bin'].values].values
# Subset of genes
use_genes = df['dispersion_norm'].sort_values().index[::-1][:no_genes]
return use_genes
def run_pca(data, device, n_components=300, var_explained=0.85):
"""Run PCA
:param data: Dataframe of cells X genes. Typicaly multiscale space diffusion components
:param n_components: Number of principal components
:param var_explained: Include components that explain amount variance. Note
number of components = min(n_components, components explaining var_explained)
:return: PCA projections of the data and the explained variance
"""
init_components = min([n_components, data.shape[0]])
if device == "gpu":
from cuml import PCA
pca = PCA(n_components=init_components)
elif device == "cpu":
from sklearn.decomposition import PCA
pca = PCA(n_components=init_components, svd_solver='randomized')
pca.fit(data)
if pca.explained_variance_ratio_.sum() >= 0.85:
n_components = np.where(np.cumsum(pca.explained_variance_ratio_) >= var_explained)[0][0]
print(f'Running PCA with {n_components} components')
pca_projections = pca.fit_transform(data)
pca_projections = pd.DataFrame(pca_projections, index=data.index)
return pca_projections, pca.explained_variance_ratio_
def normalize_counts(data, multiplier=10000):
"""Correct the counts for molecule count variability
:param data: Counts matrix: Cells x Genes
:return: Normalized matrix
"""
ms = data.sum(axis=1)
norm_df = data.div(ms, axis=0) * multiplier
return norm_df
def log_transform(data, pseudo_count=0.1):
"""Log transform the matrix
:param data: Counts matrix: Cells x Genes
:return: Log transformed matrix
"""
return np.log2(data + pseudo_count)