-
Notifications
You must be signed in to change notification settings - Fork 8
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
4 changed files
with
358 additions
and
336 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,145 @@ | ||
## ----setup, include = FALSE---------------------------------------------- | ||
knitr::opts_chunk$set( | ||
collapse = TRUE, | ||
eval=FALSE, | ||
comment = "#>" | ||
) | ||
|
||
## ------------------------------------------------------------------------ | ||
# | ||
# library(dropClust) | ||
# # ------------------------------------- | ||
# # specify paths and load functions | ||
# # ------------------------------------- | ||
# WORK_DIR = "E:/Projects/dropClust/" | ||
# DATA_DIR <- file.path(WORK_DIR,"data") | ||
# FIG_DIR <- file.path(WORK_DIR,"plots") | ||
# REPORT_DIR <- file.path(WORK_DIR,"report") | ||
# | ||
# dir.create(file.path(FIG_DIR),showWarnings = FALSE) | ||
# dir.create(file.path(REPORT_DIR),showWarnings = FALSE) | ||
# | ||
# set.seed(0) | ||
|
||
## ------------------------------------------------------------------------ | ||
# # Load Data | ||
# # path contains contains decompressed files | ||
# pbmc.data<-read_data(file.path(DATA_DIR,'pbmc3k/hg19'), format = "10X") | ||
# | ||
|
||
## ------------------------------------------------------------------------ | ||
# # Filter poor quality cells. A threshold th corresponds to the total count of a cell. | ||
# filtered.data <- filter_cells(pbmc.data) | ||
# | ||
# dim(filtered.data$mat) | ||
# | ||
|
||
## ------------------------------------------------------------------------ | ||
# print("Fetch rare genes...") | ||
# rare_data <- get_rare_genes(filtered.data) | ||
# | ||
# | ||
# | ||
|
||
## ------------------------------------------------------------------------ | ||
# # Filter poor genes | ||
# # Genes with UMI count greater than min.count = 2 in atleast min.cell = 3 cells is retained. | ||
# lnorm<-normalize(filtered.data, min.count=2, min.cell=3) | ||
# | ||
# | ||
# # Select Top Dispersed Genes by setting ngenes_keep. | ||
# dp_genes <- dispersion_genes(lnorm, ngenes_keep = 1000) | ||
# | ||
# | ||
# # Log Normalize Matrix with genes-subset, | ||
# # perform batch effect removal operation when input contains batches | ||
# whole <- matrix.transform(lnorm,dp_genes ,rare_data, batch_correction = FALSE) | ||
# | ||
# | ||
|
||
## ------------------------------------------------------------------------ | ||
# | ||
# sample_ids = initial.samples(filtered.data, rare_data) | ||
# | ||
# | ||
# # Structure preserving Sampling | ||
# | ||
# samples_louvain<-sampling(whole[sample_ids,]) | ||
# subsamples_louvain<-sample_ids[samples_louvain] | ||
# | ||
# | ||
|
||
## ------------------------------------------------------------------------ | ||
# | ||
# | ||
# # Find PCA top 200 genes. This may take some time. | ||
# top_pc_genes<-pc_genes(whole[subsamples_louvain,],top=200) | ||
# | ||
|
||
## ------------------------------------------------------------------------ | ||
# | ||
# # Adjust Minimum cluster size with argument minClusterSize (default = 20) | ||
# # Adjust tree cut with argument level deepSplit (default = 3), higher value produces more clusters. | ||
# | ||
# clust.list<-cluster.cells(data = whole[,top_pc_genes], sp.samples = subsamples_louvain, | ||
# default = FALSE, minClusterSize = 30,deepSplit = 2, conf = 0.8) | ||
# | ||
|
||
## ------------------------------------------------------------------------ | ||
# # ---------------------------- | ||
# # Tsne & CO-ORDINATE Projection | ||
# # ---------------------------- | ||
# PROJ <- compute_2d_embedding(data = whole[,top_pc_genes], | ||
# sp.samples = subsamples_louvain, | ||
# clust.list = clust.list) | ||
# | ||
# | ||
# # Scatter Plot | ||
# | ||
# plot_proj_df_pred<-data.frame(Y1 = PROJ[,1],Y2 = PROJ[,2],color = as.factor(clust.list$cluster.ident)) | ||
# | ||
# p<-all_plot(plot_proj_df_pred,filename = NA, title = "dropClust clusters") | ||
# | ||
# print(p) | ||
# | ||
# | ||
|
||
## ------------------------------------------------------------------------ | ||
# | ||
# # Construct sub matrix for DE analysis | ||
# de.mat<- reduce_mat_de(lnorm,clust.list) | ||
# | ||
# | ||
# | ||
# # Pick Cell Type Specific Genes | ||
# ############################# | ||
# | ||
# # Cells of interest | ||
# GRP = levels(clust.list$cluster.ident) | ||
# # int_cells = which(label %in% GRP) | ||
# | ||
# DE_genes_nodes_all <- DE_genes(de_data = de.mat, selected_clusters = GRP, lfc_th = 1,q_th = 0.001) | ||
# | ||
# DE_genes_nodes_all$genes | ||
# | ||
# write.csv(DE_genes_nodes_all$genes, | ||
# file = file.path(REPORT_DIR, "ct_genes.csv"), | ||
# quote = FALSE) | ||
|
||
## ------------------------------------------------------------------------ | ||
# | ||
# marker_genes = c("S100A8","GNLY","PF4" ) | ||
# | ||
# p<-plot_markers(de_data = de.mat, marker_genes) | ||
# | ||
# | ||
|
||
## ------------------------------------------------------------------------ | ||
# # Draw heatmap | ||
# ############################# | ||
# p<-plot_heatmap(de_data = de.mat, DE_res = DE_genes_nodes_all$DE_res,nDE = 10) | ||
# | ||
# print(p) | ||
# | ||
# | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,212 @@ | ||
--- | ||
title: "dropClust for Single Cell Transcriptome analysis" | ||
Version: "2.0.0" | ||
output: | ||
pdf_document | ||
vignette: > | ||
%\VignetteEngine{knitr::rmarkdown} | ||
%\VignetteIndexEntry{dropClust for Single Cell Transcriptome analysis} | ||
%\usepackage[utf8]{inputenc} | ||
--- | ||
|
||
```{r setup, include = FALSE} | ||
knitr::opts_chunk$set( | ||
collapse = TRUE, | ||
eval=FALSE, | ||
comment = "#>" | ||
) | ||
``` | ||
|
||
This latest version of dropClust has been re-implemented to include new features and performance improvements. This vignette uses a small data set from the 10X website (3K PBMC dataset [here](http://cf.10xgenomics.com/samples/cell-exp/1.1.0/pbmc3k/pbmc3k_filtered_gene_bc_matrices.tar.gz) ) to demonstrate a standard pipeline. This vignette can be used as a tutorial as well. | ||
|
||
|
||
## Setting up directories | ||
|
||
```{r} | ||
library(dropClust) | ||
# ------------------------------------- | ||
# specify paths and load functions | ||
# ------------------------------------- | ||
WORK_DIR = "E:/Projects/dropClust/" | ||
DATA_DIR <- file.path(WORK_DIR,"data") | ||
FIG_DIR <- file.path(WORK_DIR,"plots") | ||
REPORT_DIR <- file.path(WORK_DIR,"report") | ||
dir.create(file.path(FIG_DIR),showWarnings = FALSE) | ||
dir.create(file.path(REPORT_DIR),showWarnings = FALSE) | ||
set.seed(0) | ||
``` | ||
## Loading data | ||
dropClust loads UMI count expression data from three input files. The files follow the same structure as the datasets available from the 10X website, i.e.: | ||
|
||
- count matrix file in sparse format | ||
- transcriptome identifiers as a TSV file and | ||
- gene identifiers as a TSV file | ||
|
||
```{r} | ||
# Load Data | ||
# path contains contains decompressed files | ||
pbmc.data<-read_data(file.path(DATA_DIR,'pbmc3k/hg19'), format = "10X") | ||
``` | ||
|
||
## Pre-processing | ||
dropClust performs pre-processing to remove poor quality cells and genes. dropClust is also equipped to mitigate batch-effects that may be present. The user does not need to provide any information regarding the source of the batch for individual transcriptomes. However, the batch-effect removal step is optional. | ||
|
||
Cells are filtered based on the total UMI count in a cell specified by parameter `th`. | ||
|
||
```{r} | ||
# Filter poor quality cells. A threshold th corresponds to the total count of a cell. | ||
filtered.data <- filter_cells(pbmc.data) | ||
dim(filtered.data$mat) | ||
``` | ||
### Boosting rare cell discovery | ||
dropClust estimates minor population and their associated genes directly from the raw data by finding exclusive groups of co-expressed genes within a small population of transcriptomes. These genes and cells are then included for later processing to boost rare-cell discovery. | ||
|
||
```{r} | ||
print("Fetch rare genes...") | ||
rare_data <- get_rare_genes(filtered.data) | ||
``` | ||
### Data normalization and removing poor quality genes | ||
Poor quality genes are removed based on the minimum number of cells with expressions above a given threshold. Count normalization is then performed with the good quality genes only. | ||
|
||
Further gene selection is carried out by ranking the genes based on its dispersion index. | ||
|
||
```{r} | ||
# Filter poor genes | ||
# Genes with UMI count greater than min.count = 2 in atleast min.cell = 3 cells is retained. | ||
lnorm<-normalize(filtered.data, min.count=2, min.cell=3) | ||
# Select Top Dispersed Genes by setting ngenes_keep. | ||
dp_genes <- dispersion_genes(lnorm, ngenes_keep = 1000) | ||
# Log Normalize Matrix with genes-subset, | ||
# perform batch effect removal operation when input contains batches | ||
whole <- matrix.transform(lnorm,dp_genes ,rare_data, batch_correction = FALSE) | ||
``` | ||
|
||
## Structure Preserving Sampling | ||
Primary clustering is performed in a fast manner to estimate a gross structure of the data. Each of these clusters is then sampled to fine tune the clustering process. | ||
|
||
```{r} | ||
sample_ids = initial.samples(filtered.data, rare_data) | ||
# Structure preserving Sampling | ||
samples_louvain<-sampling(whole[sample_ids,]) | ||
subsamples_louvain<-sample_ids[samples_louvain] | ||
``` | ||
|
||
### Gene selection based on PCA | ||
Another gene selection is performed to reduce the number of dimensions. PCA is used to identify genes affecting major components. | ||
|
||
```{r} | ||
# Find PCA top 200 genes. This may take some time. | ||
top_pc_genes<-pc_genes(whole[subsamples_louvain,],top=200) | ||
``` | ||
|
||
## Perform clustering | ||
|
||
### Fine tuning the clustering process | ||
By default best-fit, Louvain based clusters are returned. However, the user can tune the parameters to produce the desired number of clusters. The un-sampled transcriptomes are assigned cluster identifiers from among those identifiers produced from fine-tuning clustering. The post-hoc assignment can be controlled by setting the confidence value `conf`. High `conf` values will assign cluster identifiers to only those transcriptomes sharing a majority of common nearest neighbours. | ||
|
||
|
||
```{r} | ||
# Adjust Minimum cluster size with argument minClusterSize (default = 20) | ||
# Adjust tree cut with argument level deepSplit (default = 3), higher value produces more clusters. | ||
clust.list<-cluster.cells(data = whole[,top_pc_genes], sp.samples = subsamples_louvain, | ||
default = FALSE, minClusterSize = 30,deepSplit = 2, conf = 0.8) | ||
``` | ||
|
||
|
||
## Visualizing clusters | ||
|
||
Compute 2D embeddings for samples followed by post-hoc clustering. | ||
|
||
```{r} | ||
# ---------------------------- | ||
# Tsne & CO-ORDINATE Projection | ||
# ---------------------------- | ||
PROJ <- compute_2d_embedding(data = whole[,top_pc_genes], | ||
sp.samples = subsamples_louvain, | ||
clust.list = clust.list) | ||
# Scatter Plot | ||
plot_proj_df_pred<-data.frame(Y1 = PROJ[,1],Y2 = PROJ[,2],color = as.factor(clust.list$cluster.ident)) | ||
p<-all_plot(plot_proj_df_pred,filename = NA, title = "dropClust clusters") | ||
print(p) | ||
``` | ||
|
||
## Find cluster specific Differentially Expressed genes | ||
|
||
```{r} | ||
# Construct sub matrix for DE analysis | ||
de.mat<- reduce_mat_de(lnorm,clust.list) | ||
# Pick Cell Type Specific Genes | ||
############################# | ||
# Cells of interest | ||
GRP = levels(clust.list$cluster.ident) | ||
# int_cells = which(label %in% GRP) | ||
DE_genes_nodes_all <- DE_genes(de_data = de.mat, selected_clusters = GRP, lfc_th = 1,q_th = 0.001) | ||
DE_genes_nodes_all$genes | ||
write.csv(DE_genes_nodes_all$genes, | ||
file = file.path(REPORT_DIR, "ct_genes.csv"), | ||
quote = FALSE) | ||
``` | ||
|
||
## Plot hand picked marker genes | ||
|
||
```{r} | ||
marker_genes = c("S100A8","GNLY","PF4" ) | ||
p<-plot_markers(de_data = de.mat, marker_genes) | ||
``` | ||
|
||
## Heat map of top DE genes from each cluster | ||
```{r} | ||
# Draw heatmap | ||
############################# | ||
p<-plot_heatmap(de_data = de.mat, DE_res = DE_genes_nodes_all$DE_res,nDE = 10) | ||
print(p) | ||
``` |
Oops, something went wrong.