Skip to content

Commit

Permalink
Merge branch 'devel' of https://github.com/campbio/celda into devel
Browse files Browse the repository at this point in the history
  • Loading branch information
joshua-d-campbell committed Mar 28, 2022
2 parents 7e5d820 + df4f332 commit 9d69d59
Show file tree
Hide file tree
Showing 5 changed files with 149 additions and 29 deletions.
8 changes: 4 additions & 4 deletions R/celda_functions.R
Original file line number Diff line number Diff line change
Expand Up @@ -165,9 +165,9 @@ recodeClusterZ <- function(sce, from, to, altExpName = "featureSubset") {
new.clusters <- plyr::mapvalues(celdaClusters(sce,
altExpName = altExpName),
from, to)
new.clusters <- factor(new.clusters, levels =
new.clusters <- factor(new.clusters, levels =
sort(as.numeric(unique(new.clusters))))

celdaClusters(sce, altExpName = altExpName) <- new.clusters
return(sce)
}
Expand Down Expand Up @@ -218,9 +218,9 @@ recodeClusterY <- function(sce, from, to, altExpName = "featureSubset") {
new.clusters <- plyr::mapvalues(celdaModules(sce,
altExpName = altExpName),
from, to)
new.clusters <- factor(new.clusters, levels =
new.clusters <- factor(new.clusters, levels =
sort(as.numeric(unique(new.clusters))))

celdaModules(sce, altExpName = altExpName) <- plyr::mapvalues(
celdaModules(sce, altExpName = altExpName), from, to)
return(sce)
Expand Down
107 changes: 88 additions & 19 deletions R/decon.R
Original file line number Diff line number Diff line change
Expand Up @@ -26,13 +26,17 @@
#' should be considered different batches. Default NULL.
#' @param background A numeric matrix of counts or a
#' \linkS4class{SingleCellExperiment} with the matrix located in the assay
#' slot under \code{assayName}. It should have the same structure as \code{x}
#' except it contains the matrix of empty droplets instead of cells. When
#' supplied, empirical distribution of transcripts from these empty droplets
#' slot under \code{assayName}. It should have the same data format as \code{x}
#' except it contains the empty droplets instead of cells. When supplied,
#' empirical distribution of transcripts from these empty droplets
#' will be used as the contamination distribution. Default NULL.
#' @param bgAssayName Character. Name of the assay to use if \code{background}
#' is a \linkS4class{SingleCellExperiment}. Default to same as
#' \code{assayName}.
#' @param bgBatch Numeric or character vector. Batch labels for
#' \code{background}. Its unique values should be the same as those in
#' \code{batch}, such that each batch of cells have their corresponding batch
#' of empty droplets as background, pointed by this parameter. Default to NULL.
#' @param maxIter Integer. Maximum iterations of the EM algorithm. Default 500.
#' @param convergence Numeric. The EM algorithm will be stopped if the maximum
#' difference in the contamination estimates between the previous and
Expand Down Expand Up @@ -152,6 +156,7 @@ setMethod("decontX", "SingleCellExperiment", function(x,
batch = NULL,
background = NULL,
bgAssayName = NULL,
bgBatch = NULL,
maxIter = 500,
delta = c(10, 10),
estimateDelta = TRUE,
Expand All @@ -165,8 +170,16 @@ setMethod("decontX", "SingleCellExperiment", function(x,
countsBackground <- NULL
if (!is.null(background)) {
# Remove cells with the same ID between x and the background matrix
background <- .checkBackground(x = x, background = background,
logfile = logfile, verbose = verbose)
# Also update bgBatch when background is updated and bgBatch is not null
temp <- .checkBackground(x = x,
background = background,
bgBatch = bgBatch,
logfile = logfile,
verbose = verbose)

background <- temp$background
bgBatch <- temp$bgBatch

if (is.null(bgAssayName)) {
bgAssayName <- assayName
}
Expand All @@ -180,6 +193,7 @@ setMethod("decontX", "SingleCellExperiment", function(x,
z = z,
batch = batch,
countsBackground = countsBackground,
batchBackground = bgBatch,
maxIter = maxIter,
convergence = convergence,
iterLogLik = iterLogLik,
Expand Down Expand Up @@ -232,6 +246,7 @@ setMethod("decontX", "ANY", function(x,
z = NULL,
batch = NULL,
background = NULL,
bgBatch = NULL,
maxIter = 500,
delta = c(10, 10),
estimateDelta = TRUE,
Expand All @@ -246,15 +261,26 @@ setMethod("decontX", "ANY", function(x,
countsBackground <- NULL
if (!is.null(background)) {
# Remove cells with the same ID between x and the background matrix
background <- .checkBackground(x = x, background = background,
logfile = logfile, verbose = verbose)
# Also update bgBatch when background is updated and bgBatch is not null
temp <- .checkBackground(x = x,
background = background,
bgBatch = bgBatch,
logfile = logfile,
verbose = verbose)

background <- temp$background
countsBackground <- background

bgBatch <- temp$bgBatch

}

.decontX(
counts = x,
z = z,
batch = batch,
countsBackground = countsBackground,
batchBackground = bgBatch,
maxIter = maxIter,
convergence = convergence,
iterLogLik = iterLogLik,
Expand Down Expand Up @@ -337,6 +363,7 @@ setMethod(
z = NULL,
batch = NULL,
countsBackground = NULL,
batchBackground = NULL,
maxIter = 200,
convergence = 0.001,
iterLogLik = 10,
Expand Down Expand Up @@ -367,6 +394,7 @@ setMethod(
runParams <- list(
z = z,
batch = batch,
batchBackground = batchBackground,
maxIter = maxIter,
delta = delta,
estimateDelta = estimateDelta,
Expand All @@ -384,7 +412,7 @@ setMethod(
nC <- ncol(counts)
allCellNames <- colnames(counts)

## Set up final deconaminated matrix
## Set up final decontaminated matrix
estRmat <- Matrix::Matrix(
data = 0,
ncol = totalCells,
Expand All @@ -396,8 +424,32 @@ setMethod(
## Generate batch labels if none were supplied
if (is.null(batch)) {
batch <- rep("all_cells", nC)

# If batch null, bgBatch has to be null
if (!is.null(batchBackground)) {
stop(
"When experiment default to no bacth, background should ",
"also default to no batch."
)
}

if (!is.null(countsBackground)) {
batchBackground <- rep("all_cells", ncol(countsBackground))
}
} else {

# If batch not null and countsBackground supplied,
# user has to supply batchBackground as well
if (!is.null(countsBackground) & is.null(batchBackground)) {
stop(
"Cell batch, and background are supplied. Please also ",
"supply background batch."
)
}

}
runParams$batch <- batch
runParams$batchBackground <- batchBackground
batchIndex <- unique(batch)

## Set result lists upfront for all cells from different batches
Expand Down Expand Up @@ -430,6 +482,7 @@ setMethod(

zBat <- NULL
countsBat <- counts[, batch == bat]
bgBat <- countsBackground[, batchBackground == bat]

## Convert to sparse matrix
if (!inherits(countsBat, "dgCMatrix")) {
Expand All @@ -442,9 +495,9 @@ setMethod(
)
countsBat <- methods::as(countsBat, "dgCMatrix")
}
if (!is.null(countsBackground)) {
if (!inherits(countsBackground, "dgCMatrix")) {
countsBackground <- methods::as(countsBackground, "dgCMatrix")
if (!is.null(bgBat)) {
if (!inherits(bgBat, "dgCMatrix")) {
bgBat <- methods::as(bgBat, "dgCMatrix")
}
}

Expand All @@ -456,7 +509,7 @@ setMethod(
counts = countsBat,
z = zBat,
batch = bat,
countsBackground = countsBackground,
countsBackground = bgBat,
maxIter = maxIter,
delta = delta,
estimateDelta = estimateDelta,
Expand All @@ -475,7 +528,7 @@ setMethod(
counts = countsBat,
z = zBat,
batch = bat,
countsBackground = countsBackground,
countsBackground = bgBat,
maxIter = maxIter,
delta = delta,
estimateDelta = estimateDelta,
Expand All @@ -491,7 +544,7 @@ setMethod(
}

## Try to convert class of new matrix to class of original matrix

.logMessages(
date(),
".. Calculating final decontaminated matrix",
Expand Down Expand Up @@ -563,7 +616,7 @@ setMethod(
append = TRUE,
verbose = verbose
)

## Determine class of seed in DelayedArray
seed.class <- unique(DelayedArray::seedApply(counts, class))[[1]]
if (seed.class == "HDF5ArraySeed") {
Expand Down Expand Up @@ -1369,9 +1422,11 @@ simulateContamination <- function(C = 300,
}


.checkBackground <- function(x, background, logfile = NULL, verbose = FALSE) {
.checkBackground <- function(x, background, bgBatch,
logfile = NULL, verbose = FALSE) {
# Remove background barcodes that have already appeared in x
if(!is.null(colnames(background))) {
# If bgBatch param is supplied, also remove duplicate bgBatch
if (!is.null(colnames(background))) {
dupBarcode <- colnames(background) %in% colnames(x)
} else {
dupBarcode <- FALSE
Expand All @@ -1381,7 +1436,7 @@ simulateContamination <- function(C = 300,
" Please ensure that no true cells are included in the background ",
"matrix. Otherwise, results will be incorrect.")
}

if (any(dupBarcode)) {
.logMessages(
date(),
Expand All @@ -1394,6 +1449,20 @@ simulateContamination <- function(C = 300,
verbose = verbose
)
background <- background[, !(dupBarcode), drop = FALSE]

if (!is.null(bgBatch)) {
if (length(bgBatch) != length(dupBarcode)) {
stop(
"Length of bgBatch must be equal to the number of columns",
"of background matrix."
)
}
bgBatch <- bgBatch[!(dupBarcode)]
}
}
return(background)

re <- list(background = background,
bgBatch = bgBatch)

return(re)
}
13 changes: 10 additions & 3 deletions man/decontX.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

5 changes: 5 additions & 0 deletions src/RcppExports.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,11 @@

using namespace Rcpp;

#ifdef RCPP_USE_GLOBAL_ROSTREAM
Rcpp::Rostream<true>& Rcpp::Rcout = Rcpp::Rcpp_cout_get();
Rcpp::Rostream<false>& Rcpp::Rcerr = Rcpp::Rcpp_cerr_get();
#endif

// decontXEM
Rcpp::List decontXEM(const Eigen::MappedSparseMatrix<double>& counts, const NumericVector& counts_colsums, const NumericVector& theta, const bool& estimate_eta, const NumericMatrix& eta, const NumericMatrix& phi, const IntegerVector& z, const bool& estimate_delta, const NumericVector& delta, const double& pseudocount);
RcppExport SEXP _celda_decontXEM(SEXP countsSEXP, SEXP counts_colsumsSEXP, SEXP thetaSEXP, SEXP estimate_etaSEXP, SEXP etaSEXP, SEXP phiSEXP, SEXP zSEXP, SEXP estimate_deltaSEXP, SEXP deltaSEXP, SEXP pseudocountSEXP) {
Expand Down
45 changes: 42 additions & 3 deletions vignettes/decontX.Rmd
Original file line number Diff line number Diff line change
Expand Up @@ -59,19 +59,21 @@ rownames(sce) <- rowData(sce)$Symbol_TENx

# Running decontX

A SingleCellExperiment (SCE) object or a sparse matrix containing the counts for filtered cells can be passed to decontX via the `x` parameter. There are two major ways to run decontX: with and without the raw/droplet matrix containing empty droplets. The raw/droplet matrix can be used to empirically estimate the distribution of ambient RNA, which is especially useful when cells that contributed to the ambient RNA are not accurately represented in the filtered count matrix containing the cells. For example, cells that were removed via flow cytometry or that were more sensitive to lysis during dissociation may have contributed to the ambient RNA but were not measured in the filtered/cell matrix. The raw/droplet matrix can be input as a sparse matrix or SCE object using the `background` parameter:
A SingleCellExperiment (SCE) object or a sparse matrix containing the counts for filtered cells can be passed to decontX via the `x` parameter. There are two major ways to run decontX: with and without the raw/droplet matrix containing empty droplets. The raw/droplet matrix can be used to empirically estimate the distribution of ambient RNA, which is especially useful when cells that contributed to the ambient RNA are not accurately represented in the filtered count matrix containing the cells. For example, cells that were removed via flow cytometry or that were more sensitive to lysis during dissociation may have contributed to the ambient RNA but were not measured in the filtered/cell matrix. The raw/droplet matrix can be input as an SCE object or a sparse matrix using the `background` parameter:

```{r decontX_background, eval=FALSE, message=FALSE}
sce <- decontX(sce, background = raw)
```

If cell/column names in the raw/droplet matrix are also found in the filtered counts matrix, then they will be excluded from the raw/droplet matrix before calculation of the ambient RNA distribution. If the raw matrix is not available, then `decontX` will estimate the contamination distribution for each cell cluster based on the profiles of the other cell clusters in the filtered dataset:
We would like to stress that `background` input was designed to contain only empty droplets. In case the `background` input contains both cell and empty droplets, for example the raw output from 10X Genomics, the software will try to look up for the cell/column names in the raw matrix (`background`) that are also found in the filtered counts matrix (`x`), and exclude them from the raw matrix. When cell/column names are not available for the input objects, the software will treat the entire `background` input as empty droplets. This will render incorrect estimation of the ambient RNA profile.

If the raw matrix is not available, then `decontX` will estimate the contamination distribution for each cell cluster based on the profiles of the other cell clusters in the filtered dataset:

```{r decontX, eval=TRUE, message=FALSE}
sce <- decontX(sce)
```

Note that in this case `decontX` will perform heuristic clustering to quickly define major cell clusters. However if you have your own cell cluster labels, they can be specified with the `z` parameter. If you supply a raw matrix via the `background` parameter, then the `z` parameter will not have an effect as clustering will not be performed.
Note that in this case `decontX` will perform heuristic clustering to quickly define major cell clusters. However if you have your own cell cluster labels, they can be specified with the `z` parameter.

The contamination can be found in the `colData(sce)$decontX_contamination` and the decontaminated counts can be accessed with `decontXcounts(sce)`. If the input object was a matrix, make sure to save the output into a variable with a different name (e.g. result). The result object will be a list with contamination in `result$contamination` and the decontaminated counts in `result$decontXcounts`.

Expand Down Expand Up @@ -202,6 +204,43 @@ plot(sce$decontX_contamination, sce.delta$decontX_contamination,
abline(0, 1, col = "red", lwd = 2)
```

## Integration with packages such as Seurat and singleCellTK
You can integrate decontX into your scRNA-seq analysis pipelines, such as the one provided by [Seurat](https://cran.r-project.org/web/packages/Seurat/index.html). Both decontX and Seurat takes input count matrix, although the decontaminated matrix of decontX consists of floating point numbers. As heuristics, you can round the decontaminated matrix to integers before applying it to your Seurat pipeline.

```{r seuratIntegration, eval=FALSE}
library(Seurat)
counts <- Read10X("path/to/file")
# Convert count matrix to SingleCellExperiment to run on decontX
sce <- SingleCellExperiment(list(counts = counts))
sce <- decontX(sce)
# Retrieve decontaminated matrix, round to integer, and convert to Seurat object
decontaminated.matrix <- decontXcounts(sce)
decontaminated.counts <- round(decontaminated.matrix)
seuratObject <- CreateSeuratObject(decontaminated.counts)
```

Conversely, if you have a Seurat object containing raw count matrix and would like to run decontX, simply retrieve the count matrix, convert to SingleCellExperiment, and run on decontX.

```{r seuratIntegration2, eval=FALSE}
counts <- GetAssayData(object = seuratObject, slot = "counts")
sce <- SingleCellExperiment(list(counts = counts))
sce <- decontX(sce)
```


To import datasets into SingleCellExperiment object, the [singleCellTK](https://bioconductor.org/packages/release/bioc/html/singleCellTK.html) package has several importing functions for different preprocessing tools including CellRanger, STARsolo, BUStools, Optimus, DropEST, SEQC, and Alevin/Salmon. For example, the following code can be used as a template to read in the filtered and raw matrices for multiple samples processed with CellRanger:

```{r singleCellTKIntegration, eval=FALSE}
library(singleCellTK)
sce <- importCellRanger(sampleDirs = c("path/to/sample1/", "path/to/sample2/"))
sce.raw <- importCellRanger(sampleDirs = c("path/to/sample1/", "path/to/sample2/"), dataType = "raw")
```


# Session Information

```{r}
Expand Down

0 comments on commit 9d69d59

Please sign in to comment.