Skip to content

Commit

Permalink
v0.0.9
Browse files Browse the repository at this point in the history
faster calculation of cell similarity; change variation threshold.
  • Loading branch information
Vivianstats committed Aug 15, 2018
1 parent cc491c3 commit b1623c5
Show file tree
Hide file tree
Showing 4 changed files with 191 additions and 77 deletions.
4 changes: 2 additions & 2 deletions DESCRIPTION
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
Package: scImpute
Type: Package
Title: Accurate and robust imputation of single-cell RNA sequencing data
Version: 0.0.8
Date: 2018-06-27
Version: 0.0.9
Date: 2018-08-15
Author: Wei Vivian Li, Jingyi Jessica Li
Maintainer: Wei Vivian Li <[email protected]>
Description: scRNA-seq analysis is complicated by the excess of zero or near zero counts in the data, which are the so-called dropouts due to low amounts of mRNA within each individual cell. scImpute is developed to simultaneously determine which expression values are affected by dropout events in scRNA-seq data and perform imputation only on dropout entries.
Expand Down
101 changes: 67 additions & 34 deletions R/imputation_model.R
Original file line number Diff line number Diff line change
Expand Up @@ -32,26 +32,38 @@ find_neighbors = function(count_hv, labeled, J, Kcluster = NULL,
cell_inds = which(clust == ll)
count_hv_sub = count_hv[, cell_inds, drop = FALSE]
if(J < 1000){
var_thre = 0.4
pca = prcomp(t(count_hv_sub))
eigs = (pca$sdev)^2
var_cum = cumsum(eigs)/sum(eigs)
if(max(var_cum) <= var_thre){
npc = length(var_cum)
}else{
npc = which.max(var_cum > var_thre)
if (labeled == FALSE){ npc = max(npc, Kcluster) }
}
}else{
pca = rpca(t(count_hv_sub), k = 100, center = TRUE, scale = FALSE)
}
eigs = (pca$sdev)^2
var_cum = cumsum(eigs)/sum(eigs)
if(max(var_cum) <= 0.8){
npc = length(var_cum)
}else{
npc = which.max(var_cum > 0.8)
var_thre = 0.6
pca = rpca(t(count_hv_sub), k = 1000, center = TRUE, scale = FALSE)
eigs = (pca$sdev)^2
var_cum = cumsum(eigs)/sum(eigs)
if(max(var_cum) <= var_thre){
npc = length(var_cum)
}else{
npc = which.max(var_cum > var_thre)
if (labeled == FALSE){ npc = max(npc, Kcluster) }
}
}

if (npc < 3){ npc = 3 }
mat_pcs = t(pca$x[, 1:npc])

dist_cells_list = mclapply(1:length(cell_inds), function(id1){
sapply(1:length(cell_inds), function(id2){
if(id1 <= id2) return(0)
d = sapply(1:id1, function(id2){
sse = sum((mat_pcs[, id1] - mat_pcs[, id2])^2)
sqrt(sse)
})
return(c(d, rep(0, length(cell_inds)-id1)))
}, mc.cores = ncores)
dist_cells = matrix(0, nrow = length(cell_inds), ncol = length(cell_inds))
for(cellid in 1:length(cell_inds)){dist_cells[cellid, ] = dist_cells_list[[cellid]]}
Expand All @@ -65,29 +77,39 @@ find_neighbors = function(count_hv, labeled, J, Kcluster = NULL,
## dimeansion reduction
print("dimension reduction ...")
if(J < 1000){
var_thre = 0.4
pca = prcomp(t(count_hv))
eigs = (pca$sdev)^2
var_cum = cumsum(eigs)/sum(eigs)
if(max(var_cum) <= var_thre){
npc = length(var_cum)
}else{
npc = which.max(var_cum > var_thre)
if (labeled == FALSE){ npc = max(npc, Kcluster) }
}
}else{
pca = rpca(t(count_hv), k = 100, center = TRUE, scale = FALSE)
}
eigs = (pca$sdev)^2
var_cum = cumsum(eigs)/sum(eigs)
if(max(var_cum) <= 0.8){
npc = length(var_cum)
}else{
npc = which.max(var_cum > 0.8)
if (labeled == FALSE){ npc = max(npc, Kcluster) }
var_thre = 0.6
pca = rpca(t(count_hv), k = 1000, center = TRUE, scale = FALSE)
eigs = (pca$sdev)^2
var_cum = cumsum(eigs)/sum(eigs)
if(max(var_cum) <= var_thre){
npc = length(var_cum)
}else{
npc = which.max(var_cum > var_thre)
if (labeled == FALSE){ npc = max(npc, Kcluster) }
}
}
if (npc < 3){ npc = 3 }
mat_pcs = t(pca$x[, 1:npc]) # columns are cells

## detect outliers
print("calculating cell distances ...")
dist_cells_list = mclapply(1:J, function(id1){
sapply(1:J, function(id2){
if(id1 <= id2) return(0)
d = sapply(1:id1, function(id2){
sse = sum((mat_pcs[, id1] - mat_pcs[, id2])^2)
sqrt(sse)
})
return(c(d, rep(0, J-id1)))
}, mc.cores = ncores)
dist_cells = matrix(0, nrow = J, ncol = J)
for(cellid in 1:J){dist_cells[cellid, ] = dist_cells_list[[cellid]]}
Expand Down Expand Up @@ -174,27 +196,38 @@ imputation_model8 = function(count, labeled, point, drop_thre = 0.5, Kcluster =
if(Kcluster == 1){
clust = rep(1, J)
if(J < 1000){
var_thre = 0.4
pca = prcomp(t(count_hv))
eigs = (pca$sdev)^2
var_cum = cumsum(eigs)/sum(eigs)
if(max(var_cum) <= var_thre){
npc = length(var_cum)
}else{
npc = which.max(var_cum > var_thre)
if (labeled == FALSE){ npc = max(npc, Kcluster) }
}
}else{
pca = rpca(t(count_hv), k = 100, center = TRUE, scale = FALSE)
}
eigs = (pca$sdev)^2
var_cum = cumsum(eigs)/sum(eigs)
if(max(var_cum) <= 0.8){
npc = length(var_cum)
}else{
npc = which.max(var_cum > 0.8)
if (labeled == FALSE){ npc = max(npc, Kcluster) }
var_thre = 0.6
pca = rpca(t(count_hv), k = 1000, center = TRUE, scale = FALSE)
eigs = (pca$sdev)^2
var_cum = cumsum(eigs)/sum(eigs)
if(max(var_cum) <= var_thre){
npc = length(var_cum)
}else{
npc = which.max(var_cum > var_thre)
if (labeled == FALSE){ npc = max(npc, Kcluster) }
}
}

if (npc < 3){ npc = 3 }
mat_pcs = t(pca$x[, 1:npc]) # columns are cells

dist_cells_list = mclapply(1:J, function(id1){
sapply(1:J, function(id2){
if(id1 <= id2) return(0)
d = sapply(1:id1, function(id2){
sse = sum((mat_pcs[, id1] - mat_pcs[, id2])^2)
sqrt(sse)
})
return(c(d, rep(0, J-id1)))
}, mc.cores = ncores)
dist_cells = matrix(0, nrow = J, ncol = J)
for(cellid in 1:J){dist_cells[cellid, ] = dist_cells_list[[cellid]]}
Expand Down Expand Up @@ -291,7 +324,7 @@ imputation_wlabel_model8 = function(count, labeled, cell_labels = NULL, point, d
print("searching candidate neighbors ... ")
neighbors_res = find_neighbors(count_hv = count_hv, labeled = TRUE, J = J,
ncores = ncores, cell_labels = cell_labels)
dist_cells_list = neighbors_res$dist_list
dist_cells = neighbors_res$dist_cells
clust = neighbors_res$clust

# mixture model
Expand Down Expand Up @@ -348,7 +381,7 @@ imputation_wlabel_model8 = function(count, labeled, cell_labels = NULL, point, d
geneid_drop = setA[[cellid]]
geneid_obs = setB[[cellid]]
y = try(impute_nnls(Ic, cellid = cellid, subcount, droprate, geneid_drop,
geneid_obs, nbs, distc = dist_cells_list[[cc]]),
geneid_obs, nbs, distc = dist_cells[cells, cells]),
silent = TRUE)
if (class(y) == "try-error") {
# print(y)
Expand Down
46 changes: 5 additions & 41 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,16 +1,17 @@
scImpute: accurate and robust imputation of scRNA-seq data
================
Wei Vivian Li, Jingyi Jessica Li
2018-06-28
2018-08-15

<!-- README.md is generated from README.Rmd. Please edit that file -->
Latest News
-----------

> 2018/06/27:
> 2018/08/15:
- Version 0.0.8 is released!
- Faster implementation of dimension reduction.
- Version 0.0.9 is released!
- More robust implementation of dimension reduction.
- Faster calculation of cell similarity.

Introduction
------------
Expand Down Expand Up @@ -59,40 +60,3 @@ scimpute(# full path to raw count matrix
This function returns the column indices of outlier cells, and creates a new file `scimpute_count.csv` in `out_dir` to store the imputed count matrix. Please note that we recommend applying scImpute on the whole-genome count matrix. A filtering step on genes is acceptable but most genes should be present to ensure robust identification of dropouts.

For detailed usage, please refer to the package [manual](https://github.com/Vivianstats/scImpute/blob/master/inst/docs/) or [vignette](https://github.com/Vivianstats/scImpute/blob/master/vignettes/scImpute-vignette.Rmd).

Updates
-------

> 2018/06/08:
- Version 0.0.7 is released!
- New option for application on TPM values.

> 2018/03/16:
- Version 0.0.6 is released!
- The scImpute method is published at [*Nature Communications*](https://www.nature.com/articles/s41467-018-03405-7).
- scImpute now supports input and output in the format of R objects (.rds).

> 2018/01/12:
- Version 0.0.5 is released!
- It is now possible to apply scImpute on just one cell population by setting `Kcluster = 1`.

> 2017/10/27:
- Version 0.0.4 is released!
- scImpute now supports multi-code parallelism.

> 2017/10/22:
- Version 0.0.3 is released!
- Estimation of dropout probabilities is more accurate.
- Imputation step is more robust.
- `scimpute()` incorporates a new parameter `Kcluster` to specify the number of cell subpopulations.
- `scImpute` is now able to detect outlier cells.

> 2017/07/01:
- Version 0.0.2 is released!
- This version speeds up the first step in `scImpute` and program now completes in a few seconds when applied to a dataset with 10,000 genes and 100 cells (using single core).
117 changes: 117 additions & 0 deletions inst/docs/scImpute-news.html
Original file line number Diff line number Diff line change
@@ -0,0 +1,117 @@
<!DOCTYPE html>

<html xmlns="http://www.w3.org/1999/xhtml">

<head>

<meta charset="utf-8" />
<meta http-equiv="Content-Type" content="text/html; charset=utf-8" />
<meta name="generator" content="pandoc" />

<meta name="viewport" content="width=device-width, initial-scale=1">

<meta name="author" content="Wei Vivian Li" />

<meta name="date" content="2018-08-15" />

<title>scImpute Updates</title>






<link href="data:text/css;charset=utf-8,body%20%7B%0Abackground%2Dcolor%3A%20%23fff%3B%0Amargin%3A%201em%20auto%3B%0Amax%2Dwidth%3A%20700px%3B%0Aoverflow%3A%20visible%3B%0Apadding%2Dleft%3A%202em%3B%0Apadding%2Dright%3A%202em%3B%0Afont%2Dfamily%3A%20%22Open%20Sans%22%2C%20%22Helvetica%20Neue%22%2C%20Helvetica%2C%20Arial%2C%20sans%2Dserif%3B%0Afont%2Dsize%3A%2014px%3B%0Aline%2Dheight%3A%201%2E35%3B%0A%7D%0A%23header%20%7B%0Atext%2Dalign%3A%20center%3B%0A%7D%0A%23TOC%20%7B%0Aclear%3A%20both%3B%0Amargin%3A%200%200%2010px%2010px%3B%0Apadding%3A%204px%3B%0Awidth%3A%20400px%3B%0Aborder%3A%201px%20solid%20%23CCCCCC%3B%0Aborder%2Dradius%3A%205px%3B%0Abackground%2Dcolor%3A%20%23f6f6f6%3B%0Afont%2Dsize%3A%2013px%3B%0Aline%2Dheight%3A%201%2E3%3B%0A%7D%0A%23TOC%20%2Etoctitle%20%7B%0Afont%2Dweight%3A%20bold%3B%0Afont%2Dsize%3A%2015px%3B%0Amargin%2Dleft%3A%205px%3B%0A%7D%0A%23TOC%20ul%20%7B%0Apadding%2Dleft%3A%2040px%3B%0Amargin%2Dleft%3A%20%2D1%2E5em%3B%0Amargin%2Dtop%3A%205px%3B%0Amargin%2Dbottom%3A%205px%3B%0A%7D%0A%23TOC%20ul%20ul%20%7B%0Amargin%2Dleft%3A%20%2D2em%3B%0A%7D%0A%23TOC%20li%20%7B%0Aline%2Dheight%3A%2016px%3B%0A%7D%0Atable%20%7B%0Amargin%3A%201em%20auto%3B%0Aborder%2Dwidth%3A%201px%3B%0Aborder%2Dcolor%3A%20%23DDDDDD%3B%0Aborder%2Dstyle%3A%20outset%3B%0Aborder%2Dcollapse%3A%20collapse%3B%0A%7D%0Atable%20th%20%7B%0Aborder%2Dwidth%3A%202px%3B%0Apadding%3A%205px%3B%0Aborder%2Dstyle%3A%20inset%3B%0A%7D%0Atable%20td%20%7B%0Aborder%2Dwidth%3A%201px%3B%0Aborder%2Dstyle%3A%20inset%3B%0Aline%2Dheight%3A%2018px%3B%0Apadding%3A%205px%205px%3B%0A%7D%0Atable%2C%20table%20th%2C%20table%20td%20%7B%0Aborder%2Dleft%2Dstyle%3A%20none%3B%0Aborder%2Dright%2Dstyle%3A%20none%3B%0A%7D%0Atable%20thead%2C%20table%20tr%2Eeven%20%7B%0Abackground%2Dcolor%3A%20%23f7f7f7%3B%0A%7D%0Ap%20%7B%0Amargin%3A%200%2E5em%200%3B%0A%7D%0Ablockquote%20%7B%0Abackground%2Dcolor%3A%20%23f6f6f6%3B%0Apadding%3A%200%2E25em%200%2E75em%3B%0A%7D%0Ahr%20%7B%0Aborder%2Dstyle%3A%20solid%3B%0Aborder%3A%20none%3B%0Aborder%2Dtop%3A%201px%20solid%20%23777%3B%0Amargin%3A%2028px%200%3B%0A%7D%0Adl%20%7B%0Amargin%2Dleft%3A%200%3B%0A%7D%0Adl%20dd%20%7B%0Amargin%2Dbottom%3A%2013px%3B%0Amargin%2Dleft%3A%2013px%3B%0A%7D%0Adl%20dt%20%7B%0Afont%2Dweight%3A%20bold%3B%0A%7D%0Aul%20%7B%0Amargin%2Dtop%3A%200%3B%0A%7D%0Aul%20li%20%7B%0Alist%2Dstyle%3A%20circle%20outside%3B%0A%7D%0Aul%20ul%20%7B%0Amargin%2Dbottom%3A%200%3B%0A%7D%0Apre%2C%20code%20%7B%0Abackground%2Dcolor%3A%20%23f7f7f7%3B%0Aborder%2Dradius%3A%203px%3B%0Acolor%3A%20%23333%3B%0Awhite%2Dspace%3A%20pre%2Dwrap%3B%20%0A%7D%0Apre%20%7B%0Aborder%2Dradius%3A%203px%3B%0Amargin%3A%205px%200px%2010px%200px%3B%0Apadding%3A%2010px%3B%0A%7D%0Apre%3Anot%28%5Bclass%5D%29%20%7B%0Abackground%2Dcolor%3A%20%23f7f7f7%3B%0A%7D%0Acode%20%7B%0Afont%2Dfamily%3A%20Consolas%2C%20Monaco%2C%20%27Courier%20New%27%2C%20monospace%3B%0Afont%2Dsize%3A%2085%25%3B%0A%7D%0Ap%20%3E%20code%2C%20li%20%3E%20code%20%7B%0Apadding%3A%202px%200px%3B%0A%7D%0Adiv%2Efigure%20%7B%0Atext%2Dalign%3A%20center%3B%0A%7D%0Aimg%20%7B%0Abackground%2Dcolor%3A%20%23FFFFFF%3B%0Apadding%3A%202px%3B%0Aborder%3A%201px%20solid%20%23DDDDDD%3B%0Aborder%2Dradius%3A%203px%3B%0Aborder%3A%201px%20solid%20%23CCCCCC%3B%0Amargin%3A%200%205px%3B%0A%7D%0Ah1%20%7B%0Amargin%2Dtop%3A%200%3B%0Afont%2Dsize%3A%2035px%3B%0Aline%2Dheight%3A%2040px%3B%0A%7D%0Ah2%20%7B%0Aborder%2Dbottom%3A%204px%20solid%20%23f7f7f7%3B%0Apadding%2Dtop%3A%2010px%3B%0Apadding%2Dbottom%3A%202px%3B%0Afont%2Dsize%3A%20145%25%3B%0A%7D%0Ah3%20%7B%0Aborder%2Dbottom%3A%202px%20solid%20%23f7f7f7%3B%0Apadding%2Dtop%3A%2010px%3B%0Afont%2Dsize%3A%20120%25%3B%0A%7D%0Ah4%20%7B%0Aborder%2Dbottom%3A%201px%20solid%20%23f7f7f7%3B%0Amargin%2Dleft%3A%208px%3B%0Afont%2Dsize%3A%20105%25%3B%0A%7D%0Ah5%2C%20h6%20%7B%0Aborder%2Dbottom%3A%201px%20solid%20%23ccc%3B%0Afont%2Dsize%3A%20105%25%3B%0A%7D%0Aa%20%7B%0Acolor%3A%20%230033dd%3B%0Atext%2Ddecoration%3A%20none%3B%0A%7D%0Aa%3Ahover%20%7B%0Acolor%3A%20%236666ff%3B%20%7D%0Aa%3Avisited%20%7B%0Acolor%3A%20%23800080%3B%20%7D%0Aa%3Avisited%3Ahover%20%7B%0Acolor%3A%20%23BB00BB%3B%20%7D%0Aa%5Bhref%5E%3D%22http%3A%22%5D%20%7B%0Atext%2Ddecoration%3A%20underline%3B%20%7D%0Aa%5Bhref%5E%3D%22https%3A%22%5D%20%7B%0Atext%2Ddecoration%3A%20underline%3B%20%7D%0A%0Acode%20%3E%20span%2Ekw%20%7B%20color%3A%20%23555%3B%20font%2Dweight%3A%20bold%3B%20%7D%20%0Acode%20%3E%20span%2Edt%20%7B%20color%3A%20%23902000%3B%20%7D%20%0Acode%20%3E%20span%2Edv%20%7B%20color%3A%20%2340a070%3B%20%7D%20%0Acode%20%3E%20span%2Ebn%20%7B%20color%3A%20%23d14%3B%20%7D%20%0Acode%20%3E%20span%2Efl%20%7B%20color%3A%20%23d14%3B%20%7D%20%0Acode%20%3E%20span%2Ech%20%7B%20color%3A%20%23d14%3B%20%7D%20%0Acode%20%3E%20span%2Est%20%7B%20color%3A%20%23d14%3B%20%7D%20%0Acode%20%3E%20span%2Eco%20%7B%20color%3A%20%23888888%3B%20font%2Dstyle%3A%20italic%3B%20%7D%20%0Acode%20%3E%20span%2Eot%20%7B%20color%3A%20%23007020%3B%20%7D%20%0Acode%20%3E%20span%2Eal%20%7B%20color%3A%20%23ff0000%3B%20font%2Dweight%3A%20bold%3B%20%7D%20%0Acode%20%3E%20span%2Efu%20%7B%20color%3A%20%23900%3B%20font%2Dweight%3A%20bold%3B%20%7D%20%20code%20%3E%20span%2Eer%20%7B%20color%3A%20%23a61717%3B%20background%2Dcolor%3A%20%23e3d2d2%3B%20%7D%20%0A" rel="stylesheet" type="text/css" />

</head>

<body>




<h1 class="title toc-ignore">scImpute Updates</h1>
<h4 class="author"><em>Wei Vivian Li</em></h4>
<h4 class="date"><em>2018-08-15</em></h4>



<div id="updates" class="section level2">
<h2>Updates</h2>
<blockquote>
<p>2018/08/15:</p>
</blockquote>
<ul>
<li>Version 0.0.9 is released!</li>
<li>More robust implementation of dimension reduction.</li>
<li>Faster calculation of cell similarity.</li>
</ul>
<blockquote>
<p>2018/06/27:</p>
</blockquote>
<ul>
<li>Version 0.0.8 is released!</li>
<li>Faster implementation of dimension reduction.</li>
</ul>
<blockquote>
<p>2018/06/08:</p>
</blockquote>
<ul>
<li>Version 0.0.7 is released!</li>
<li>New option for application on TPM values.</li>
</ul>
<blockquote>
<p>2018/03/16:</p>
</blockquote>
<ul>
<li>Version 0.0.6 is released!</li>
<li>The scImpute method is published at <a href="https://www.nature.com/articles/s41467-018-03405-7"><em>Nature Communications</em></a>.</li>
<li>scImpute now supports input and output in the format of R objects (.rds).</li>
</ul>
<blockquote>
<p>2018/01/12:</p>
</blockquote>
<ul>
<li>Version 0.0.5 is released!</li>
<li>It is now possible to apply scImpute on just one cell population by setting <code>Kcluster = 1</code>.</li>
</ul>
<blockquote>
<p>2017/10/27:</p>
</blockquote>
<ul>
<li>Version 0.0.4 is released!</li>
<li>scImpute now supports multi-code parallelism.</li>
</ul>
<blockquote>
<p>2017/10/22:</p>
</blockquote>
<ul>
<li>Version 0.0.3 is released!</li>
<li>Estimation of dropout probabilities is more accurate.</li>
<li>Imputation step is more robust.</li>
<li><code>scimpute()</code> incorporates a new parameter <code>Kcluster</code> to specify the number of cell subpopulations.</li>
<li><code>scImpute</code> is now able to detect outlier cells.</li>
</ul>
<blockquote>
<p>2017/07/01:</p>
</blockquote>
<ul>
<li>Version 0.0.2 is released!</li>
<li>This version speeds up the first step in <code>scImpute</code> and program now completes in a few seconds when applied to a dataset with 10,000 genes and 100 cells (using single core).</li>
</ul>
</div>



<!-- dynamically load mathjax for compatibility with self-contained -->
<script>
(function () {
var script = document.createElement("script");
script.type = "text/javascript";
script.src = "https://mathjax.rstudio.com/latest/MathJax.js?config=TeX-AMS-MML_HTMLorMML";
document.getElementsByTagName("head")[0].appendChild(script);
})();
</script>

</body>
</html>

0 comments on commit b1623c5

Please sign in to comment.