v0.0.9

faster calculation of cell similarity; change variation threshold.
Vivianstats · Aug 15, 2018 · b1623c5 · b1623c5
1 parent cc491c3
commit b1623c5
Show file tree

Hide file tree

Showing 4 changed files with 191 additions and 77 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,8 +1,8 @@
 Package: scImpute
 Type: Package
 Title: Accurate and robust imputation of single-cell RNA sequencing data
-Version: 0.0.8
-Date: 2018-06-27
+Version: 0.0.9
+Date: 2018-08-15
 Author: Wei Vivian Li, Jingyi Jessica Li
 Maintainer: Wei Vivian Li <[email protected]>
 Description: scRNA-seq analysis is complicated by the excess of zero or near zero counts in the data, which are the so-called dropouts due to low amounts of mRNA within each individual cell. scImpute is developed to simultaneously determine which expression values are affected by dropout events in scRNA-seq data and perform imputation only on dropout entries.

diff --git a/R/imputation_model.R b/R/imputation_model.R
@@ -32,26 +32,38 @@ find_neighbors = function(count_hv, labeled, J, Kcluster = NULL,
       cell_inds = which(clust == ll)
       count_hv_sub = count_hv[, cell_inds, drop = FALSE]
       if(J < 1000){
+        var_thre = 0.4
         pca = prcomp(t(count_hv_sub))
+        eigs = (pca$sdev)^2
+        var_cum = cumsum(eigs)/sum(eigs)
+        if(max(var_cum) <= var_thre){
+          npc = length(var_cum)
+        }else{
+          npc = which.max(var_cum > var_thre)
+          if (labeled == FALSE){ npc = max(npc, Kcluster) }
+        }
       }else{
-        pca = rpca(t(count_hv_sub), k = 100, center = TRUE, scale = FALSE) 
-      }
-      eigs = (pca$sdev)^2
-      var_cum = cumsum(eigs)/sum(eigs)
-      if(max(var_cum) <= 0.8){
-        npc = length(var_cum)
-      }else{
-        npc = which.max(var_cum > 0.8)
+        var_thre = 0.6
+        pca = rpca(t(count_hv_sub), k = 1000, center = TRUE, scale = FALSE) 
+        eigs = (pca$sdev)^2
+        var_cum = cumsum(eigs)/sum(eigs)
+        if(max(var_cum) <= var_thre){
+          npc = length(var_cum)
+        }else{
+          npc = which.max(var_cum > var_thre)
+          if (labeled == FALSE){ npc = max(npc, Kcluster) }
+        }
       }
+
       if (npc < 3){ npc = 3 }
       mat_pcs = t(pca$x[, 1:npc]) 
 
       dist_cells_list = mclapply(1:length(cell_inds), function(id1){
-        sapply(1:length(cell_inds), function(id2){
-          if(id1 <= id2) return(0)
+        d = sapply(1:id1, function(id2){
           sse = sum((mat_pcs[, id1] - mat_pcs[, id2])^2)
           sqrt(sse)
         })
+        return(c(d, rep(0, length(cell_inds)-id1)))
       }, mc.cores = ncores)
       dist_cells = matrix(0, nrow = length(cell_inds), ncol = length(cell_inds))
       for(cellid in 1:length(cell_inds)){dist_cells[cellid, ] = dist_cells_list[[cellid]]}
@@ -65,29 +77,39 @@ find_neighbors = function(count_hv, labeled, J, Kcluster = NULL,
     ## dimeansion reduction
     print("dimension reduction ...")
     if(J < 1000){
+      var_thre = 0.4
       pca = prcomp(t(count_hv))
+      eigs = (pca$sdev)^2
+      var_cum = cumsum(eigs)/sum(eigs)
+      if(max(var_cum) <= var_thre){
+        npc = length(var_cum)
+      }else{
+        npc = which.max(var_cum > var_thre)
+        if (labeled == FALSE){ npc = max(npc, Kcluster) }
+      }
     }else{
-      pca = rpca(t(count_hv), k = 100, center = TRUE, scale = FALSE) 
-    }
-    eigs = (pca$sdev)^2
-    var_cum = cumsum(eigs)/sum(eigs)
-    if(max(var_cum) <= 0.8){
-      npc = length(var_cum)
-    }else{
-      npc = which.max(var_cum > 0.8)
-      if (labeled == FALSE){ npc = max(npc, Kcluster) }
+      var_thre = 0.6
+      pca = rpca(t(count_hv), k = 1000, center = TRUE, scale = FALSE) 
+      eigs = (pca$sdev)^2
+      var_cum = cumsum(eigs)/sum(eigs)
+      if(max(var_cum) <= var_thre){
+        npc = length(var_cum)
+      }else{
+        npc = which.max(var_cum > var_thre)
+        if (labeled == FALSE){ npc = max(npc, Kcluster) }
+      }
     }
     if (npc < 3){ npc = 3 }
     mat_pcs = t(pca$x[, 1:npc]) # columns are cells
 
     ## detect outliers
     print("calculating cell distances ...")
     dist_cells_list = mclapply(1:J, function(id1){
-      sapply(1:J, function(id2){
-        if(id1 <= id2) return(0)
+      d = sapply(1:id1, function(id2){
         sse = sum((mat_pcs[, id1] - mat_pcs[, id2])^2)
         sqrt(sse)
       })
+      return(c(d, rep(0, J-id1)))
     }, mc.cores = ncores)
     dist_cells = matrix(0, nrow = J, ncol = J)
     for(cellid in 1:J){dist_cells[cellid, ] = dist_cells_list[[cellid]]}
@@ -174,27 +196,38 @@ imputation_model8 = function(count, labeled, point, drop_thre = 0.5, Kcluster =
   if(Kcluster == 1){
     clust = rep(1, J)
     if(J < 1000){
+      var_thre = 0.4
       pca = prcomp(t(count_hv))
+      eigs = (pca$sdev)^2
+      var_cum = cumsum(eigs)/sum(eigs)
+      if(max(var_cum) <= var_thre){
+        npc = length(var_cum)
+      }else{
+        npc = which.max(var_cum > var_thre)
+        if (labeled == FALSE){ npc = max(npc, Kcluster) }
+      }
     }else{
-      pca = rpca(t(count_hv), k = 100, center = TRUE, scale = FALSE) 
-    }
-    eigs = (pca$sdev)^2
-    var_cum = cumsum(eigs)/sum(eigs)
-    if(max(var_cum) <= 0.8){
-      npc = length(var_cum)
-    }else{
-      npc = which.max(var_cum > 0.8)
-      if (labeled == FALSE){ npc = max(npc, Kcluster) }
+      var_thre = 0.6
+      pca = rpca(t(count_hv), k = 1000, center = TRUE, scale = FALSE) 
+      eigs = (pca$sdev)^2
+      var_cum = cumsum(eigs)/sum(eigs)
+      if(max(var_cum) <= var_thre){
+        npc = length(var_cum)
+      }else{
+        npc = which.max(var_cum > var_thre)
+        if (labeled == FALSE){ npc = max(npc, Kcluster) }
+      }
     }
+
     if (npc < 3){ npc = 3 }
     mat_pcs = t(pca$x[, 1:npc]) # columns are cells
 
     dist_cells_list = mclapply(1:J, function(id1){
-      sapply(1:J, function(id2){
-        if(id1 <= id2) return(0)
+      d = sapply(1:id1, function(id2){
         sse = sum((mat_pcs[, id1] - mat_pcs[, id2])^2)
         sqrt(sse)
       })
+      return(c(d, rep(0, J-id1)))
     }, mc.cores = ncores)
     dist_cells = matrix(0, nrow = J, ncol = J)
     for(cellid in 1:J){dist_cells[cellid, ] = dist_cells_list[[cellid]]}
@@ -291,7 +324,7 @@ imputation_wlabel_model8 = function(count, labeled, cell_labels = NULL, point, d
   print("searching candidate neighbors ... ")
   neighbors_res = find_neighbors(count_hv = count_hv, labeled = TRUE, J = J,  
                                  ncores = ncores, cell_labels = cell_labels)
-  dist_cells_list = neighbors_res$dist_list
+  dist_cells = neighbors_res$dist_cells
   clust = neighbors_res$clust
 
   # mixture model
@@ -348,7 +381,7 @@ imputation_wlabel_model8 = function(count, labeled, cell_labels = NULL, point, d
       geneid_drop = setA[[cellid]]
       geneid_obs = setB[[cellid]]
       y = try(impute_nnls(Ic, cellid = cellid, subcount, droprate, geneid_drop, 
-                          geneid_obs, nbs, distc = dist_cells_list[[cc]]),
+                          geneid_obs, nbs, distc = dist_cells[cells, cells]),
               silent = TRUE)
       if (class(y) == "try-error") {
         # print(y)

diff --git a/README.md b/README.md
@@ -1,16 +1,17 @@
 scImpute: accurate and robust imputation of scRNA-seq data
 ================
 Wei Vivian Li, Jingyi Jessica Li
-2018-06-28
+2018-08-15
 
 <!-- README.md is generated from README.Rmd. Please edit that file -->
 Latest News
 -----------
 
-> 2018/06/27:
+> 2018/08/15:
 
--   Version 0.0.8 is released!
--   Faster implementation of dimension reduction.
+-   Version 0.0.9 is released!
+-   More robust implementation of dimension reduction.
+-   Faster calculation of cell similarity.
 
 Introduction
 ------------
@@ -59,40 +60,3 @@ scimpute(# full path to raw count matrix
 This function returns the column indices of outlier cells, and creates a new file `scimpute_count.csv` in `out_dir` to store the imputed count matrix. Please note that we recommend applying scImpute on the whole-genome count matrix. A filtering step on genes is acceptable but most genes should be present to ensure robust identification of dropouts.
 
 For detailed usage, please refer to the package [manual](https://github.com/Vivianstats/scImpute/blob/master/inst/docs/) or [vignette](https://github.com/Vivianstats/scImpute/blob/master/vignettes/scImpute-vignette.Rmd).
-
-Updates
--------
-
-> 2018/06/08:
-
--   Version 0.0.7 is released!
--   New option for application on TPM values.
-
-> 2018/03/16:
-
--   Version 0.0.6 is released!
--   The scImpute method is published at [*Nature Communications*](https://www.nature.com/articles/s41467-018-03405-7).
--   scImpute now supports input and output in the format of R objects (.rds).
-
-> 2018/01/12:
-
--   Version 0.0.5 is released!
--   It is now possible to apply scImpute on just one cell population by setting `Kcluster = 1`.
-
-> 2017/10/27:
-
--   Version 0.0.4 is released!
--   scImpute now supports multi-code parallelism.
-
-> 2017/10/22:
-
--   Version 0.0.3 is released!
--   Estimation of dropout probabilities is more accurate.
--   Imputation step is more robust.
--   `scimpute()` incorporates a new parameter `Kcluster` to specify the number of cell subpopulations.
--   `scImpute` is now able to detect outlier cells.
-
-> 2017/07/01:
-
--   Version 0.0.2 is released!
--   This version speeds up the first step in `scImpute` and program now completes in a few seconds when applied to a dataset with 10,000 genes and 100 cells (using single core).
diff --git a/inst/docs/scImpute-news.html b/inst/docs/scImpute-news.html
@@ -0,0 +1,117 @@
+<!DOCTYPE html>
+
+<html xmlns="http://www.w3.org/1999/xhtml">
+
+<head>
+
+<meta charset="utf-8" />
+<meta http-equiv="Content-Type" content="text/html; charset=utf-8" />
+<meta name="generator" content="pandoc" />
+
+<meta name="viewport" content="width=device-width, initial-scale=1">
+
+<meta name="author" content="Wei Vivian Li" />
+
+<meta name="date" content="2018-08-15" />
+
+<title>scImpute Updates</title>
+
+
+
+
+
+
+<link href="data:text/css;charset=utf-8,body%20%7B%0Abackground%2Dcolor%3A%20%23fff%3B%0Amargin%3A%201em%20auto%3B%0Amax%2Dwidth%3A%20700px%3B%0Aoverflow%3A%20visible%3B%0Apadding%2Dleft%3A%202em%3B%0Apadding%2Dright%3A%202em%3B%0Afont%2Dfamily%3A%20%22Open%20Sans%22%2C%20%22Helvetica%20Neue%22%2C%20Helvetica%2C%20Arial%2C%20sans%2Dserif%3B%0Afont%2Dsize%3A%2014px%3B%0Aline%2Dheight%3A%201%2E35%3B%0A%7D%0A%23header%20%7B%0Atext%2Dalign%3A%20center%3B%0A%7D%0A%23TOC%20%7B%0Aclear%3A%20both%3B%0Amargin%3A%200%200%2010px%2010px%3B%0Apadding%3A%204px%3B%0Awidth%3A%20400px%3B%0Aborder%3A%201px%20solid%20%23CCCCCC%3B%0Aborder%2Dradius%3A%205px%3B%0Abackground%2Dcolor%3A%20%23f6f6f6%3B%0Afont%2Dsize%3A%2013px%3B%0Aline%2Dheight%3A%201%2E3%3B%0A%7D%0A%23TOC%20%2Etoctitle%20%7B%0Afont%2Dweight%3A%20bold%3B%0Afont%2Dsize%3A%2015px%3B%0Amargin%2Dleft%3A%205px%3B%0A%7D%0A%23TOC%20ul%20%7B%0Apadding%2Dleft%3A%2040px%3B%0Amargin%2Dleft%3A%20%2D1%2E5em%3B%0Amargin%2Dtop%3A%205px%3B%0Amargin%2Dbottom%3A%205px%3B%0A%7D%0A%23TOC%20ul%20ul%20%7B%0Amargin%2Dleft%3A%20%2D2em%3B%0A%7D%0A%23TOC%20li%20%7B%0Aline%2Dheight%3A%2016px%3B%0A%7D%0Atable%20%7B%0Amargin%3A%201em%20auto%3B%0Aborder%2Dwidth%3A%201px%3B%0Aborder%2Dcolor%3A%20%23DDDDDD%3B%0Aborder%2Dstyle%3A%20outset%3B%0Aborder%2Dcollapse%3A%20collapse%3B%0A%7D%0Atable%20th%20%7B%0Aborder%2Dwidth%3A%202px%3B%0Apadding%3A%205px%3B%0Aborder%2Dstyle%3A%20inset%3B%0A%7D%0Atable%20td%20%7B%0Aborder%2Dwidth%3A%201px%3B%0Aborder%2Dstyle%3A%20inset%3B%0Aline%2Dheight%3A%2018px%3B%0Apadding%3A%205px%205px%3B%0A%7D%0Atable%2C%20table%20th%2C%20table%20td%20%7B%0Aborder%2Dleft%2Dstyle%3A%20none%3B%0Aborder%2Dright%2Dstyle%3A%20none%3B%0A%7D%0Atable%20thead%2C%20table%20tr%2Eeven%20%7B%0Abackground%2Dcolor%3A%20%23f7f7f7%3B%0A%7D%0Ap%20%7B%0Amargin%3A%200%2E5em%200%3B%0A%7D%0Ablockquote%20%7B%0Abackground%2Dcolor%3A%20%23f6f6f6%3B%0Apadding%3A%200%2E25em%200%2E75em%3B%0A%7D%0Ahr%20%7B%0Aborder%2Dstyle%3A%20solid%3B%0Aborder%3A%20none%3B%0Aborder%2Dtop%3A%201px%20solid%20%23777%3B%0Amargin%3A%2028px%200%3B%0A%7D%0Adl%20%7B%0Amargin%2Dleft%3A%200%3B%0A%7D%0Adl%20dd%20%7B%0Amargin%2Dbottom%3A%2013px%3B%0Amargin%2Dleft%3A%2013px%3B%0A%7D%0Adl%20dt%20%7B%0Afont%2Dweight%3A%20bold%3B%0A%7D%0Aul%20%7B%0Amargin%2Dtop%3A%200%3B%0A%7D%0Aul%20li%20%7B%0Alist%2Dstyle%3A%20circle%20outside%3B%0A%7D%0Aul%20ul%20%7B%0Amargin%2Dbottom%3A%200%3B%0A%7D%0Apre%2C%20code%20%7B%0Abackground%2Dcolor%3A%20%23f7f7f7%3B%0Aborder%2Dradius%3A%203px%3B%0Acolor%3A%20%23333%3B%0Awhite%2Dspace%3A%20pre%2Dwrap%3B%20%0A%7D%0Apre%20%7B%0Aborder%2Dradius%3A%203px%3B%0Amargin%3A%205px%200px%2010px%200px%3B%0Apadding%3A%2010px%3B%0A%7D%0Apre%3Anot%28%5Bclass%5D%29%20%7B%0Abackground%2Dcolor%3A%20%23f7f7f7%3B%0A%7D%0Acode%20%7B%0Afont%2Dfamily%3A%20Consolas%2C%20Monaco%2C%20%27Courier%20New%27%2C%20monospace%3B%0Afont%2Dsize%3A%2085%25%3B%0A%7D%0Ap%20%3E%20code%2C%20li%20%3E%20code%20%7B%0Apadding%3A%202px%200px%3B%0A%7D%0Adiv%2Efigure%20%7B%0Atext%2Dalign%3A%20center%3B%0A%7D%0Aimg%20%7B%0Abackground%2Dcolor%3A%20%23FFFFFF%3B%0Apadding%3A%202px%3B%0Aborder%3A%201px%20solid%20%23DDDDDD%3B%0Aborder%2Dradius%3A%203px%3B%0Aborder%3A%201px%20solid%20%23CCCCCC%3B%0Amargin%3A%200%205px%3B%0A%7D%0Ah1%20%7B%0Amargin%2Dtop%3A%200%3B%0Afont%2Dsize%3A%2035px%3B%0Aline%2Dheight%3A%2040px%3B%0A%7D%0Ah2%20%7B%0Aborder%2Dbottom%3A%204px%20solid%20%23f7f7f7%3B%0Apadding%2Dtop%3A%2010px%3B%0Apadding%2Dbottom%3A%202px%3B%0Afont%2Dsize%3A%20145%25%3B%0A%7D%0Ah3%20%7B%0Aborder%2Dbottom%3A%202px%20solid%20%23f7f7f7%3B%0Apadding%2Dtop%3A%2010px%3B%0Afont%2Dsize%3A%20120%25%3B%0A%7D%0Ah4%20%7B%0Aborder%2Dbottom%3A%201px%20solid%20%23f7f7f7%3B%0Amargin%2Dleft%3A%208px%3B%0Afont%2Dsize%3A%20105%25%3B%0A%7D%0Ah5%2C%20h6%20%7B%0Aborder%2Dbottom%3A%201px%20solid%20%23ccc%3B%0Afont%2Dsize%3A%20105%25%3B%0A%7D%0Aa%20%7B%0Acolor%3A%20%230033dd%3B%0Atext%2Ddecoration%3A%20none%3B%0A%7D%0Aa%3Ahover%20%7B%0Acolor%3A%20%236666ff%3B%20%7D%0Aa%3Avisited%20%7B%0Acolor%3A%20%23800080%3B%20%7D%0Aa%3Avisited%3Ahover%20%7B%0Acolor%3A%20%23BB00BB%3B%20%7D%0Aa%5Bhref%5E%3D%22http%3A%22%5D%20%7B%0Atext%2Ddecoration%3A%20underline%3B%20%7D%0Aa%5Bhref%5E%3D%22https%3A%22%5D%20%7B%0Atext%2Ddecoration%3A%20underline%3B%20%7D%0A%0Acode%20%3E%20span%2Ekw%20%7B%20color%3A%20%23555%3B%20font%2Dweight%3A%20bold%3B%20%7D%20%0Acode%20%3E%20span%2Edt%20%7B%20color%3A%20%23902000%3B%20%7D%20%0Acode%20%3E%20span%2Edv%20%7B%20color%3A%20%2340a070%3B%20%7D%20%0Acode%20%3E%20span%2Ebn%20%7B%20color%3A%20%23d14%3B%20%7D%20%0Acode%20%3E%20span%2Efl%20%7B%20color%3A%20%23d14%3B%20%7D%20%0Acode%20%3E%20span%2Ech%20%7B%20color%3A%20%23d14%3B%20%7D%20%0Acode%20%3E%20span%2Est%20%7B%20color%3A%20%23d14%3B%20%7D%20%0Acode%20%3E%20span%2Eco%20%7B%20color%3A%20%23888888%3B%20font%2Dstyle%3A%20italic%3B%20%7D%20%0Acode%20%3E%20span%2Eot%20%7B%20color%3A%20%23007020%3B%20%7D%20%0Acode%20%3E%20span%2Eal%20%7B%20color%3A%20%23ff0000%3B%20font%2Dweight%3A%20bold%3B%20%7D%20%0Acode%20%3E%20span%2Efu%20%7B%20color%3A%20%23900%3B%20font%2Dweight%3A%20bold%3B%20%7D%20%20code%20%3E%20span%2Eer%20%7B%20color%3A%20%23a61717%3B%20background%2Dcolor%3A%20%23e3d2d2%3B%20%7D%20%0A" rel="stylesheet" type="text/css" />
+
+</head>
+
+<body>
+
+
+
+
+<h1 class="title toc-ignore">scImpute Updates</h1>
+<h4 class="author"><em>Wei Vivian Li</em></h4>
+<h4 class="date"><em>2018-08-15</em></h4>
+
+
+
+<div id="updates" class="section level2">
+<h2>Updates</h2>
+<blockquote>
+<p>2018/08/15:</p>
+</blockquote>
+<ul>
+<li>Version 0.0.9 is released!</li>
+<li>More robust implementation of dimension reduction.</li>
+<li>Faster calculation of cell similarity.</li>
+</ul>
+<blockquote>
+<p>2018/06/27:</p>
+</blockquote>
+<ul>
+<li>Version 0.0.8 is released!</li>
+<li>Faster implementation of dimension reduction.</li>
+</ul>
+<blockquote>
+<p>2018/06/08:</p>
+</blockquote>
+<ul>
+<li>Version 0.0.7 is released!</li>
+<li>New option for application on TPM values.</li>
+</ul>
+<blockquote>
+<p>2018/03/16:</p>
+</blockquote>
+<ul>
+<li>Version 0.0.6 is released!</li>
+<li>The scImpute method is published at <a href="https://www.nature.com/articles/s41467-018-03405-7"><em>Nature Communications</em></a>.</li>
+<li>scImpute now supports input and output in the format of R objects (.rds).</li>
+</ul>
+<blockquote>
+<p>2018/01/12:</p>
+</blockquote>
+<ul>
+<li>Version 0.0.5 is released!</li>
+<li>It is now possible to apply scImpute on just one cell population by setting <code>Kcluster = 1</code>.</li>
+</ul>
+<blockquote>
+<p>2017/10/27:</p>
+</blockquote>
+<ul>
+<li>Version 0.0.4 is released!</li>
+<li>scImpute now supports multi-code parallelism.</li>
+</ul>
+<blockquote>
+<p>2017/10/22:</p>
+</blockquote>
+<ul>
+<li>Version 0.0.3 is released!</li>
+<li>Estimation of dropout probabilities is more accurate.</li>
+<li>Imputation step is more robust.</li>
+<li><code>scimpute()</code> incorporates a new parameter <code>Kcluster</code> to specify the number of cell subpopulations.</li>
+<li><code>scImpute</code> is now able to detect outlier cells.</li>
+</ul>
+<blockquote>
+<p>2017/07/01:</p>
+</blockquote>
+<ul>
+<li>Version 0.0.2 is released!</li>
+<li>This version speeds up the first step in <code>scImpute</code> and program now completes in a few seconds when applied to a dataset with 10,000 genes and 100 cells (using single core).</li>
+</ul>
+</div>
+
+
+
+<!-- dynamically load mathjax for compatibility with self-contained -->
+<script>
+  (function () {
+    var script = document.createElement("script");
+    script.type = "text/javascript";
+    script.src  = "https://mathjax.rstudio.com/latest/MathJax.js?config=TeX-AMS-MML_HTMLorMML";
+    document.getElementsByTagName("head")[0].appendChild(script);
+  })();
+</script>
+
+</body>
+</html>