0.99.2 version

peterawe · Jun 11, 2020 · 40dfe24 · 40dfe24
1 parent 5e7c937
commit 40dfe24
Show file tree

Hide file tree

Showing 39 changed files with 377 additions and 192 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,7 +1,7 @@
 Package: CMScaller
 Title: CMScaller: an R package for consensus molecular subtyping of colorectal cancer pre-clinical models
 Description: Colorectal cancers (CRCs) can be divided into four gene expression-based and biologically distinct consensus molecular subtypes (CMS). This classification is clinically relevant and provides both prognostic stratification of the patients and a potential basis for stratified treatment. CMScaller provides CMS classification of pre-clinical models such as cell lines, organoids and patient derived xenografts based on a microarray or RNA-sequencing gene expression data.
-Version: 0.99.1
+Version: 0.99.2
 Date: 2017-10-23
 Authors@R: c(person("Peter W.", "Eide", 
                     email = "[email protected]",
@@ -15,22 +15,26 @@ Imports:
     Biobase,
     graphics,
     stats,
-    utils
+    utils,
 Suggests:
     BiocStyle,
+    commonmark,
     edgeR,
     knitr,
     limma,
     parallel,
+    plotrix,
     snow,
-    testthat
+    testthat,
+    xml2
 biocViews:
     Software,
     GeneExpression,
     StatisticalMethod,
     Classification
 LazyData: TRUE
+Encoding: UTF-8
 ByteCompile: TRUE
 License: GPL-3 + file LICENSE
-RoxygenNote: 6.0.1
+RoxygenNote: 7.1.0
 VignetteBuilder: knitr
diff --git a/NAMESPACE b/NAMESPACE
@@ -8,6 +8,7 @@ export(ematAdjust)
 export(fromTo)
 export(ntp)
 export(ntpMakeTemplates)
+export(plotPC)
 export(replaceGeneId)
 export(subBoxplot)
 export(subCamera)

diff --git a/NEWS.md b/NEWS.md
@@ -1,8 +1,8 @@
-# 0.99.2 
+# 0.99.2
 
 ## Changes
 
-* genome annotations updated to [GENCODE](https://www.gencodegenes.org/) v26/genome build GRCh38.p10
+* genome annotations updated to [GENCODE](https://www.gencodegenes.org/) v26/genome build GRCh38.p10 (now reported as startup message)
 * geneSets updated: [MSigDB](http://software.broadinstitute.org/gsea/index.jsp) to v7.0 and [reactome](https://reactome.org/) to download date 2019-Aug-22
 * default heatmap colors changed to be less intrusive
 * default qualitative color palette changed to [Okabe & Ito's Color Universal Design](http://jfly.iam.u-tokyo.ac.jp/color/)
@@ -13,13 +13,13 @@
 
 ## Bug fixes
 
-* `subCamera` p-value legend lacked the color representing the highest value [SH Moosavi]
+* `subCamera` p-value legend lacked the color representing the highest value [reported by SH Moosavi]
 
-## notes
+## Notes
 
-* [R 3.6.0 implemented a bug fix to `sample` which may break p-value reproducibility with older versions](https://cran.r-project.org/doc/manuals/r-release/NEWS.html)
+* [R 3.6.0 fixed a bug in `sample()`. Fix may break p-value reproducibility for older versions](https://cran.r-project.org/doc/manuals/r-release/NEWS.html)
 
-# 0.99.1 
+# 0.99.1 (2019-August)
 
 *Scientific Reports* publication
 

diff --git a/R/data.R b/R/data.R
@@ -4,7 +4,7 @@
 #' (\url{http://gdac.broadinstitute.org/}). Samples were annotated with
 #' Consensus Molecular Subtypes (Guinney 2015), Sadanandam subtypes
 #' (Isella 2015) and ABSOLUTE estimated tumor purity (Aran 2015). Features were
-#' annotated using \code{\link[CMScaller]{fromTo}}. Rownames are Ensembl ids.
+#' annotated using \code{\link[CMScaller]{fromTo}}. Rownames are Entrez ids.
 #' Included are 92 HiSeq test set samples with genes with maximum count
 #' exceeding 25.
 #' @references Aran D, Sirota M, Butte AJ. Systematic pan-cancer analysis of tumour purity. Nat Commun. 2015;6:8971.
@@ -20,7 +20,7 @@
 #' @details Colorectal cancer Consensus Molecular Subtypes (CMS) prediction
 #' templates for \code{\link[CMScaller]{ntp}}. Marker genes were
 #' identified using TCGA RNA-sequencing data. \code{templates$probe} refers to
-#' Ensembl ids.
+#' Entrez ids.
 #' @references Guinney J, Dienstmann R, Wang X, de Reynies A, Schlicker A, Soneson C, et al. The consensus molecular subtypes of colorectal cancer. Nat Med [Internet]. 2015 [cited 2015 Nov 5];advance online publication. Available from: \url{http://www.nature.com/nm/journal/vaop/ncurrent/full/nm.3967.html}
 #' @references Hoshida Y. Nearest Template Prediction: A Single-Sample-Based Flexible Class Prediction with Confidence Assessment. PLoS ONE. 2010;5:e15543.
 #' @examples
@@ -32,7 +32,7 @@
 #' @details Colorectal cancer Consensus Molecular Subtypes (CMS) prediction
 #' templates for \code{\link[CMScaller]{ntp}}. Marker genes were
 #' identified using TCGA RNA-sequencing data. \code{templates$probe} refers to
-#' Ensembl ids. Compared to \code{\link{templates.CMS}} this was prepared using
+#' Entrez ids. Compared to \code{\link{templates.CMS}} this was prepared using
 #' additional filters steps to further clean genes also expressed by
 #' non-carcinoma cell types. Predictions are usually highly concordant with
 #' \code{\link{templates.CMS}}. Under development.
@@ -56,7 +56,7 @@
 #' micro-satellite instability templates
 #' @details micro-satellite instability (MSI) prediction templates for
 #' \code{\link[CMScaller]{ntp}}. Marker genes were identified using TCGA
-#' RNA-sequencing data. \code{templates$probe} refers to Ensembl ids.
+#' RNA-sequencing data. \code{templates$probe} refers to Entrez ids.
 #' @examples
 #' head(templates.MSI)
 #' table(templates.MSI$class)
@@ -86,23 +86,16 @@
 "anno.orgHs"
 
 #' gene sets for exploratory gene set analysis
-#' @details Gene sets from Reactome (Oct-2017) with Ensembl gene identifiers.
+#' @details Gene sets from Reactome (Aug-2019) with Entrez gene identifiers.
 #' @references Croft D, Mundo AF, Haw R, Milacic M, Weiser J, Wu G, et al. The Reactome pathway knowledgebase. Nucl Acids Res. 2014;42:D472-7.
 #' @examples
 #' head(names(geneSets.reactome))
 "geneSets.reactome"
 
-#' gene sets for exploratory gene set analysis
-#' @details Gene sets from MSigDB (v6.2) with ensembl gene identifiers.
-#' @examples
-#' head(names(geneSets.C2))
-"geneSets.C2"
-
-
 #' gene sets relevant to Consensus Molecular Subtypes
-#' @description Geneset is a named list of Ensembl idds. Watanabe CRC MSS/MSI,
+#' @description Geneset is a named list of Entrez idds. Watanabe CRC MSS/MSI,
 #' Liu CDX2, Lucas HNF4A, and Servitja HNF1A were retrieved from MutSigDB C2
-#' (v6.2). Gastro-Intestinal enriched genes are from the \href{http://www.proteinatlas.org/humanproteome/gastrointestinal+tract}{Human Protein Atlas}
+#' (v7.0). Gastro-Intestinal enriched genes are from the \href{http://www.proteinatlas.org/humanproteome/gastrointestinal+tract}{Human Protein Atlas}
 #' Extracellular Matrix mCRC is from Naba Additional Data 2.
 #' Crypt signatures are based on Merlos-Suarez Supplementary Table 5.
 #' MYB signature is based on Thorner Supplementary Table 2.
@@ -112,7 +105,7 @@
 #' CTNNB1/Beta-catenin signature is based on Watanabe Supplementary Table 1.
 #' WNT signatures are based on Vermeulen Supplementary Table S1.
 #' Retionic acid signatures are based on Duffy Supplementary Table 2.
-#' Remaining are either from MutSigDB Hallmarks (v6.2) or Reactome (Dec-2016).
+#' Remaining are either from MutSigDB Hallmarks (v7.0) or Reactome (Aug-2019).
 #' @references Croft D, Mundo AF, Haw R, Milacic M, Weiser J, Wu G, et al. The Reactome pathway knowledgebase. Nucl Acids Res. 2014;42:D472-7.
 #' @references Duffy DJ, Krstic A, Halasz M, Schwarzl T, Konietzny A, Iljin K, et al. Retinoic acid and TGF-beta signalling cooperate to overcome MYCN-induced retinoid resistance. Genome Medicine. 2017;9:15.
 #' @references Fessler E, Drost J, Hooff SR van, Linnekamp JF, Wang X, Jansen M, et al. TGF-beta signaling directs serrated adenomas to the mesenchymal colorectal cancer subtype. EMBO Molecular Medicine. 2016;8:745-60.
@@ -131,23 +124,23 @@
 "geneSets.CRC"
 
 #' gene sets relevant to Consensus Molecular Subtypes
-#' @description Geneset is a named list of Ensembl ids and is a subset of \code{\link{geneSets.CRC}}.
+#' @description Geneset is a named list of Entrez ids and is a subset of \code{\link{geneSets.CRC}}.
 #' \itemize{
-#' \item{MSI (Watanabe) from \href{http://software.broadinstitute.org/gsea/msigdb}{MutSigDB} C2 (v6.2)}
-#' \item{DNA repair from MutSigDB Hallmark (v6.2)}
-#' \item{HNF4A (Lucas) from MutSigDB C2 (v6.2)}
-#' \item{MSS (Watanabe) from MutSigDB C2 (v6.2)}
-#' \item{MYC from MutSigDB Hallmark (v6.2)}
+#' \item{MSI (Watanabe) from \href{http://software.broadinstitute.org/gsea/msigdb}{MutSigDB} C2 (v7.0)}
+#' \item{DNA repair from MutSigDB Hallmark (v7.0)}
+#' \item{HNF4A (Lucas) from MutSigDB C2 (v7.0)}
+#' \item{MSS (Watanabe) from MutSigDB C2 (v7.0)}
+#' \item{MYC from MutSigDB Hallmark (v7.0)}
 #' \item{WNT signature based on Vermeulen Supplementary Table S1}
-#' \item{cell cycle (E2F targets) from MutSigDB Hallmark (v6.2)}
+#' \item{cell cycle (E2F targets) from MutSigDB Hallmark (v7.0)}
 #' \item{differentiation (gastro-intestinal markers) from
 #' \href{http://www.proteinatlas.org/humanproteome/gastrointestinal+tract}{Human Protein Atlas}.}
-#' \item{glycolysis from MutSigDB Hallmark (v6.2)}
-#' \item{fatty acids from \href{http://www.reactome.org/}{reactome} (accessed 20180921)}
-#' \item{CDX2 (Liu) from MutSigDB C2 (v6.2)}
+#' \item{glycolysis from MutSigDB Hallmark (v7.0)}
+#' \item{fatty acids from \href{http://www.reactome.org/}{reactome} (accessed 20190822)}
+#' \item{CDX2 (Liu) from MutSigDB C2 (v7.0)}
 #' \item{LGR5 stem cells Merlos-Suarez Supplementary Table 5}
 #' \item{TGF-beta signature based on Fessler GEO \href{https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE79461}{GSE79461}}
-#' \item{EMT (epithelial-mesenchymal transition)from MutSigDB Hallmark (v6.2)}
+#' \item{EMT (epithelial-mesenchymal transition)from MutSigDB Hallmark (v7.0)}
 #' }
 #' @references Croft D, Mundo AF, Haw R, Milacic M, Weiser J, Wu G, et al. The Reactome pathway knowledgebase. Nucl Acids Res. 2014;42:D472-7.
 #' @references Fessler E, Drost J, Hooff SR van, Linnekamp JF, Wang X, Jansen M, et al. TGF-beta signaling directs serrated adenomas to the mesenchymal colorectal cancer subtype. EMBO Molecular Medicine. 2016;8:745-60.

diff --git a/R/subVolcano.R b/R/subVolcano.R
@@ -20,7 +20,7 @@
 subVolcano <- function(deg, geneID = "rownames",
                        lfc = log2(2), padj = .05, ave=0,
                        topN = 10, cexText=1,
-                       classCol = getOption("subClassCol"),...) {
+                       classCol = getOption("subClassCol"), ...) {
 
     ddd <- list(...)
     if (!is.null(ddd$main)) main <- ddd$main else main=""
@@ -48,14 +48,14 @@ subVolcano <- function(deg, geneID = "rownames",
 
         if (length(x) < 3e3) {
             graphics::plot(x, y, xlim=xRange, col="gray", cex=.5,
-                           main=main, xlab=xlab, ylab=ylab)
+                           main=main, xlab=xlab, ylab=ylab, ...)
         } else {
             if (!packageExists("KernSmooth")) {
                 graphics::plot(x, y, xlim=xRange, col="gray", cex=.5,
-                               main=main, xlab=xlab, ylab=ylab)
+                               main=main, xlab=xlab, ylab=ylab, ...)
             } else {
                 graphics::smoothScatter(x, y, xlim=xRange, nrpoints=0,
-                                        main=main, xlab=xlab, ylab=ylab)
+                                        main=main, xlab=xlab, ylab=ylab, ...)
             }
         }
         graphics::abline(h=0, lty=1)
@@ -65,7 +65,7 @@ subVolcano <- function(deg, geneID = "rownames",
         graphics::abline(v=c(-lfc,lfc), h=hline, lty=2)
         # add features crossing threshollds
         ff <- which(abs(x) > lfc & deg$adj.P.Val < padj)
-        if (length(ff) > 1) {
+        if (length(ff) >= 1) {
             cc <- ifelse(x > 0, clCol[1], clCol[2])
             graphics::points(x[ff], y[ff], col=cc[ff], cex=.5)
 
@@ -85,7 +85,7 @@ subVolcano <- function(deg, geneID = "rownames",
 
     mtextFun <- function(...) graphics::mtext(...,side=3, cex=.67, line=0)
 
-    if(class(deg) == "list") {
+    if(class(deg)[1] == "list") {
         K <- length(deg)
         snk <- lapply(seq_len(K), function(k) {
             plotVolc(deg[[k]],

diff --git a/R/sysdata.rda b/R/sysdata.rda
diff --git a/data/anno.orgHs.rda b/data/anno.orgHs.rda
diff --git a/data/crcTCGAsubset.rda b/data/crcTCGAsubset.rda
diff --git a/data/geneSets.CMS.rda b/data/geneSets.CMS.rda
diff --git a/data/geneSets.CRC.rda b/data/geneSets.CRC.rda
diff --git a/data/geneSets.reactome.rda b/data/geneSets.reactome.rda
diff --git a/data/templates.CMS.rda b/data/templates.CMS.rda
diff --git a/data/templates.CRIS.rda b/data/templates.CRIS.rda
diff --git a/data/templates.MSI.rda b/data/templates.MSI.rda
diff --git a/inst/doc/CMScaller.R b/inst/doc/CMScaller.R
@@ -1,33 +1,33 @@
-## ----prepareSession, include=FALSE---------------------------------------
+## ----prepareSession, include=FALSE--------------------------------------------
 library(Biobase)
 library(BiocStyle)
 knitr::opts_chunk$set(fig.width=6, fig.height=3, 
         dev.args=list(pointsize=8), dpi=150,
         collapse=TRUE, message=TRUE, echo=TRUE, warnings=FALSE)
 options(scipen=-1, digits=2)
 
-## ---- eval=FALSE---------------------------------------------------------
+## ---- eval=FALSE--------------------------------------------------------------
 #  # dependencies: run if not already installed
 #  source("https://bioconductor.org/biocLite.R")
 #  biocLite(c("Biobase", "limma"))
 #  # proper repository to be fixed for publication
-#  install.packages("pathToPackageFile/CMScaller_0.99.0.tar.gz", repos=NULL)
+#  install.packages("pathToPackageFile/CMScaller_0.99.2.tar.gz", repos=NULL)
 
-## ----quickStart, fig.cap="CMScaller graphic output. **Left** heatmap shows relative expression for template genes. Samples (columns) are ordered according to class predictions and confidence. The height of the white bars below gives the unadjusted prediction *p*-values. Genes (rows) are ordered according to class template. Heatmap color saturation indicates magnitude while red and blue indicate genes up and down relative to the sample mean. **Right** heatmap shows results for Camera gene set analysis. Heatmap color saturation indicates statistical significance and red and blue indicates direction of change."----
+## ----quickStart, fig.cap="CMScaller graphic output. Left heatmap shows relative expression for template genes. Samples (columns) are ordered according to class predictions and confidence. The height of the white bars below gives the unadjusted prediction $p$-values. Genes (rows) are ordered according to class template. Heatmap color saturation indicates magnitude while red and blue indicate genes up and down relative to the sample mean. Right heatmap shows results for Camera gene set analysis. Heatmap color saturation indicates statistical significance and red and blue indicates direction of change."----
 library(Biobase) # if input is ExpressionSet
 library(CMScaller)
 # get RNA-seq counts from TCGA example data
 counts <- exprs(crcTCGAsubset)
-head(counts[,1:3])
+head(counts[,1:2])
 # prediction and gene set analysis
 par(mfrow=c(1,2))
-res <- CMScaller(emat=counts, RNAseq=TRUE, FDR=0.1)
-cam <- CMSgsa(emat=crcTCGAsubset, class=res$prediction,RNAseq=TRUE)
+res <- CMScaller(emat=counts, RNAseq=TRUE, FDR=0.05)
+cam <- CMSgsa(emat=counts, class=res$prediction,RNAseq=TRUE)
 # comparison with true class
 table(pred=res$prediction, true=crcTCGAsubset$CMS)
 head(res, n=3)
 
-## ----makeTemplates, fig.keep="last", fig.height=4------------------------
+## ----makeTemplates, fig.keep="last", fig.height=4-----------------------------
 emat <- crcTCGAsubset
 cms <- emat$CMS.Syn
 train <- sample(seq_along(cms), size=length(cms)/(2))
@@ -37,32 +37,39 @@ templates$symbol <- fromTo(templates$probe)
 tail(templates,n=3)
 
 ## ----visGSA, message=TRUE, fig.cap="Gene Set Analysis (GSA) shows that CMS are biologically distinct.", fig.width=3----
-# increase left margins to accomodate gene set names
+# increase left margins to accommodate gene set names
 par.old <- par()
 par(mfrow=c(1,1), mar=par.old$mar+c(0,4,0,0))
-subCamera(emat, cms, geneList=geneSets.CRC, doVoom=TRUE)
+subCamera(counts, cms, geneList=geneSets.CRC, doVoom=TRUE)
 # restore margins
 par(mar=par.old$mar)
 
-## ----input---------------------------------------------------------------
+## ----visPCA, message=TRUE, fig.cap="Principal component analysis (PCA) and CMS. First two principal components seperates CMS (left) with CMS4 characgterized by high levels of THBS4 and low levels of CLCA1 (right)."----
+# increase left margins to accommodate gene set names
+par(mfrow=c(1,2))
+p <- subPCA(emat = crcTCGAsubset, class = crcTCGAsubset$CMS.Syn, 
+            normMethod = "quantile", pch=16, frame=FALSE)
+plotPC(p, n=6, entrez=TRUE)
+
+## ----input--------------------------------------------------------------------
 # loads included emat, scales and centers
 emat <- crcTCGAsubset
 emat_sc <- ematAdjust(emat, normMethod="quantile")
-head(emat_sc[,1:3])
+head(emat_sc[,1:2])
 
-## ------------------------------------------------------------------------
+## -----------------------------------------------------------------------------
 # test set prediction
 res <- ntp(emat_sc[,-train], templates, nPerm=1000)
 res <- subSetNA(res, pValue=.1)
 table(pred=res$prediction, true=cms[-train])
 head(res)
 
-## ----principleDistance, fig.width=3--------------------------------------
+## ----principleDistance, fig.width=3-------------------------------------------
 # random centered/scaled expression matrix and templates
 set.seed(42)
 N <- 5000;P <- 5000;K <- 4;nPerm <- 1000;n <- 1
 X <- matrix(rnorm(P*N, mean=0, sd=1), ncol=N)
-Y <- matrix(rbinom(P*K, size = 1, prob=.01), ncol=K)
+Y <- matrix(rbinom(P*K, size=1, prob=.01), ncol=K)
 # sample-template correlations (implemented in corCosine)
 cos.sim <- crossprod(X,Y) / outer(
                 sqrt(apply(X, 2, crossprod)), 
@@ -72,24 +79,25 @@ simToDist <- function(cos.sim) sqrt(1/2 * (1-cos.sim))
 cos.dist <- simToDist(cos.sim)
 hist(cos.dist, xlab="cosine correlation distance")
 
-## ----principlePermutations-----------------------------------------------
+## ----principlePermutations----------------------------------------------------
 # estimate prediction confidence
 pred.class <- apply(cos.dist, 1, which.min)
 pred.dists <- apply(cos.dist, 1, min)
 null.dist <- replicate(nPerm, min(simToDist(corCosine(sample(X[,n]), Y))))
 p <- rank(c(pred.dists[n], null.dist))[1]/(length(null.dist))
 
-## ----pUniform, eval=FALSE------------------------------------------------
-#  # rearrange matrix and templates for ntp input
-#  rownames(X) <- make.names(seq_len(P))
-#  templates <- lapply(seq_len(K), function(k) rownames(X)[Y[,k]==1])
-#  names(templates) <- paste("k", seq_len(K))
-#  templates <- ntpMakeTemplates(templates, resDEG = FALSE)
-#  # takes a couple of minutes: 5000 samples and 1000 permutations
-#  res <- ntp(X, templates, nCores=4L, nPerm=nPerm, doPlot=TRUE)
-#  # expect uniform distribution
-#  hist(res$p.value)
+## ----pUniform, fig.cap="NTP results for random data. Left heatmap shows expression for template genes for random data with rows and columns sorted according to class. Right histogram shows the expected uniform $p$-value distribution for the random data."----
+# rearrange matrix and templates for ntp input
+rownames(X) <- make.names(seq_len(P))
+templates <- lapply(seq_len(K), function(k) rownames(X)[Y[,k]==1])
+names(templates) <- paste("k", seq_len(K))
+templates <- ntpMakeTemplates(templates, resDEG=FALSE)
+# permutations set to 100 to reduce processing for vignette
+par(mfrow=c(1,2))
+res <- ntp(X, templates, nCores=1L, nPerm=100, doPlot=TRUE)
+# expect uniform distribution
+hist(res$p.value, main ="", xlab="prediction p-values")
 
-## ----endSession, echo=FALSE, as.is=TRUE----------------------------------
-sessionInfo()
+## ----endSession, echo=FALSE, results='asis'-----------------------------------
+toLatex(sessionInfo())