From dd64735cef260f576461c491b7376dd25a1710dd Mon Sep 17 00:00:00 2001 From: MeenaChoi Date: Mon, 2 Jul 2018 12:40:11 -0400 Subject: [PATCH] add classification --- DESCRIPTION | 4 +- R/DIAUmpiretoMSstatsFormat.R | 17 +- R/MaxQtoMSstatsFormat.R | 11 + R/OpenMStoMSstatsFormat.R | 14 +- R/OpenSWATHtoMSstatsFormat.R | 14 +- R/PDtoMSstatsFormat.R | 47 +- R/SampleSizeCalculationClassification.R | 888 ++++++++------- R/SpectronauttoMSstatsFormat.R | 14 +- inst/NEWS | 7 + man/designSampleSizeClassification.Rd | 48 +- man/designSampleSizeClassificationPlots.Rd | 28 +- vignettes/MSstats.Rmd | 1148 ++++++++++---------- 12 files changed, 1200 insertions(+), 1040 deletions(-) mode change 100755 => 100644 R/SampleSizeCalculationClassification.R mode change 100755 => 100644 vignettes/MSstats.Rmd diff --git a/DESCRIPTION b/DESCRIPTION index 8c6c98a5..343fecd1 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,8 +1,8 @@ Package: MSstats Title: Protein Significance Analysis in DDA, SRM and DIA for Label-free or Label-based Proteomics Experiments -Version: 3.13.3 -Date: 2018-06-01 +Version: 3.13.4 +Date: 2018-07-02 Description: A set of tools for statistical relative protein significance analysis in DDA, SRM and DIA experiments. Authors@R: c( person("Meena","Choi", , "mnchoi67@gmail.com", c("aut", "cre")), diff --git a/R/DIAUmpiretoMSstatsFormat.R b/R/DIAUmpiretoMSstatsFormat.R index 293959fc..878f7b11 100644 --- a/R/DIAUmpiretoMSstatsFormat.R +++ b/R/DIAUmpiretoMSstatsFormat.R @@ -29,11 +29,22 @@ DIAUmpiretoMSstatsFormat <- function(raw.frag, raw.pep, raw.pro, if (is.null(annotation)) { stop('** Please prepare \'annotation\' as one of input.') + } else { + ## check annotation + required.annotation <- c('Condition', 'BioReplicate', 'Run') + + if (!all(required.annotation %in% colnames(annotation))) { + + missedAnnotation <- which(!(required.annotation %in% colnames(annotation))) + + stop(paste("**", toString(required.annotation[missedAnnotation]), + "is not provided in Annotation. Please check the annotation file.")) + } else { + ### annotation.txt : Condition, BioReplicate, Run, + annot <- annotation + } } - ### annotation.txt : Raw.file, Condition, BioReplicate, Run, (IsotopeLabelType) - annot <- annotation - ######################## ## 1. get selected frag from DIA-Umpire ######################## diff --git a/R/MaxQtoMSstatsFormat.R b/R/MaxQtoMSstatsFormat.R index e2f2ecae..a9063289 100644 --- a/R/MaxQtoMSstatsFormat.R +++ b/R/MaxQtoMSstatsFormat.R @@ -38,6 +38,17 @@ MaxQtoMSstatsFormat <- function(evidence, ## annotation.txt : Raw.file, Condition, BioReplicate, Run, (IsotopeLabelType) annot <- annotation + ## check annotation + required.annotation <- c('Raw.file', 'Condition', 'BioReplicate', 'Run', 'IsotopeLabelType') + + if (!all(required.annotation %in% colnames(annot))) { + + missedAnnotation <- which(!(required.annotation %in% colnames(annot))) + + stop(paste("**", toString(required.annotation[missedAnnotation]), + "is not provided in Annotation. Please check the annotation file.")) + } + ################################################ ## 1.1 remove contaminant, reverse proteinID ## Contaminant, Reverse column in evidence diff --git a/R/OpenMStoMSstatsFormat.R b/R/OpenMStoMSstatsFormat.R index f1642d42..6c61a6c5 100644 --- a/R/OpenMStoMSstatsFormat.R +++ b/R/OpenMStoMSstatsFormat.R @@ -50,7 +50,19 @@ OpenMStoMSstatsFormat <- function(input, if (is.null(annotation)) { annotinfo <- unique(input[, c("Run", "Condition", 'BioReplicate')]) } else { - annotinfo <- annotation + + ## check annotation + required.annotation <- c('Condition', 'BioReplicate', 'Run') + + if (!all(required.annotation %in% colnames(annotation))) { + + missedAnnotation <- which(!(required.annotation %in% colnames(annotation))) + + stop(paste("**", toString(required.annotation[missedAnnotation]), + "is not provided in Annotation. Please check the annotation file.")) + } else { + annotinfo <- annotation + } } ############################## diff --git a/R/OpenSWATHtoMSstatsFormat.R b/R/OpenSWATHtoMSstatsFormat.R index 09db6aab..2d53eefe 100644 --- a/R/OpenSWATHtoMSstatsFormat.R +++ b/R/OpenSWATHtoMSstatsFormat.R @@ -23,7 +23,19 @@ OpenSWATHtoMSstatsFormat <- function(input, if (is.null(annotation)) { stop('** Please prepare \'annotation\' as one of input.') } else { - annotinfo <- annotation + + ## check annotation + required.annotation <- c('Condition', 'BioReplicate', 'Run') + + if (!all(required.annotation %in% colnames(annotation))) { + + missedAnnotation <- which(!(required.annotation %in% colnames(annotation))) + + stop(paste("**", toString(required.annotation[missedAnnotation]), + "is not provided in Annotation. Please check the annotation file.")) + } else { + annotinfo <- annotation + } } ## Check correct option or input diff --git a/R/PDtoMSstatsFormat.R b/R/PDtoMSstatsFormat.R index e8572f46..6ecb7a6f 100644 --- a/R/PDtoMSstatsFormat.R +++ b/R/PDtoMSstatsFormat.R @@ -16,6 +16,18 @@ PDtoMSstatsFormat <- function(input, which.proteinid = 'Protein.Group.Accessions', which.sequence = 'Sequence'){ + ## check annotation + required.annotation <- c('Condition', 'BioReplicate', 'Run') + + if (!all(required.annotation %in% colnames(annotation))) { + + missedAnnotation <- which(!(required.annotation %in% colnames(annotation))) + + stop(paste("**", toString(required.annotation[missedAnnotation]), + "is not provided in Annotation. Please check the annotation file.", + "'Run' will be matched with 'Spectrum.File' ")) + } + ################################################ ## 0.1. which intensity : Precursor.Area vs. Intensity vs Area ################################################ @@ -244,7 +256,38 @@ PDtoMSstatsFormat <- function(input, rm(input.final) ############################## - ## 6. remove features which has 1 or 2 measurements across runs + ## 6. remove featuares with all na or zero + ## some rows have all zero values across all MS runs. They should be removed. + ############################## + + input$fea <- paste(input$PeptideModifiedSequence, + input$PrecursorCharge, + input$FragmentIon, + input$ProductCharge, + sep="_") + + inputtmp <- input[!is.na(input$Intensity) & input$Intensity > 1, ] + + count <- inputtmp %>% group_by(fea) %>% summarise(length=length(Intensity)) + + ## get feature with all NA or zeros + getfea <- count[count$length > 0, 'fea'] + + if (nrow(getfea) > 0) { + + nfea.remove <- length(unique(input$fea))-nrow(getfea) + input <- input[which(input$fea %in% getfea$fea), ] + + message(paste0('** ', nfea.remove, ' features have all NAs or zero intensity values and are removed.')) + + } + + rm(inputtmp) + input <- input[, -which(colnames(input) %in% c('fea'))] + + + ############################## + ## 7. remove features which has 1 or 2 measurements across runs ############################## if (fewMeasurements=="remove") { @@ -265,7 +308,7 @@ PDtoMSstatsFormat <- function(input, } ############################## - ## 7. remove proteins with only one peptide and charge per protein + ## 8. remove proteins with only one peptide and charge per protein ############################## if (removeProtein_with1Peptide) { diff --git a/R/SampleSizeCalculationClassification.R b/R/SampleSizeCalculationClassification.R old mode 100755 new mode 100644 index 887b3257..323a7a1f --- a/R/SampleSizeCalculationClassification.R +++ b/R/SampleSizeCalculationClassification.R @@ -1,410 +1,478 @@ -#' Calculate the optimal size of training data for classification problem -#' -#' @param data output from function \code{\link{dataProcess}} -#' @param n number of steps, which is the number of different sample sizes to simulate. Default is 5 -#' @param step number of samples per condition to increase at each step. Default is 20 -#' @param noise the fraction of protein variance to change at each step. Positive value indicates increasing noise in the data while negative value means decreasing noise. Default is 0.0 -#' @param iter number of times to repeat simulation experiments. Default is 5 -#' @description For classification problem (such as disgnosys of disease), calculate sample size of training data under different size of validation data for future experiments of a Selected Reaction Monitoring (SRM), Data-Dependent Acquisition (DDA or shotgun), and Data-Independent Acquisition (DIA or SWATH-MS) experiment based on simulation. -#' @details The function fits intensity-based linear model and uses variance components and mean abundance to simulate new data with different sample size. Training data and validation data are simulated independently. Random forest model is fitted on train data and used to predict the validation data. The above procedure is repeated \emph{iter} times. Mean predictive accuracy and variance under different size of training data and validation data are reported. For each size of validation data, the number of training samples with best predictive accuracy is also returned. -#' @return \emph{optTrainSize} is the number of training samples with best predictive accuracy under each size of validation data. -#' @return \emph{meanPA} is the mean predictive accuracy matrix under different size of training data and validation data. -#' @return \emph{varPA} is variance of predictive accuracy under different size of training data and validation data. -#' @author Ting Huang, Meena Choi, Olga Vitek. -#' -#' Maintainer: Meena Choi (\email{mnchoi67@gmail.com}) -#' @references T. Huang et al. TBD 2018 -#' @examples # Consider quantitative data (i.e. QuantData) from yeast study. -#' # A time course study with ten time points of interests and three biological replicates. -#' set.seed(1234) -#' QuantSRM <- dataProcess(SRMRawData) -#' # estimate the mean predictive accuray under different sizes of training data and validation data -#' # n is the number of biological replicates per condition -#' result.srm <- designSampleSizeClassification(data=QuantSRM, n=5, step=10) -#' result.srm$meanPA # mean predictive accuracy -#' -#' # Consider another training set from a colorectal cancer study -#' # Subjects are from control group or colorectal cancer group -#' # 72 proteins were targeted with SRM -#' require(MSstatsBioData) -#' set.seed(1235) -#' data(SRM_crc_training) -#' QuantCRCSRM <- dataProcess(SRM_crc_training) -#' crc.srm <- designSampleSizeClassification(data=QuantCRCSRM, n=10, step=10) -#' crc.srm$meanPA # mean predictive accuracy -#' @importFrom randomForest randomForest combine -#' -#' @export -designSampleSizeClassification <- function(data, - n = 5, step = 20, noise = 0.0, iter = 5) { - - ## save process output in each step - allfiles <- list.files() - filenaming <- "msstats" - - if (length(grep(filenaming, allfiles)) == 0) { - - finalfile <- "msstats.log" - processout <- NULL - - } else { - - num <- 0 - finalfile <- "msstats.log" - - while (is.element(finalfile, allfiles)) { - num <- num + 1 - lastfilename <- finalfile ## in order to rea - finalfile <- paste0(paste(filenaming, num, sep="-"), ".log") - } - finalfile <- lastfilename - processout <- as.matrix(read.table(finalfile, header=TRUE, sep="\t")) - } - processout <- rbind(processout, - as.matrix(c(" ", " ", "MSstats - designSampleSizeClassification function", " "), ncol=1)) - - ## check input is correct - ## data format - rawinput <- c("ProteinName", "PeptideSequence", "PrecursorCharge", - "FragmentIon", "ProductCharge", "IsotopeLabelType", - "Condition", "BioReplicate", "Run", "Intensity") - ## check data - if (length(setdiff(toupper(rawinput),toupper(colnames(data$ProcessedData)))) == 0) { - processout <- rbind(processout, - "The required input - data : did not process from dataProcess function. - stop") - write.table(processout, file=finalfile, row.names=FALSE) - - stop("Please use 'dataProcess' first. Then use output of dataProcess function as input in groupComparison.") - } - - ## check n - if (!(n>0) & (is.integer(n))) { - processout <- rbind(processout, - "The required input - n : was not a positive integer. - stop") - write.table(processout, file=finalfile, row.names=FALSE) - - stop("Please set n as a positive integer.") - } - - ## check step - if (!(step>0) & (is.integer(step))) { - processout <- rbind(processout, - "The required input - step : was not a positive integer. - stop") - write.table(processout, file=finalfile, row.names=FALSE) - - stop("Please set step as a positive integer.") - } - - ## check noise - if (!is.numeric(noise)) { - processout <- rbind(processout, - "The required input - noise : was not a numeric value - stop") - write.table(processout, file=finalfile, row.names=FALSE) - - stop("Please set noise as a numeric value.") - } - - ## check iteration - if (!(iter>0) & (is.integer(iter))) { - processout <- rbind(processout, - "The required input - iter : was not a positive integer. - stop") - write.table(processout, file=finalfile, row.names=FALSE) - - stop("Please set iter as a positive integer.") - } - - processout <- rbind(processout, paste0("n = ", n)) - processout <- rbind(processout, paste0("step = ", step)) - processout <- rbind(processout, paste0("noise = ", noise)) - processout <- rbind(processout, paste0("iter = ", iter)) - - ## Estimate the mean abundance and variance of each protein in each phenotype group - parameters <- .estimateVar(data) - - ## - processout <- rbind(processout, - "Calculated variance component. - okay") - write.table(processout, file=finalfile, row.names=FALSE) - - mu <- parameters$mu - sigma <- parameters$sigma - - ngroup <- length(unique(data$ProcessedData$GROUP_ORIGINAL)) ## Number of phenotype groups - ## Generate the vector of training sample size - train_size <- seq.int(from = step, to = step * n, length.out = n) - train_size <- train_size*ngroup - - ## Generate the vector of validation sample size - validation_size <- seq.int(from = step, to = step*n, length.out = n) - validation_size <- validation_size * ngroup - - message(" Start to run the simulation...") - PA <- list() - for (i in 1:iter) { ## Number of iterations - message(" Iteration: ", i) - rep <- list() - new_sigma_1 <- sigma - accur <- matrix(rep(0, times=length(train_size) * length(validation_size)), nrow=length(train_size)) - - for (m1 in seq_along(train_size)) { - message(" Sample size of training data: ", train_size[m1]) - train <- .sampleSimulation(train_size[m1], mu, new_sigma_1) #Simulate training data - x <- as.data.frame(train$X) - y <- as.factor(train$Y) - #Train random forest on training data - #rf <- randomForest::randomForest(x=x, y=y, ntree = 100) - # parallel computing for random forest - registerDoSNOW() - - mcoptions <- list(set.seed=FALSE) - rf <- foreach(ntree=rep(10, 10), .combine=randomForest::combine, .multicombine=TRUE, - .packages='randomForest', .options.multicore=mcoptions) %dopar% { - randomForest(x=x, - y=y, - ntree=ntree)} - new_sigma_1 <- new_sigma_1 * (noise + 1.0) - new_sigma_2 <- sigma - - for (m2 in seq_along(validation_size)) { ## Different validation sample sizes - message(" Sample size of validation data: ", validation_size[m2]) - valid <- .sampleSimulation(validation_size[m2], mu, new_sigma_2) #Simulate validation data - ## Calculate predictive accuracy - valid_x <- as.data.frame(valid$X) - valid_y <- as.factor(valid$Y) - rf.pred <- predict(rf, valid_x) #Predict validation data - accuracy <- sum(diag(table(rf.pred,valid_y))) / validation_size[m2] - accur[m1,m2] <- accuracy - new_sigma_2 <- new_sigma_2 * (noise + 1.0) - } - } - PA[[i]] <- accur - } - - ## Calculate the mean accuracy and variance - meanPA <- matrix(rep(0, times=length(train_size) * length(validation_size)), nrow=length(train_size)) - varPA <- matrix(rep(0, times=length(train_size) * length(validation_size)), nrow=length(train_size)) - - for (m1 in seq_along(train_size)) { - for (m2 in seq_along(validation_size)) { - temp <- NULL - for (i in 1:iter) { - temp <- c(temp, PA[[i]][m1, m2]) - } - meanPA[m1, m2] <- mean(temp) - varPA[m1, m2] <- var(temp) - } - } - rownames(meanPA) <- paste0("tra", train_size) - colnames(meanPA) <- paste0("val", validation_size) - rownames(varPA) <- paste0("tra", train_size) - colnames(varPA) <- paste0("val", validation_size) - opt <- train_size[max.col(t(meanPA))] - names(opt) <- paste("val", validation_size, sep="") - ## - processout <- rbind(processout, - "The number of sample is calculated. - okay") - write.table(processout, file=finalfile, row.names=FALSE) - - return(list(optTrainSize = opt, - meanPA = meanPA, - varPA = varPA)) -} - - - -#' Estimate the mean abundance and variance of each protein in each phenotype group -#' -#' @param data The run level data reported by dataProcess function. -#' @return mu is the mean abundance matrix of each protein in each phenotype group; sigma is the sd matrix of each protein in each phenotype group. -#' @keywords internal -.estimateVar <- function(data) { - - ## generate a fake contrast matrix - groups <- unique(data$ProcessedData$GROUP_ORIGINAL) - comparison <- matrix(rep(0, length(groups)), nrow=1) - comparison[1, 1] <- 1 - comparison[1, 2] <- -1 - row.names(comparison) <- paste0(groups[1], groups[2]) - - ## Estimate variance components and mean abundances in each group - ResultOneComparison <- groupComparison(contrast.matrix=comparison, data=data) - res <- ResultOneComparison$fittedmodel - - ## Store the results - VarComponent <- data.frame(Error=NA, Subject=NA, GroupBySubject=NA) - Var <- matrix(rep(0.0, length(res) * length(groups)), ncol = length(groups)) - MeanAbun <- matrix(rep(0.0, length(res) * length(groups)), ncol = length(groups)) - count <- 0 - - for (i in 1:length(res)) { - - # note: when run is fixed, we can obtain the same variance of error for both case-control and time course studies. - - fit.full <- res[[i]] - - ## if fit.full==NA (class(fit.full)=="try-error) - if (is.null(fit.full)) { - ## !!!!! but if we have NULL for last protein? - next - - } else { - - ## get variance component - - if (class(fit.full) != "lmerMod") { - VarComponent[i, "Error"] <- summary(fit.full)$sigma^2 - } else { - stddev <- c(sapply(VarCorr(fit.full), function(el) attr(el, "stddev")),attr(VarCorr(fit.full), "sc")) - VarComponent[i, "Error"] <- stddev[names(stddev) == ""]^2 - - if (sum(names(stddev) %in% "SUBJECT_NESTED.(Intercept)") > 0) { - VarComponent[i, "Subject"] <- stddev[names(stddev) == "SUBJECT_NESTED.(Intercept)"]^2 - } - if (sum(names(stddev) %in% "SUBJECT.(Intercept)") > 0) { - VarComponent[i, "Subject"] <- stddev[names(stddev) == "SUBJECT.(Intercept)"]^2 - } - if (sum(names(stddev) %in% "SUBJECT:GROUP.(Intercept)") > 0) { - VarComponent[i, "GroupBySubject"] <- stddev[names(stddev) == "SUBJECT:GROUP.(Intercept)"]^2 - } - } - ## get mean abundance in each group - count <- count + 1 - abun <- summary(fit.full)$coefficients[, 1] - abun[-1] <- abun[1] + abun[-1] - MeanAbun[count, ] <- abun - Var[count, ] <- rep(sqrt(sum(VarComponent[i, ], na.rm = TRUE)), times=length(abun)) - } - } ## end-loop - - return(list(mu=MeanAbun[1:count, ], - sigma=Var[1:count, ])) -} - - - -#' Simulate extended datasets for sample size estimation -#' -#' @param m number of samples to simulate -#' @param mu a matrix of mean abundance in each phenotype group and each protein -#' @param sigma a matrix of variance in each phenotype group and each protein -#' @return A simulated matrix with required sample size -#' @keywords internal -.sampleSimulation <- function(m, mu, sigma) { - - nproteins <- nrow(mu) - ngroup <- ncol(mu) - ## Determine the size of each phenotype group - samplesize <- .determineSampleSizeinEachGroup(m, ngroup) - - ## Simulate the data matrix - sim_matrix <- matrix(rep(0, nproteins * m), ncol=m) - for (i in 1:nproteins) { - abun <- NULL - for (j in 1:ngroup) { - abun <- c(abun, rnorm(samplesize[j], mu[i, j], sigma[i, j])) - } - sim_matrix[i, ] <- abun - } - sim_matrix <- t(sim_matrix) - colnames(sim_matrix) <- rownames(mu) - #Simulate the phenotype information - group <- rep(c(1:ngroup), times=samplesize) - - index <- sample(length(group), length(group)) - sim_matrix <- sim_matrix[index, ] - group <- group[index] - - return(list(X=sim_matrix, - Y=as.factor(group))) -} - -#' Determine the size of each phenotype group -#' -#' @param m sample size -#' @param ngroup number of phenotype groups -#' @return vector of sample size in each group -#' @keywords internal -.determineSampleSizeinEachGroup <- function(m, ngroup) { - samplesize <- vector(mode="numeric", length=ngroup) - counter <- 0 - - while (counter < m) { - for (i in 1:ngroup) { - if (counter < m) { - counter <- counter + 1 - samplesize[i] <- samplesize[i] + 1 - } - } - } - return(samplesize) -} - - -############################################# -## designSampleSizeClassificationPlots -############################################# -#' Visualization for sample size calculation in classification problem -#' @param data output from function \code{\link{designSampleSizeClassification}} -#' @description To illustrate the mean classification accuracy under different size of training data and validation data. The input is the result from function \code{\link{designSampleSizeClassification}}. -#' @return Plot for sample size estimation. x-axis : size of training set, y-axis: mean predictive accuracy on validation set. Color: size of validation set. -#' @details Data in the example is based on the results of sample size calculation in classification problem from function \code{\link{designSampleSizeClassification}} -#' @author Ting Huang, Meena Choi, Olga Vitek. -#' -#' Maintainer: Meena Choi (\email{mnchoi67@gmail.com}) -#' @references T. Huang et al. TBD 2018 -#' @examples # Consider quantitative data (i.e. QuantData) from yeast study. -#' # A time course study with ten time points of interests and three biological replicates. -#' set.seed(1234) -#' QuantSRM <- dataProcess(SRMRawData) -#' # estimate the mean predictive accuray under different sizes of training data and validation data -#' # n is the number of biological replicates per condition -#' result.srm <- designSampleSizeClassification(data=QuantSRM, n=5, step=10) -#' designSampleSizeClassificationPlots(data=result.srm) -#' -#' # Consider another training set from a colorectal cancer study -#' # Subjects are from control group or colorectal cancer group -#' # 72 proteins were targeted with SRM -#' require(MSstatsBioData) -#' set.seed(1235) -#' data(SRM_crc_training) -#' QuantCRCSRM <- dataProcess(SRM_crc_training) -#' crc.srm <- designSampleSizeClassification(data=QuantCRCSRM, n=10, step=10) -#' designSampleSizeClassificationPlots(data=crc.srm) -#' @importFrom reshape2 melt -#' @export -designSampleSizeClassificationPlots <- function(data) { - - ## ggplot needs a long format dataframe - ## get the mean accuracy - meandata <- as.data.frame(data$meanPA) - meandata$Training_size <- rownames(meandata) - meandata <- reshape2::melt(meandata, id.vars = "Training_size", variable.name = "Valid_size", value.name = "mean") - - ## get the variance - vardata <- as.data.frame(data$varPA) - vardata$Training_size <- rownames(vardata) - vardata <- reshape2::melt(vardata, id.vars = "Training_size", variable.name = "Valid_size", value.name = "var") - - ## perform the join - plotdata <- merge(meandata, vardata, all=TRUE) - - plotdata$Training_size <- gsub("tra", "", plotdata$Training_size) - vardata$Training_size <- as.numeric(as.character(plotdata$Training_size)) - plotdata$Valid_size <- gsub("val", "", plotdata$Valid_size) - plotdata$Valid_size <- as.character(plotdata$Valid_size) - plotdata$Valid_size <- factor(plotdata$Valid_size, levels=sort(as.numeric(unique(plotdata$Valid_size)))) - - ## make the plot - plotdata$sd <- sqrt(plotdata$var) - p <- ggplot(data = plotdata , - aes(x= as.numeric(as.character(Training_size)), y= mean, group = Valid_size, colour = Valid_size)) + - geom_point() + - geom_line() + - labs(title="Sample size estimation", x="Size of training data", y='Mean accuracy') + - guides(color=guide_legend(title="Size of validation data")) - - print(p) -} - - - +#' Estimate the optimal size of training data for classification problem +#' +#' @param data output from function \code{\link{dataProcess}} +#' @param n_sample number of different sample size to simulate. Default is 5 +#' @param sample_incr number of samples per condition to increase at each step. Default is 20 +#' @param protein_desc the fraction of proteins to reduce at each step. Proteins are ranked based on their mean abundance across all the samples. Default is 0.2. If protein_desc = 0.0, protein number will not be changed. +#' @param iter number of times to repeat simulation experiments. Default is 10 +#' @description For classification problem (such as disgnosys of disease), calculate the mean predictive accuray under different size of training data for future experiments of a Selected Reaction Monitoring (SRM), Data-Dependent Acquisition (DDA or shotgun), and Data-Independent Acquisition (DIA or SWATH-MS) experiment based on simulation. +#' @details The function fits intensity-based linear model on the input prelimiary data \emph{data} and uses variance components and mean abundance to simulate new training data with different sample size and protein number. Random forest model is fitted on simulated train data and used to predict the input preliminary data \emph{data}. The above procedure is repeated \emph{iter} times. Mean predictive accuracy and variance under different size of training data are reported. +#' @return \emph{meanPA} is the mean predictive accuracy matrix under different size of training data. +#' @return \emph{varPA} is variance of predictive accuracy under different size of training data. +#' @author Ting Huang, Meena Choi, Olga Vitek. +#' +#' Maintainer: Meena Choi (\email{mnchoi67@gmail.com}) +#' @references T. Huang et al. TBD 2018 +#' @examples # Consider the training set from a colorectal cancer study +#' # Subjects are from control group or colorectal cancer group +#' # 72 proteins were targeted with SRM +#' require(MSstatsBioData) +#' set.seed(1235) +#' data(SRM_crc_training) +#' QuantCRCSRM <- dataProcess(SRM_crc_training, normalization = FALSE) +#' # estimate the mean predictive accuray under different sizes of training data +#' # n_sample is the number of different sample size to simulate +#' # Datasets with 10 different sample size and 3 different protein numbers are simulated +#' result.crc.srm <- designSampleSizeClassification(data=QuantCRCSRM, +#' n_sample = 10, +#' sample_incr = 10, +#' protein_desc = 0.33, +#' iter = 50) +#' result.crc.srm$meanPA # mean predictive accuracy +#' @importFrom randomForest randomForest combine +#' +#' @export +designSampleSizeClassification <- function(data, + n_sample = 5, + sample_incr = 20, + protein_desc = 0.2, + iter = 10) { + + ## save process output in each step + allfiles <- list.files() + filenaming <- "msstats" + + if (length(grep(filenaming, allfiles)) == 0) { + + finalfile <- "msstats.log" + processout <- NULL + + } else { + + num <- 0 + finalfile <- "msstats.log" + + while (is.element(finalfile, allfiles)) { + num <- num + 1 + lastfilename <- finalfile ## in order to rea + finalfile <- paste0(paste(filenaming, num, sep="-"), ".log") + } + finalfile <- lastfilename + processout <- as.matrix(read.table(finalfile, header=TRUE, sep="\t")) + } + processout <- rbind(processout, + as.matrix(c(" ", " ", "MSstats - designSampleSizeClassification function", " "), ncol=1)) + + ## check input is correct + ## data format + rawinput <- c("ProteinName", "PeptideSequence", "PrecursorCharge", + "FragmentIon", "ProductCharge", "IsotopeLabelType", + "Condition", "BioReplicate", "Run", "Intensity") + ## check data + if (length(setdiff(toupper(rawinput),toupper(colnames(data$ProcessedData)))) == 0) { + processout <- rbind(processout, + "The required input - data : did not process from dataProcess function. - stop") + write.table(processout, file=finalfile, row.names=FALSE) + + stop("Please use 'dataProcess' first. Then use output of dataProcess function as input in groupComparison.") + } + + ## check n_sample + if (!((n_sample>0) & n_sample%%1==0)) { + processout <- rbind(processout, + "The required input - n_sample : was not a positive integer. - stop") + write.table(processout, file=finalfile, row.names=FALSE) + + stop("Please set n_sample as a positive integer.") + } + + ## check sample_incr + if (!((sample_incr>0) & sample_incr%%1==0)) { + processout <- rbind(processout, + "The required input - sample_incr : was not a positive integer. - stop") + write.table(processout, file=finalfile, row.names=FALSE) + + stop("Please set sample_incr as a positive integer.") + } + + ## check protein_desc + if (!(is.numeric(protein_desc) & (protein_desc >= 0) & (protein_desc < 1))) { + processout <- rbind(processout, + "The required input - protein_desc : was not a numeric value between 0 and 1 - stop") + write.table(processout, file=finalfile, row.names=FALSE) + + stop("Please set protein_desc as a numeric value.") + } + + ## check iteration + if (!((iter>0) & iter%%1==0)) { + processout <- rbind(processout, + "The required input - iter : was not a positive integer. - stop") + write.table(processout, file=finalfile, row.names=FALSE) + + stop("Please set iter as a positive integer.") + } + + processout <- rbind(processout, paste0("n_sample = ", n_sample)) + processout <- rbind(processout, paste0("sample_incr = ", sample_incr)) + processout <- rbind(processout, paste0("protein_desc = ", protein_desc)) + processout <- rbind(processout, paste0("iter = ", iter)) + + ## Estimate the mean abundance and variance of each protein in each phenotype group + parameters <- .estimateVar(data) + processout <- rbind(processout, + "Calculated variance component. - okay") + write.table(processout, file=finalfile, row.names=FALSE) + + ## Prepare the parameters for simulation experiment + mu <- parameters$mu + sigma <- parameters$sigma + promean <- parameters$promean + valid_x <- as.data.frame(parameters$X) + valid_y <- as.factor(parameters$Y) + + ## Generate the vector of training sample size to simulate + ngroup <- length(unique(data$ProcessedData$GROUP_ORIGINAL)) # Number of phenotype groups + train_size <- seq.int(from = sample_incr, to = sample_incr * n_sample, length.out = n_sample) + train_size <- train_size*ngroup + message(" Size of training data to simulate: ", paste(train_size, collapse = ', ')) + + ## Generate the vector of protein number to simulate + nproteins <- nrow(mu) + if(protein_desc == 0.0){ # no change of protein number + protein_num <- nproteins + } else{ # decrease the protein number by nproteins*protein_desc each time + m_prot <- round(nproteins*protein_desc) + protein_num <- seq.int(from = m_prot, to = nproteins, by = m_prot) + } + message(" Number of proteins to simulate: ", paste(protein_num, collapse = ', ')) + + processout <- rbind(processout, + "Prepare simulation paramters. - okay") + write.table(processout, file=finalfile, row.names=FALSE) + + message(" Start to run the simulation...") + + PA <- list() + for (i in 1:iter) { ## Number of iterations + message(" Iteration: ", i) + accur <- matrix(rep(0, times=length(train_size) * length(protein_num)), nrow=length(protein_num)) + + ## simulate train data with different size + for (n in seq_along(protein_num)) { + message(" Protein Number: ", protein_num[n]) + + ## select proteins based on their mean abundance + selectedPros<-order(promean,decreasing = TRUE)[1:protein_num[n]] + mu_2 <- mu[selectedPros,] + sigma_2 <-sigma[selectedPros,] + + ## Retired code: Simulate an independant validation data + # valid <- .sampleSimulation(valid_size, mu_2, sigma_2) + # valid_x <- as.data.frame(valid$X) + # colnames(valid_x) <- rownames(mu) + # valid_y <- as.factor(valid$Y) + + for (m in seq_along(train_size)) { ## simulate samples in the training data + ##Simulate training data + train <- .sampleSimulation(train_size[m], mu_2, sigma_2) + x <- as.data.frame(train$X) + colnames(x) <- rownames(mu_2) + y <- as.factor(train$Y) + + #Train random forest on training data + rf <- randomForest::randomForest(x=x, y=y, mtry = ) + + ## Retired code: parallel computing for random forest + #registerDoSNOW() + #mcoptions <- list(set.seed=FALSE) + #rf <- foreach(ntree=rep(10, 10), .combine=randomForest::combine, .multicombine=TRUE, + #.packages='randomForest', .options.multicore=mcoptions) %dopar% { + # randomForest(x=x, + #y=y, + #ntree=ntree)} + + ## Calculate predictive accuracy on validation data + rf.pred <- predict(rf, valid_x) #Predict validation data + accuracy <- sum(diag(table(rf.pred,valid_y))) / length(rf.pred) + accur[n,m] <- accuracy + } + } + PA[[i]] <- accur + } + + ## Calculate the mean accuracy and variance + meanPA <- matrix(rep(0, times=length(train_size) * length(protein_num)), nrow=length(protein_num)) + varPA <- matrix(rep(0, times=length(train_size) * length(protein_num)), nrow=length(protein_num)) + + for (n in seq_along(protein_num)) { + for (m in seq_along(train_size)) { + temp <- NULL + for (i in 1:iter) { + temp <- c(temp, PA[[i]][n, m]) + } + meanPA[n, m] <- mean(temp) + varPA[n, m] <- var(temp) + } + } + rownames(meanPA) <- paste0("prot", protein_num) + colnames(meanPA) <- paste0("tra", train_size) + rownames(varPA) <- paste0("prot", protein_num) + colnames(varPA) <- paste0("tra", train_size) + + ## + processout <- rbind(processout, + "The number of sample and proteins is estimated - okay") + write.table(processout, file=finalfile, row.names=FALSE) + + return(list(meanPA = meanPA, + varPA = varPA)) +} + + + +#' (1) Estimate the mean abundance and variance of each protein in each phenotype group. (2) Estimate the protein abundance of the input data +#' +#' @param data The run level data reported by dataProcess function. +#' @return \emph{mu} is the mean abundance matrix of each protein in each phenotype group; +#' @return \emph{sigma} is the sd matrix of each protein in each phenotype group; +#' @return \emph{promean} is the mean abundance of each protein across all the samples. +#' @return \emph{X} is the protein abundance matrix for \emph{data} +#' @return \emph{Y} is group information vector for \emph{data} +#' @keywords internal +.estimateVar <- function(data) { + + ## get the list of proteins in the data + rqall <- data$RunlevelData + rqall$Protein <- factor(rqall$Protein) + proteins <- levels(rqall$Protein) + + ## generate a fake contrast matrix + groups <- unique(data$ProcessedData$GROUP_ORIGINAL) + comparison <- matrix(rep(0, length(groups)), nrow=1) + comparison[1, 1] <- 1 + comparison[1, 2] <- -1 + row.names(comparison) <- paste0(groups[1], groups[2]) + + ## Estimate variance components and mean abundances in each group + ResultOneComparison <- groupComparison(contrast.matrix=comparison, data=data) + res <- ResultOneComparison$fittedmodel + + ## Store the results + VarComponent <- data.frame(Error=NA, Subject=NA, GroupBySubject=NA) + GroupVar <- matrix(rep(0.0, length(res) * length(groups)), ncol = length(groups)) + GroupMean <- matrix(rep(0.0, length(res) * length(groups)), ncol = length(groups)) + SampleMean <- NULL # mean across all the samples + count <- 0 + rnames <- NULL + for (i in 1:length(res)) { + + # note: when run is fixed, we can obtain the same variance of error for both case-control and time course studies. + fit.full <- res[[i]] + + ## if fit.full==NA (class(fit.full)=="try-error) + if (is.null(fit.full)) { + ## !!!!! but if we have NULL for last protein? + next + + } else { + + ## get variance component + if (class(fit.full) != "lmerMod") { + VarComponent[i, "Error"] <- summary(fit.full)$sigma^2 + } else { + stddev <- c(sapply(VarCorr(fit.full), function(el) attr(el, "stddev")),attr(VarCorr(fit.full), "sc")) + VarComponent[i, "Error"] <- stddev[names(stddev) == ""]^2 + + if (sum(names(stddev) %in% "SUBJECT_NESTED.(Intercept)") > 0) { + VarComponent[i, "Subject"] <- stddev[names(stddev) == "SUBJECT_NESTED.(Intercept)"]^2 + } + if (sum(names(stddev) %in% "SUBJECT.(Intercept)") > 0) { + VarComponent[i, "Subject"] <- stddev[names(stddev) == "SUBJECT.(Intercept)"]^2 + } + if (sum(names(stddev) %in% "SUBJECT:GROUP.(Intercept)") > 0) { + VarComponent[i, "GroupBySubject"] <- stddev[names(stddev) == "SUBJECT:GROUP.(Intercept)"]^2 + } + } + ## get mean abundance in each group + abun <- summary(fit.full)$coefficients[, 1] + abun[-1] <- abun[1] + abun[-1] + if(length(abun) == length(groups)){ + count <- count + 1 + rnames <- c(rnames, proteins[i]) + GroupMean[count, ] <- abun # group mean + GroupVar[count, ] <- rep(sqrt(sum(VarComponent[i, ], na.rm = TRUE)), times=length(abun)) # group variance + if (class(fit.full) == "lm") { + SampleMean <- c(SampleMean, mean(fit.full$model$ABUNDANCE, na.rm = T)) + } else{ # class(fit.full) == "lmerMod" + SampleMean <- c(SampleMean, mean(fit.full@frame$ABUNDANCE, na.rm = T)) + } + } else{ + message("Protein ", proteins[i], " is missing from at least one group and not considered in the simulation.") + } + } + } ## end-loop + + GroupMean <- GroupMean[1:count, ] # remove the rows for unqualitied proteins + GroupVar <- GroupVar[1:count, ] # remove the rows for unqualitied proteins + rownames(GroupMean) <- rnames + rownames(GroupVar) <- rnames + names(SampleMean) <- rnames + + # get the protein abundance matrix of the input preliminary data + Quant <- quantification(data) + rownames(Quant) <- Quant[,1] + Quant <- Quant[,-1] # columns are samples and rows are proteins + valid_x <- Quant # generate data matrix + out <- strsplit(as.character(colnames(valid_x)),'_') # generate group vector + valid_y <- do.call(rbind, out)[,1] # first column is group information + valid_x <-apply(valid_x,1, function(x) .random.imp(x)) # impute missing values + valid_x<- as.data.frame(valid_x) + + return(list(promean = SampleMean, # mean protein abundance across all the samples + mu=GroupMean, # mean protein abundance in each group + sigma=GroupVar, # standard deviation in each group + X = valid_x, # protein abundance matrix + Y = valid_y # vector with group information + )) +} + +#' Simulate extended datasets for sample size estimation +#' +#' @param m number of samples to simulate +#' @param mu a matrix of mean abundance in each phenotype group and each protein +#' @param sigma a matrix of variance in each phenotype group and each protein +#' @return \emph{X} A simulated matrix with required sample size +#' @return \emph{Y} Group information corresponding with \emph{X} +#' @keywords internal +.sampleSimulation <- function(m, mu, sigma) { + + nproteins <- nrow(mu) + ngroup <- ncol(mu) + ## Determine the size of each phenotype group + samplesize <- .determineSampleSizeinEachGroup(m, ngroup) + + ## Simulate the data matrix + sim_matrix <- matrix(rep(0, nproteins * m), ncol=m) + for (i in 1:nproteins) { + abun <- NULL + for (j in 1:ngroup) { + abun <- c(abun, rnorm(samplesize[j], mu[i, j], sigma[i, j])) + } + sim_matrix[i, ] <- abun + } + sim_matrix <- t(sim_matrix) + colnames(sim_matrix) <- rownames(mu) + #Simulate the phenotype information + group <- rep(c(1:ngroup), times=samplesize) + + index <- sample(length(group), length(group)) + sim_matrix <- sim_matrix[index, ] + group <- group[index] + + return(list(X=sim_matrix, + Y=as.factor(group))) +} + +#' Determine the size of each phenotype group +#' +#' @param m sample size +#' @param ngroup number of phenotype groups +#' @return vector of sample size in each group +#' @keywords internal +.determineSampleSizeinEachGroup <- function(m, ngroup) { + samplesize <- vector(mode="numeric", length=ngroup) + counter <- 0 + + while (counter < m) { + for (i in 1:ngroup) { + if (counter < m) { + counter <- counter + 1 + samplesize[i] <- samplesize[i] + 1 + } + } + } + return(samplesize) +} + +#' For each protein, impute the missing values based on the observed values +#' +#' @param data protein abundance data for one protein. +#' @return Imputed protein abundance data +#' @keywords internal +.random.imp <- function (data){ + missing <- is.na(data) # count missing values + n.missing <- sum(missing) + data.obs <- data[!missing] # keep the observed values + imputed <- data + # impute missing values by randomly selecting observed values + imputed[missing] <- sample (data.obs, n.missing, replace=TRUE) + return (imputed) +} + +############################################# +## designSampleSizeClassificationPlots +############################################# +#' Visualization for sample size calculation in classification problem +#' @param data output from function \code{\link{designSampleSizeClassification}} +#' @description To illustrate the mean classification accuracy under different protein number and sample size. The input is the result from function \code{\link{designSampleSizeClassification}}. +#' @return Plot for sample size estimation. x-axis : sample size, y-axis: mean predictive accuracy. Color: different protein number. +#' @details Data in the example is based on the results of sample size calculation in classification problem from function \code{\link{designSampleSizeClassification}} +#' @author Ting Huang, Meena Choi, Olga Vitek. +#' +#' Maintainer: Meena Choi (\email{mnchoi67@gmail.com}) +#' @references T. Huang et al. TBD 2018 +#' @examples # Consider the training set from a colorectal cancer study +#' # Subjects are from control group or colorectal cancer group +#' # 72 proteins were targeted with SRM +#' require(MSstatsBioData) +#' set.seed(1235) +#' data(SRM_crc_training) +#' QuantCRCSRM <- dataProcess(SRM_crc_training, normalization = FALSE) +#' # estimate the mean predictive accuray under different sizes of training data +#' # n_sample is the number of different sample size to simulate +#' # Datasets with 10 different sample size and 3 different protein numbers are simulated +#' result.crc.srm <- designSampleSizeClassification(data=QuantCRCSRM, +#' n_sample = 10, +#' sample_incr = 10, +#' protein_desc = 0.33, +#' iter = 50) +#' designSampleSizeClassificationPlots(data=result.crc.srm) +#' @importFrom reshape2 melt +#' @export +designSampleSizeClassificationPlots <- function(data) { + + ## ggplot needs a long format dataframe + ## get the mean accuracy + meandata <- as.data.frame(data$meanPA) + meandata$Protein_number <- rownames(meandata) + meandata <- reshape2::melt(meandata, id.vars = "Protein_number", variable.name = "Train_size", value.name = "mean") + + ## get the variance + vardata <- as.data.frame(data$varPA) + vardata$Protein_number <- rownames(vardata) + vardata <- reshape2::melt(vardata, id.vars = "Protein_number", variable.name = "Train_size", value.name = "var") + + ## perform the join + plotdata <- merge(meandata, vardata, all=TRUE) + # get standard deviation column + plotdata$sd <- sqrt(plotdata$var) + # make sure train size is numeric + plotdata$Train_size <- gsub("tra", "", plotdata$Train_size) + plotdata$Train_size <- as.numeric(as.character(plotdata$Train_size)) + # make sure Protein_number is ordered factor + plotdata$Protein_number <- gsub("prot", "", plotdata$Protein_number) + plotdata$Protein_number <- factor(plotdata$Protein_number, levels = sort(as.numeric(unique(plotdata$Protein_number)))) + + ## make the plot + p <- ggplot(data = plotdata, aes(x= Train_size, y= mean, group = Protein_number, colour = Protein_number)) + + geom_point() + + geom_line() + + labs(title="Sample size estimation", x="Sample size", y='Mean accuracy') + + guides(color=guide_legend(title="Protein Number")) + + print(p) +} + + + diff --git a/R/SpectronauttoMSstatsFormat.R b/R/SpectronauttoMSstatsFormat.R index 881c9b8e..71d1b792 100644 --- a/R/SpectronauttoMSstatsFormat.R +++ b/R/SpectronauttoMSstatsFormat.R @@ -66,7 +66,19 @@ SpectronauttoMSstatsFormat <- function(input, annotinfo <- unique(input[, c("R.FileName", "R.Condition", "R.Replicate")]) colnames(annotinfo) <- c('Run', 'Condition', 'BioReplicate') } else { - annotinfo <- annotation + ## check annotation + required.annotation <- c('Condition', 'BioReplicate', 'Run') + + if (!all(required.annotation %in% colnames(annotation))) { + + missedAnnotation <- which(!(required.annotation %in% colnames(annotation))) + + stop(paste("**", toString(required.annotation[missedAnnotation]), + "is not provided in Annotation. Please check the annotation file.", + "'Run' will be matched with 'R.FileName' ")) + } else { + annotinfo <- annotation + } } ############################## diff --git a/inst/NEWS b/inst/NEWS index 90e9377e..eb39d850 100644 --- a/inst/NEWS +++ b/inst/NEWS @@ -1,3 +1,10 @@ +CHANGES IN VERSION 3.13.4 [2018-07-02] +------------------------- + UPDATED FEATURES + - designSampleSizeClassification and designSampleSizeClassificationPlots + - Converter functions check the required column of annootation file. + + CHANGES IN VERSION 3.13.3 [2018-06-01] ------------------------- BUG FIXES diff --git a/man/designSampleSizeClassification.Rd b/man/designSampleSizeClassification.Rd index 2bc03b82..12ff07de 100644 --- a/man/designSampleSizeClassification.Rd +++ b/man/designSampleSizeClassification.Rd @@ -2,54 +2,50 @@ % Please edit documentation in R/SampleSizeCalculationClassification.R \name{designSampleSizeClassification} \alias{designSampleSizeClassification} -\title{Calculate the optimal size of training data for classification problem} +\title{Estimate the optimal size of training data for classification problem} \usage{ -designSampleSizeClassification(data, n = 5, step = 20, noise = 0, - iter = 5) +designSampleSizeClassification(data, n_sample = 5, sample_incr = 20, + protein_desc = 0.2, iter = 10) } \arguments{ \item{data}{output from function \code{\link{dataProcess}}} -\item{n}{number of steps, which is the number of different sample sizes to simulate. Default is 5} +\item{n_sample}{number of different sample size to simulate. Default is 5} -\item{step}{number of samples per condition to increase at each step. Default is 20} +\item{sample_incr}{number of samples per condition to increase at each step. Default is 20} -\item{noise}{the fraction of protein variance to change at each step. Positive value indicates increasing noise in the data while negative value means decreasing noise. Default is 0.0} +\item{protein_desc}{the fraction of proteins to reduce at each step. Proteins are ranked based on their mean abundance across all the samples. Default is 0.2. If protein_desc = 0.0, protein number will not be changed.} -\item{iter}{number of times to repeat simulation experiments. Default is 5} +\item{iter}{number of times to repeat simulation experiments. Default is 10} } \value{ -\emph{optTrainSize} is the number of training samples with best predictive accuracy under each size of validation data. +\emph{meanPA} is the mean predictive accuracy matrix under different size of training data. -\emph{meanPA} is the mean predictive accuracy matrix under different size of training data and validation data. - -\emph{varPA} is variance of predictive accuracy under different size of training data and validation data. +\emph{varPA} is variance of predictive accuracy under different size of training data. } \description{ -For classification problem (such as disgnosys of disease), calculate sample size of training data under different size of validation data for future experiments of a Selected Reaction Monitoring (SRM), Data-Dependent Acquisition (DDA or shotgun), and Data-Independent Acquisition (DIA or SWATH-MS) experiment based on simulation. +For classification problem (such as disgnosys of disease), calculate the mean predictive accuray under different size of training data for future experiments of a Selected Reaction Monitoring (SRM), Data-Dependent Acquisition (DDA or shotgun), and Data-Independent Acquisition (DIA or SWATH-MS) experiment based on simulation. } \details{ -The function fits intensity-based linear model and uses variance components and mean abundance to simulate new data with different sample size. Training data and validation data are simulated independently. Random forest model is fitted on train data and used to predict the validation data. The above procedure is repeated \emph{iter} times. Mean predictive accuracy and variance under different size of training data and validation data are reported. For each size of validation data, the number of training samples with best predictive accuracy is also returned. +The function fits intensity-based linear model on the input prelimiary data \emph{data} and uses variance components and mean abundance to simulate new training data with different sample size and protein number. Random forest model is fitted on simulated train data and used to predict the input preliminary data \emph{data}. The above procedure is repeated \emph{iter} times. Mean predictive accuracy and variance under different size of training data are reported. } \examples{ -# Consider quantitative data (i.e. QuantData) from yeast study. -# A time course study with ten time points of interests and three biological replicates. -set.seed(1234) -QuantSRM <- dataProcess(SRMRawData) -# estimate the mean predictive accuray under different sizes of training data and validation data -# n is the number of biological replicates per condition -result.srm <- designSampleSizeClassification(data=QuantSRM, n=5, step=10) -result.srm$meanPA # mean predictive accuracy - -# Consider another training set from a colorectal cancer study +# Consider the training set from a colorectal cancer study # Subjects are from control group or colorectal cancer group # 72 proteins were targeted with SRM require(MSstatsBioData) set.seed(1235) data(SRM_crc_training) -QuantCRCSRM <- dataProcess(SRM_crc_training) -crc.srm <- designSampleSizeClassification(data=QuantCRCSRM, n=10, step=10) -crc.srm$meanPA # mean predictive accuracy +QuantCRCSRM <- dataProcess(SRM_crc_training, normalization = FALSE) +# estimate the mean predictive accuray under different sizes of training data +# n_sample is the number of different sample size to simulate +# Datasets with 10 different sample size and 3 different protein numbers are simulated +result.crc.srm <- designSampleSizeClassification(data=QuantCRCSRM, +n_sample = 10, +sample_incr = 10, +protein_desc = 0.33, +iter = 50) +result.crc.srm$meanPA # mean predictive accuracy } \references{ T. Huang et al. TBD 2018 diff --git a/man/designSampleSizeClassificationPlots.Rd b/man/designSampleSizeClassificationPlots.Rd index 4b76741f..070ae3ff 100644 --- a/man/designSampleSizeClassificationPlots.Rd +++ b/man/designSampleSizeClassificationPlots.Rd @@ -10,33 +10,31 @@ designSampleSizeClassificationPlots(data) \item{data}{output from function \code{\link{designSampleSizeClassification}}} } \value{ -Plot for sample size estimation. x-axis : size of training set, y-axis: mean predictive accuracy on validation set. Color: size of validation set. +Plot for sample size estimation. x-axis : sample size, y-axis: mean predictive accuracy. Color: different protein number. } \description{ -To illustrate the mean classification accuracy under different size of training data and validation data. The input is the result from function \code{\link{designSampleSizeClassification}}. +To illustrate the mean classification accuracy under different protein number and sample size. The input is the result from function \code{\link{designSampleSizeClassification}}. } \details{ Data in the example is based on the results of sample size calculation in classification problem from function \code{\link{designSampleSizeClassification}} } \examples{ -# Consider quantitative data (i.e. QuantData) from yeast study. -# A time course study with ten time points of interests and three biological replicates. -set.seed(1234) -QuantSRM <- dataProcess(SRMRawData) -# estimate the mean predictive accuray under different sizes of training data and validation data -# n is the number of biological replicates per condition -result.srm <- designSampleSizeClassification(data=QuantSRM, n=5, step=10) -designSampleSizeClassificationPlots(data=result.srm) - -# Consider another training set from a colorectal cancer study +# Consider the training set from a colorectal cancer study # Subjects are from control group or colorectal cancer group # 72 proteins were targeted with SRM require(MSstatsBioData) set.seed(1235) data(SRM_crc_training) -QuantCRCSRM <- dataProcess(SRM_crc_training) -crc.srm <- designSampleSizeClassification(data=QuantCRCSRM, n=10, step=10) -designSampleSizeClassificationPlots(data=crc.srm) +QuantCRCSRM <- dataProcess(SRM_crc_training, normalization = FALSE) +# estimate the mean predictive accuray under different sizes of training data +# n_sample is the number of different sample size to simulate +# Datasets with 10 different sample size and 3 different protein numbers are simulated +result.crc.srm <- designSampleSizeClassification(data=QuantCRCSRM, +n_sample = 10, +sample_incr = 10, +protein_desc = 0.33, +iter = 50) +designSampleSizeClassificationPlots(data=result.crc.srm) } \references{ T. Huang et al. TBD 2018 diff --git a/vignettes/MSstats.Rmd b/vignettes/MSstats.Rmd old mode 100755 new mode 100644 index 07435d67..23dbcc73 --- a/vignettes/MSstats.Rmd +++ b/vignettes/MSstats.Rmd @@ -1,579 +1,569 @@ - -```{r style, echo = FALSE, results = 'asis'} -BiocStyle::markdown() -``` -```{r global_options, include=FALSE} -knitr::opts_chunk$set(fig.width=10, fig.height=7, warning=FALSE, message=FALSE) -options(width=110) -``` - - - -# MSstats: Protein/Peptide significance analysis - -Package: MSstats - -Author: Meena Choi , Tsung-Heng Tsai , Cyril Galitzine - -Date: October 2, 2017 - - -This vignette summarizes the introduction and various options of all functionalities in MSstats. More details are available in `User Manual`. - -- The types of experiment that MSstats can analyze are LC-MS, SRM, DIA(SWATH) with label-free or labeled synthetic peptides. MSstats does not support for metabolic labeling or iTRAQ experiments. - -## SkylinetoMSstatsFormat - -Preprocess MSstats input report from Skyline and convert into the required input format for MSstats. - -### Arguments - -* `input` : name of MSstats input report from Skyline, which includes feature-level data. -* `annotation` : name of 'annotation.txt' data which includes `Condition`, `BioReplicate`, `Run`. If annotation is already complete in Skyline, use `annotation=NULL` (default). It will use the annotation information from input. -* `removeiRT` : `removeiRT=TRUE`(default) will remove the proteins or peptides which are labeld 'iRT' in 'StandardType' column. `removeiRT=FALSE` will keep them. -* `removeProtein_with1Peptide` : `removeProtein_with1Peptide=TRUE` will remove the proteins which have only 1 peptide and charge. `removeProtein_with1Peptide=FALSE` is default. -* `filter_with_Qvalue` : `removeProtein_with1Peptide=TRUE`(default) will filter out the intensities that have greater than `qvalue_cutoff` in `DetectionQValue` column. Those intensities will be replaced with zero and will be considered as censored missing values for imputation purpose. -* `qvalue_cutoff` : Cutoff for `DetectionQValue`. default is 0.01. - -### Example -```{r, eval=FALSE} -# 'MSstatsInput.csv' is the MSstats report from Skyline. -input <- read.csv(file="MSstatsInput.csv") - -raw <- SkylinetoMSstatsFormat(input) -``` - -## MaxQtoMSstatsFormat - -Convert MaxQuant output into the required input format for MSstats. - -### Arguments - -* `evidence` : name of `'evidence.txt'` data, which includes feature-level data. -* `annotation` : name of `'annotation.txt'` data which includes `Raw.file`, `Condition`, `BioReplicate`, `Run`, `IsotopeLabelType` information. -* `proteinGroups` : name of `'proteinGroups.txt'` data. It needs to matching protein group ID. If `proteinGroups=NULL`, use `'Proteins'` column in `'evidence.txt'`. -* `proteinID` : `'Proteins'`(default) or `'proteinGroups'` in `'proteinGroup.txt'` for Protein ID. -* `useUniquePeptide` : `useUniquePeptide=TRUE`(default) removes peptides that are assigned for more than one proteins. We assume to use unique peptide for each protein. -* `summaryforMultipleRows` : `summaryforMultipleRows=max`(default) or sum, when there are multiple measurements for certain feature and certain run, use highest or sum of all. -* `fewMeasurements` : `fewMeasurements='remove'`(default) will remove the features that have 1 or 2 measurements across runs. -* `removeMpeptides` : `removeMpeptides=TRUE`(default) will remove the peptides including 'M' sequence. -* `removeProtein_with1Peptide` : `removeProtein_with1Peptide=TRUE` will remove the proteins which have only 1 peptide and charge. `FALSE` is default. - -### Example -```{r, eval=FALSE} -# Read in MaxQuant files -proteinGroups <- read.table("proteinGroups.txt", sep="\t", header=TRUE) - -infile <- read.table("evidence.txt", sep="\t", header=TRUE) - -# Read in annotation including condition and biological replicates per run. -# Users should make this annotation file. It is not the output from MaxQuant. -annot <- read.csv("annotation.csv", header=TRUE) - -raw <- MaxQtoMSstatsFormat(evidence=infile, - annotation=annot, - proteinGroups=proteinGroups) -``` - -## ProgenesistoMSstatsFormat - -Convert Progenesis output into the required input format for MSstats. - -### Arguments - -* `input` : name of Progenesis output, which is wide-format. `Accession`, `Sequence`, `Modification`, `Charge` and one column for each run are required. -* `annotation` : name of `'annotation.txt'` or `'annotation.csv'` data which includes `Condition`, `BioReplicate`, `Run` information. It will be matched with the column name of input for MS runs. -* `useUniquePeptide` : `useUniquePeptide=TRUE`(default) removes peptides that are assigned for more than one proteins. We assume to use unique peptide for each protein. -* `summaryforMultipleRows` : `summaryforMultipleRows=max`(default) or `sum`, when there are multiple measurements for certain feature and certain run, use highest or sum of multiple intensities. -* `fewMeasurements` : `fewMeasurements='remove'`(default) will remove the features that have 1 or 2 measurements across runs. -* `removeProtein_with1Peptide` : `removeProtein_with1Peptide=TRUE` will remove the proteins which have only 1 peptide and charge. `FALSE` is default. - -### Example -```{r, eval=FALSE} -input <- read.csv("output_progenesis.csv", stringsAsFactors=F) - -# Read in annotation including condition and biological replicates per run. -# Users should make this annotation file. It is not the output from Progenesis. -annot <- read.csv('annotation.csv') - -raw <- ProgenesistoMSstatsFormat(input, annotation=annot) -``` - -## SpectronauttoMSstatsFormat - -Convert Spectronaut output into the required input format for MSstats. - -### Arguments - -* `input`: name of Spectronaut output, which is long-format. `ProteinName`, `PeptideSequence`, `PrecursorCharge`, `FragmentIon`, `ProductCharge`, `IsotopeLabelType`, `Condition`, `BioReplicate`, `Run`, `Intensity`, `F.ExcludedFromQuantification` are required. Rows with `F.ExcludedFromQuantification=True` will be removed. -* `summaryforMultipleRows` : `summaryforMultipleRows=max`(default) or `sum`, when there are multiple measurements for certain feature and certain run, use highest or sum of multiple intensities. - -### Example -```{r, eval=FALSE} -input <- read.csv("output_spectronaut.csv", stringsAsFactors=F) - -quant <- SpectronauttoMSstatsFormat(input) -``` - - -## dataProcess - -Data pre-processing and quality control of MS runs of the original raw data into quantitative data for model fitting and group comparison. Log transformation is automatically applied and additional variables are created in columns for model fitting and group comparison process. Three options of data pre-processing and quality control of MS runs in dataProcess are - -- Transformation: logarithm transformation of intensity with base 2 or 10. -- Normalization: to remove systematic bias between MS runs. -- - -### Arguments - -* `raw` : name of the raw (input) data set. -* `logTrans` : logarithm transformation with base 2(default) or 10. If `logTrans=2`, the measurement of Variable `ABUNDANCE` is log-transformed with base 2. Same apply to `logTrans=10`. -* `normalization` : normalization to remove systematic bias between MS runs. There are three different normalizations supported. `'equalizeMedians'` (default) represents constant normalization (equalizing the medians) based on reference signals is performed. `'quantile'` represents quantile normalization based on reference signals is performed. `'globalStandards'` represents normalization with global standards proteins. `FALSE` represents no normalization is performed. -* `betweenRunInterferenceScore` : interference is detected by a between-run-interference score. `TRUE` means the scores are generated automatically and stored in a .csv file. `FALSE` (default) means no scores are generated. -* `fillIncompleteRows` : If the input dataset has incomplete rows, `TRUE` (default) adds the rows with intensity value=NA for missing peaks. `FALSE` reports error message with list of features which have incomplete rows. -* `featureSubset` : `"all"` (default) uses all features that the data set has. `"top3"` uses top 3 features which have highest average of log2(intensity) across runs. `"topN"` uses top N features which has highest average of log2(intensity) across runs. It needs the input for `n_top_feature` option. `"highQuality"` selects the most informative features which agree the pattern of the average features across the runs. -* `remove_proteins_with_interference` : `TRUE` allows the algorithm to remove the proteins if deem interfered. `FALSE` (default) does not allow to remove the proteins, in which all features are interfered. In this case, the proteins, which will completely loss all features by the algorithm, will keep the most abundant peptide -* `n_top_feature` : The number of top features for `featureSubset='topN'`. Default is `n_top_feature=3`, which means to use top 3 features.} -* `summaryMethod` : `"TMP"` (default) means Tukey's median polish, which is robust estimation method. `"linear"` uses linear mixed model. -* `equalFeatureVar` : only for `summaryMethod="linear"`. Default is `TRUE`. Logical variable for whether the model should account for heterogeneous variation among intensities from different features. Default is TRUE, which assume equal variance among intensities from features. `FALSE` means that we cannot assume equal variance among intensities from features, then we will account for heterogeneous variation from different features. -* `censoredInt` : Missing values are censored or at random. `'NA'` (default) assumes that all 'NA's in 'Intensity' column are censored. `'0'` uses zero intensities as censored intensity. In this case, NA intensities are missing at random. The output from Skyline and Progenesis should use '0'. Null assumes that all NA intensites are randomly missing. -* `cutoffCensored` : Cutoff value for censoring. Only with `censoredInt='NA'` or `'0'`. Default is `'minFeature'`, which uses minimum value for each feature. `'minFeatureNRun'` uses the smallest between minimum value of corresponding feature and minimum value of corresponding run. `'minRun'` uses minumum value for each run. -* `MBimpute` : only for `summaryMethod="TMP"` and `censoredInt='NA'` or `'0'`. `TRUE` (default) imputes 'NA' or '0' (depending on censoredInt option) by Accelated failure model. `FALSE` uses the values assigned by cutoffCensored. -* `remove50missing` : only for `summaryMethod="TMP"`. `TRUE` removes the runs which have more than 50\% missing values. `FALSE` is default. -* `address` : the name of folder that will store the results. Default folder is the current working directory. The other assigned folder has to be existed under the current working directory. An output csv file is automatically created with the default name of "BetweenRunInterferenceFile.csv". The command address can help to specify where to store the file as well as how to modify the beginning of the file name. -* `maxQuantileforCensored` : Maximum quantile for deciding censored missing values. Default is 0.999. - -### Example -```{r, eval=FALSE} -QuantData <- dataProcess(SRMRawData) -``` - - -## dataProcessPlots - -Visualization for explanatory data analysis. To illustrate the quantitative data after data-preprocessing and quality control of MS runs, `dataProcessPlots` takes the quantitative data from function `dataProcess` as input and automatically generate three types of figures in pdf files as output : - - - Profile plot : to identify the potential sources of variation for each protein. One of output from `dataProcess`, `xxx$ProcessedData`, is used for profile plots. Another output from `dataProcess`, `xxx$RunlevelData`, is used for profile plots with summarization. X-axis represents MS runs. Y-axis is for log-intensities of transitions. Reference/endogenous signals are in the left/right panel. Line colors indicate peptides and line types indicate transitions. Type of dots(filled vs empty dot) indicates censored missing values or not. In summarization plots, gray dots and lines are the same as original profile plots with `xxx$ProcessedData`. Dark dots and lines are for summarized intensities from `xxx$RunlevelData`. - - - Quality control plot : to illustrate the systematic bias between MS runs and to check normalization. After normalization, the reference signals for all proteins should be stable across MS runs. `xxx$ProcessedData` is used for plots. X-axis is for MS runs. Y-axis is for log-intensities of transition (`xxx$ProcessedData$ABUNDANCE`). Reference/endogenous signals are in the left/right panel. The pdf file contains (1) QC plot for all proteins and (2) QC plots for each protein separately. - - - Mean plot for conditions : to illustrate mean and variability of each condition per protein. Summarized intensnties from `xxx$RunlevelData` are used for plots. X-axis is for condition. Y-axis is for summarized log transformed intensities. If `scale=TRUE`, the levels of conditions is scaled according to its actual values at x-axis. Red points indicate the mean for each condition. If `interval="CI"`, error bars indicate the confidence interval with 0.95 significant level for each condition. If `interval="SD"`, error bars indicate the standard deviation for each condition. **The interval is not related with model-based analysis in `groupCompaison`**. - -### Arguments - -* `data` : name of the (output of `dataProcess` function) data set. -* `type` : choice of visualization. `"ProfilePlot"` represents profile plot of log intensities across MS runs. `"QCPlot"` represents quality control plot of log intensities across MS runs. `"ConditionPlot"` represents mean plot of log ratios (Light/Heavy) across conditions. -* `featureName` : for `"ProfilePlot"` only, `"Transition"` (default) means printing feature legend in transition-level. `"Peptide"` means printing feature legend in peptide-level. `"NA"` means no feature legend printing. -* `ylimUp` : upper limit for y-axis in the log scale. `FALSE` (Default) for Profile Plot and QC Plot use the upper limit as rounded off maximum of log2(intensities) after normalization + 3. `FALSE` (Default) for Condition Plot is maximum of log ratio + SD or CI. -* `ylimDown` : lower limit for y-axis in the log scale. `FALSE` (Default) for Profile Plot and QC Plot is 0. `FALSE`` (Default) for Condition Plot is minumum of log ratio - SD or CI. -* `scale` : for `"ConditionPlot"` only, `FALSE` (default) means each conditional level is not scaled at x-axis according to its actual value (equal space at x-axis). `TRUE` means each conditional level is scaled at x-axis according to its actual value (unequal space at x-axis). -* `interval` : for `"ConditionPlot"` only, `"CI"` (default) uses confidence interval with 0.95 significant level for the width of error bar. `"SD"` uses standard deviation for the width of error bar. -* `x.axis.size` : size of x-axis labeling for "MS Runs" in Profile Plot and QC Plot, and "Condition" in Condition Plot. Default is `10`. -* `y.axis.size` : size of y-axis labels. Default is `10`. -* `text.size` : size of labels represented each condition at the top of graph in Profile Plot and QC plot. Default is `4` -* `text.angle` : angle of labels represented each condition at the top of graph in Profile Plot and QC plot or x-axis labeling in Condition plot. Default is `0`. -* `legend.size` : size of feature legend (transition-level or peptide-level) above graph in Profile Plot. Default is `7`. -* `dot.size.profile` : size of dots in profile plot. Default is `2`. -* `dot.size.condition` : size of dots in condition plot. Default is `3`. -* `width` : width of the saved file. Default is 10. -* `height` : height of the saved file. Default is 10. -* `which.Protein` : Protein list to draw plots. List can be names of Proteins or order numbers of Proteins from `levels(xxx$ProcessedData$PROTEIN)`. Default is `"all"`, which generates all plots for each protein. -* `originalPlot` : `TRUE` (default) draws original profile plots. -* `summaryPlot` : `TRUE` (default) draws profile plots with summarization for run levels. -* `save_condition_plot_result` : `TRUE` saves the table with values using condition plots. Default is FALSE. -* `address` : the name of folder that will store the results. Default folder is the current working directory. The other assigned folder has to be existed under the current working directory. An output pdf file is automatically created with the default name of `"ProfilePlot.pdf"` or `"QCplot.pdf"` or `"ConditionPlot.pdf"` or `"ConditionPlot_value.csv"`. The command address can help to specify where to store the file as well as how to modify the beginning of the file name. If `address=FALSE`, plot will be not saved as pdf file but showed in window. - - -### Example -```{r, eval=FALSE} -QuantData <- dataProcess(SRMRawData) - -# Profile plot -dataProcessPlots(data=QuantData, type="ProfilePlot") - -# Quality control plot -dataProcessPlots(data=QuantData, type="QCPlot") - -# Quantification plot for conditions -dataProcessPlots(data=QuantData, type="ConditionPlot") -``` - - -## groupComparison -Tests for significant changes in protein abundance across conditions based on a family of linear mixed-effects models in targeted Selected Reaction Monitoring (SRM), Data-Dependent Acquisition (DDA or shotgun), and Data-Independent Acquisition (DIA or SWATH-MS) - experiment. It is applicable to multiple types of sample preparation, including label-free workflows, workflows that use stable isotope labeled reference proteins and peptides, and workflows that use fractionation. Experimental design of case-control study (patients are not repeatedly measured) or time course study (patients are repeatedly measured) is automatically determined based on proper statistical model. - -### Arguments - -* `contrast.matrix` : comparison between conditions of interests. Based on the levels of conditions, specify 1 or -1 to the conditions of interests and 0 otherwise. The levels of conditions are sorted alphabetically. Command `levels(xxx$ProcessedData$GROUP_ORIGINAL)` after using `dataProcess` function can illustrate the actual order of the levels of conditions. -* `data` : name of the (output of dataProcess function) data set. - - -### Example -```{r, eval=FALSE} -QuantData <- dataProcess(SRMRawData) - -levels(QuantData$ProcessedData$GROUP_ORIGINAL) -comparison <- matrix(c(-1,0,0,0,0,0,1,0,0,0), nrow=1) -row.names(comparison) <- "T7-T1" - -# Tests for differentially abundant proteins with models: -testResultOneComparison <- groupComparison(contrast.matrix=comparison, data=QuantData) -``` - -## groupComparisonPlots - -Visualization for model-based analysis and summarizing differentially abundant proteins. To summarize the results of log-fold changes and adjusted p-values for differentially abundant proteins, `groupComparisonPlots` takes testing results from function `groupComparison` as input and automatically generate three types of figures in pdf files as output : - -- Volcano plot : For each comparison separately. It illustrates actual log-fold changes and adjusted p-values for each comparison separately with all proteins. The x-axis is the log fold change. The base of logarithm transformation is the same as specified in "logTrans" from `dataProcess`. The y-axis is the negative log2 or log10 adjusted p-values. The horizontal dashed line represents the FDR cutoff. The points below the FDR cutoff line are non-significantly abundant proteins (colored in black). The points above the FDR cutoff line are significantly abundant proteins (colored in red/blue for up-/down-regulated). If fold change cutoff is specified (`FCcutoff = specific value`), the points above the FDR cutoff line but within the FC cutoff line are non-significantly abundant proteins (colored in black). - -- Heatmap : For multiple comparisons. It illustratea up-/down-regulated proteins for multiple comparisons with all proteins. Each column represents each comparison of interest. Each row represents each protein. Color red/blue represents proteins in that specific comparison are significantly up-regulated/down-regulated proteins with FDR cutoff and/or FC cutoff. The color scheme shows the evidences of significance. The darker color it is, the stronger evidence of significance it has. Color gold represents proteins are not significantly different in abundance. - -- Comparison plot : For multiple comparisons per protein. It illustrates log-fold change and its variation of multiple comparisons for single protein. X-axis is comparison of interest. Y-axis is the log fold change. The red points are the estimated log fold change from the model. The error bars are the confidence interval with 0.95 significant level for log fold change. This interval is only based on the standard error, which is estimated from the model. - -### Arguments - -* `data` : `xxx$ComparisonResult` in testing output from function `groupComparison`. -* `type` : choice of visualization. `"VolcanoPlot"` represents volcano plot of log fold changes and adjusted p-values for each comparison separately. `"Heatmap"` represents heatmap of adjusted p-values for multiple comparisons. `"ComparisonPlot"` represents comparison plot of log fold changes for multiple comparisons per protein. -* `sig` : FDR cutoff for the adjusted p-values in heatmap and volcano plot. $100(1-sig)$\% confidence interval will be drawn. `sig=0.05` is default. -* `FCcutoff` : for volcano plot or heatmap, whether involve fold change cutoff or not. `FALSE` (default) means no fold change cutoff is applied for significance analysis. `FCcutoff = specific value` means specific fold change cutoff is applied. -* `logBase.pvalue` : for volcano plot or heatmap, (-) logarithm transformation of adjusted p-value with base 2 or 10(default). -* `ylimUp` : for all three plots, upper limit for y-axis. `FALSE` (default) for volcano plot/heatmap use maximum of -log2 (adjusted p-value) or -log10 (adjusted p-value). `FALSE` (default) for comparison plot uses maximum of log-fold change + CI. -* `ylimDown` : for all three plots, lower limit for y-axis. `FALSE` (default) for volcano plot/heatmap use minimum of -log2 (adjusted p-value) or -log10 (adjusted p-value). `FALSE` (default) for comparison plot uses minimum of log-fold change - CI. -* `xlimUp` : for Volcano plot, the limit for x-axis. `FALSE` (default) uses the maximum for absolute value of log-fold change or 3 as default if maximum for absolute value of log-fold change is less than 3. -* `x.axis.size` : size of axes labels, e.g. name of the comparisons in heatmap, and in comparison plot. Default is `10`. -* `y.axis.size` : size of axes labels, e.g. name of targeted proteins in heatmap. Default is `10`. -* `dot.size` : size of dots in volcano plot and comparison plot. Default is `3`. -* `text.size` : size of `ProteinName` label in the graph for Volcano Plot. Default is `4`. -* `legend.size` : size of legend for color at the bottom of volcano plot. Default is `7`. -* `ProteinName` : for volcano plot only, whether display protein names next to dots or not. `TRUE` (default) means protein names, which are significant, are displayed next to the points. `FALSE` means no protein names are displayed. -* `numProtein` : The number of proteins which will be presented in each heatmap. Default is `100`. Maximum possible number of protein for one heatmap is 180. -* `clustering` : Determines how to order proteins and comparisons. Hierarchical cluster analysis with Ward method(minimum variance) is performed. `'protein'` (default) means that protein dendrogram is computed and reordered based on protein means (the order of row is changed). `'comparison'` means comparison dendrogram is computed and reordered based on comparison means (the order of comparison is changed). `'both'` means to reorder both protein and comparison. -* `width` : width of the saved file. Default is `10`. -* `height` : height of the saved file. Default is `10`. -* `which.Comparison` : list of comparisons to draw plots. List can be labels of comparisons or order numbers of comparisons from `levels(xxx$Label)`, such as `levels(xxx$ComparisonResult$Label)`. Default is `"all"`, which generates all plots for each protein. -* `address` : the name of folder that will store the results. Default folder is the current working directory. The other assigned folder has to be existed under the current working directory. An output pdf file is automatically created with the default name of `"VolcanoPlot.pdf"` or `"Heatmap.pdf"` or `"ComparisonPlot.pdf"`. The command address can help to specify where to store the file as well as how to modify the beginning of the file name. If `address=FALSE`, plot will be not saved as pdf file but showed in window. - -### Example -```{r, eval=FALSE} -QuantData <- dataProcess(SRMRawData) - -# based on multiple comparisons (T1 vs T3; T1 vs T7; T1 vs T9) -comparison1<-matrix(c(-1,0,1,0,0,0,0,0,0,0),nrow=1) -comparison2<-matrix(c(-1,0,0,0,0,0,1,0,0,0),nrow=1) -comparison3<-matrix(c(-1,0,0,0,0,0,0,0,1,0),nrow=1) -comparison<-rbind(comparison1,comparison2, comparison3) -row.names(comparison)<-c("T3-T1","T7-T1","T9-T1") - -testResultMultiComparisons <- groupComparison(contrast.matrix=comparison, data=QuantData) - -# Volcano plot -groupComparisonPlots(data=testResultMultiComparisons$ComparisonResult, type="VolcanoPlot") - -# Heatmap -groupComparisonPlots(data=testResultMultiComparisons$ComparisonResult, type="Heatmap") - -# Comparison Plot -groupComparisonPlots(data=testResultMultiComparisons$ComparisonResult, type="ComparisonPlot") -``` - - -## modelBasedQCPlots - -Results based on statistical models for whole plot level inference are accurate as long as the assumptions of the model are met. The model assumes that the measurement errors are normally distributed with mean 0 and constant variance. The assumption of a constant variance can be checked by examining the residuals from the model. - -To check the assumption of linear model for whole plot inference, `modelBasedQCPlots` takes the results after fitting models from function `groupComparison` as input and automatically generate two types of figures in pdf files as output. - -- Normal quantile-quantile plot : For checking normally distributed errors. A normal quantile-quantile plot for each protein is generated in order to check whether the errors are well approximated by a normal distribution. If points fall approximately along a straight line, then the assumption is appropriate for that protein. Only large deviations from the line are problematic. - -- Residual plot : The plots of residuals against predicted (fitted) values. If it shows a random scatter, then the assumption is appropriate. - -### Arguments - -* `data` : output from function `groupComparison`. -* `type` : choice of visualization. `"QQPlots"` represents normal quantile-quantile plot for each protein after fitting models. `"ResidualPlots"` represents a plot of residuals versus fitted values for each protein in the dataset. -* `axis.size` : size of axes labels. Default is `10`. -* `dot.size` : size of points in the graph for residual plots and QQ plots. Default is `3`. -* `text.size` : size of labeling for feature names only in normal quantile-quantile plots separately for each feature. Default is `7`. -* `legend.size` : size of legend for feature names only in residual plots. Default is `7`. -* `width` : width of the saved file. Default is `10`. -* `height` : height of the saved file. Default is `10`. -* `address` : the name of folder that will store the results. Default folder is the current working directory. The other assigned folder has to be existed under the current working directory. If `type="residualPlots"` or `"QQPlots"`, `"ResidualPlots.pdf"` or `"QQPlots.plf"` will be generated. The command address can help to specify where to store the file as well as how to modify the beginning of the file name. If `address=FALSE`, plot will be not saved as pdf file but showed in window. - -### Example -```{r, eval=FALSE} -testResultOneComparison <- groupComparison(contrast.matrix=comparison, data=QuantData) - -# normal quantile-quantile plots -modelBasedQCPlots(data=testResultOneComparison, type="QQPlots") - -# residual plots -modelBasedQCPlots(data=testResultOneComparison, type="ResidualPlots") -``` - - -## designSampleSize - -Calculate sample size for future experiments of a Selected Reaction Monitoring (SRM), Data-Dependent Acquisition (DDA or shotgun), and Data-Independent Acquisition (DIA or SWATH-MS) experiment based on intensity-based linear model. The function fits the model and uses variance components to calculate sample size. The underlying model fitting with intensity-based linear model with technical MS run replication. Estimated sample size is rounded to 0 decimal. Two options of the calculation: - -- number of biological replicates per condition -- power - - -### Arguments - -* `data` : `'fittedmodel'` in testing output from function `groupComparison`. -* `desiredFC` : the range of a desired fold change which includes the lower and upper values of the desired fold change. -* `FDR` : a pre-specified false discovery ratio (FDR) to control the overall false positive. Default is `0.05`. -* `numSample` : minimal number of biological replicates per condition. `TRUE` represents you require to calculate the sample size for this category, else you should input the exact number of biological replicates. -* `power` : a pre-specified statistical power which defined as the probability of detecting a true fold change. `TRUE` represent you require to calculate the power for this category, else you should input the average of power you expect. Default is `0.9`. - - -### Example -```{r, eval=FALSE} -QuantData <- dataProcess(SRMRawData) -head(QuantData$ProcessedData) - -## based on multiple comparisons (T1 vs T3; T1 vs T7; T1 vs T9) -comparison1 <- matrix(c(-1,0,1,0,0,0,0,0,0,0),nrow=1) -comparison2 <- matrix(c(-1,0,0,0,0,0,1,0,0,0),nrow=1) -comparison3 <- matrix(c(-1,0,0,0,0,0,0,0,1,0),nrow=1) -comparison <- rbind(comparison1,comparison2, comparison3) -row.names(comparison) <- c("T3-T1","T7-T1","T9-T1") - -testResultMultiComparisons <- groupComparison(contrast.matrix=comparison,data=QuantData) - -#(1) Minimal number of biological replicates per condition -designSampleSize(data=testResultMultiComparisons$fittedmodel, numSample=TRUE, - desiredFC=c(1.25,1.75), FDR=0.05, power=0.8) - -#(2) Power calculation -designSampleSize(data=testResultMultiComparisons$fittedmodel, numSample=2, - desiredFC=c(1.25,1.75), FDR=0.05, power=TRUE) -``` - -## designSampleSizePlots - -To illustrate the relationship of desired fold change and the calculated minimal number sample size which are - -- Number of biological replicates per condition -- power. - -The input is the result from function `designSampleSize`. - -### Arguments -* `data` : output from function `designSampleSize`. - -### Example - -```{r, eval=FALSE} -# (1) Minimal number of biological replicates per condition -result.sample <- designSampleSize(data=testResultMultiComparisons$fittedmodel, numSample=TRUE, - desiredFC=c(1.25,1.75), FDR=0.05, power=0.8) -designSampleSizePlots(data=result.sample) - -# (2) Power -result.power <- designSampleSize(data=testResultMultiComparisons$fittedmodel, numSample=2, - desiredFC=c(1.25,1.75), FDR=0.05, power=TRUE) -designSampleSizePlots(data=result.power) -``` - -## designSampleSizeClassification - -For classification problem (such as disgnosys of disease), simulation is utilized to estimate the optimal sample size of training data under different size of validation data, including Selected Reaction Monitoring (SRM), Data-Dependent Acquisition (DDA or shotgun), and Data-Independent Acquisition (DIA or SWATH-MS) experiment. The function fits intensity-based linear model and uses variance components and mean abundance to simulate new data with different sample size. Training data and validation data are simulated independently. Random forest model is fitted on train data and used to predict the validation data. The above procedure is repeated several times. Mean predictive accuracy and variance under different size of training data and validation data are reported. For each size of validation data, the number of training samples with best predictive accuracy is also returned. - - -### Arguments - -* `data` : output from function `dataProcess`. -* `n` : number of steps, which is the number of different sample sizes to simulate. Default is `5`. -* `step` : number of samples per condition to increase at each step. Default is `20`. -* `noise` : the fraction of protein variance to change at each step. Positive value indicates increasing noise in the data while negative value means decreasing noise. Default is `0.0`. -* `iter` : number of times to repeat simulation experiments. Default is `5`. - - -### Example -```{r, eval=FALSE} -# Consider quantitative data (i.e. QuantData) from yeast study. -# A time course study with ten time points of interests and three biological replicates. -QuantSRM <- dataProcess(SRMRawData) -# estimate the mean predictive accuray under different sizes of training data and validation data -# n is the number of biological replicates per condition -result.srm <- designSampleSizeClassification(data=QuantSRM, n=5, step=10) -result.srm$meanPA # mean predictive accuracy - -# Consider another training set from a colorectal cancer study -# Subjects are from control group or colorectal cancer group -# 72 proteins were targeted with SRM -require(MSstatsBioData) -set.seed(1235) -data(SRM_crc_training) -QuantCRCSRM <- dataProcess(SRM_crc_training) -# estimate the mean predictive accuray under different sizes of training data and validation data -# n is the number of biological replicates per condition -crc.srm <- designSampleSizeClassification(data=QuantCRCSRM, n=10, step=10) -crc.srm$meanPA # mean predictive accuracy -``` - -## designSampleSizeClassificationPlots - -To illustrate the mean classification accuracy under different size of training data and validation data. The input is the result from function `designSampleSizeClassification`. - -### Arguments -* `data` : output from function `designSampleSizeClassification`. - -### Example - -```{r, eval=FALSE} -# Consider another training set from a colorectal cancer study -# Subjects are from control group or colorectal cancer group -# 72 proteins were targeted with SRM -require(MSstatsBioData) -set.seed(1235) -data(SRM_crc_training) -QuantCRCSRM <- dataProcess(SRM_crc_training) -crc.srm <- designSampleSizeClassification(data=QuantCRCSRM, n=10, step=10) -designSampleSizeClassificationPlots(data=crc.srm) -``` - -## quantification - -Model-based quantification for each condition or for each biological samples per protein in a targeted Selected Reaction Monitoring (SRM), Data-Dependent Acquisition (DDA or shotgun), and Data-Independent Acquisition (DIA or SWATH-MS) experiment. Quantification takes the processed data set by `dataProcess` as input and automatically generate the quantification results (data.frame) with long or matrix format. The quantification for endogenous samples is based on run summarization from subplot model, with TMP robust estimation. - -- Sample quantification : individual biological sample quantification for each protein. The label of each biological sample is a combination of the corresponding group and the sample ID. If there are no technical replicates or experimental replicates per sample, sample quantification is the same as run summarization from dataProcess (`xxx$RunlevelData` from `dataProcess`). If there are technical replicates or experimental replicates, sample quantification is median among run quantification corresponding MS runs. - -- Group quantification : quantification for individual group or individual condition per protein. It is median among sample quantification. - - -### Arguments -* `data` : name of the (processed) data set from `dataPRocess`. -* `type` : choice of quantification. `"Sample"` or `"Group"` for protein sample quantification or group quantification. -* `format` : choice of returned format. `"long"` for long format which has the columns named `Protein`, `Condition`, `LogIntensities` (and `BioReplicate` if it is subject quantification), `NumFeature` for number of transitions for a protein, and `NumPeaks` for number of observed peak intensities for a protein. `"matrix"` for data matrix format which has the rows for Protein and the columns, which are `Groups`(or `Conditions`) for group quantification or the combinations of `BioReplicate` and `Condition` (labeled by `"BioReplicate"_"Condition"`) for sample quantification. Default is `"matrix"`, whether `format="long"` or `"matrix"`, both files, `"Group or SampleQuantification_longformat.csv"` and `"Group or SampleQuantification_dataMatrix.csv"` will be stored in the assigned folder. - -### Example -```{r, eval=FALSE} -QuantData <- dataProcess(SRMRawData) - -# Sample quantification -sampleQuant <- quantification(QuantData) - -# Group quantification -groupQuant <- quantification(QuantData, type="Group") -``` - -*** - -# Assay characterization - -## nonlinear_quantlim - -This function calculates the value of the LOB (limit of blank) and LOD (limit of detection) from the (`Concentration`, `Intensity`) spiked in data. This function should be used instead of the linear function whenever a significant threshold is present at low concentrations. Such threshold is characterized by a signal that is dominated by noise where the mean intensity is constant and independent of concentration. -The function also returns the values of the nonlinear curve fit that allows it to be plotted. At least 2 blank samples (characterized by `Intensity = 0`) are required by this function which are used to calculate the background noise. The LOB is defined as the concentration at which the value of the nonlinear fit is equal to the 95\% upper bound of the noise. The LOD is the concentration at which the latter is equal to the 90\% lower bound (5\% quantile) of the prediction interval of the nonlinear fit. -A weighted nonlinear fit is used with weights for every unique concentration proportional to the inverse of variance between replicates. The details behind the calculation of the nonlinear fit can be found in the Reference. - -The output is a data frame that contains the following columns: - -* `CONCENTRATION` : Concentration values at which the value of the fit is calculated. -* `MEAN` : The value of the curve fit. -* `LOW` : the lower bound of the 95\% prediction interval. -* `UP` : the upper bound of the 95\% prediction interval. -* `LOB` : LOB (one column with identical values) -* `LOD` : LOD (one column with identical values) -* `SLOPE` : the slope of the linear curve fit where only the spikes above LOD are considered. -* `INTERCEPT` : the intercept of the linear curve fit where only the spikes above LOD are considered . -* `NAME` : The name of the assay (identical to that provided in the input). -* `METHOD` : which is always set to `NONLINEAR` when this function is used. Each line of the data frame corresponds to a unique concentration value at which the value of the fit and prediction interval are evaluated. More unique concentrations values than in the input data frame are used to increase the accuracy of the LOB/D calculations. - -The LOB and LOD can only be calculated when more than 2 blank samples are included. -The data should ideally be plotted using the companion function `plot_quantlim` to ensure that the fit is suited to the data. - -### Arguments -* `datain` : Data frame that contains the input data. The input data frame has to contain the following columns : `CONCENTRATION`, `INTENSITY` (both of which are measurements from the spiked in experiment) and `NAME` which designates the name of the assay (e.g. the name of the peptide or protein), Each line of the data frame contains one measurement from the spiked-in experiment. Multiple different `INTENSITY` values for the same `CONCENTRATION` are assumed to correspond to different replicates. Blank Samples are characterized by `CONCENTRATION = 0`. -* `alpha` : Probability level to estimate the LOB/LOD. -* `Npoints` : Number of points to use to discretize the concentration line between 0 and the maximum spiked concentration. -* `Nbootstrap` : Number of bootstrap samples to use to calculate the prediction interval of the fit. This number has to be increased for very low alpha values or whenever very accurate assay characterization is required. - -### Example -```{r, eval=FALSE} -# Consider data from a spiked-in contained in an example dataset -head(SpikeInDataNonLinear) - -nonlinear_quantlim_out <- nonlinear_quantlim(SpikeInDataNonLinear) - -# Get values of LOB/LOD -nonlinear_quantlim_out$LOB[1] -nonlinear_quantlim_out$LOD[1] -``` - - -### linear_quantlim - -This function calculates the value of the LOB (limit of blank) and LOD (limit of detection) from the (`Concentration`, `Intensity`) spiked in data. The function also returns the values of the linear curve fit that allows it to be plotted. At least 2 blank samples (characterized by `Intensity = 0`) are required by this function which are used to calculate the background noise. The LOB is defined as the concentration at which the value of the linear fit is equal to the 95\% upper bound of the noise. The LOD is the concentration at which the latter is equal to the 90\% lower bound of the prediction interval (5\% quantile) of the linear fit. A weighted linear fit is used with weights for every unique concentration proportional to the inverse of variance between replicates. - -The output is a data frame that contains the following columns: - -* `CONCENTRATION` : Concentration values at which the value of the fit is calculated . -* `MEAN` : The value of the curve fit. -* `LOW` : the lower bound of the 95\% prediction interval. -* `UP` : the upper bound of the 95\% prediction interval. -* `LOB`: LOB (one column with identical values) -* `LOD` : LOD (one column with identical values) -* `SLOPE` : the slope of the linear curve fit where only the spikes above LOD are considered. -* `INTERCEPT` : the intercept of the linear curve fit where only the spikes above LOD are considered. -* `NAME` : The name of the assay (identical to that provided in the input). -* `METHOD` : which is always set to `LINEAR` when this function is used. Each line of the data frame corresponds to a unique concentration value at which the value of the fit and prediction interval are evaluated. More unique concentrations values than in the input data frame are used to increase the accuracy of the LOB/D calculations. - -The LOB and LOD can only be calculated when more than 2 blank samples are included. -The data should ideally be plotted using the companion function `plot_quantlim` to ensure that a linear fit is suited to the data. - -### Arguments -* `datain` : Data frame that contains the input data. The input data frame has to contain the following columns : `CONCENTRATION`, `INTENSITY` (both of which are measurements from the spiked in experiment) and `NAME` which designates the name of the assay (e.g. the name of the peptide or protein) Each line of the data frame contains one measurement from the spiked-in experiment. Multiple different `INTENSITY` values for the same `CONCENTRATION` are assumed to correspond to different replicates. Blank Samples are characterized by `CONCENTRATION = 0`. -* `alpha` : Probability level to estimate the LOB/LOD. -* `Npoints` : Number of points to use to discretize the concentration line between 0 and the maximum spiked concentration. -* `{Nbootstrap` : Number of bootstrap samples to use to calculate the prediction interval of the fit. This number has to be increased for very low alpha values or whenever very accurate assay characterization is required. - -### Example -```{r, eval=FALSE} -# Consider data from a spiked-in contained in an example dataset -head(SpikeInDataLinear) - -linear_quantlim_out <- linear_quantlim(SpikeInDataLinear) - -# Get values of LOB/LOD -linear_quantlim_out$LOB[1] -linear_quantlim_out$LOD[1] -``` - - -### plot_quantlim - -This function allows to plot the curve fit that is used to calculate the LOB and LOD with functions `nonlinear_quantlim` and `linear_quantlim`. The function outputs for each calibration curve, two pdf files each containg one plot. On the first, designated by `*_overall.pdf`, the entire concentration range is plotted. On the second plot, designated by `*_zoom.pdf`, the concentration range between 0 and `xlim_plot` (if specified in the argument of the function) is plotted. When no `xlim_plot` value is specified, the region close to LOB and LOD is automatically plotted. - -This plotting function should ideally be used every time `nonlinear_quantlim` or `linear_quantlim` are called to visually ensure that the fits and data are accurate. - -### Arguments -* `spikeindata` : Data frame that contains the experimental spiked in data. This data frame should be identical to that used as input by function functions `nonlinear_quantlim` or `linear_quantlim`. The data frame has to contain the following columns : `CONCENTRATION`, `INTENSITY` (both of which are measurements from the spiked in experiment) and NAME which designates the name of the assay (e.g. the name of the peptide or protein). -* `quantlim_out` : Data frame that was output by functions `nonlinear_quantlim` or `linear_quantlim`. -* `alpha` : Probability level to estimate the LOB/LOD. -* `dir_output` : String containg the path of the directly where the pdf files of the plots should be output. -* `xlim_plot` : Optional argument containing the maximum xaxis value of the zoom plot. When no value is specified, a suitable value close to LOD is automatically chosen. - -### Example -```{r, eval=FALSE} -# Consider data from a spiked-in contained in an example dataset -head(SpikeInDataNonLinear) - -nonlinear_quantlim_out <- nonlinear_quantlim(SpikeInDataNonLinear, alpha = 0.05) - -#Get values of LOB/LOD -nonlinear_quantlim_out$LOB[1] -nonlinear_quantlim_out$LOD[1] - -plot_quantlim(spikeindata = SpikeInDataLinear, quantlim_out = nonlinear_quantlim_out, -dir_output = getwd(), alpha = 0.05) -``` - - + +```{r style, echo = FALSE, results = 'asis'} +BiocStyle::markdown() +``` +```{r global_options, include=FALSE} +knitr::opts_chunk$set(fig.width=10, fig.height=7, warning=FALSE, message=FALSE) +options(width=110) +``` + + + +# MSstats: Protein/Peptide significance analysis + +Package: MSstats + +Author: Meena Choi , Tsung-Heng Tsai , Cyril Galitzine + +Date: October 2, 2017 + + +This vignette summarizes the introduction and various options of all functionalities in MSstats. More details are available in `User Manual`. + +- The types of experiment that MSstats can analyze are LC-MS, SRM, DIA(SWATH) with label-free or labeled synthetic peptides. MSstats does not support for metabolic labeling or iTRAQ experiments. + +## SkylinetoMSstatsFormat + +Preprocess MSstats input report from Skyline and convert into the required input format for MSstats. + +### Arguments + +* `input` : name of MSstats input report from Skyline, which includes feature-level data. +* `annotation` : name of 'annotation.txt' data which includes `Condition`, `BioReplicate`, `Run`. If annotation is already complete in Skyline, use `annotation=NULL` (default). It will use the annotation information from input. +* `removeiRT` : `removeiRT=TRUE`(default) will remove the proteins or peptides which are labeld 'iRT' in 'StandardType' column. `removeiRT=FALSE` will keep them. +* `removeProtein_with1Peptide` : `removeProtein_with1Peptide=TRUE` will remove the proteins which have only 1 peptide and charge. `removeProtein_with1Peptide=FALSE` is default. +* `filter_with_Qvalue` : `removeProtein_with1Peptide=TRUE`(default) will filter out the intensities that have greater than `qvalue_cutoff` in `DetectionQValue` column. Those intensities will be replaced with zero and will be considered as censored missing values for imputation purpose. +* `qvalue_cutoff` : Cutoff for `DetectionQValue`. default is 0.01. + +### Example +```{r, eval=FALSE} +# 'MSstatsInput.csv' is the MSstats report from Skyline. +input <- read.csv(file="MSstatsInput.csv") + +raw <- SkylinetoMSstatsFormat(input) +``` + +## MaxQtoMSstatsFormat + +Convert MaxQuant output into the required input format for MSstats. + +### Arguments + +* `evidence` : name of `'evidence.txt'` data, which includes feature-level data. +* `annotation` : name of `'annotation.txt'` data which includes `Raw.file`, `Condition`, `BioReplicate`, `Run`, `IsotopeLabelType` information. +* `proteinGroups` : name of `'proteinGroups.txt'` data. It needs to matching protein group ID. If `proteinGroups=NULL`, use `'Proteins'` column in `'evidence.txt'`. +* `proteinID` : `'Proteins'`(default) or `'proteinGroups'` in `'proteinGroup.txt'` for Protein ID. +* `useUniquePeptide` : `useUniquePeptide=TRUE`(default) removes peptides that are assigned for more than one proteins. We assume to use unique peptide for each protein. +* `summaryforMultipleRows` : `summaryforMultipleRows=max`(default) or sum, when there are multiple measurements for certain feature and certain run, use highest or sum of all. +* `fewMeasurements` : `fewMeasurements='remove'`(default) will remove the features that have 1 or 2 measurements across runs. +* `removeMpeptides` : `removeMpeptides=TRUE`(default) will remove the peptides including 'M' sequence. +* `removeProtein_with1Peptide` : `removeProtein_with1Peptide=TRUE` will remove the proteins which have only 1 peptide and charge. `FALSE` is default. + +### Example +```{r, eval=FALSE} +# Read in MaxQuant files +proteinGroups <- read.table("proteinGroups.txt", sep="\t", header=TRUE) + +infile <- read.table("evidence.txt", sep="\t", header=TRUE) + +# Read in annotation including condition and biological replicates per run. +# Users should make this annotation file. It is not the output from MaxQuant. +annot <- read.csv("annotation.csv", header=TRUE) + +raw <- MaxQtoMSstatsFormat(evidence=infile, + annotation=annot, + proteinGroups=proteinGroups) +``` + +## ProgenesistoMSstatsFormat + +Convert Progenesis output into the required input format for MSstats. + +### Arguments + +* `input` : name of Progenesis output, which is wide-format. `Accession`, `Sequence`, `Modification`, `Charge` and one column for each run are required. +* `annotation` : name of `'annotation.txt'` or `'annotation.csv'` data which includes `Condition`, `BioReplicate`, `Run` information. It will be matched with the column name of input for MS runs. +* `useUniquePeptide` : `useUniquePeptide=TRUE`(default) removes peptides that are assigned for more than one proteins. We assume to use unique peptide for each protein. +* `summaryforMultipleRows` : `summaryforMultipleRows=max`(default) or `sum`, when there are multiple measurements for certain feature and certain run, use highest or sum of multiple intensities. +* `fewMeasurements` : `fewMeasurements='remove'`(default) will remove the features that have 1 or 2 measurements across runs. +* `removeProtein_with1Peptide` : `removeProtein_with1Peptide=TRUE` will remove the proteins which have only 1 peptide and charge. `FALSE` is default. + +### Example +```{r, eval=FALSE} +input <- read.csv("output_progenesis.csv", stringsAsFactors=F) + +# Read in annotation including condition and biological replicates per run. +# Users should make this annotation file. It is not the output from Progenesis. +annot <- read.csv('annotation.csv') + +raw <- ProgenesistoMSstatsFormat(input, annotation=annot) +``` + +## SpectronauttoMSstatsFormat + +Convert Spectronaut output into the required input format for MSstats. + +### Arguments + +* `input`: name of Spectronaut output, which is long-format. `ProteinName`, `PeptideSequence`, `PrecursorCharge`, `FragmentIon`, `ProductCharge`, `IsotopeLabelType`, `Condition`, `BioReplicate`, `Run`, `Intensity`, `F.ExcludedFromQuantification` are required. Rows with `F.ExcludedFromQuantification=True` will be removed. +* `summaryforMultipleRows` : `summaryforMultipleRows=max`(default) or `sum`, when there are multiple measurements for certain feature and certain run, use highest or sum of multiple intensities. + +### Example +```{r, eval=FALSE} +input <- read.csv("output_spectronaut.csv", stringsAsFactors=F) + +quant <- SpectronauttoMSstatsFormat(input) +``` + + +## dataProcess + +Data pre-processing and quality control of MS runs of the original raw data into quantitative data for model fitting and group comparison. Log transformation is automatically applied and additional variables are created in columns for model fitting and group comparison process. Three options of data pre-processing and quality control of MS runs in dataProcess are + +- Transformation: logarithm transformation of intensity with base 2 or 10. +- Normalization: to remove systematic bias between MS runs. +- + +### Arguments + +* `raw` : name of the raw (input) data set. +* `logTrans` : logarithm transformation with base 2(default) or 10. If `logTrans=2`, the measurement of Variable `ABUNDANCE` is log-transformed with base 2. Same apply to `logTrans=10`. +* `normalization` : normalization to remove systematic bias between MS runs. There are three different normalizations supported. `'equalizeMedians'` (default) represents constant normalization (equalizing the medians) based on reference signals is performed. `'quantile'` represents quantile normalization based on reference signals is performed. `'globalStandards'` represents normalization with global standards proteins. `FALSE` represents no normalization is performed. +* `betweenRunInterferenceScore` : interference is detected by a between-run-interference score. `TRUE` means the scores are generated automatically and stored in a .csv file. `FALSE` (default) means no scores are generated. +* `fillIncompleteRows` : If the input dataset has incomplete rows, `TRUE` (default) adds the rows with intensity value=NA for missing peaks. `FALSE` reports error message with list of features which have incomplete rows. +* `featureSubset` : `"all"` (default) uses all features that the data set has. `"top3"` uses top 3 features which have highest average of log2(intensity) across runs. `"topN"` uses top N features which has highest average of log2(intensity) across runs. It needs the input for `n_top_feature` option. `"highQuality"` selects the most informative features which agree the pattern of the average features across the runs. +* `remove_proteins_with_interference` : `TRUE` allows the algorithm to remove the proteins if deem interfered. `FALSE` (default) does not allow to remove the proteins, in which all features are interfered. In this case, the proteins, which will completely loss all features by the algorithm, will keep the most abundant peptide +* `n_top_feature` : The number of top features for `featureSubset='topN'`. Default is `n_top_feature=3`, which means to use top 3 features.} +* `summaryMethod` : `"TMP"` (default) means Tukey's median polish, which is robust estimation method. `"linear"` uses linear mixed model. +* `equalFeatureVar` : only for `summaryMethod="linear"`. Default is `TRUE`. Logical variable for whether the model should account for heterogeneous variation among intensities from different features. Default is TRUE, which assume equal variance among intensities from features. `FALSE` means that we cannot assume equal variance among intensities from features, then we will account for heterogeneous variation from different features. +* `censoredInt` : Missing values are censored or at random. `'NA'` (default) assumes that all 'NA's in 'Intensity' column are censored. `'0'` uses zero intensities as censored intensity. In this case, NA intensities are missing at random. The output from Skyline and Progenesis should use '0'. Null assumes that all NA intensites are randomly missing. +* `cutoffCensored` : Cutoff value for censoring. Only with `censoredInt='NA'` or `'0'`. Default is `'minFeature'`, which uses minimum value for each feature. `'minFeatureNRun'` uses the smallest between minimum value of corresponding feature and minimum value of corresponding run. `'minRun'` uses minumum value for each run. +* `MBimpute` : only for `summaryMethod="TMP"` and `censoredInt='NA'` or `'0'`. `TRUE` (default) imputes 'NA' or '0' (depending on censoredInt option) by Accelated failure model. `FALSE` uses the values assigned by cutoffCensored. +* `remove50missing` : only for `summaryMethod="TMP"`. `TRUE` removes the runs which have more than 50\% missing values. `FALSE` is default. +* `address` : the name of folder that will store the results. Default folder is the current working directory. The other assigned folder has to be existed under the current working directory. An output csv file is automatically created with the default name of "BetweenRunInterferenceFile.csv". The command address can help to specify where to store the file as well as how to modify the beginning of the file name. +* `maxQuantileforCensored` : Maximum quantile for deciding censored missing values. Default is 0.999. + +### Example +```{r, eval=FALSE} +QuantData <- dataProcess(SRMRawData) +``` + + +## dataProcessPlots + +Visualization for explanatory data analysis. To illustrate the quantitative data after data-preprocessing and quality control of MS runs, `dataProcessPlots` takes the quantitative data from function `dataProcess` as input and automatically generate three types of figures in pdf files as output : + + - Profile plot : to identify the potential sources of variation for each protein. One of output from `dataProcess`, `xxx$ProcessedData`, is used for profile plots. Another output from `dataProcess`, `xxx$RunlevelData`, is used for profile plots with summarization. X-axis represents MS runs. Y-axis is for log-intensities of transitions. Reference/endogenous signals are in the left/right panel. Line colors indicate peptides and line types indicate transitions. Type of dots(filled vs empty dot) indicates censored missing values or not. In summarization plots, gray dots and lines are the same as original profile plots with `xxx$ProcessedData`. Dark dots and lines are for summarized intensities from `xxx$RunlevelData`. + + - Quality control plot : to illustrate the systematic bias between MS runs and to check normalization. After normalization, the reference signals for all proteins should be stable across MS runs. `xxx$ProcessedData` is used for plots. X-axis is for MS runs. Y-axis is for log-intensities of transition (`xxx$ProcessedData$ABUNDANCE`). Reference/endogenous signals are in the left/right panel. The pdf file contains (1) QC plot for all proteins and (2) QC plots for each protein separately. + + - Mean plot for conditions : to illustrate mean and variability of each condition per protein. Summarized intensnties from `xxx$RunlevelData` are used for plots. X-axis is for condition. Y-axis is for summarized log transformed intensities. If `scale=TRUE`, the levels of conditions is scaled according to its actual values at x-axis. Red points indicate the mean for each condition. If `interval="CI"`, error bars indicate the confidence interval with 0.95 significant level for each condition. If `interval="SD"`, error bars indicate the standard deviation for each condition. **The interval is not related with model-based analysis in `groupCompaison`**. + +### Arguments + +* `data` : name of the (output of `dataProcess` function) data set. +* `type` : choice of visualization. `"ProfilePlot"` represents profile plot of log intensities across MS runs. `"QCPlot"` represents quality control plot of log intensities across MS runs. `"ConditionPlot"` represents mean plot of log ratios (Light/Heavy) across conditions. +* `featureName` : for `"ProfilePlot"` only, `"Transition"` (default) means printing feature legend in transition-level. `"Peptide"` means printing feature legend in peptide-level. `"NA"` means no feature legend printing. +* `ylimUp` : upper limit for y-axis in the log scale. `FALSE` (Default) for Profile Plot and QC Plot use the upper limit as rounded off maximum of log2(intensities) after normalization + 3. `FALSE` (Default) for Condition Plot is maximum of log ratio + SD or CI. +* `ylimDown` : lower limit for y-axis in the log scale. `FALSE` (Default) for Profile Plot and QC Plot is 0. `FALSE`` (Default) for Condition Plot is minumum of log ratio - SD or CI. +* `scale` : for `"ConditionPlot"` only, `FALSE` (default) means each conditional level is not scaled at x-axis according to its actual value (equal space at x-axis). `TRUE` means each conditional level is scaled at x-axis according to its actual value (unequal space at x-axis). +* `interval` : for `"ConditionPlot"` only, `"CI"` (default) uses confidence interval with 0.95 significant level for the width of error bar. `"SD"` uses standard deviation for the width of error bar. +* `x.axis.size` : size of x-axis labeling for "MS Runs" in Profile Plot and QC Plot, and "Condition" in Condition Plot. Default is `10`. +* `y.axis.size` : size of y-axis labels. Default is `10`. +* `text.size` : size of labels represented each condition at the top of graph in Profile Plot and QC plot. Default is `4` +* `text.angle` : angle of labels represented each condition at the top of graph in Profile Plot and QC plot or x-axis labeling in Condition plot. Default is `0`. +* `legend.size` : size of feature legend (transition-level or peptide-level) above graph in Profile Plot. Default is `7`. +* `dot.size.profile` : size of dots in profile plot. Default is `2`. +* `dot.size.condition` : size of dots in condition plot. Default is `3`. +* `width` : width of the saved file. Default is 10. +* `height` : height of the saved file. Default is 10. +* `which.Protein` : Protein list to draw plots. List can be names of Proteins or order numbers of Proteins from `levels(xxx$ProcessedData$PROTEIN)`. Default is `"all"`, which generates all plots for each protein. +* `originalPlot` : `TRUE` (default) draws original profile plots. +* `summaryPlot` : `TRUE` (default) draws profile plots with summarization for run levels. +* `save_condition_plot_result` : `TRUE` saves the table with values using condition plots. Default is FALSE. +* `address` : the name of folder that will store the results. Default folder is the current working directory. The other assigned folder has to be existed under the current working directory. An output pdf file is automatically created with the default name of `"ProfilePlot.pdf"` or `"QCplot.pdf"` or `"ConditionPlot.pdf"` or `"ConditionPlot_value.csv"`. The command address can help to specify where to store the file as well as how to modify the beginning of the file name. If `address=FALSE`, plot will be not saved as pdf file but showed in window. + + +### Example +```{r, eval=FALSE} +QuantData <- dataProcess(SRMRawData) + +# Profile plot +dataProcessPlots(data=QuantData, type="ProfilePlot") + +# Quality control plot +dataProcessPlots(data=QuantData, type="QCPlot") + +# Quantification plot for conditions +dataProcessPlots(data=QuantData, type="ConditionPlot") +``` + + +## groupComparison +Tests for significant changes in protein abundance across conditions based on a family of linear mixed-effects models in targeted Selected Reaction Monitoring (SRM), Data-Dependent Acquisition (DDA or shotgun), and Data-Independent Acquisition (DIA or SWATH-MS) + experiment. It is applicable to multiple types of sample preparation, including label-free workflows, workflows that use stable isotope labeled reference proteins and peptides, and workflows that use fractionation. Experimental design of case-control study (patients are not repeatedly measured) or time course study (patients are repeatedly measured) is automatically determined based on proper statistical model. + +### Arguments + +* `contrast.matrix` : comparison between conditions of interests. Based on the levels of conditions, specify 1 or -1 to the conditions of interests and 0 otherwise. The levels of conditions are sorted alphabetically. Command `levels(xxx$ProcessedData$GROUP_ORIGINAL)` after using `dataProcess` function can illustrate the actual order of the levels of conditions. +* `data` : name of the (output of dataProcess function) data set. + + +### Example +```{r, eval=FALSE} +QuantData <- dataProcess(SRMRawData) + +levels(QuantData$ProcessedData$GROUP_ORIGINAL) +comparison <- matrix(c(-1,0,0,0,0,0,1,0,0,0), nrow=1) +row.names(comparison) <- "T7-T1" + +# Tests for differentially abundant proteins with models: +testResultOneComparison <- groupComparison(contrast.matrix=comparison, data=QuantData) +``` + +## groupComparisonPlots + +Visualization for model-based analysis and summarizing differentially abundant proteins. To summarize the results of log-fold changes and adjusted p-values for differentially abundant proteins, `groupComparisonPlots` takes testing results from function `groupComparison` as input and automatically generate three types of figures in pdf files as output : + +- Volcano plot : For each comparison separately. It illustrates actual log-fold changes and adjusted p-values for each comparison separately with all proteins. The x-axis is the log fold change. The base of logarithm transformation is the same as specified in "logTrans" from `dataProcess`. The y-axis is the negative log2 or log10 adjusted p-values. The horizontal dashed line represents the FDR cutoff. The points below the FDR cutoff line are non-significantly abundant proteins (colored in black). The points above the FDR cutoff line are significantly abundant proteins (colored in red/blue for up-/down-regulated). If fold change cutoff is specified (`FCcutoff = specific value`), the points above the FDR cutoff line but within the FC cutoff line are non-significantly abundant proteins (colored in black). + +- Heatmap : For multiple comparisons. It illustratea up-/down-regulated proteins for multiple comparisons with all proteins. Each column represents each comparison of interest. Each row represents each protein. Color red/blue represents proteins in that specific comparison are significantly up-regulated/down-regulated proteins with FDR cutoff and/or FC cutoff. The color scheme shows the evidences of significance. The darker color it is, the stronger evidence of significance it has. Color gold represents proteins are not significantly different in abundance. + +- Comparison plot : For multiple comparisons per protein. It illustrates log-fold change and its variation of multiple comparisons for single protein. X-axis is comparison of interest. Y-axis is the log fold change. The red points are the estimated log fold change from the model. The error bars are the confidence interval with 0.95 significant level for log fold change. This interval is only based on the standard error, which is estimated from the model. + +### Arguments + +* `data` : `xxx$ComparisonResult` in testing output from function `groupComparison`. +* `type` : choice of visualization. `"VolcanoPlot"` represents volcano plot of log fold changes and adjusted p-values for each comparison separately. `"Heatmap"` represents heatmap of adjusted p-values for multiple comparisons. `"ComparisonPlot"` represents comparison plot of log fold changes for multiple comparisons per protein. +* `sig` : FDR cutoff for the adjusted p-values in heatmap and volcano plot. $100(1-sig)$\% confidence interval will be drawn. `sig=0.05` is default. +* `FCcutoff` : for volcano plot or heatmap, whether involve fold change cutoff or not. `FALSE` (default) means no fold change cutoff is applied for significance analysis. `FCcutoff = specific value` means specific fold change cutoff is applied. +* `logBase.pvalue` : for volcano plot or heatmap, (-) logarithm transformation of adjusted p-value with base 2 or 10(default). +* `ylimUp` : for all three plots, upper limit for y-axis. `FALSE` (default) for volcano plot/heatmap use maximum of -log2 (adjusted p-value) or -log10 (adjusted p-value). `FALSE` (default) for comparison plot uses maximum of log-fold change + CI. +* `ylimDown` : for all three plots, lower limit for y-axis. `FALSE` (default) for volcano plot/heatmap use minimum of -log2 (adjusted p-value) or -log10 (adjusted p-value). `FALSE` (default) for comparison plot uses minimum of log-fold change - CI. +* `xlimUp` : for Volcano plot, the limit for x-axis. `FALSE` (default) uses the maximum for absolute value of log-fold change or 3 as default if maximum for absolute value of log-fold change is less than 3. +* `x.axis.size` : size of axes labels, e.g. name of the comparisons in heatmap, and in comparison plot. Default is `10`. +* `y.axis.size` : size of axes labels, e.g. name of targeted proteins in heatmap. Default is `10`. +* `dot.size` : size of dots in volcano plot and comparison plot. Default is `3`. +* `text.size` : size of `ProteinName` label in the graph for Volcano Plot. Default is `4`. +* `legend.size` : size of legend for color at the bottom of volcano plot. Default is `7`. +* `ProteinName` : for volcano plot only, whether display protein names next to dots or not. `TRUE` (default) means protein names, which are significant, are displayed next to the points. `FALSE` means no protein names are displayed. +* `numProtein` : The number of proteins which will be presented in each heatmap. Default is `100`. Maximum possible number of protein for one heatmap is 180. +* `clustering` : Determines how to order proteins and comparisons. Hierarchical cluster analysis with Ward method(minimum variance) is performed. `'protein'` (default) means that protein dendrogram is computed and reordered based on protein means (the order of row is changed). `'comparison'` means comparison dendrogram is computed and reordered based on comparison means (the order of comparison is changed). `'both'` means to reorder both protein and comparison. +* `width` : width of the saved file. Default is `10`. +* `height` : height of the saved file. Default is `10`. +* `which.Comparison` : list of comparisons to draw plots. List can be labels of comparisons or order numbers of comparisons from `levels(xxx$Label)`, such as `levels(xxx$ComparisonResult$Label)`. Default is `"all"`, which generates all plots for each protein. +* `address` : the name of folder that will store the results. Default folder is the current working directory. The other assigned folder has to be existed under the current working directory. An output pdf file is automatically created with the default name of `"VolcanoPlot.pdf"` or `"Heatmap.pdf"` or `"ComparisonPlot.pdf"`. The command address can help to specify where to store the file as well as how to modify the beginning of the file name. If `address=FALSE`, plot will be not saved as pdf file but showed in window. + +### Example +```{r, eval=FALSE} +QuantData <- dataProcess(SRMRawData) + +# based on multiple comparisons (T1 vs T3; T1 vs T7; T1 vs T9) +comparison1<-matrix(c(-1,0,1,0,0,0,0,0,0,0),nrow=1) +comparison2<-matrix(c(-1,0,0,0,0,0,1,0,0,0),nrow=1) +comparison3<-matrix(c(-1,0,0,0,0,0,0,0,1,0),nrow=1) +comparison<-rbind(comparison1,comparison2, comparison3) +row.names(comparison)<-c("T3-T1","T7-T1","T9-T1") + +testResultMultiComparisons <- groupComparison(contrast.matrix=comparison, data=QuantData) + +# Volcano plot +groupComparisonPlots(data=testResultMultiComparisons$ComparisonResult, type="VolcanoPlot") + +# Heatmap +groupComparisonPlots(data=testResultMultiComparisons$ComparisonResult, type="Heatmap") + +# Comparison Plot +groupComparisonPlots(data=testResultMultiComparisons$ComparisonResult, type="ComparisonPlot") +``` + + +## modelBasedQCPlots + +Results based on statistical models for whole plot level inference are accurate as long as the assumptions of the model are met. The model assumes that the measurement errors are normally distributed with mean 0 and constant variance. The assumption of a constant variance can be checked by examining the residuals from the model. + +To check the assumption of linear model for whole plot inference, `modelBasedQCPlots` takes the results after fitting models from function `groupComparison` as input and automatically generate two types of figures in pdf files as output. + +- Normal quantile-quantile plot : For checking normally distributed errors. A normal quantile-quantile plot for each protein is generated in order to check whether the errors are well approximated by a normal distribution. If points fall approximately along a straight line, then the assumption is appropriate for that protein. Only large deviations from the line are problematic. + +- Residual plot : The plots of residuals against predicted (fitted) values. If it shows a random scatter, then the assumption is appropriate. + +### Arguments + +* `data` : output from function `groupComparison`. +* `type` : choice of visualization. `"QQPlots"` represents normal quantile-quantile plot for each protein after fitting models. `"ResidualPlots"` represents a plot of residuals versus fitted values for each protein in the dataset. +* `axis.size` : size of axes labels. Default is `10`. +* `dot.size` : size of points in the graph for residual plots and QQ plots. Default is `3`. +* `text.size` : size of labeling for feature names only in normal quantile-quantile plots separately for each feature. Default is `7`. +* `legend.size` : size of legend for feature names only in residual plots. Default is `7`. +* `width` : width of the saved file. Default is `10`. +* `height` : height of the saved file. Default is `10`. +* `address` : the name of folder that will store the results. Default folder is the current working directory. The other assigned folder has to be existed under the current working directory. If `type="residualPlots"` or `"QQPlots"`, `"ResidualPlots.pdf"` or `"QQPlots.plf"` will be generated. The command address can help to specify where to store the file as well as how to modify the beginning of the file name. If `address=FALSE`, plot will be not saved as pdf file but showed in window. + +### Example +```{r, eval=FALSE} +testResultOneComparison <- groupComparison(contrast.matrix=comparison, data=QuantData) + +# normal quantile-quantile plots +modelBasedQCPlots(data=testResultOneComparison, type="QQPlots") + +# residual plots +modelBasedQCPlots(data=testResultOneComparison, type="ResidualPlots") +``` + + +## designSampleSize + +Calculate sample size for future experiments of a Selected Reaction Monitoring (SRM), Data-Dependent Acquisition (DDA or shotgun), and Data-Independent Acquisition (DIA or SWATH-MS) experiment based on intensity-based linear model. The function fits the model and uses variance components to calculate sample size. The underlying model fitting with intensity-based linear model with technical MS run replication. Estimated sample size is rounded to 0 decimal. Two options of the calculation: + +- number of biological replicates per condition +- power + + +### Arguments + +* `data` : `'fittedmodel'` in testing output from function `groupComparison`. +* `desiredFC` : the range of a desired fold change which includes the lower and upper values of the desired fold change. +* `FDR` : a pre-specified false discovery ratio (FDR) to control the overall false positive. Default is `0.05`. +* `numSample` : minimal number of biological replicates per condition. `TRUE` represents you require to calculate the sample size for this category, else you should input the exact number of biological replicates. +* `power` : a pre-specified statistical power which defined as the probability of detecting a true fold change. `TRUE` represent you require to calculate the power for this category, else you should input the average of power you expect. Default is `0.9`. + + +### Example +```{r, eval=FALSE} +QuantData <- dataProcess(SRMRawData) +head(QuantData$ProcessedData) + +## based on multiple comparisons (T1 vs T3; T1 vs T7; T1 vs T9) +comparison1 <- matrix(c(-1,0,1,0,0,0,0,0,0,0),nrow=1) +comparison2 <- matrix(c(-1,0,0,0,0,0,1,0,0,0),nrow=1) +comparison3 <- matrix(c(-1,0,0,0,0,0,0,0,1,0),nrow=1) +comparison <- rbind(comparison1,comparison2, comparison3) +row.names(comparison) <- c("T3-T1","T7-T1","T9-T1") + +testResultMultiComparisons <- groupComparison(contrast.matrix=comparison,data=QuantData) + +#(1) Minimal number of biological replicates per condition +designSampleSize(data=testResultMultiComparisons$fittedmodel, numSample=TRUE, + desiredFC=c(1.25,1.75), FDR=0.05, power=0.8) + +#(2) Power calculation +designSampleSize(data=testResultMultiComparisons$fittedmodel, numSample=2, + desiredFC=c(1.25,1.75), FDR=0.05, power=TRUE) +``` + +## designSampleSizePlots + +To illustrate the relationship of desired fold change and the calculated minimal number sample size which are + +- Number of biological replicates per condition +- power. + +The input is the result from function `designSampleSize`. + +### Arguments +* `data` : output from function `designSampleSize`. + +### Example + +```{r, eval=FALSE} +# (1) Minimal number of biological replicates per condition +result.sample <- designSampleSize(data=testResultMultiComparisons$fittedmodel, numSample=TRUE, + desiredFC=c(1.25,1.75), FDR=0.05, power=0.8) +designSampleSizePlots(data=result.sample) + +# (2) Power +result.power <- designSampleSize(data=testResultMultiComparisons$fittedmodel, numSample=2, + desiredFC=c(1.25,1.75), FDR=0.05, power=TRUE) +designSampleSizePlots(data=result.power) +``` + +## designSampleSizeClassification + +For classification problem (such as disgnosys of disease), simulation is utilized to estimate the optimal size of training data, including Selected Reaction Monitoring (SRM), Data-Dependent Acquisition (DDA or shotgun), and Data-Independent Acquisition (DIA or SWATH-MS) experiment. The function fits intensity-based linear model based on the input preliminary data and uses the estimated variance components to simulate new training data with different protein number and sample size. Random forest model is fitted on train data and used to predict the input preliminary data. The above procedure is repeated several times. Mean predictive accuracy and variance under different size of training data are reported. + +### Arguments + +* `data` : output from function `dataProcess`. +* `n_sample` : number of datasets to simulate. Default is `5`. +* `sample_incr` : number of samples per condition to increase at each step. Default is `20`. +* `protein_desc` : the fraction of protein to decrease at each step. Proteins are ranked based on their mean abundance across all the samples. Default is `0.2`, which means $0.2*$total number of proteins are reduced each time. +* `iter` : number of times to repeat simulation experiments. Default is `10`. + + +### Example +```{r, eval=FALSE} +# Consider another training set from a colorectal cancer study +# Subjects are from control group or colorectal cancer group +# 72 proteins were targeted with SRM +require(MSstatsBioData) +set.seed(1235) +data(SRM_crc_training) +QuantCRCSRM <- dataProcess(SRM_crc_training, normalization = FALSE) +# estimate the mean predictive accuray under different sizes of training data +# n_sample is the number of different sample size to simulate +# Datasets with 10 different sample size and 3 different protein numbers are simulated +# Each time 10 samples per condition are increased and 72*0.33 = 24 proteins are descreased. +# Simulation procedure is repeated 100 times +result.crc.srm <- designSampleSizeClassification(data=QuantCRCSRM, + n_sample = 10, + sample_incr = 10, + protein_desc = 0.33, + iter = 50) +result.crc.srm$meanPA # mean predictive accuracy +``` + +## designSampleSizeClassificationPlots + +To illustrate the mean classification accuracy under different size of training data. The input is the result from function `designSampleSizeClassification`. + +### Arguments +* `data` : output from function `designSampleSizeClassification`. + +### Example + +```{r, eval=FALSE} +designSampleSizeClassificationPlots(data=result.crc.srm) +``` + +## quantification + +Model-based quantification for each condition or for each biological samples per protein in a targeted Selected Reaction Monitoring (SRM), Data-Dependent Acquisition (DDA or shotgun), and Data-Independent Acquisition (DIA or SWATH-MS) experiment. Quantification takes the processed data set by `dataProcess` as input and automatically generate the quantification results (data.frame) with long or matrix format. The quantification for endogenous samples is based on run summarization from subplot model, with TMP robust estimation. + +- Sample quantification : individual biological sample quantification for each protein. The label of each biological sample is a combination of the corresponding group and the sample ID. If there are no technical replicates or experimental replicates per sample, sample quantification is the same as run summarization from dataProcess (`xxx$RunlevelData` from `dataProcess`). If there are technical replicates or experimental replicates, sample quantification is median among run quantification corresponding MS runs. + +- Group quantification : quantification for individual group or individual condition per protein. It is median among sample quantification. + + +### Arguments +* `data` : name of the (processed) data set from `dataPRocess`. +* `type` : choice of quantification. `"Sample"` or `"Group"` for protein sample quantification or group quantification. +* `format` : choice of returned format. `"long"` for long format which has the columns named `Protein`, `Condition`, `LogIntensities` (and `BioReplicate` if it is subject quantification), `NumFeature` for number of transitions for a protein, and `NumPeaks` for number of observed peak intensities for a protein. `"matrix"` for data matrix format which has the rows for Protein and the columns, which are `Groups`(or `Conditions`) for group quantification or the combinations of `BioReplicate` and `Condition` (labeled by `"BioReplicate"_"Condition"`) for sample quantification. Default is `"matrix"`, whether `format="long"` or `"matrix"`, both files, `"Group or SampleQuantification_longformat.csv"` and `"Group or SampleQuantification_dataMatrix.csv"` will be stored in the assigned folder. + +### Example +```{r, eval=FALSE} +QuantData <- dataProcess(SRMRawData) + +# Sample quantification +sampleQuant <- quantification(QuantData) + +# Group quantification +groupQuant <- quantification(QuantData, type="Group") +``` + +*** + +# Assay characterization + +## nonlinear_quantlim + +This function calculates the value of the LOB (limit of blank) and LOD (limit of detection) from the (`Concentration`, `Intensity`) spiked in data. This function should be used instead of the linear function whenever a significant threshold is present at low concentrations. Such threshold is characterized by a signal that is dominated by noise where the mean intensity is constant and independent of concentration. +The function also returns the values of the nonlinear curve fit that allows it to be plotted. At least 2 blank samples (characterized by `Intensity = 0`) are required by this function which are used to calculate the background noise. The LOB is defined as the concentration at which the value of the nonlinear fit is equal to the 95\% upper bound of the noise. The LOD is the concentration at which the latter is equal to the 90\% lower bound (5\% quantile) of the prediction interval of the nonlinear fit. +A weighted nonlinear fit is used with weights for every unique concentration proportional to the inverse of variance between replicates. The details behind the calculation of the nonlinear fit can be found in the Reference. + +The output is a data frame that contains the following columns: + +* `CONCENTRATION` : Concentration values at which the value of the fit is calculated. +* `MEAN` : The value of the curve fit. +* `LOW` : the lower bound of the 95\% prediction interval. +* `UP` : the upper bound of the 95\% prediction interval. +* `LOB` : LOB (one column with identical values) +* `LOD` : LOD (one column with identical values) +* `SLOPE` : the slope of the linear curve fit where only the spikes above LOD are considered. +* `INTERCEPT` : the intercept of the linear curve fit where only the spikes above LOD are considered . +* `NAME` : The name of the assay (identical to that provided in the input). +* `METHOD` : which is always set to `NONLINEAR` when this function is used. Each line of the data frame corresponds to a unique concentration value at which the value of the fit and prediction interval are evaluated. More unique concentrations values than in the input data frame are used to increase the accuracy of the LOB/D calculations. + +The LOB and LOD can only be calculated when more than 2 blank samples are included. +The data should ideally be plotted using the companion function `plot_quantlim` to ensure that the fit is suited to the data. + +### Arguments +* `datain` : Data frame that contains the input data. The input data frame has to contain the following columns : `CONCENTRATION`, `INTENSITY` (both of which are measurements from the spiked in experiment) and `NAME` which designates the name of the assay (e.g. the name of the peptide or protein), Each line of the data frame contains one measurement from the spiked-in experiment. Multiple different `INTENSITY` values for the same `CONCENTRATION` are assumed to correspond to different replicates. Blank Samples are characterized by `CONCENTRATION = 0`. +* `alpha` : Probability level to estimate the LOB/LOD. +* `Npoints` : Number of points to use to discretize the concentration line between 0 and the maximum spiked concentration. +* `Nbootstrap` : Number of bootstrap samples to use to calculate the prediction interval of the fit. This number has to be increased for very low alpha values or whenever very accurate assay characterization is required. + +### Example +```{r, eval=FALSE} +# Consider data from a spiked-in contained in an example dataset +head(SpikeInDataNonLinear) + +nonlinear_quantlim_out <- nonlinear_quantlim(SpikeInDataNonLinear) + +# Get values of LOB/LOD +nonlinear_quantlim_out$LOB[1] +nonlinear_quantlim_out$LOD[1] +``` + + +### linear_quantlim + +This function calculates the value of the LOB (limit of blank) and LOD (limit of detection) from the (`Concentration`, `Intensity`) spiked in data. The function also returns the values of the linear curve fit that allows it to be plotted. At least 2 blank samples (characterized by `Intensity = 0`) are required by this function which are used to calculate the background noise. The LOB is defined as the concentration at which the value of the linear fit is equal to the 95\% upper bound of the noise. The LOD is the concentration at which the latter is equal to the 90\% lower bound of the prediction interval (5\% quantile) of the linear fit. A weighted linear fit is used with weights for every unique concentration proportional to the inverse of variance between replicates. + +The output is a data frame that contains the following columns: + +* `CONCENTRATION` : Concentration values at which the value of the fit is calculated . +* `MEAN` : The value of the curve fit. +* `LOW` : the lower bound of the 95\% prediction interval. +* `UP` : the upper bound of the 95\% prediction interval. +* `LOB`: LOB (one column with identical values) +* `LOD` : LOD (one column with identical values) +* `SLOPE` : the slope of the linear curve fit where only the spikes above LOD are considered. +* `INTERCEPT` : the intercept of the linear curve fit where only the spikes above LOD are considered. +* `NAME` : The name of the assay (identical to that provided in the input). +* `METHOD` : which is always set to `LINEAR` when this function is used. Each line of the data frame corresponds to a unique concentration value at which the value of the fit and prediction interval are evaluated. More unique concentrations values than in the input data frame are used to increase the accuracy of the LOB/D calculations. + +The LOB and LOD can only be calculated when more than 2 blank samples are included. +The data should ideally be plotted using the companion function `plot_quantlim` to ensure that a linear fit is suited to the data. + +### Arguments +* `datain` : Data frame that contains the input data. The input data frame has to contain the following columns : `CONCENTRATION`, `INTENSITY` (both of which are measurements from the spiked in experiment) and `NAME` which designates the name of the assay (e.g. the name of the peptide or protein) Each line of the data frame contains one measurement from the spiked-in experiment. Multiple different `INTENSITY` values for the same `CONCENTRATION` are assumed to correspond to different replicates. Blank Samples are characterized by `CONCENTRATION = 0`. +* `alpha` : Probability level to estimate the LOB/LOD. +* `Npoints` : Number of points to use to discretize the concentration line between 0 and the maximum spiked concentration. +* `{Nbootstrap` : Number of bootstrap samples to use to calculate the prediction interval of the fit. This number has to be increased for very low alpha values or whenever very accurate assay characterization is required. + +### Example +```{r, eval=FALSE} +# Consider data from a spiked-in contained in an example dataset +head(SpikeInDataLinear) + +linear_quantlim_out <- linear_quantlim(SpikeInDataLinear) + +# Get values of LOB/LOD +linear_quantlim_out$LOB[1] +linear_quantlim_out$LOD[1] +``` + + +### plot_quantlim + +This function allows to plot the curve fit that is used to calculate the LOB and LOD with functions `nonlinear_quantlim` and `linear_quantlim`. The function outputs for each calibration curve, two pdf files each containg one plot. On the first, designated by `*_overall.pdf`, the entire concentration range is plotted. On the second plot, designated by `*_zoom.pdf`, the concentration range between 0 and `xlim_plot` (if specified in the argument of the function) is plotted. When no `xlim_plot` value is specified, the region close to LOB and LOD is automatically plotted. + +This plotting function should ideally be used every time `nonlinear_quantlim` or `linear_quantlim` are called to visually ensure that the fits and data are accurate. + +### Arguments +* `spikeindata` : Data frame that contains the experimental spiked in data. This data frame should be identical to that used as input by function functions `nonlinear_quantlim` or `linear_quantlim`. The data frame has to contain the following columns : `CONCENTRATION`, `INTENSITY` (both of which are measurements from the spiked in experiment) and NAME which designates the name of the assay (e.g. the name of the peptide or protein). +* `quantlim_out` : Data frame that was output by functions `nonlinear_quantlim` or `linear_quantlim`. +* `alpha` : Probability level to estimate the LOB/LOD. +* `dir_output` : String containg the path of the directly where the pdf files of the plots should be output. +* `xlim_plot` : Optional argument containing the maximum xaxis value of the zoom plot. When no value is specified, a suitable value close to LOD is automatically chosen. + +### Example +```{r, eval=FALSE} +# Consider data from a spiked-in contained in an example dataset +head(SpikeInDataNonLinear) + +nonlinear_quantlim_out <- nonlinear_quantlim(SpikeInDataNonLinear, alpha = 0.05) + +#Get values of LOB/LOD +nonlinear_quantlim_out$LOB[1] +nonlinear_quantlim_out$LOD[1] + +plot_quantlim(spikeindata = SpikeInDataLinear, quantlim_out = nonlinear_quantlim_out, +dir_output = getwd(), alpha = 0.05) +``` + +