Version ready to be tested:

-new design of TCGAanalyze_DEA -TCGAbatch_correction: voom transformation and exploratory analysis using ComBat -TCGAtumor_purity to filter TCGA samples according to tumor purity estimates -TCGAPam50: returns dataframe with subtype for each TCGA barcodes -UseRaw_afterFilter: recovers raw counts after gene filtering and normalization steps -TCGAquery_recount2: Querying data from recount2 project
QiStark · Sep 5, 2017 · cd97f93 · cd97f93
1 parent 75d4904
commit cd97f93
Show file tree

Hide file tree

Showing 16 changed files with 1,190 additions and 51 deletions.
diff --git a/NAMESPACE b/NAMESPACE
@@ -6,6 +6,7 @@ export(GDCprepare_clinic)
 export(GDCquery)
 export(GDCquery_Maf)
 export(GDCquery_clinic)
+export(TCGAPam50)
 export(TCGAVisualize_volcano)
 export(TCGAanalyze_Clustering)
 export(TCGAanalyze_DEA)
@@ -21,11 +22,14 @@ export(TCGAanalyze_SurvivalKM)
 export(TCGAanalyze_analyseGRN)
 export(TCGAanalyze_networkInference)
 export(TCGAanalyze_survival)
+export(TCGAbatch_Correction)
 export(TCGAprepare_Affy)
 export(TCGAprepare_elmer)
 export(TCGAquery_MatchedCoupledSampleTypes)
 export(TCGAquery_SampleTypes)
+export(TCGAquery_recount2)
 export(TCGAquery_subtype)
+export(TCGAtumor_purity)
 export(TCGAvisualize_BarPlot)
 export(TCGAvisualize_EAbarplot)
 export(TCGAvisualize_Heatmap)
@@ -34,6 +38,7 @@ export(TCGAvisualize_SurvivalCoxNET)
 export(TCGAvisualize_meanMethylation)
 export(TCGAvisualize_oncoprint)
 export(TCGAvisualize_starburst)
+export(UseRaw_afterFilter)
 export(gaiaCNVplot)
 export(getAdjacencyBiogrid)
 export(getDataCategorySummary)
@@ -135,6 +140,12 @@ importFrom(gridExtra,rbind.gtable)
 importFrom(httr,timeout)
 importFrom(jsonlite,fromJSON)
 importFrom(knitr,kable)
+importFrom(limma,contrasts.fit)
+importFrom(limma,eBayes)
+importFrom(limma,lmFit)
+importFrom(limma,makeContrasts)
+importFrom(limma,toptable)
+importFrom(limma,voom)
 importFrom(matlab,jet.colors)
 importFrom(methods,as)
 importFrom(methods,is)
@@ -156,6 +167,7 @@ importFrom(survival,coxph)
 importFrom(survival,survdiff)
 importFrom(survival,survfit)
 importFrom(survminer,ggsurvplot)
+importFrom(sva,ComBat)
 importFrom(tibble,as_data_frame)
 importFrom(tools,file_ext)
 importFrom(tools,md5sum)

diff --git a/R/analyze.R b/R/analyze.R
diff --git a/R/clinical.R b/R/clinical.R
@@ -486,3 +486,102 @@ TCGAquery_subtype <- function(tumor){
         stop("For the moment we have only subtype for: acc, brca, coad, gbm, hnsc, kich, kirp, kirc, lgg, luad, lusc, prad, pancan, read, skcm, stad, thca and ucec")
     }
 }
+
+
+
+#' @title Retrieve PAM50 molecular subtypes for given TCGA barcodes
+#' @description
+#'   TCGAPam50 Retrieve PAM50 molecular subtypes from TCGA consortium for a given set of barcodes
+#' @param barcodes is a vector of TCGA barcodes
+#' @export
+#' @examples
+#' Purity.BRCA<-TCGAtumor_purity(barcodes, 0, 0, 0, 0, 0.7)
+#' @return List with $subtypes attribute as a dataframe with barcodes, samples, subtypes, and colors. The $filtered attribute is returned as filtered samples with no subtype info
+TCGAPam50<-function(barcodes){
+
+  Pam50<-TabSubtypesCol_merged
+  Pam50$samples<-as.character(Pam50$samples)
+  barcodes<-as.character(barcodes)
+
+  patients<-sapply(barcodes, function(x) paste(unlist(stringr::str_split(x, "-"))[1:3], collapse = "-"))
+  filt.p<-c()
+  df.barcodes_patID<- data.frame(barcodes=barcodes, patID=patients, row.names = 1:length(barcodes))
+  #print(df.barcodes_patID)
+  for(p in patients){
+    if(p%in%Pam50$samples==FALSE)
+      filt.p<-c(filt.p, p)
+  }
+
+  if(length(filt.p)>0){
+    message("the following TCGA barcodes/patients with no subtypes were filtered:")
+    filt<-as.character(df.barcodes_patID[which(df.barcodes_patID$patID%in%filt.p),]$barcodes)
+    print(filt)
+  }
+  else filt<-c()
+
+
+  patients.filtered<-unlist(patients[patients%in%filt.p==FALSE])
+  idx.patient<-which(Pam50$samples%in%patients.filtered)
+
+  Subtypes<-Pam50[idx.patient,]
+  Subtypes<-Subtypes[match(patients.filtered, Subtypes$samples),]
+  Subtypes$barcodes<-as.character(df.barcodes_patID[which(df.barcodes_patID$patID%in%Subtypes$samples),]$barcodes)
+
+  return(list(subtypes=Subtypes, filtered=filt))
+}
+
+#' @title Filters TCGA barcodes according to purity parameters
+#' @description
+#'   TCGAtumor_purity Filters TCGA samples using 5 estimates from 5 methods as thresholds.
+#' @param barcodes is a vector of TCGA barcodes
+#' @param estimate uses gene expression profiles of 141 immune genes and 141 stromal genes
+#' @param absolute which uses somatic copy-number data (estimations were available for only 11 cancer types)
+#' @param lump (leukocytes unmethylation for purity), which averages 44 non-methylated immune-specific CpG sites
+#' @param ihc as estimated by image analysis of haematoxylin and eosin stain slides produced by the Nationwide Childrens Hospital Biospecimen Core Resource
+#' @param cpe CPE is a derived consensus measurement as the median purity level after normalizing levels from all methods to give them equal means and s.ds
+#' @export
+#' @examples
+#' @return List with $pure_barcodes attribute as a vector of pure samples and $filtered attribute as filtered samples with no purity info
+TCGAtumor_purity<-function(barcodes, estimate, absolute, lump, ihc, cpe){
+  Tumor.purity<-Tumor.purity
+  barcodes<=as.character(barcodes)
+  Tumor.purity$Sample.ID<-as.character(Tumor.purity$Sample.ID)
+  Tumor.purity$ESTIMATE<-as.numeric(gsub(",", ".", Tumor.purity$ESTIMATE))
+  Tumor.purity$ABSOLUTE<-as.numeric(gsub(",", ".", Tumor.purity$ABSOLUTE))
+  Tumor.purity$LUMP<-as.numeric(gsub(",", ".", Tumor.purity$LUMP))
+  Tumor.purity$IHC<-as.numeric(gsub(",", ".", Tumor.purity$IHC))
+  Tumor.purity$CPE<-as.numeric(gsub(",", ".", Tumor.purity$CPE))
+
+
+
+  samples.id<-sapply(barcodes, function(x) paste(unlist(stringr::str_split(x, "-"))[1:4], collapse = "-"))
+
+
+  df.barcodes_sampID<- data.frame(barcodes=barcodes, sampID=samples.id, row.names = 1:length(barcodes))
+  print(df.barcodes_sampID)
+  filt.s<-c()
+
+  for(s in samples.id){
+    if(s%in%Tumor.purity$Sample.ID==FALSE)
+      filt.s<-c(filt.s, s)
+  }
+
+  if(length(filt.s)>0){
+    message("the following TCGA barcodes do not have info on tumor purity:")
+    filt<-as.character(df.barcodes_sampID[which(df.barcodes_sampID$sampID%in%filt.s),]$barcodes)
+    print(filt)
+  }
+  else filt<-c()
+
+  samples.filtered<-unlist(samples.id[samples.id%in%filt.s==FALSE])
+
+  idx.samples<-which(Tumor.purity$Sample.ID%in%samples.filtered & Tumor.purity$ESTIMATE>=estimate & Tumor.purity$ABSOLUTE>=absolute & Tumor.purity$LUMP>=lump & Tumor.purity$CPE>=cpe)
+  df.purity<-Tumor.purity[idx.samples,]
+  #print(df.purity)
+  idx<-which(df.barcodes_sampID$sampID%in%df.purity$Sample.ID)
+
+  filtered.barcodes<-as.character(df.barcodes_sampID[idx,]$barcodes)
+
+  return(list(pure_barcodes=filtered.barcodes, filtered=filt))
+}
+
diff --git a/R/prepare.R b/R/prepare.R
@@ -1069,3 +1069,25 @@ TCGAprepare_Affy <- function(ClinData, PathFolder, TabCel){
     return(mat)
 
 }
+
+
+##Function to take raw counts by removing rows filtered after norm and filter process###
+
+#' @title Use raw count from the DataPrep object which genes are removed by normalization and filtering steps.
+#' @description function to keep raw counts after filtering and/or normalizing.
+#' @param DataPrep DataPrep object returned by TCGAanalyze_Preprocessing()
+#' @param Filtered data frame containing samples in columns and genes in rows after normalization and/or filtering steps
+#' @examples
+#' \dontrun{
+#' dataPrep_raw<-UseRaw_afterFilter(dataPrep, dataFilt)
+#' }
+#' @export
+#' @return Filtered return object similar to DataPrep with genes removed after normalization and filtering process.
+UseRaw_afterFilter<-function(DataPrep, DataFilt){
+  rownames(DataPrep)<-lapply(rownames(DataPrep), function(x) gsub("[[:punct:]]\\d*", "", x ))
+  filtered.list <- setdiff(rownames(DataPrep), rownames(DataFilt))
+  Res <- DataPrep[!rownames(DataPrep) %in% filtered.list, ]
+  return(Res)
+}
+
+
diff --git a/R/query.R b/R/query.R
@@ -569,3 +569,60 @@ GDCquery_Maf <- function(tumor, save.csv= FALSE, directory = "GDCdata", pipeline
     return(maf)
 }
 
+#' @title Query gene counts of TCGA and GTEx data from the Recount2 project
+#' @description
+#'   TCGAquery_recount2 queries and downloads data produced by the Recount2 project. User can specify which project and which tissue to query
+#' @param project is a string denoting which project the user wants. Options are "tcga" and "gtex"
+#' @param tissue a vector of tissue(s) to download. Options are "adipose tissue", "adrenal", "gland", "bladder","blood", "blood vessel", "bone marrow", "brain", "breast","cervix uteri", "colon", "esophagus", "fallopian tube","heart", "kidney", "liver", "lung", "muscle", "nerve", "ovary","pancreas", "pituitary", "prostate", "salivary", "gland", "skin", "small intestine", "spleen", "stomach", "testis", "thyroid", "uterus", "vagina"
+#' @export
+#' @examples
+#' brain.rec<-TCGArecount2_query(project = "gtex", tissue = "brain")
+#' @return List with $subtypes attribute as a dataframe with barcodes, samples, subtypes, and colors. The $filtered attribute is returned as filtered samples with no subtype info
+TCGAquery_recount2<-function(project, tissue=c()){
+  tissues<-c("adipose tissue", "adrenal", "gland", "bladder",
+             "blood", "blood vessel", "bone marrow", "brain", "breast",
+             "cervix uteri", "colon", "esophagus", "fallopian tube",
+             "heart", "kidney", "liver", "lung", "muscle", "nerve", "ovary",
+             "pancreas", "pituitary", "prostate", "salivary", "gland", "skin",
+             "small intestine", "spleen", "stomach", "testis", "thyroid",
+             "uterus", "vagina")
+  tissue<-paste(unlist(strsplit(tissue, " ")), collapse="_")
+  Res<-list()
+
+  if(tolower(project)=="gtex"){
+    for(t_i in tissue){
+      if(tissue%in%tissues){
+        con<-"http://duffel.rail.bio/recount/SRP012682/rse_gene_"
+        con<-paste0(con,tissue,".Rdata")
+        message(paste0("downloading Range Summarized Experiment for: ", tissue))
+        load(url(con))
+        Res[[paste0(project,"_", t_i)]]<-rse_gene
+        ###Convert to matrix[genes, samples]#####
+        #ES<- exprs(as(rse_gene, "ExpressionSet"))
+
+        ###Remove version from ENSG ids (what's after the ".")####
+        #rownames(ES)<-gsub("\\..*","",rownames(ES))
+      }
+      else stop(paste0(tissue, " is not an available tissue on Recount2")) 
+    }
+    return(Res)
+  }
+  else if(tolower(project)=="tcga"){
+    for(t_i in tissue){
+      if(tissue%in%tissues){
+        con<-"http://duffel.rail.bio/recount/TCGA/rse_gene_"
+        con<-paste0(con,tissue,".Rdata")
+        message(paste0("downloading Range Summarized Experiment for: ", tissue))
+        load(con)
+        Res[[paste0(project,"_", t_i)]]<-rse_gene
+
+      }
+      else stop(paste0(tissue, " is not an available tissue on Recount2")) 
+    }
+    return(Res)
+  }
+  else stop(paste0(project, " is not a valid project"))
+
+}
+
+
diff --git a/R/sysdata.rda b/R/sysdata.rda
diff --git a/man/TCGAPam50.Rd b/man/TCGAPam50.Rd
diff --git a/man/TCGAanalyze_DEA.Rd b/man/TCGAanalyze_DEA.Rd
diff --git a/man/TCGAbatch_Correction.Rd b/man/TCGAbatch_Correction.Rd
diff --git a/man/TCGAquery_recount2.Rd b/man/TCGAquery_recount2.Rd