Skip to content

Commit

Permalink
Version ready to be tested:
Browse files Browse the repository at this point in the history
-new design of TCGAanalyze_DEA
-TCGAbatch_correction: voom transformation and exploratory analysis using ComBat
-TCGAtumor_purity to filter TCGA samples according to tumor purity estimates
-TCGAPam50: returns dataframe with subtype for each TCGA barcodes
-UseRaw_afterFilter: recovers raw counts after gene filtering and normalization steps
-TCGAquery_recount2: Querying data from recount2 project
  • Loading branch information
simomounir committed Sep 5, 2017
1 parent 75d4904 commit cd97f93
Show file tree
Hide file tree
Showing 16 changed files with 1,190 additions and 51 deletions.
12 changes: 12 additions & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ export(GDCprepare_clinic)
export(GDCquery)
export(GDCquery_Maf)
export(GDCquery_clinic)
export(TCGAPam50)
export(TCGAVisualize_volcano)
export(TCGAanalyze_Clustering)
export(TCGAanalyze_DEA)
Expand All @@ -21,11 +22,14 @@ export(TCGAanalyze_SurvivalKM)
export(TCGAanalyze_analyseGRN)
export(TCGAanalyze_networkInference)
export(TCGAanalyze_survival)
export(TCGAbatch_Correction)
export(TCGAprepare_Affy)
export(TCGAprepare_elmer)
export(TCGAquery_MatchedCoupledSampleTypes)
export(TCGAquery_SampleTypes)
export(TCGAquery_recount2)
export(TCGAquery_subtype)
export(TCGAtumor_purity)
export(TCGAvisualize_BarPlot)
export(TCGAvisualize_EAbarplot)
export(TCGAvisualize_Heatmap)
Expand All @@ -34,6 +38,7 @@ export(TCGAvisualize_SurvivalCoxNET)
export(TCGAvisualize_meanMethylation)
export(TCGAvisualize_oncoprint)
export(TCGAvisualize_starburst)
export(UseRaw_afterFilter)
export(gaiaCNVplot)
export(getAdjacencyBiogrid)
export(getDataCategorySummary)
Expand Down Expand Up @@ -135,6 +140,12 @@ importFrom(gridExtra,rbind.gtable)
importFrom(httr,timeout)
importFrom(jsonlite,fromJSON)
importFrom(knitr,kable)
importFrom(limma,contrasts.fit)
importFrom(limma,eBayes)
importFrom(limma,lmFit)
importFrom(limma,makeContrasts)
importFrom(limma,toptable)
importFrom(limma,voom)
importFrom(matlab,jet.colors)
importFrom(methods,as)
importFrom(methods,is)
Expand All @@ -156,6 +167,7 @@ importFrom(survival,coxph)
importFrom(survival,survdiff)
importFrom(survival,survfit)
importFrom(survminer,ggsurvplot)
importFrom(sva,ComBat)
importFrom(tibble,as_data_frame)
importFrom(tools,file_ext)
importFrom(tools,md5sum)
Expand Down
490 changes: 446 additions & 44 deletions R/analyze.R

Large diffs are not rendered by default.

99 changes: 99 additions & 0 deletions R/clinical.R
Original file line number Diff line number Diff line change
Expand Up @@ -486,3 +486,102 @@ TCGAquery_subtype <- function(tumor){
stop("For the moment we have only subtype for: acc, brca, coad, gbm, hnsc, kich, kirp, kirc, lgg, luad, lusc, prad, pancan, read, skcm, stad, thca and ucec")
}
}



#' @title Retrieve PAM50 molecular subtypes for given TCGA barcodes
#' @description
#' TCGAPam50 Retrieve PAM50 molecular subtypes from TCGA consortium for a given set of barcodes
#' @param barcodes is a vector of TCGA barcodes
#' @export
#' @examples
#' Purity.BRCA<-TCGAtumor_purity(barcodes, 0, 0, 0, 0, 0.7)
#' @return List with $subtypes attribute as a dataframe with barcodes, samples, subtypes, and colors. The $filtered attribute is returned as filtered samples with no subtype info
TCGAPam50<-function(barcodes){

Pam50<-TabSubtypesCol_merged
Pam50$samples<-as.character(Pam50$samples)
barcodes<-as.character(barcodes)

patients<-sapply(barcodes, function(x) paste(unlist(stringr::str_split(x, "-"))[1:3], collapse = "-"))
filt.p<-c()
df.barcodes_patID<- data.frame(barcodes=barcodes, patID=patients, row.names = 1:length(barcodes))
#print(df.barcodes_patID)
for(p in patients){
if(p%in%Pam50$samples==FALSE)
filt.p<-c(filt.p, p)
}

if(length(filt.p)>0){
message("the following TCGA barcodes/patients with no subtypes were filtered:")
filt<-as.character(df.barcodes_patID[which(df.barcodes_patID$patID%in%filt.p),]$barcodes)
print(filt)
}
else filt<-c()


patients.filtered<-unlist(patients[patients%in%filt.p==FALSE])
idx.patient<-which(Pam50$samples%in%patients.filtered)

Subtypes<-Pam50[idx.patient,]
Subtypes<-Subtypes[match(patients.filtered, Subtypes$samples),]
Subtypes$barcodes<-as.character(df.barcodes_patID[which(df.barcodes_patID$patID%in%Subtypes$samples),]$barcodes)

return(list(subtypes=Subtypes, filtered=filt))
}

#' @title Filters TCGA barcodes according to purity parameters
#' @description
#' TCGAtumor_purity Filters TCGA samples using 5 estimates from 5 methods as thresholds.
#' @param barcodes is a vector of TCGA barcodes
#' @param estimate uses gene expression profiles of 141 immune genes and 141 stromal genes
#' @param absolute which uses somatic copy-number data (estimations were available for only 11 cancer types)
#' @param lump (leukocytes unmethylation for purity), which averages 44 non-methylated immune-specific CpG sites
#' @param ihc as estimated by image analysis of haematoxylin and eosin stain slides produced by the Nationwide Childrens Hospital Biospecimen Core Resource
#' @param cpe CPE is a derived consensus measurement as the median purity level after normalizing levels from all methods to give them equal means and s.ds
#' @export
#' @examples
#' @return List with $pure_barcodes attribute as a vector of pure samples and $filtered attribute as filtered samples with no purity info
TCGAtumor_purity<-function(barcodes, estimate, absolute, lump, ihc, cpe){
Tumor.purity<-Tumor.purity
barcodes<=as.character(barcodes)
Tumor.purity$Sample.ID<-as.character(Tumor.purity$Sample.ID)
Tumor.purity$ESTIMATE<-as.numeric(gsub(",", ".", Tumor.purity$ESTIMATE))
Tumor.purity$ABSOLUTE<-as.numeric(gsub(",", ".", Tumor.purity$ABSOLUTE))
Tumor.purity$LUMP<-as.numeric(gsub(",", ".", Tumor.purity$LUMP))
Tumor.purity$IHC<-as.numeric(gsub(",", ".", Tumor.purity$IHC))
Tumor.purity$CPE<-as.numeric(gsub(",", ".", Tumor.purity$CPE))



samples.id<-sapply(barcodes, function(x) paste(unlist(stringr::str_split(x, "-"))[1:4], collapse = "-"))


df.barcodes_sampID<- data.frame(barcodes=barcodes, sampID=samples.id, row.names = 1:length(barcodes))
print(df.barcodes_sampID)
filt.s<-c()

for(s in samples.id){
if(s%in%Tumor.purity$Sample.ID==FALSE)
filt.s<-c(filt.s, s)
}

if(length(filt.s)>0){
message("the following TCGA barcodes do not have info on tumor purity:")
filt<-as.character(df.barcodes_sampID[which(df.barcodes_sampID$sampID%in%filt.s),]$barcodes)
print(filt)
}
else filt<-c()

samples.filtered<-unlist(samples.id[samples.id%in%filt.s==FALSE])

idx.samples<-which(Tumor.purity$Sample.ID%in%samples.filtered & Tumor.purity$ESTIMATE>=estimate & Tumor.purity$ABSOLUTE>=absolute & Tumor.purity$LUMP>=lump & Tumor.purity$CPE>=cpe)
df.purity<-Tumor.purity[idx.samples,]
#print(df.purity)
idx<-which(df.barcodes_sampID$sampID%in%df.purity$Sample.ID)

filtered.barcodes<-as.character(df.barcodes_sampID[idx,]$barcodes)

return(list(pure_barcodes=filtered.barcodes, filtered=filt))
}

22 changes: 22 additions & 0 deletions R/prepare.R
Original file line number Diff line number Diff line change
Expand Up @@ -1069,3 +1069,25 @@ TCGAprepare_Affy <- function(ClinData, PathFolder, TabCel){
return(mat)

}


##Function to take raw counts by removing rows filtered after norm and filter process###

#' @title Use raw count from the DataPrep object which genes are removed by normalization and filtering steps.
#' @description function to keep raw counts after filtering and/or normalizing.
#' @param DataPrep DataPrep object returned by TCGAanalyze_Preprocessing()
#' @param Filtered data frame containing samples in columns and genes in rows after normalization and/or filtering steps
#' @examples
#' \dontrun{
#' dataPrep_raw<-UseRaw_afterFilter(dataPrep, dataFilt)
#' }
#' @export
#' @return Filtered return object similar to DataPrep with genes removed after normalization and filtering process.
UseRaw_afterFilter<-function(DataPrep, DataFilt){
rownames(DataPrep)<-lapply(rownames(DataPrep), function(x) gsub("[[:punct:]]\\d*", "", x ))
filtered.list <- setdiff(rownames(DataPrep), rownames(DataFilt))
Res <- DataPrep[!rownames(DataPrep) %in% filtered.list, ]
return(Res)
}


57 changes: 57 additions & 0 deletions R/query.R
Original file line number Diff line number Diff line change
Expand Up @@ -569,3 +569,60 @@ GDCquery_Maf <- function(tumor, save.csv= FALSE, directory = "GDCdata", pipeline
return(maf)
}

#' @title Query gene counts of TCGA and GTEx data from the Recount2 project
#' @description
#' TCGAquery_recount2 queries and downloads data produced by the Recount2 project. User can specify which project and which tissue to query
#' @param project is a string denoting which project the user wants. Options are "tcga" and "gtex"
#' @param tissue a vector of tissue(s) to download. Options are "adipose tissue", "adrenal", "gland", "bladder","blood", "blood vessel", "bone marrow", "brain", "breast","cervix uteri", "colon", "esophagus", "fallopian tube","heart", "kidney", "liver", "lung", "muscle", "nerve", "ovary","pancreas", "pituitary", "prostate", "salivary", "gland", "skin", "small intestine", "spleen", "stomach", "testis", "thyroid", "uterus", "vagina"
#' @export
#' @examples
#' brain.rec<-TCGArecount2_query(project = "gtex", tissue = "brain")
#' @return List with $subtypes attribute as a dataframe with barcodes, samples, subtypes, and colors. The $filtered attribute is returned as filtered samples with no subtype info
TCGAquery_recount2<-function(project, tissue=c()){
tissues<-c("adipose tissue", "adrenal", "gland", "bladder",
"blood", "blood vessel", "bone marrow", "brain", "breast",
"cervix uteri", "colon", "esophagus", "fallopian tube",
"heart", "kidney", "liver", "lung", "muscle", "nerve", "ovary",
"pancreas", "pituitary", "prostate", "salivary", "gland", "skin",
"small intestine", "spleen", "stomach", "testis", "thyroid",
"uterus", "vagina")
tissue<-paste(unlist(strsplit(tissue, " ")), collapse="_")
Res<-list()

if(tolower(project)=="gtex"){
for(t_i in tissue){
if(tissue%in%tissues){
con<-"http://duffel.rail.bio/recount/SRP012682/rse_gene_"
con<-paste0(con,tissue,".Rdata")
message(paste0("downloading Range Summarized Experiment for: ", tissue))
load(url(con))
Res[[paste0(project,"_", t_i)]]<-rse_gene
###Convert to matrix[genes, samples]#####
#ES<- exprs(as(rse_gene, "ExpressionSet"))

###Remove version from ENSG ids (what's after the ".")####
#rownames(ES)<-gsub("\\..*","",rownames(ES))
}
else stop(paste0(tissue, " is not an available tissue on Recount2"))
}
return(Res)
}
else if(tolower(project)=="tcga"){
for(t_i in tissue){
if(tissue%in%tissues){
con<-"http://duffel.rail.bio/recount/TCGA/rse_gene_"
con<-paste0(con,tissue,".Rdata")
message(paste0("downloading Range Summarized Experiment for: ", tissue))
load(con)
Res[[paste0(project,"_", t_i)]]<-rse_gene

}
else stop(paste0(tissue, " is not an available tissue on Recount2"))
}
return(Res)
}
else stop(paste0(project, " is not a valid project"))

}


Binary file modified R/sysdata.rda
Binary file not shown.
20 changes: 20 additions & 0 deletions man/TCGAPam50.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

41 changes: 35 additions & 6 deletions man/TCGAanalyze_DEA.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

33 changes: 33 additions & 0 deletions man/TCGAbatch_Correction.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

22 changes: 22 additions & 0 deletions man/TCGAquery_recount2.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Loading

0 comments on commit cd97f93

Please sign in to comment.