5/10/2023
To install the private package:
# install.packages("devtools")
devtools::install_github("trichelab/bamSliceR",
ref = "main",
auth_token = "your github token: Settings ->
Developer settings ->
Personal access tokens")
Keep in mind that you also have to save your GDC token downloaded from GDC portal in a file in the user home directory, called .gdc_token.
library(bamSliceR)
availableProjectId()
## [1] "CGCI-HTMCP-CC" "TARGET-AML"
## [3] "GENIE-JHU" "GENIE-MSK"
## [5] "GENIE-VICC" "GENIE-MDA"
## [7] "TCGA-MESO" "TARGET-ALL-P3"
## [9] "TCGA-UVM" "TCGA-KICH"
## [11] "TARGET-WT" "TARGET-OS"
## [13] "TCGA-DLBC" "GENIE-UHN"
## [15] "APOLLO-LUAD" "CDDP_EAGLE-1"
## [17] "EXCEPTIONAL_RESPONDERS-ER" "MP2PRT-WT"
## [19] "CGCI-HTMCP-DLBCL" "CMI-MPC"
## [21] "WCDT-MCRPC" "TCGA-CHOL"
## [23] "TCGA-UCS" "TCGA-PCPG"
## [25] "CPTAC-2" "TCGA-CESC"
## [27] "TCGA-LIHC" "TCGA-ACC"
## [29] "CMI-MBC" "TCGA-BRCA"
## [31] "CPTAC-3" "TCGA-COAD"
## [33] "TCGA-GBM" "TCGA-TGCT"
## [35] "NCICCR-DLBCL" "TCGA-LGG"
## [37] "FM-AD" "GENIE-GRCC"
## [39] "CTSP-DLBCL1" "TARGET-CCSK"
## [41] "GENIE-NKI" "TARGET-ALL-P1"
## [43] "MATCH-N" "TRIO-CRU"
## [45] "CMI-ASC" "TARGET-RT"
## [47] "ORGANOID-PANCREATIC" "MATCH-Z1D"
## [49] "MATCH-B" "VAREPOP-APOLLO"
## [51] "MATCH-Q" "BEATAML1.0-CRENOLANIB"
## [53] "MATCH-Y" "OHSU-CNL"
## [55] "CGCI-HTMCP-LC" "TARGET-NBL"
## [57] "TCGA-SARC" "TCGA-PAAD"
## [59] "TCGA-LUAD" "TCGA-PRAD"
## [61] "MP2PRT-ALL" "TCGA-LUSC"
## [63] "TCGA-LAML" "TCGA-SKCM"
## [65] "HCMI-CMDC" "BEATAML1.0-COHORT"
## [67] "TCGA-BLCA" "TCGA-READ"
## [69] "TCGA-UCEC" "TCGA-THCA"
## [71] "TCGA-OV" "TCGA-KIRC"
## [73] "MMRF-COMMPASS" "GENIE-DFCI"
## [75] "TCGA-HNSC" "TCGA-ESCA"
## [77] "CGCI-BLGSP" "TARGET-ALL-P2"
## [79] "TCGA-STAD" "REBC-THYR"
## [81] "TCGA-KIRP" "TCGA-THYM"
availableExpStrategy("TARGET-AML")
## [1] "Genotyping Array" "Methylation Array" "RNA-Seq"
## [4] "WGS" "WXS" "miRNA-Seq"
availableWorkFlow(projectId = "TARGET-AML", es = "RNA-Seq")
## doc_count key
## 1 6450 Arriba
## 2 6450 STAR - Counts
## 3 6450 STAR-Fusion
## 4 3225 STAR 2-Pass Chimeric
## 5 3225 STAR 2-Pass Genome
## 6 3225 STAR 2-Pass Transcriptome
After knowing the keyword of project, experimental strategy and workflow, we can collect information of BAMs we are interested.
file_meta = getGDCBAMs(projectId = "TARGET-AML", es = "RNA-Seq", workflow = "STAR 2-Pass Genome")
head(file_meta)
## id sample
## 1 abc2b327-39c6-4d4f-9391-d07aa92d8833 TARGET-20-PAWBTJ-09A-01R
## 2 e895073b-e823-4152-971d-2689efd96be5 TARGET-20-PARXZP-09A-01R
## 3 a5a2bdf0-1979-491d-8535-d4e17fda66b9 TARGET-20-PANLXK-09A-03R
## 4 2fa5a766-307f-4bd7-9abb-3d6c0efe69c4 TARGET-20-PARUBT-40A-01R
## 5 0cf94c97-b3f2-4f91-836d-28ca46df31b5 TARGET-20-PAXKKC-10A-01R
## 6 4e608568-0588-44b5-aa85-5d55e4b30e0e TARGET-20-PAVCZF-04A-01R
## file_name
## 1 98ed96f9-88ef-43b7-a83d-6f66d34584ef.rna_seq.genomic.gdc_realn.bam
## 2 6203ff36-3658-44d1-a4f8-392ba7f1199f.rna_seq.genomic.gdc_realn.bam
## 3 186aef04-6517-42d4-b164-1189ac109593.rna_seq.genomic.gdc_realn.bam
## 4 5e7c4746-1ffc-43d6-85cc-eefc114aba99.rna_seq.genomic.gdc_realn.bam
## 5 d0e4b790-cba8-4da4-a770-b7d4995df8a2.rna_seq.genomic.gdc_realn.bam
## 6 adc88e24-63ac-4ab8-bae2-77c1f1bef50a.rna_seq.genomic.gdc_realn.bam
## case_id sample_type
## 1 TARGET-20-PAWBTJ Primary Blood Derived Cancer - Bone Marrow
## 2 TARGET-20-PARXZP Primary Blood Derived Cancer - Bone Marrow
## 3 TARGET-20-PANLXK Primary Blood Derived Cancer - Bone Marrow
## 4 TARGET-20-PARUBT Recurrent Blood Derived Cancer - Peripheral Blood
## 5 TARGET-20-PAXKKC Blood Derived Normal
## 6 TARGET-20-PAVCZF Recurrent Blood Derived Cancer - Bone Marrow
## experimental_strategy workflow
## 1 RNA-Seq STAR 2-Pass Genome
## 2 RNA-Seq STAR 2-Pass Genome
## 3 RNA-Seq STAR 2-Pass Genome
## 4 RNA-Seq STAR 2-Pass Genome
## 5 RNA-Seq STAR 2-Pass Genome
## 6 RNA-Seq STAR 2-Pass Genome
## downloaded_file_name
## 1 TARGET-20-PAWBTJ-09A-01R_TARGET-20-PAWBTJ_98ed96f9-88ef-43b7-a83d-6f66d34584ef.rna_seq.genomic.gdc_realn.bam
## 2 TARGET-20-PARXZP-09A-01R_TARGET-20-PARXZP_6203ff36-3658-44d1-a4f8-392ba7f1199f.rna_seq.genomic.gdc_realn.bam
## 3 TARGET-20-PANLXK-09A-03R_TARGET-20-PANLXK_186aef04-6517-42d4-b164-1189ac109593.rna_seq.genomic.gdc_realn.bam
## 4 TARGET-20-PARUBT-40A-01R_TARGET-20-PARUBT_5e7c4746-1ffc-43d6-85cc-eefc114aba99.rna_seq.genomic.gdc_realn.bam
## 5 TARGET-20-PAXKKC-10A-01R_TARGET-20-PAXKKC_d0e4b790-cba8-4da4-a770-b7d4995df8a2.rna_seq.genomic.gdc_realn.bam
## 6 TARGET-20-PAVCZF-04A-01R_TARGET-20-PAVCZF_adc88e24-63ac-4ab8-bae2-77c1f1bef50a.rna_seq.genomic.gdc_realn.bam
## UPC_ID
## 1 PAWBTJ
## 2 PARXZP
## 3 PANLXK
## 4 PARUBT
## 5 PAXKKC
## 6 PAVCZF
target_genes = readRDS("./data/target_genes.rds")
target_genes
## KMT2A GBA1 ETV6 IDH1 IDH2 TET1 TET2
## "KMT2A" "GBA1" "ETV6" "IDH1" "IDH2" "TET1" "TET2"
## ASXL1 ASXL2 DNMT3A RUNX1 CENPA H3-7 H3-3A
## "ASXL1" "ASXL2" "DNMT3A" "RUNX1" "CENPA" "H3-2" "H3F3A"
## H3-3B H3-4 H3-5 H3C1 H3C10 H3C11 H3C12
## "H3F3B" "H3-4" "H3F3C" "HIST1H3A" "HIST1H3H" "HIST1H3I" "HIST1H3J"
## H3C14 H3C14 H3C14 H3C2 H3C3 H3C4 H3C5P
## "H3C13" "H3C14" "H3C15" "HIST1H3B" "HIST1H3C" "HIST1H3D" "H3C5P"
## H3C6 H3C7 H3C8 H3C9P H3Y1 H3Y2
## "HIST1H3E" "HIST1H3F" "HIST1H3G" "H3C9P" "H3Y1" "H3Y2"
Get granges for exons of all genes
library(TxDb.Hsapiens.UCSC.hg38.knownGene)
library(Homo.sapiens)
library(stringr)
txdb <- TxDb.Hsapiens.UCSC.hg38.knownGene
#all the GDC files are against hg38,
TxDb(Homo.sapiens) <- TxDb.Hsapiens.UCSC.hg38.knownGene
exs <- exonsBy(Homo.sapiens, "gene", columns="SYMBOL")
names(exs) <- mapIds(Homo.sapiens, names(exs), "SYMBOL", "GENEID")
exs <- exs[which(!is.na(names(exs)))]
Get the exons of target genes and make the format for bamSliceR input
target_genes_exs = target_genes[which(names(target_genes) %in% names(exs)) ]
target_genes_exons = exs[names(target_genes_exs) %>% unique()]
target_ranges = reduce(unlist(target_genes_exons) )
target_ranges = subset(target_ranges, seqnames %in% paste0 ("chr", c(1:22, "X", "Y") ))
target_ranges_chars = paste0(as.character(seqnames(target_ranges)), ":", start(ranges(target_ranges)), "-", end(ranges(target_ranges)) )
head(target_ranges_chars)
## [1] "chr1:226061851-226062094" "chr1:226062714-226062948"
## [3] "chr1:226063466-226063681" "chr1:226063977-226064824"
## [5] "chr1:226065656-226067269" "chr1:226071351-226072019"
#Download 3225 RNA-Seq sliced BAMs files with reads within regions defined by "target_ranges_chars" from TARGET-AML.
#downloadSlicedBAMs(file_df = file_meta, regions = target_ranges_chars, dir = "./inst/extdata/")
We first also need to specify the regions as a GRanges object.
head(target_ranges)
## GRanges object with 6 ranges and 0 metadata columns:
## seqnames ranges strand
## <Rle> <IRanges> <Rle>
## [1] chr1 226061851-226062094 +
## [2] chr1 226062714-226062948 +
## [3] chr1 226063466-226063681 +
## [4] chr1 226063977-226064824 +
## [5] chr1 226065656-226067269 +
## [6] chr1 226071351-226072019 +
## -------
## seqinfo: 640 sequences (1 circular) from hg38 genome
Then we need make the character() vector including all the names of downloaded BAM files. I normally would do:
cd DIR_BAM_FILES
ls | grep bam$ > bamfiles
in the directory of the downloaded BAM files. And then scan the ‘bamfiles’ in R.
#bamfiles = scan("bamfiles", "character")
Last thing we need is to specify the directory of gmapGenome object created before.
#gmapGenome_dir = "/varidata/research/projects/triche/TARGET/GMKF/oncohistone/BAMs/hg38/"
We can then start to tally the reads of BAMs files:
#tallyReads(bamfiles = bamfiles, gmapGenome_dir = gmapGenome_dir, grs = target_ranges,
# BPPARAM = MulticoreParam(workers = 10 , stop.on.error = TRUE), parallelOnRanges = TRUE,
# parallelOnRangesBPPARAM = MulticoreParam(workers = 10) )
TallyVariants() originally from package ‘VariantTools’ allows parallelizing computing on both BAM files and GRanges using the same parameter, BPPARAM, for bplapply(). The setting is not efficient on certain circumstances and sometimes would unnecessarily eat up all the memory. For example, if we want to tally reads on thousands of BAMs but only few gene regions, ideally we want to put more workers on parallelizing computing on BAM files but less workers on granges regions. To overcome the issue, I modified TallyVariants() and wrapper up it to tallyReads(). We can now specify if we want to parallelize computing on granges regions using parameter “parallelOnRanges”, and provide the ‘parallelOnRangesBPPARAM’ specific for parallelize computing on granges regions.
Example1: tally reads on “chr17:7665307:7704652” of 679 BAMs files If we set 10 workers to parallelize compute on both BAM files and granges regions, it would take ~12 mins to complete.
library(bamSliceR)
library(VariantAnnotation)
library(BiocParallel)
library(gmapR)
library(VariantTools)
setwd("/varidata/research/projects/triche/Peter/bamSliceR/TP53/RNA_ALL_BAMs")
bamfiles = scan("bamfiles", "character")
gmapGenome_dir = "/varidata/research/projects/triche/TARGET/GMKF/oncohistone/BAMs/hg38/"
GRanges( seqnames = Rle (c("chr17")) , IRanges(start=7665307, end=7704652), strand = Rle(strand(c("*")) ) ) -> TP53_gr
tallyReads(bamfiles = bamfiles, gmapGenome_dir = gmapGenome_dir, grs = TP53_gr,
BPPARAM = MulticoreParam(workers = 10 , stop.on.error = TRUE), parallelOnRanges = TRUE,
parallelOnRangesBPPARAM = MulticoreParam(workers = 10) ) -> TARGET_ALL_RNA_TP53
saveRDS(TARGET_ALL_RNA_TP53, "../TARGET_ALL_RNA_TP53.rds")
If we set 80 workers to parallelize compute on BAM files and set ‘parallelOnRanges = FALSE’, it would take ~2 mins to complete.
library(bamSliceR)
library(VariantAnnotation)
library(BiocParallel)
library(gmapR)
library(VariantTools)
setwd("/varidata/research/projects/triche/Peter/bamSliceR/TP53/RNA_ALL_BAMs")
bamfiles = scan("bamfiles", "character")
gmapGenome_dir = "/varidata/research/projects/triche/TARGET/GMKF/oncohistone/BAMs/hg38/"
GRanges( seqnames = Rle (c("chr17")) , IRanges(start=7665307, end=7704652), strand = Rle(strand(c("*")) ) ) -> TP53_gr
tallyReads(bamfiles = bamfiles, gmapGenome_dir = gmapGenome_dir, grs = TP53_gr,
BPPARAM = MulticoreParam(workers = 80 , stop.on.error = TRUE), parallelOnRanges = FALSE,
parallelOnRangesBPPARAM = MulticoreParam(workers = 10) ) -> TARGET_ALL_RNA_TP53
saveRDS(TARGET_ALL_RNA_TP53, "../TARGET_ALL_RNA_TP53_2.rds")
Example2: 100 BAM files (sliced on 500+ genes) in Lauren’s folder “BAM_slices_T20_09” and 50 gene regions:
library(bamSliceR)
library(VariantAnnotation)
library(BiocParallel)
library(gmapR)
library(VariantTools)
setwd("/varidata/research/projects/triche/TARGET/GMKF/germline_validations/BAM_slices_T20_09")
bamfiles = scan("/varidata/research/projects/triche/Peter/bamSliceR/TARGET_AML_germline/BAM_slices_T20_09/bamfiles", "character")
gmapGenome_dir = "/varidata/research/projects/triche/TARGET/GMKF/oncohistone/BAMs/hg38/"
target_ranges_gr = readRDS("/varidata/research/projects/triche/Peter/bamSliceR/TARGET_AML_germline/target_ranges_gr.rds")
seqlevelsStyle(target_ranges_gr) <- "UCSC"
keepSeqlevels(target_ranges_gr, paste0("chr", c(1:22,"X") ) ) -> target_ranges_gr
c( "TARGET-20-PASBHI-09A-01R.RNA.GRCh38.sliced.bam",
"TARGET-20-PASVYA-09A-01R.RNA.GRCh38.sliced.bam",
"TARGET-20-PASVYL-09A-01R.RNA.GRCh38.sliced.bam",
"TARGET-20-PASWLN-09A-01R.RNA.GRCh38.sliced.bam") -> noIndexBamfiles
bamfiles[ -which(bamfiles %in% noIndexBamfiles)] -> indexed_bamfiles
Sys.time() -> t1
tallyReads(bamfiles = indexed_bamfiles[1:100] , gmapGenome_dir = gmapGenome_dir, grs = target_ranges_gr[1:50],
BPPARAM = MulticoreParam(workers = 10 , stop.on.error = TRUE), parallelOnRanges = TRUE,
parallelOnRangesBPPARAM = MulticoreParam(workers = 10 ) ) -> TARGET_AML_RNA_10
Sys.time() -> t2
t2 - t1
#Time difference of 52.10761 mins
We can use sbatch to submit bamSliceR job in Rcode to new HPC. The Rcode for downloading BAMs would be like this:
#bamSliceR_Download.r
library(bamSliceR)
library(GenomicDataCommons)
library(httr)
BAMs_FOLDER_DIR = "/varidata/research/projects/triche/Peter/bamSliceR/WT1/TARGET_AML_RNA"
WT1 = c("chr11:32379149-32468665")
TARGET_AML_RNA_BAMs = getGDCBAMs("TARGET-AML", "RNA-Seq", "STAR 2-Pass Genome" )
downloadSlicedBAMs(file_df = TARGET_AML_RNA_BAMs, regions = WT1, dir = BAMs_FOLDER_DIR)
The Rcode for tally BAMs would be like this:
#bamSliceR_TallyReads.r
library(bamSliceR)
library(GenomicDataCommons)
library(VariantAnnotation)
library(BiocParallel)
BAMs_FOLDER_DIR = "/varidata/research/projects/triche/Peter/bamSliceR/WT1/TARGET_AML_RNA"
gmapGenome_dir = "/varidata/research/projects/triche/TARGET/GMKF/oncohistone/BAMs/hg38/"
setwd(BAMs_FOLDER_DIR)
WT1_gr = GRanges( seqnames = Rle (c("chr11")) , IRanges(start=32379149, end=32468665), strand = Rle(strand(c("*")) ) )
TARGET_AML_RNA_BAMs = getGDCBAMs("TARGET-AML", "RNA-Seq", "STAR 2-Pass Genome" )
badbamfiles = scan("bad_bams.fofn", "character")
bamfiles = scan("bamfiles", "character")
if (length(badbamfiles) == 0)
{
valid_bamfiles = bamfiles
} else
{
valid_bamfiles = bamfiles[ -which( bamfiles %in% badbamfiles)]
}
tallyReads(bamfiles = valid_bamfiles, gmapGenome_dir = gmapGenome_dir, grs = WT1_gr,
BPPARAM = MulticoreParam(workers = 80 , stop.on.error = TRUE), parallelOnRanges = FALSE,
parallelOnRangesBPPARAM = MulticoreParam(workers = 10) ) -> TARGET_AML_RNA_WT1
saveRDS(TARGET_AML_RNA_WT1, "TARGET_AML_RNA_WT1.rds")
TARGET_AML_RNA_WT1_combined = stackSamples(VRangesList(TARGET_AML_RNA_WT1))
TARGET_AML_RNA_WT1_combined = annotateWithBAMinfo(tallied_reads = TARGET_AML_RNA_WT1_combined, file_meta = TARGET_AML_RNA_BAMs)
saveRDS(TARGET_AML_RNA_WT1_combined, "TARGET_AML_RNA_WT1_combined.rds")
The sbatch file would be like this (bamSliceR.sh):
#!/bin/bash
#SBATCH --export=NONE
#SBATCH -J TallyReads2
#SBATCH -o TallyReads2.o
#SBATCH -e TallyReads2.e
#SBATCH --ntasks 1
#SBATCH --time 3:00:00
#SBATCH --mem=800G
BAMs_FOLDER_DIR="/varidata/research/projects/triche/Peter/bamSliceR/WT1/TARGET_AML_RNA/"
start_time=$(date +"%T")
R CMD BATCH $BAMs_FOLDER_DIR/bamSliceR_Download.r
end_time=$(date +"%T")
echo "Downloading BAMs:"
echo "Start time: $start_time"
echo "End time: $end_time"
start_time=$(date +"%T")
cd $BAMs_FOLDER_DIR
samtools quickcheck -v *.bam > bad_bams.fofn && echo 'all ok' || echo 'some files failed check, see bad_bams.fofn'
for i in $(ls | grep bam$); do samtools index $i; done
ls | grep bam$ > bamfiles
end_time=$(date +"%T")
echo "Index BAMs:"
echo "Start time: $start_time"
echo "End time: $end_time"
start_time=$(date +"%T")
R CMD BATCH $BAMs_FOLDER_DIR/bamSliceR_TallyReads.r
end_time=$(date +"%T")
echo "Tally BAMs:"
echo "Start time: $start_time"
echo "End time: $end_time"
Submit the jobs to hpc:
sbatch -p bigmem -n 128 bamSliceR.sh