Skip to content

Commit

Permalink
Enable tabix-indexing for tabular output
Browse files Browse the repository at this point in the history
  • Loading branch information
bschilder committed Oct 3, 2021
1 parent 2de2d68 commit 8ab84dc
Show file tree
Hide file tree
Showing 13 changed files with 280 additions and 14 deletions.
4 changes: 3 additions & 1 deletion DESCRIPTION
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,8 @@ Suggests:
testthat (>= 3.0.0),
UpSetR,
BiocStyle,
covr
covr,
seqminer,
Rsamtools
Config/testthat/edition: 3
VignetteBuilder: knitr
4 changes: 4 additions & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ export(find_sumstats)
export(format_sumstats)
export(get_genome_builds)
export(import_sumstats)
export(index_tabular)
export(load_snp_loc_data)
export(read_sumstats)
export(write_sumstats)
Expand All @@ -16,6 +17,7 @@ importFrom(GenomeInfoDb,seqlevelsStyle)
importFrom(GenomeInfoDb,seqnames)
importFrom(GenomicRanges,makeGRangesFromDataFrame)
importFrom(R.utils,gunzip)
importFrom(Rsamtools,bgzip)
importFrom(VariantAnnotation,makeVRangesFromGRanges)
importFrom(VariantAnnotation,scanVcfHeader)
importFrom(data.table,":=")
Expand Down Expand Up @@ -58,9 +60,11 @@ importFrom(rtracklayer,import.chain)
importFrom(rtracklayer,liftOver)
importFrom(rtracklayer,strand)
importFrom(rtracklayer,width)
importFrom(seqminer,tabix.createIndex)
importFrom(stats,complete.cases)
importFrom(stats,qchisq)
importFrom(stats,sd)
importFrom(stats,setNames)
importFrom(stringr,str_split)
importFrom(utils,capture.output)
importFrom(utils,data)
Expand Down
1 change: 1 addition & 0 deletions NEWS.md
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
* Added tests for `get_genome_builds`
* Added early check for making sure the directory `save_path` is in was
actually created (as opposed to finding out at the very end of the pipeline).
* Tabix-indexing now available for tabular output data.

## CHANGES IN VERSION 1.1.26

Expand Down
23 changes: 18 additions & 5 deletions R/check_save_path.R
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,8 @@
check_save_path <- function(save_path,
log_folder,
log_folder_ind,
write_vcf = FALSE) {
write_vcf = FALSE,
tabix_index = FALSE) {
#### Add warning to users that temp files aren't actually saved ####
if (dirname(save_path) == tempdir()) {
message(
Expand Down Expand Up @@ -81,11 +82,14 @@ check_save_path <- function(save_path,
}
}
}
#### Distinguish between VCF and tabular formats ####
if (write_vcf) {
save_path <- gsub(paste(suffixes, collapse = "|"),
".vcf.gz", save_path)
sep <- "\t"
file_type <- "vcf"
# Don't have to worry about tabix_index bc if writing to VCF format,
# will always be compresed (vcf.gz) anyway.
} else {
#### Account for mismatch between write_vcf and save_path ####
suffixes.vcf <- supported_suffixes(
Expand All @@ -106,13 +110,22 @@ check_save_path <- function(save_path,
# if output vcf, save log file .tsv.gz
extension <- ".tsv.gz"
}
# get extension of save path for log files
#### Check for tabix-indexing ####
if(tabix_index){
# Using slightly modified version of
# Rsamtools::bgzip default
save_path <- sprintf("%s.bgz",
sub("\\.gz$|\\.bgz$", "", save_path))
extension <- sprintf("%s.bgz",
sub("\\.gz$|\\.bgz$", "", extension))
}
#### get extension of save path for log files ###
if (!exists("extension")) {
# if output vcf, save log file type just .tsv.gz
extension <- ".tsv.gz"
}
}
#### Make sure dir exists
}
}
#### Make sure dir exists ####
if (is.character(save_path)) {
dir.create(dirname(save_path),
showWarnings = FALSE, recursive = TRUE)
Expand Down
37 changes: 37 additions & 0 deletions R/column_dictionary.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
#' Map column names to positions.
#'
#' Useful in situations where you need to specify columns by
#' index instead of name (e.g. awk queries).
#'
#' @source Borrowed function from
#' \href{https://github.com/RajLabMSSM/echotabix/blob/main/R/convert.R}{
#' echotabix}.
#'
#' @source
#' \code{
#' eduAttainOkbayPth <- system.file("extdata", "eduAttainOkbay.txt",
#' package = "MungeSumstats"
#' )
#' tmp <- tempfile(fileext = ".tsv")
#' file.copy(eduAttainOkbayPth, tmp)
#' cdict <- MungeSumstats:::column_dictionary(file_path = tmp)
#' }
#'
#' @param file_path Path to full summary stats file
#' (or any really file you want to make a column dictionary for).
#'
#' @return Named list of column positions.
#'
#' @keywords internal
#' @importFrom stats setNames
column_dictionary <- function(file_path) {
# Get the index of each column name
header <- read_header(path = file_path,
# n must be 2 or else
# fread won't be able to parse text
n = 2,
skip_vcf_metadata = TRUE)
cNames <- colnames(data.table::fread(text = header))
colDict <- stats::setNames(seq(1, length(cNames)), cNames)
return(colDict)
}
5 changes: 3 additions & 2 deletions R/format_sumstats.R
Original file line number Diff line number Diff line change
Expand Up @@ -812,13 +812,14 @@ format_sumstats <- function(path,


#### WRITE data.table TO PATH ####
write_sumstats(
check_save_out$save_path <- write_sumstats(
sumstats_dt = sumstats_return$sumstats_dt,
save_path = check_save_out$save_path,
sep = check_save_out$sep,
write_vcf = write_vcf,
tabix_index = tabix_index,
nThread = nThread
nThread = nThread,
return_path = TRUE
)
rm(rsids) # free up memory

Expand Down
65 changes: 65 additions & 0 deletions R/index_tabular.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
#' Convert summary stats file to tabix format
#'
#' @source Borrowed function from
#' \href{https://github.com/RajLabMSSM/echotabix/blob/main/R/convert.R}{
#' echotabix}.
#'
#' @param path Path to GWAS summary statistics file.
#' @param verbose Print messages.
#' @inheritParams dt_to_granges
#'
#' @family tabix
#' @examples
#' eduAttainOkbayPth <- system.file("extdata", "eduAttainOkbay.txt",
#' package = "MungeSumstats")
#' sumstats_dt <- data.table::fread(eduAttainOkbayPth, nThread = 1)
#' sumstats_dt <-
#' MungeSumstats:::standardise_sumstats_column_headers_crossplatform(
#' sumstats_dt = sumstats_dt)$sumstats_dt
#' sumstats_dt <- MungeSumstats:::sort_coords(sumstats_dt = sumstats_dt)
#' path <- tempfile(fileext = ".tsv")
#' MungeSumstats::write_sumstats(sumstats_dt = sumstats_dt, save_path = path)
#'
#' indexed_file <- MungeSumstats::index_tabular(path = path)
#' @export
#' @importFrom Rsamtools bgzip
#' @importFrom seqminer tabix.createIndex
index_tabular <- function(path,
chrom_col = "CHR",
start_col = "BP",
end_col = start_col,
verbose = TRUE) {
msg <- paste("Converting full summary stats file to",
"tabix format for fast querying...")
message(msg)
#### Read header and make dictionary ####
cdict <- column_dictionary(file_path = path)
#### Check column exist ####
if(!chrom_col %in% names(cdict)) stop("chrom_col not found in file.")
if(!start_col %in% names(cdict)) stop("start_col not found in file.")
if(!end_col %in% names(cdict)) stop("end_col not found in file.")
#### Make sure input file isn't empty ####
if (file.size(path) == 0) {
msg2 <- paste("Removing empty file:", path)
messager(msg2)
out <- file.remove(path)
}
### File MUST be bgzipped first
message("Ensuring file is bgzipped.")
bgz_file <- Rsamtools::bgzip(file = path,
dest = sprintf("%s.bgz",
sub("\\.gz$|\\.bgz$", "",
path)),
overwrite = TRUE)
### Tabix-index file
message("Tabix-indexing file.")
seqminer::tabix.createIndex(
bgzipFile = bgz_file,
sequenceColumn = cdict[[chrom_col]],
startColumn = cdict[[start_col]],
endColumn = cdict[[end_col]],
## Just use the first columns name
metaChar = names(cdict)[1]
)
return(bgz_file)
}
20 changes: 17 additions & 3 deletions R/write_sumstats.R
Original file line number Diff line number Diff line change
Expand Up @@ -2,10 +2,15 @@
#'
#' @param sumstats_dt data table obj of the summary statistics
#' file for the GWAS.
#' @param return_path Return \code{save_path}.
#' This will have been modified in some cases
#' (e.g. after compressing and tabix-indexing a
#' previously un-compressed file).
#' @inheritParams data.table::fread
#' @inheritParams format_sumstats
#'
#' @return \code{VRanges} object
#' @returns If \code{return_path=TRUE}, returns \code{save_path}.
#' Else returns \code{NULL}.
#'
#' @export
#' @importFrom GenomicRanges makeGRangesFromDataFrame
Expand All @@ -24,7 +29,8 @@ write_sumstats <- function(sumstats_dt,
sep = "\t",
write_vcf = FALSE,
tabix_index = FALSE,
nThread = 1) {
nThread = 1,
return_path = FALSE) {
#### Make sure the directory actually exists
if (is.character(save_path)) {
dir.create(dirname(save_path),
Expand All @@ -47,7 +53,8 @@ write_sumstats <- function(sumstats_dt,
} else {
message("Writing in VCF format ==> ", save_path)
}
VariantAnnotation::writeVcf(vr,
VariantAnnotation::writeVcf(
obj = vr,
filename = save_path,
index = tabix_index
)
Expand All @@ -59,5 +66,12 @@ write_sumstats <- function(sumstats_dt,
sep = sep,
nThread = nThread
)
if(tabix_index){
save_path <- index_tabular(path = save_path,
chrom_col = "CHR",
start_col = "BP",
verbose = TRUE)
}
}
if(return_path) return(save_path)
}
11 changes: 10 additions & 1 deletion man/check_save_path.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

34 changes: 34 additions & 0 deletions man/column_dictionary.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

47 changes: 47 additions & 0 deletions man/index_tabular.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Loading

0 comments on commit 8ab84dc

Please sign in to comment.