Enable tabix-indexing for tabular output

BEFH · Oct 3, 2021 · 8ab84dc · 8ab84dc
1 parent 2de2d68
commit 8ab84dc
Show file tree

Hide file tree

Showing 13 changed files with 280 additions and 14 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -66,6 +66,8 @@ Suggests:
     testthat (>= 3.0.0),
     UpSetR,
     BiocStyle,
-    covr
+    covr,
+    seqminer,
+    Rsamtools
 Config/testthat/edition: 3
 VignetteBuilder: knitr
diff --git a/NAMESPACE b/NAMESPACE
@@ -5,6 +5,7 @@ export(find_sumstats)
 export(format_sumstats)
 export(get_genome_builds)
 export(import_sumstats)
+export(index_tabular)
 export(load_snp_loc_data)
 export(read_sumstats)
 export(write_sumstats)
@@ -16,6 +17,7 @@ importFrom(GenomeInfoDb,seqlevelsStyle)
 importFrom(GenomeInfoDb,seqnames)
 importFrom(GenomicRanges,makeGRangesFromDataFrame)
 importFrom(R.utils,gunzip)
+importFrom(Rsamtools,bgzip)
 importFrom(VariantAnnotation,makeVRangesFromGRanges)
 importFrom(VariantAnnotation,scanVcfHeader)
 importFrom(data.table,":=")
@@ -58,9 +60,11 @@ importFrom(rtracklayer,import.chain)
 importFrom(rtracklayer,liftOver)
 importFrom(rtracklayer,strand)
 importFrom(rtracklayer,width)
+importFrom(seqminer,tabix.createIndex)
 importFrom(stats,complete.cases)
 importFrom(stats,qchisq)
 importFrom(stats,sd)
+importFrom(stats,setNames)
 importFrom(stringr,str_split)
 importFrom(utils,capture.output)
 importFrom(utils,data)

diff --git a/NEWS.md b/NEWS.md
@@ -13,6 +13,7 @@
 * Added tests for `get_genome_builds`  
 * Added early check for making sure the directory `save_path` is in was 
 actually created (as opposed to finding out at the very end of the pipeline). 
+* Tabix-indexing now available for tabular output data.
 
 ## CHANGES IN VERSION 1.1.26
 

diff --git a/R/check_save_path.R b/R/check_save_path.R
@@ -8,7 +8,8 @@
 check_save_path <- function(save_path,
                             log_folder,
                             log_folder_ind,
-                            write_vcf = FALSE) {
+                            write_vcf = FALSE,
+                            tabix_index = FALSE) {
     #### Add warning to users that temp files aren't actually saved ####
     if (dirname(save_path) == tempdir()) {
         message(
@@ -81,11 +82,14 @@ check_save_path <- function(save_path,
             }
         }
     }
+    #### Distinguish between VCF and tabular formats ####
     if (write_vcf) {
         save_path <- gsub(paste(suffixes, collapse = "|"),
                           ".vcf.gz", save_path)
         sep <- "\t"
         file_type <- "vcf"
+        # Don't have to worry about tabix_index bc if writing to VCF format, 
+        # will always be compresed (vcf.gz) anyway.
     } else {
         #### Account for mismatch between write_vcf and save_path ####
         suffixes.vcf <- supported_suffixes(
@@ -106,13 +110,22 @@ check_save_path <- function(save_path,
             # if output vcf, save log file .tsv.gz
             extension <- ".tsv.gz"
         }
-        # get extension of save path for log files
+        #### Check for tabix-indexing ####
+        if(tabix_index){
+            # Using slightly modified version of 
+            #  Rsamtools::bgzip default
+            save_path <- sprintf("%s.bgz",
+                                 sub("\\.gz$|\\.bgz$", "", save_path)) 
+            extension <- sprintf("%s.bgz",
+                                 sub("\\.gz$|\\.bgz$", "", extension)) 
+        }
+        #### get extension of save path for log files ###
         if (!exists("extension")) {
             # if output vcf, save log file type just .tsv.gz
             extension <- ".tsv.gz"
-        }
-    }
-    #### Make sure dir exists
+        } 
+    } 
+    #### Make sure dir exists ####
     if (is.character(save_path)) {
         dir.create(dirname(save_path),
                    showWarnings = FALSE, recursive = TRUE)

diff --git a/R/column_dictionary.R b/R/column_dictionary.R
@@ -0,0 +1,37 @@
+#' Map column names to positions.
+#'
+#' Useful in situations where you need to specify columns by
+#' index instead of name (e.g. awk queries).
+#'
+#' @source Borrowed function from
+#' \href{https://github.com/RajLabMSSM/echotabix/blob/main/R/convert.R}{
+#' echotabix}.
+#' 
+#' @source 
+#' \code{
+#' eduAttainOkbayPth <- system.file("extdata", "eduAttainOkbay.txt",
+#'     package = "MungeSumstats"
+#' )
+#' tmp <- tempfile(fileext = ".tsv")
+#' file.copy(eduAttainOkbayPth, tmp)
+#' cdict <- MungeSumstats:::column_dictionary(file_path = tmp)
+#' }
+#' 
+#' @param file_path Path to full summary stats file
+#' (or any really file you want to make a column dictionary for).
+#' 
+#' @return Named list of column positions.
+#' 
+#' @keywords internal
+#' @importFrom stats setNames
+column_dictionary <- function(file_path) {
+    # Get the index of each column name
+    header <- read_header(path = file_path, 
+                          # n must be 2 or else 
+                          # fread won't be able to parse text
+                          n = 2, 
+                          skip_vcf_metadata = TRUE) 
+    cNames <- colnames(data.table::fread(text = header))
+    colDict <- stats::setNames(seq(1, length(cNames)), cNames)
+    return(colDict)
+}
diff --git a/R/format_sumstats.R b/R/format_sumstats.R
@@ -812,13 +812,14 @@ format_sumstats <- function(path,
 
 
         #### WRITE data.table TO PATH ####
-        write_sumstats(
+        check_save_out$save_path <- write_sumstats(
             sumstats_dt = sumstats_return$sumstats_dt,
             save_path = check_save_out$save_path,
             sep = check_save_out$sep,
             write_vcf = write_vcf,
             tabix_index = tabix_index,
-            nThread = nThread
+            nThread = nThread,
+            return_path = TRUE
         )
         rm(rsids) # free up memory
 

diff --git a/R/index_tabular.R b/R/index_tabular.R
@@ -0,0 +1,65 @@
+#' Convert summary stats file to tabix format
+#'
+#' @source Borrowed function from 
+#' \href{https://github.com/RajLabMSSM/echotabix/blob/main/R/convert.R}{
+#' echotabix}.
+#' 
+#' @param path Path to GWAS summary statistics file.
+#' @param verbose Print messages.
+#' @inheritParams dt_to_granges
+#'
+#' @family tabix
+#' @examples 
+#' eduAttainOkbayPth <- system.file("extdata", "eduAttainOkbay.txt",
+#'                                  package = "MungeSumstats")
+#' sumstats_dt <- data.table::fread(eduAttainOkbayPth, nThread = 1)
+#' sumstats_dt <- 
+#' MungeSumstats:::standardise_sumstats_column_headers_crossplatform(
+#'     sumstats_dt = sumstats_dt)$sumstats_dt
+#' sumstats_dt <- MungeSumstats:::sort_coords(sumstats_dt = sumstats_dt)
+#' path <- tempfile(fileext = ".tsv")
+#' MungeSumstats::write_sumstats(sumstats_dt = sumstats_dt, save_path = path)
+#'     
+#' indexed_file <- MungeSumstats::index_tabular(path = path)
+#' @export
+#' @importFrom Rsamtools bgzip
+#' @importFrom seqminer tabix.createIndex
+index_tabular <- function(path,
+                          chrom_col = "CHR",
+                          start_col = "BP",
+                          end_col = start_col,
+                          verbose = TRUE) {
+    msg <- paste("Converting full summary stats file to",
+                 "tabix format for fast querying...")
+    message(msg)
+    #### Read header and make dictionary ####
+    cdict <- column_dictionary(file_path = path)
+    #### Check column exist ####
+    if(!chrom_col %in% names(cdict)) stop("chrom_col not found in file.")
+    if(!start_col %in% names(cdict)) stop("start_col not found in file.")
+    if(!end_col %in% names(cdict)) stop("end_col not found in file.")
+    #### Make sure input file isn't empty ####
+    if (file.size(path) == 0) {
+        msg2 <- paste("Removing empty file:", path)
+        messager(msg2)
+        out <- file.remove(path)
+    }
+    ### File MUST be bgzipped first
+    message("Ensuring file is bgzipped.")
+    bgz_file <- Rsamtools::bgzip(file = path,
+                                 dest = sprintf("%s.bgz",
+                                                sub("\\.gz$|\\.bgz$", "",
+                                                    path)),
+                                 overwrite = TRUE)
+    ### Tabix-index file
+    message("Tabix-indexing file.")
+    seqminer::tabix.createIndex(
+        bgzipFile = bgz_file,
+        sequenceColumn = cdict[[chrom_col]],
+        startColumn = cdict[[start_col]],
+        endColumn = cdict[[end_col]],
+        ## Just use the first columns name
+        metaChar = names(cdict)[1]
+    )
+    return(bgz_file)
+}
diff --git a/R/write_sumstats.R b/R/write_sumstats.R
@@ -2,10 +2,15 @@
 #'
 #' @param sumstats_dt data table obj of the summary statistics
 #' file for the GWAS.
+#' @param return_path Return \code{save_path}.
+#'  This will have been modified in some cases
+#'   (e.g. after compressing and tabix-indexing a
+#'    previously un-compressed file).
 #' @inheritParams data.table::fread
 #' @inheritParams format_sumstats
 #'
-#' @return \code{VRanges} object
+#' @returns If \code{return_path=TRUE}, returns \code{save_path}.
+#'  Else returns \code{NULL}.
 #'
 #' @export
 #' @importFrom GenomicRanges makeGRangesFromDataFrame
@@ -24,7 +29,8 @@ write_sumstats <- function(sumstats_dt,
                            sep = "\t",
                            write_vcf = FALSE,
                            tabix_index = FALSE,
-                           nThread = 1) {
+                           nThread = 1,
+                           return_path = FALSE) {
     #### Make sure the directory actually exists
     if (is.character(save_path)) {
         dir.create(dirname(save_path),
@@ -47,7 +53,8 @@ write_sumstats <- function(sumstats_dt,
         } else {
             message("Writing in VCF format ==> ", save_path)
         }
-        VariantAnnotation::writeVcf(vr,
+        VariantAnnotation::writeVcf(
+            obj = vr,
             filename = save_path,
             index = tabix_index
         )
@@ -59,5 +66,12 @@ write_sumstats <- function(sumstats_dt,
             sep = sep,
             nThread = nThread
         )
+        if(tabix_index){
+            save_path <- index_tabular(path = save_path,
+                                       chrom_col = "CHR", 
+                                       start_col = "BP", 
+                                       verbose = TRUE)
+        }
     }
+    if(return_path) return(save_path)
 }
diff --git a/man/check_save_path.Rd b/man/check_save_path.Rd
diff --git a/man/column_dictionary.Rd b/man/column_dictionary.Rd
diff --git a/man/index_tabular.Rd b/man/index_tabular.Rd