document and test as_paf

dwinter · Dec 1, 2020 · 2984e7b · 2984e7b
1 parent b317ed6
commit 2984e7b
Showing 5 changed files with 91 additions and 19 deletions.
diff --git a/.Rbuildignore b/.Rbuildignore
@@ -6,3 +6,5 @@
 ^codecov\.yml$
 ^_pkgdown\.yml$
 ^docs$
+^doc$
+^Meta$
diff --git a/NAMESPACE b/NAMESPACE
@@ -6,6 +6,7 @@ S3method(print,pafr)
 export(Gb_lab)
 export(Kb_lab)
 export(Mb_lab)
+export(as_paf)
 export(chrom_sizes)
 export(dotplot)
 export(filter_secondary_alignments)

diff --git a/R/IO.r b/R/IO.r
@@ -132,7 +132,20 @@ read_paf <- function(file_name, tibble=FALSE, include_tags=TRUE) {
     res
 }
 
-#;
+#' Coerce a data.frame or tibble into a pafr object
+#' 
+#' The main reason to use this function is speed up the process of reading in
+#' a large paf file that has no tags. Functions like read.table, read_delim
+#' (reader) and fread (data.table) can process a 12 column file  more quickly
+#' than pafr's read_paf. If you you do not need tag data for your analyses or
+#' visualizations, it might make sense to use a fast reading function to get a
+#' 12 column data.frame,  convert that data.frame into a `pafr  object with this
+#' function. The `pafr` object can then work easily with the functions in this package.
+#' @export 
+#' @param paf_data_frame a data.frame object with 12 columns. Column names and
+#' types wil be over-written in the returned object
+#' @return a pafr object 
+#' @seealso read_pa
 as_paf <- function(paf_data_frame){
     #lets be assertive
     if( ncol(paf_data_frame) != 12 ){
@@ -141,14 +154,6 @@ as_paf <- function(paf_data_frame){
     col_names <- c("qname", "qlen", "qstart", "qend", "strand",
                    "tname", "tlen", "tstart", "tend", "nmatch",
                    "alen", "mapq")
-  as_paf <- function(paf_data_frame){
-    #lets be assertive
-    if( nrow(paf_data_frame) != 12 ){
-        stop("data.frame should be in paf format, with 12 columns")
-    }
-    col_names <- c("qname", "qlen", "qstart", "qend", "strand",
-                   "tname", "tlen", "tstart", "tend", "nmatch",
-                   "alen", "mapq")
     for(col_idx in c(2:4, 7:12)){
         if( !(is_numericable(paf_data_frame[[col_idx]]) ) ){
             msg <- paste0("Column", col_idx, "('", col_names[col_idx], 
@@ -161,12 +166,7 @@ as_paf <- function(paf_data_frame){
     names(res) <- col_names
     class(res) <- c("pafr", "data.frame")
     res
-}    #ok, looks good
-    res <- .make_numeric(paf_data_frame, c(2, 3, 4, 7:12))
-    names(res) <- col_names
-    class(res) <- c("pafr", "data.frame")
-    res
-}
+}    
 
 #is a vector, possibly in character form, able to be treated as a numeric?
 is_numericable <- function(vec){

diff --git a/tests/testthat/test_IO.r b/tests/testthat/test_IO.r
@@ -2,8 +2,10 @@ context("file IO")
 # Setup
 
 ali_pafr <- read_paf("test_ali.paf")
+ali_skip_tags <- read_paf("test_ali.paf", include_tags=FALSE)
 ali_tib <- read_paf("test_ali.paf", tibble=TRUE)
 ali_tagless <- read_paf("tagless.paf")
+ali_df <- read.table("tagless.paf")
 
 
 tag_names <- c("NM", "ms", "AS", "nn", "tp", "cm", "s1", "s2", "dv", "cg", "zd")
@@ -51,4 +53,26 @@ test_that("can handle tagless alignments", {
     expect_equal(ncol(ali_tagless), 12)
     expect_output(print(ali_tagless, "0 tags"))
 })
+test_that("We can ignore tags read_paf", {
+    expect_that(ali_skip_tags, is_a("pafr"))
+    expect_that( ncol(ali_skip_tags), equals(12))
+})
+
+
+test_that("can convert data.frame to pafr", {
+    expect_that(ali_df, is_a("data.frame"))
+    ali_converted <- as_paf(ali_df)
+    expect_that(ali_converted, is_a("pafr"))
+
+    expect_that( nrow(ali_converted), equals(5))
+    expect_that( ncol(ali_converted), equals(12))
+    expect_that( sum(ali_converted[["alen"]]), equals(240777))
+})
 
+test_that("throw errors on converting data.frames", {
+    ali_df$extra_col <- "shouldn't be here"
+    expect_error( as_paf(ali_df) )
+    ali_df$extra_col <- NULL
+    ali_df[[12]] <- "Should be a number"
+    expect_error( as_paf(ali_df) )
+})
diff --git a/vignettes/Introduction_to_pafr.Rmd b/vignettes/Introduction_to_pafr.Rmd
@@ -97,10 +97,7 @@ similarity.
 
 Because the `pafr` object is effectively a `data.frame`, we can again use standard R
 functions to inspect or analyse it. For example, we can calculate the mean
-divergence level for alignments featuring each query sequence.
-<!---
-David: what do you mean by "alignments featuring each query sequence"?
--->
+divergence level for alignments featuring each sequence in the query genome.
 
 ```{r, compare_q}
 by_q <- aggregate(dv ~ qname, data=ali, FUN=mean)
@@ -111,6 +108,54 @@ Interestingly enough, `Q_chrm` is the mitochondrial genome, so it appears that t
 mitochondrial genome displays less divergence than any of the autosomal chromosomes in this particular species.
 
 
+### Reading really big alignments
+
+Though `read_paf` has been optimised to process the tags in a paf file quickly, it
+can still take some time to read paf files that have a large number of
+alignments. For example, it takes about 30 seconds to read an alignment between
+two fragmented vertebrate genomes on my desktop PC. If speed is of the essence
+and you know you don't need the data encoded in the tags, there are a few tricks
+to get a `pafr` object more quickly.
+
+First, you can tell read_paf to ignore the tag information using the
+`include_tags` argument. Let's use `microbenchmark` to see how much time we can
+save by skipping the tags.
+
+```{r, include-tags}
+microbenchmark::microbenchmark(
+  tags = read_paf(path_to_fungal_alignment),
+  no_tags = read_paf(path_to_fungal_alignment, include_tags=FALSE),
+  times=10
+)
+```
+
+That's about five times faster on my computer.
+
+There may be cases the alignemnts are so huge or fragmented that they are still
+taking a long time to read in. In this case, you may want to remove the tag data
+prior ot reading in the alignments. For example, you could use the unix utility
+`cut`:
+
+```sh
+cut -f12 [alignment_with_tags.paf] > tagless.paf
+```
+
+With not tags to process, read_paf will be quicker. In this case, you could also
+read the .paf file in with `read.table` or a faster reading function like
+`fread` from data.table. The data.frame returned by that function can then be
+converted to a `pafr` object for use with the plotting functions below using 
+`as_pafr`. 
+
+
+```r
+tagless_df <- read.table("tagless.paf")
+tagless_ali <- as_paf(tagless_df)
+```
+pafr` provides the function `as_pafr` to convert
+a data.frame (or tibble or similar) into a `pafr`
+system.time(read_paf(path_to_fungal_alignment, include_tags=FALSE))
+
+
 ## Filtering and subsetting alignments
 
 Often the first thing you will want to do after reading in data is to drop
-Original file line number
+Diff line change
@@ @@ -6,3 +6,5 @@ @@
     ^codecov\.yml$
     ^_pkgdown\.yml$
     ^docs$
+    ^doc$
+    ^Meta$