robinweide
diff --git a/‎DESCRIPTION
+2-2 b/‎DESCRIPTION
+2-2
diff --git a/‎NAMESPACE
+11 b/‎NAMESPACE
+11
diff --git a/‎NEWS.md
+15-1 b/‎NEWS.md
+15-1
diff --git a/‎R/align.R
+44-14 b/‎R/align.R
+44-14
diff --git a/‎R/checkPrimer.R
+142 b/‎R/checkPrimer.R
+142
@@ -1,11 +1,11 @@
 Package: tagMeppr
 Type: Package
-Title: A computational pipeline to map tagmap-insertions.
+Title: A computational pipeline to map tagmap-insertions
 Version: 0.1.0
 Authors@R: person("Robin H.", "van der Weide", email = "[email protected]", role = c("aut", "cre"), comment = c(ORCID = "0000-0002-6466-7280s"))
 Author: Robin H. van der Weide [aut, cre]
 Maintainer: Robin H. van der Weide <[email protected]>
-Description: Align, interrogate and visualise your tagmap sequencing data, without leaving R.
+Description: Align, interrogate and visualise your tagmap sequencing data, without leaving R
 License: MIT + file LICENSE
 Depends: R (>= 3.4.0)
 URL: https://github.com/robinweide/tagmeppr
 
@@ -2,31 +2,41 @@
 
 S3method(print,tagMepprIndex)
 S3method(print,tagMepprSample)
+S3method(results,tagMepprSample)
 export(align)
+export(checkPrimer)
 export(findInsertions)
 export(loadIndex)
 export(makeIndex)
 export(newTagMeppr)
 export(plotInsertions)
 export(plotSite)
+export(results)
 export(runIDgen)
+export(tagMepprCol)
 importFrom(BSgenome,getSeq)
 importFrom(BSgenome.Hsapiens.UCSC.hg19,BSgenome.Hsapiens.UCSC.hg19)
 importFrom(BiocGenerics,end)
 importFrom(BiocGenerics,start)
 importFrom(BiocGenerics,strand)
+importFrom(Biostrings,DNAString)
+importFrom(Biostrings,letterFrequency)
 importFrom(Biostrings,readDNAStringSet)
+importFrom(Biostrings,reverseComplement)
 importFrom(Biostrings,vmatchPattern)
 importFrom(Biostrings,writeXStringSet)
 importFrom(GenomeInfoDb,seqlengths)
 importFrom(GenomicAlignments,readGAlignments)
 importFrom(GenomicAlignments,width)
 importFrom(GenomicRanges,GRanges)
 importFrom(GenomicRanges,GRangesList)
+importFrom(GenomicRanges,as.data.frame)
 importFrom(GenomicRanges,countOverlaps)
 importFrom(GenomicRanges,findOverlaps)
 importFrom(GenomicRanges,makeGRangesFromDataFrame)
+importFrom(GenomicRanges,reduce)
 importFrom(GenomicRanges,seqnames)
+importFrom(GenomicRanges,strand)
 importFrom(IRanges,IRanges)
 importFrom(IRanges,coverage)
 importFrom(IRanges,subsetByOverlaps)
@@ -71,6 +81,7 @@ importFrom(reshape2,colsplit)
 importFrom(rtracklayer,export.bed)
 importFrom(rtracklayer,import.bed)
 importFrom(scales,extended_breaks)
+importFrom(scales,hue_pal)
 importFrom(stats,approx)
 importFrom(stats,complete.cases)
 importFrom(stats,p.adjust)
 
@@ -1,3 +1,18 @@
+# tagmeppr 0.2
+
+* results-method to make a df of results
+* updated results-metadata to be more clear (orientation is now strand info)
+* `plotInsertions` now handles failed experiments without insertions
+* counts are now correct: `align()` did show some duplicate reads (which were not remove due to strand-info)
+* added the option to use padding in `findInsertions()`, which fixes issues with the mapper reporting matches over the TIS.
+* wrote checkPrimer to... check... the... primers...
+* `findInsertions()` now uses D_scores to set orientation as strand
+* `findInsertions()` uses the 5rev_3fwd-flag for setting the orientation
+* `plotInsertions()` has better spacing, colours and handles orientation.
+* `results()` now enables filtering on pvalue, counts and orientation
+* `plotSite()` is ready to use and also shows the orientation
+* a bug `plotSite()` regarding x-ticks is fixed
+
 # tagmeppr 0.1
 
 * re-wrote indexing part to use internal ITRs
@@ -9,4 +24,3 @@
 * put postalign inside align
 * made a minimised fastq-seq to speed things up
 * made nice print methods
-* results-method to make a df of results
 
@@ -302,28 +302,58 @@ align = function(exp, ref, cores = 20, empericalCentre = F, verbose = F){
 
   combined = lapply(c('FWD', 'REV'), function(i){
     combined = list(mainCorpus[[i]],SAlist[[i]],XAlist[[i]])
-    combined <- suppressWarnings(dplyr::bind_rows(combined))
-    combined = unique(combined)
-
-    # split combined on the mapping-position in the casette
-    tmpCassette = combined[combined$seqnames == exp$insertName, ]
-    orDF = data.frame(readName = tmpCassette$readName,
-                      orientation = ifelse( (tmpCassette$start + 2) < cassMid, 5,3))
-    combined = suppressWarnings(dplyr::full_join(combined, orDF, by = c("readName")))
 
-    # remove read with no cassette-loc
-    combined = combined[!is.na(combined$orientation), ]
+    combined <- suppressWarnings(dplyr::bind_rows(combined))
 
-    unique(combined)
   })
 
-  exp$alignedReadsFWD = GenomicRanges::makeGRangesFromDataFrame( combined[[1]],
+
+  alignedReadsFWD = GenomicRanges::makeGRangesFromDataFrame( combined[[1]],
                                                                  keep.extra.columns = T)
-  exp$alignedReadsREV = GenomicRanges::makeGRangesFromDataFrame( combined[[2]],
+  alignedReadsREV = GenomicRanges::makeGRangesFromDataFrame( combined[[2]],
                                                                  keep.extra.columns = T)
-  exp$insertionMid = cassMid
+
+
+
+
+  #################################################################### ITRs
+  # get a GRanges ofthe two arms: this biostrings should be in the ref-object
+  ITRpadRange = ref$NpadRange
+
+  #################################################################### prettyBam
+  ### FWD
+
+  BiocGenerics::strand(alignedReadsFWD) = "*"
+  alignedReadsFWDlist =  GenomicRanges::split(alignedReadsFWD, ~ readName)
+  alignedReadsFWDlist = GenomicRanges::reduce(alignedReadsFWDlist)
+
+  beforePAD = IRanges::start(alignedReadsFWDlist[seqnames(alignedReadsFWDlist) == exp$insertName]) < IRanges::start(ITRpadRange)
+  S4Vectors::mcols(alignedReadsFWDlist)$beforePad = beforePAD
+  alignedReadsFWD <- IRanges::stack(alignedReadsFWDlist, "readName")
+
+  alignedReadsFWD$beforePad =  vapply(alignedReadsFWD$beforePad , any, FUN.VALUE = logical(1) )
+
+  ### REV
+
+
+  BiocGenerics::strand(alignedReadsREV) = "*"
+  alignedReadsREVlist =  GenomicRanges::split(alignedReadsREV, ~ readName)
+  alignedReadsREVlist = GenomicRanges::reduce(alignedReadsREVlist)
+
+  beforePAD = IRanges::start(alignedReadsREVlist[seqnames(alignedReadsREVlist) == exp$insertName]) < IRanges::start(ITRpadRange)
+  S4Vectors::mcols(alignedReadsREVlist)$beforePad = beforePAD
+  alignedReadsREV <- IRanges::stack(alignedReadsREVlist, "readName")
+
+  alignedReadsREV$beforePad =  vapply(alignedReadsREV$beforePad , any, FUN.VALUE = logical(1) )
+
+
+
 
   ##################################################################### assigner
+  exp$alignedReadsFWD = alignedReadsFWD
+  exp$alignedReadsREV = alignedReadsREV
+  exp$insertionMid = cassMid
+
   tmp = exp
   # get arguments
   name <- sapply(match.call(expand.dots=TRUE)[-1], deparse)
 
@@ -0,0 +1,142 @@
+
+
+#' checkPrimer
+#'
+#' This tool checks the assumptions about the primers.
+#'
+#' @author Robin H. van der Weide, \email{[email protected]}
+#' @param fwdPrimer A character-string of the forward-primer used.
+#' @param revPrimer A character-string of the reverse-primer used.
+#' @param exp The tagMeppr-object of a sample: first run \code{\link{align}}.
+#' @param ITR Can take PiggyBac (default), SleepingBeauty, or a path to a 1000xN-padded ITR.fasta.
+#' @details
+#'
+#'
+#' The expected general layout for the ITR-sequence looks like this:
+#'
+#' \code{|---ITR---NNN...NNN---ITR---|}
+#'
+#' The primers are expected to be 5'-end for the reverse and 3' for the forward:
+#'
+#' \code{<rev}
+#' \code{|---ITR---NNN...NNN---ITR---|}
+#' \code{                         fwd>}
+#'
+#' This tool checks these assumptions and sets the rev5_fwd3 flag to TRUE.
+#'
+#' @examples
+#' \dontrun{
+#'
+#' C9 = newTagMeppr(F1 = 'clone9_FWD_R1.fq.gz',
+#'                  F2 = 'clone9_FWD_R2.fq.gz',
+#'                  R1 = 'clone9_REV_R1.fq.gz',
+#'                  R2 = 'clone9_REV_R2.fq.gz',
+#'                  name = "clone9",
+#'                  protocol = 'PiggyBac')
+#'
+#' checkPrimer(fwdPrimer = "CGTCAATTTTACGCAGACTATC",
+#'             revPrimer = "GTACGTCACAATATGATTATCTTTCTAG",
+#'             exp =  C9,
+#'             ITR = 'PiggyBac')
+#'
+#' }
+#' @return The experiment-object will be updated with the rev5_fwd3-flag, which
+#' will tell all downstream analyses if our assumptions are correct.
+#'
+#' @importFrom Biostrings reverseComplement DNAString readDNAStringSet letterFrequency vmatchPattern
+#' @export
+checkPrimer <- function(fwdPrimer, revPrimer, exp, ITR = 'PiggyBac'){
+  rev5_fwd3 = F
+
+  if(exp$protocol != ITR){
+    stop('Protocol given in exp (', exp$protocol,
+         ') is not the same as given as ITR (',ITR,').')
+  }
+
+  ############################################################# get revComplement
+  fwdPrimerCompl = Biostrings::reverseComplement(Biostrings::DNAString(fwdPrimer))
+  revPrimerCompl = Biostrings::reverseComplement(Biostrings::DNAString(revPrimer))
+
+  ##################################################################### load ITR
+  transposonSeq = NULL
+  if(ITR == "PiggyBac"){
+    transposonSeq = tagMeppr::PiggyBacITRs
+  } else if(ITR == "SleepingBeauty"){
+    transposonSeq = tagMeppr::SleepingBeautyITRs
+  } else if(grepl(ITR, pattern = ".fa")){
+    # check if exists
+    if(file.exists(ITR)){
+      transposonSeq = Biostrings::readDNAStringSet(filepath = ITR, use.names = T)
+      # check if N-padded
+      N1k = Biostrings::letterFrequency(transposonSeq, letters = "N") == 1000
+      if(!N1k){
+        stop('The file ', ITR, " has no padding of 1000 N's between the arms.")
+      }
+    } else {
+      stop('The file ', ITR, ' does not exist.')
+    }
+  } else {
+    stop('Please set ITR to either "PiggyBac", "SleepingBeauty", or as a path to a .fasta-file!')
+  }
+
+  ##################################################################### get arms
+  NpadRange = Biostrings::vmatchPattern(transposonSeq,
+                                        pattern = paste0(rep('N', 1e3), collapse = ''))
+
+  ######################################################################### find
+
+  hitF = Biostrings::vmatchPattern(transposonSeq,pattern = fwdPrimer)[1]
+  hitR = Biostrings::vmatchPattern(transposonSeq,pattern = revPrimer)[1]
+
+  if(length(hitF[[1]]) == 0){
+    hitF = Biostrings::vmatchPattern(transposonSeq,pattern = fwdPrimerCompl)[1]
+  }
+
+  if(length(hitR[[1]]) == 0){
+    hitR = Biostrings::vmatchPattern(transposonSeq,pattern = revPrimerCompl)[1]
+  }
+
+
+  ############################################################### check if found
+  if(length(hitF[[1]]) == 0){
+    stop('No match between fwdPrimer (including revComplement) and the sequence.')
+  }
+
+  if(length(hitR[[1]]) == 0){
+    stop('No match between revPrimer (including revComplement) and the sequence.')
+  }
+
+  ############################################# check if they are on unique arms
+  belowF = unlist(hitF[[1]] < NpadRange)
+  belowR = unlist(hitR[[1]] < NpadRange)
+
+  if(belowR == belowF){
+    if(belowF){
+      stop('Primers are both found on the first ITR!')
+    } else {
+      stop('Primers are both found on the second ITR!')
+    }
+  }
+
+  ####################### check if start(reverse primer) < start(forward primer)
+  if(hitR[[1]] < hitF[[1]]){
+    rev5_fwd3 = T
+  } else {
+    # reverse is on second ITR, which is not what I expect
+    rev5_fwd3 = F
+  }
+
+  exp$rev5_fwd3 = rev5_fwd3
+
+  ##################################################################### assigner
+  tmp = exp
+  # get arguments
+  name <- sapply(match.call(expand.dots=TRUE)[-1], deparse)
+  #find argument postion for exp
+  AP = which(names(name) == 'exp')
+
+  assign(name[AP], tmp, envir = parent.frame())
+
+  invisible(rev5_fwd3)
+
+}