Skip to content

Commit 3e74f53

Browse files
author
Robin Van der Weide
committed
eod
1 parent d201ae5 commit 3e74f53

28 files changed

+907
-272
lines changed

DESCRIPTION

+2-2
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,11 @@
11
Package: tagMeppr
22
Type: Package
3-
Title: A computational pipeline to map tagmap-insertions.
3+
Title: A computational pipeline to map tagmap-insertions
44
Version: 0.1.0
55
Authors@R: person("Robin H.", "van der Weide", email = "[email protected]", role = c("aut", "cre"), comment = c(ORCID = "0000-0002-6466-7280s"))
66
Author: Robin H. van der Weide [aut, cre]
77
Maintainer: Robin H. van der Weide <[email protected]>
8-
Description: Align, interrogate and visualise your tagmap sequencing data, without leaving R.
8+
Description: Align, interrogate and visualise your tagmap sequencing data, without leaving R
99
License: MIT + file LICENSE
1010
Depends: R (>= 3.4.0)
1111
URL: https://github.com/robinweide/tagmeppr

NAMESPACE

+11
Original file line numberDiff line numberDiff line change
@@ -2,31 +2,41 @@
22

33
S3method(print,tagMepprIndex)
44
S3method(print,tagMepprSample)
5+
S3method(results,tagMepprSample)
56
export(align)
7+
export(checkPrimer)
68
export(findInsertions)
79
export(loadIndex)
810
export(makeIndex)
911
export(newTagMeppr)
1012
export(plotInsertions)
1113
export(plotSite)
14+
export(results)
1215
export(runIDgen)
16+
export(tagMepprCol)
1317
importFrom(BSgenome,getSeq)
1418
importFrom(BSgenome.Hsapiens.UCSC.hg19,BSgenome.Hsapiens.UCSC.hg19)
1519
importFrom(BiocGenerics,end)
1620
importFrom(BiocGenerics,start)
1721
importFrom(BiocGenerics,strand)
22+
importFrom(Biostrings,DNAString)
23+
importFrom(Biostrings,letterFrequency)
1824
importFrom(Biostrings,readDNAStringSet)
25+
importFrom(Biostrings,reverseComplement)
1926
importFrom(Biostrings,vmatchPattern)
2027
importFrom(Biostrings,writeXStringSet)
2128
importFrom(GenomeInfoDb,seqlengths)
2229
importFrom(GenomicAlignments,readGAlignments)
2330
importFrom(GenomicAlignments,width)
2431
importFrom(GenomicRanges,GRanges)
2532
importFrom(GenomicRanges,GRangesList)
33+
importFrom(GenomicRanges,as.data.frame)
2634
importFrom(GenomicRanges,countOverlaps)
2735
importFrom(GenomicRanges,findOverlaps)
2836
importFrom(GenomicRanges,makeGRangesFromDataFrame)
37+
importFrom(GenomicRanges,reduce)
2938
importFrom(GenomicRanges,seqnames)
39+
importFrom(GenomicRanges,strand)
3040
importFrom(IRanges,IRanges)
3141
importFrom(IRanges,coverage)
3242
importFrom(IRanges,subsetByOverlaps)
@@ -71,6 +81,7 @@ importFrom(reshape2,colsplit)
7181
importFrom(rtracklayer,export.bed)
7282
importFrom(rtracklayer,import.bed)
7383
importFrom(scales,extended_breaks)
84+
importFrom(scales,hue_pal)
7485
importFrom(stats,approx)
7586
importFrom(stats,complete.cases)
7687
importFrom(stats,p.adjust)

NEWS.md

+15-1
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,18 @@
1+
# tagmeppr 0.2
2+
3+
* results-method to make a df of results
4+
* updated results-metadata to be more clear (orientation is now strand info)
5+
* `plotInsertions` now handles failed experiments without insertions
6+
* counts are now correct: `align()` did show some duplicate reads (which were not remove due to strand-info)
7+
* added the option to use padding in `findInsertions()`, which fixes issues with the mapper reporting matches over the TIS.
8+
* wrote checkPrimer to... check... the... primers...
9+
* `findInsertions()` now uses D_scores to set orientation as strand
10+
* `findInsertions()` uses the 5rev_3fwd-flag for setting the orientation
11+
* `plotInsertions()` has better spacing, colours and handles orientation.
12+
* `results()` now enables filtering on pvalue, counts and orientation
13+
* `plotSite()` is ready to use and also shows the orientation
14+
* a bug `plotSite()` regarding x-ticks is fixed
15+
116
# tagmeppr 0.1
217

318
* re-wrote indexing part to use internal ITRs
@@ -9,4 +24,3 @@
924
* put postalign inside align
1025
* made a minimised fastq-seq to speed things up
1126
* made nice print methods
12-
* results-method to make a df of results

R/align.R

+44-14
Original file line numberDiff line numberDiff line change
@@ -302,28 +302,58 @@ align = function(exp, ref, cores = 20, empericalCentre = F, verbose = F){
302302

303303
combined = lapply(c('FWD', 'REV'), function(i){
304304
combined = list(mainCorpus[[i]],SAlist[[i]],XAlist[[i]])
305-
combined <- suppressWarnings(dplyr::bind_rows(combined))
306-
combined = unique(combined)
307-
308-
# split combined on the mapping-position in the casette
309-
tmpCassette = combined[combined$seqnames == exp$insertName, ]
310-
orDF = data.frame(readName = tmpCassette$readName,
311-
orientation = ifelse( (tmpCassette$start + 2) < cassMid, 5,3))
312-
combined = suppressWarnings(dplyr::full_join(combined, orDF, by = c("readName")))
313305

314-
# remove read with no cassette-loc
315-
combined = combined[!is.na(combined$orientation), ]
306+
combined <- suppressWarnings(dplyr::bind_rows(combined))
316307

317-
unique(combined)
318308
})
319309

320-
exp$alignedReadsFWD = GenomicRanges::makeGRangesFromDataFrame( combined[[1]],
310+
311+
alignedReadsFWD = GenomicRanges::makeGRangesFromDataFrame( combined[[1]],
321312
keep.extra.columns = T)
322-
exp$alignedReadsREV = GenomicRanges::makeGRangesFromDataFrame( combined[[2]],
313+
alignedReadsREV = GenomicRanges::makeGRangesFromDataFrame( combined[[2]],
323314
keep.extra.columns = T)
324-
exp$insertionMid = cassMid
315+
316+
317+
318+
319+
#################################################################### ITRs
320+
# get a GRanges ofthe two arms: this biostrings should be in the ref-object
321+
ITRpadRange = ref$NpadRange
322+
323+
#################################################################### prettyBam
324+
### FWD
325+
326+
BiocGenerics::strand(alignedReadsFWD) = "*"
327+
alignedReadsFWDlist = GenomicRanges::split(alignedReadsFWD, ~ readName)
328+
alignedReadsFWDlist = GenomicRanges::reduce(alignedReadsFWDlist)
329+
330+
beforePAD = IRanges::start(alignedReadsFWDlist[seqnames(alignedReadsFWDlist) == exp$insertName]) < IRanges::start(ITRpadRange)
331+
S4Vectors::mcols(alignedReadsFWDlist)$beforePad = beforePAD
332+
alignedReadsFWD <- IRanges::stack(alignedReadsFWDlist, "readName")
333+
334+
alignedReadsFWD$beforePad = vapply(alignedReadsFWD$beforePad , any, FUN.VALUE = logical(1) )
335+
336+
### REV
337+
338+
339+
BiocGenerics::strand(alignedReadsREV) = "*"
340+
alignedReadsREVlist = GenomicRanges::split(alignedReadsREV, ~ readName)
341+
alignedReadsREVlist = GenomicRanges::reduce(alignedReadsREVlist)
342+
343+
beforePAD = IRanges::start(alignedReadsREVlist[seqnames(alignedReadsREVlist) == exp$insertName]) < IRanges::start(ITRpadRange)
344+
S4Vectors::mcols(alignedReadsREVlist)$beforePad = beforePAD
345+
alignedReadsREV <- IRanges::stack(alignedReadsREVlist, "readName")
346+
347+
alignedReadsREV$beforePad = vapply(alignedReadsREV$beforePad , any, FUN.VALUE = logical(1) )
348+
349+
350+
325351

326352
##################################################################### assigner
353+
exp$alignedReadsFWD = alignedReadsFWD
354+
exp$alignedReadsREV = alignedReadsREV
355+
exp$insertionMid = cassMid
356+
327357
tmp = exp
328358
# get arguments
329359
name <- sapply(match.call(expand.dots=TRUE)[-1], deparse)

R/checkPrimer.R

+142
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,142 @@
1+
2+
3+
#' checkPrimer
4+
#'
5+
#' This tool checks the assumptions about the primers.
6+
#'
7+
#' @author Robin H. van der Weide, \email{[email protected]}
8+
#' @param fwdPrimer A character-string of the forward-primer used.
9+
#' @param revPrimer A character-string of the reverse-primer used.
10+
#' @param exp The tagMeppr-object of a sample: first run \code{\link{align}}.
11+
#' @param ITR Can take PiggyBac (default), SleepingBeauty, or a path to a 1000xN-padded ITR.fasta.
12+
#' @details
13+
#'
14+
#'
15+
#' The expected general layout for the ITR-sequence looks like this:
16+
#'
17+
#' \code{|---ITR---NNN...NNN---ITR---|}
18+
#'
19+
#' The primers are expected to be 5'-end for the reverse and 3' for the forward:
20+
#'
21+
#' \code{<rev}
22+
#' \code{|---ITR---NNN...NNN---ITR---|}
23+
#' \code{ fwd>}
24+
#'
25+
#' This tool checks these assumptions and sets the rev5_fwd3 flag to TRUE.
26+
#'
27+
#' @examples
28+
#' \dontrun{
29+
#'
30+
#' C9 = newTagMeppr(F1 = 'clone9_FWD_R1.fq.gz',
31+
#' F2 = 'clone9_FWD_R2.fq.gz',
32+
#' R1 = 'clone9_REV_R1.fq.gz',
33+
#' R2 = 'clone9_REV_R2.fq.gz',
34+
#' name = "clone9",
35+
#' protocol = 'PiggyBac')
36+
#'
37+
#' checkPrimer(fwdPrimer = "CGTCAATTTTACGCAGACTATC",
38+
#' revPrimer = "GTACGTCACAATATGATTATCTTTCTAG",
39+
#' exp = C9,
40+
#' ITR = 'PiggyBac')
41+
#'
42+
#' }
43+
#' @return The experiment-object will be updated with the rev5_fwd3-flag, which
44+
#' will tell all downstream analyses if our assumptions are correct.
45+
#'
46+
#' @importFrom Biostrings reverseComplement DNAString readDNAStringSet letterFrequency vmatchPattern
47+
#' @export
48+
checkPrimer <- function(fwdPrimer, revPrimer, exp, ITR = 'PiggyBac'){
49+
rev5_fwd3 = F
50+
51+
if(exp$protocol != ITR){
52+
stop('Protocol given in exp (', exp$protocol,
53+
') is not the same as given as ITR (',ITR,').')
54+
}
55+
56+
############################################################# get revComplement
57+
fwdPrimerCompl = Biostrings::reverseComplement(Biostrings::DNAString(fwdPrimer))
58+
revPrimerCompl = Biostrings::reverseComplement(Biostrings::DNAString(revPrimer))
59+
60+
##################################################################### load ITR
61+
transposonSeq = NULL
62+
if(ITR == "PiggyBac"){
63+
transposonSeq = tagMeppr::PiggyBacITRs
64+
} else if(ITR == "SleepingBeauty"){
65+
transposonSeq = tagMeppr::SleepingBeautyITRs
66+
} else if(grepl(ITR, pattern = ".fa")){
67+
# check if exists
68+
if(file.exists(ITR)){
69+
transposonSeq = Biostrings::readDNAStringSet(filepath = ITR, use.names = T)
70+
# check if N-padded
71+
N1k = Biostrings::letterFrequency(transposonSeq, letters = "N") == 1000
72+
if(!N1k){
73+
stop('The file ', ITR, " has no padding of 1000 N's between the arms.")
74+
}
75+
} else {
76+
stop('The file ', ITR, ' does not exist.')
77+
}
78+
} else {
79+
stop('Please set ITR to either "PiggyBac", "SleepingBeauty", or as a path to a .fasta-file!')
80+
}
81+
82+
##################################################################### get arms
83+
NpadRange = Biostrings::vmatchPattern(transposonSeq,
84+
pattern = paste0(rep('N', 1e3), collapse = ''))
85+
86+
######################################################################### find
87+
88+
hitF = Biostrings::vmatchPattern(transposonSeq,pattern = fwdPrimer)[1]
89+
hitR = Biostrings::vmatchPattern(transposonSeq,pattern = revPrimer)[1]
90+
91+
if(length(hitF[[1]]) == 0){
92+
hitF = Biostrings::vmatchPattern(transposonSeq,pattern = fwdPrimerCompl)[1]
93+
}
94+
95+
if(length(hitR[[1]]) == 0){
96+
hitR = Biostrings::vmatchPattern(transposonSeq,pattern = revPrimerCompl)[1]
97+
}
98+
99+
100+
############################################################### check if found
101+
if(length(hitF[[1]]) == 0){
102+
stop('No match between fwdPrimer (including revComplement) and the sequence.')
103+
}
104+
105+
if(length(hitR[[1]]) == 0){
106+
stop('No match between revPrimer (including revComplement) and the sequence.')
107+
}
108+
109+
############################################# check if they are on unique arms
110+
belowF = unlist(hitF[[1]] < NpadRange)
111+
belowR = unlist(hitR[[1]] < NpadRange)
112+
113+
if(belowR == belowF){
114+
if(belowF){
115+
stop('Primers are both found on the first ITR!')
116+
} else {
117+
stop('Primers are both found on the second ITR!')
118+
}
119+
}
120+
121+
####################### check if start(reverse primer) < start(forward primer)
122+
if(hitR[[1]] < hitF[[1]]){
123+
rev5_fwd3 = T
124+
} else {
125+
# reverse is on second ITR, which is not what I expect
126+
rev5_fwd3 = F
127+
}
128+
129+
exp$rev5_fwd3 = rev5_fwd3
130+
131+
##################################################################### assigner
132+
tmp = exp
133+
# get arguments
134+
name <- sapply(match.call(expand.dots=TRUE)[-1], deparse)
135+
#find argument postion for exp
136+
AP = which(names(name) == 'exp')
137+
138+
assign(name[AP], tmp, envir = parent.frame())
139+
140+
invisible(rev5_fwd3)
141+
142+
}

0 commit comments

Comments
 (0)