Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Make mergePairs work with one sample mergers #605

Closed
wants to merge 5 commits into from
Closed
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
127 changes: 68 additions & 59 deletions R/paired.R
Original file line number Diff line number Diff line change
Expand Up @@ -94,8 +94,7 @@ mergePairs <- function(dadaF, derepF, dadaR, derepR, minOverlap = 12, maxMismatc
nrecs <- c(length(dadaF), length(derepF), length(dadaR), length(derepR))
if(length(unique(nrecs))>1) stop("The dadaF/derepF/dadaR/derepR arguments must be the same length.")

rval <- list()
for(i in seq_along(dadaF)) {
rval <- lapply(seq_along(dadaF), function (i) {
mapF <- derepF[[i]]$map
mapR <- derepR[[i]]$map
if(!(is.integer(mapF) && is.integer(mapR))) stop("Incorrect format of $map in derep-class arguments.")
Expand All @@ -110,69 +109,79 @@ mergePairs <- function(dadaF, derepF, dadaR, derepR, minOverlap = 12, maxMismatc
pairdf <- data.frame(sequence = "", abundance=0, forward=rF, reverse=rR)
ups <- unique(pairdf) # The unique forward/reverse pairs of denoised sequences
keep <- !is.na(ups$forward) & !is.na(ups$reverse)
ups <- ups[!is.na(ups$forward) & !is.na(ups$reverse),] # Drop pairs with uncorrected uniques
Funqseq <- unname(as.character(dadaF[[i]]$clustering$sequence[ups$forward]))
Runqseq <- rc(unname(as.character(dadaR[[i]]$clustering$sequence[ups$reverse])))

if (justConcatenate == TRUE) {
# Simply concatenate the sequences together
ups$sequence <- mapply(function(x,y) paste0(x,"NNNNNNNNNN", y), Funqseq, Runqseq, SIMPLIFY=FALSE);
ups$nmatch <- 0
ups$nmismatch <- 0
ups$nindel <- 0
ups$prefer <- NA
ups$accept <- TRUE
ups <- ups[keep, ]
if (nrow(ups)==0) {
outnames <- c("sequence", "abundance", "forward", "reverse",
"nmatch", "nmismatch", "nindel", "prefer", "accept")
ups <- data.frame(matrix(ncol = length(outnames), nrow = 0))
names(ups) <- outnames
if(verbose) {
message("No paired-reads (in ZERO unique pairings) successfully merged out of ", nrow(pairdf), " pairings) input.")
}
return(ups)
} else {
# Align forward and reverse reads.
# Use unbanded N-W align to compare forward/reverse
# Adjusting align params to prioritize zero-mismatch merges
tmp <- getDadaOpt(c("MATCH", "MISMATCH", "GAP_PENALTY"))
if(maxMismatch==0) {
setDadaOpt(MATCH=1L, MISMATCH=-64L, GAP_PENALTY=-64L)
} else {
setDadaOpt(MATCH=1L, MISMATCH=-8L, GAP_PENALTY=-8L)
}
alvecs <- mapply(function(x,y) nwalign(x,y,band=-1,...), Funqseq, Runqseq, SIMPLIFY=FALSE)
setDadaOpt(tmp)
outs <- t(sapply(alvecs, function(x) C_eval_pair(x[1], x[2])))
ups$nmatch <- outs[,1]
ups$nmismatch <- outs[,2]
ups$nindel <- outs[,3]
ups$prefer <- 1 + (dadaR[[i]]$clustering$n0[ups$reverse] > dadaF[[i]]$clustering$n0[ups$forward])
ups$accept <- (ups$nmatch >= minOverlap) & ((ups$nmismatch + ups$nindel) <= maxMismatch)
# Make the sequence
ups$sequence <- mapply(C_pair_consensus, sapply(alvecs,`[`,1), sapply(alvecs,`[`,2), ups$prefer, trimOverhang);
# Additional param to indicate whether 1:forward or 2:reverse takes precedence
# Must also strip out any indels in the return
# This function is only used here.
Funqseq <- unname(as.character(dadaF[[i]]$clustering$sequence[ups$forward]))
Runqseq <- rc(unname(as.character(dadaR[[i]]$clustering$sequence[ups$reverse])))
if (justConcatenate == TRUE) {
# Simply concatenate the sequences together
ups$sequence <- mapply(function(x,y) paste0(x,"NNNNNNNNNN", y),
Funqseq, Runqseq, SIMPLIFY=FALSE);
ups$nmatch <- 0
ups$nmismatch <- 0
ups$nindel <- 0
ups$prefer <- NA
ups$accept <- TRUE
} else {
# Align forward and reverse reads.
# Use unbanded N-W align to compare forward/reverse
# Adjusting align params to prioritize zero-mismatch merges
tmp <- getDadaOpt(c("MATCH", "MISMATCH", "GAP_PENALTY"))
if(maxMismatch==0) {
setDadaOpt(MATCH=1L, MISMATCH=-64L, GAP_PENALTY=-64L)
} else {
setDadaOpt(MATCH=1L, MISMATCH=-8L, GAP_PENALTY=-8L)
}
alvecs <- mapply(function(x,y) nwalign(x,y,band=-1,...), Funqseq, Runqseq, SIMPLIFY=FALSE)
setDadaOpt(tmp)
outs <- t(sapply(alvecs, function(x) C_eval_pair(x[1], x[2])))
ups$nmatch <- outs[,1]
ups$nmismatch <- outs[,2]
ups$nindel <- outs[,3]
ups$prefer <- 1 + (dadaR[[i]]$clustering$n0[ups$reverse] > dadaF[[i]]$clustering$n0[ups$forward])
ups$accept <- (ups$nmatch >= minOverlap) & ((ups$nmismatch + ups$nindel) <= maxMismatch)
# Make the sequence
ups$sequence <- mapply(C_pair_consensus, sapply(alvecs,`[`,1), sapply(alvecs,`[`,2), ups$prefer, trimOverhang);
# Additional param to indicate whether 1:forward or 2:reverse takes precedence
# Must also strip out any indels in the return
# This function is only used here.
}

# Add abundance and sequence to the output data.frame
tab <- table(pairdf$forward, pairdf$reverse)
ups$abundance <- tab[cbind(ups$forward, ups$reverse)]
ups$sequence[!ups$accept] <- ""
# Add columns from forward/reverse clustering
propagateCol <- propagateCol[propagateCol %in% colnames(dadaF[[i]]$clustering)]
for(col in propagateCol) {
ups[,paste0("F.",col)] <- dadaF[[i]]$clustering[ups$forward,col]
ups[,paste0("R.",col)] <- dadaR[[i]]$clustering[ups$reverse,col]
}
# Sort output by abundance and name
ups <- ups[order(ups$abundance, decreasing=TRUE),]
rownames(ups) <- NULL
if(verbose) {
message(sum(ups$abundance[ups$accept]), " paired-reads (in ", sum(ups$accept), " unique pairings) successfully merged out of ", sum(ups$abundance), " (in ", nrow(ups), " pairings) input.")
}
if(!returnRejects) { ups <- ups[ups$accept,] }
# Add abundance and sequence to the output data.frame
tab <- table(pairdf$forward, pairdf$reverse)
ups$abundance <- tab[cbind(ups$forward, ups$reverse)]
ups$sequence[!ups$accept] <- ""
# Add columns from forward/reverse clustering
propagateCol <- propagateCol[propagateCol %in% colnames(dadaF[[i]]$clustering)]
for(col in propagateCol) {
ups[,paste0("F.",col)] <- dadaF[[i]]$clustering[ups$forward,col]
ups[,paste0("R.",col)] <- dadaR[[i]]$clustering[ups$reverse,col]
}
# Sort output by abundance and name
ups <- ups[order(ups$abundance, decreasing=TRUE),]
rownames(ups) <- NULL
if(verbose) {
message(sum(ups$abundance[ups$accept]), " paired-reads (in ", sum(ups$accept), " unique pairings) successfully merged out of ", sum(ups$abundance), " (in ", nrow(ups), " pairings) input.")
}
if(!returnRejects) { ups <- ups[ups$accept,] }

if(any(duplicated(ups$sequence))) {
message("Duplicate sequences in merged output.")
if(any(duplicated(ups$sequence))) {
message("Duplicate sequences in merged output.")
}
return(ups)
}
rval[[i]] <- ups
}
})
if(!is.null(names(dadaF))) names(rval) <- names(dadaF)
if(length(rval) == 1) rval <- rval[[1]]
else if(!is.null(names(dadaF))) names(rval) <- names(dadaF)

return(rval)
}

Expand Down