Skip to content

Commit

Permalink
Bug fix for tiny edge-case in MarkDuplicates
Browse files Browse the repository at this point in the history
* In the extremely improbable case that the mated reads share the same unclipped 5' end, and are on opposite strands, the order in the file will determine the orientation ("innie" vs. "outtie") and thus if the relative order of the mates is not deterministic, two such pairs (with the same position) may or may not be marked as duplicates. While this is virtually impossible in coordinate-sorted input (since you need all but one base to be clipped in the back-facing read) in query-ordered input this has happened in the wild.

- This PR corrects this behavior by declaring such mates to be "innies" regardless of the order in the file.
- Tests covering both cases (queryname and coordinate sorted) have been added
- Variable names changed to make code more readable
- A small saving was found which reduced the number of times the readnames are parsed by 1/2.
  • Loading branch information
Yossi Farjoun committed May 26, 2016
1 parent 04ed147 commit 85823d2
Show file tree
Hide file tree
Showing 14 changed files with 300 additions and 27 deletions.
32 changes: 22 additions & 10 deletions src/java/picard/sam/markduplicates/MarkDuplicates.java
Original file line number Diff line number Diff line change
Expand Up @@ -443,11 +443,13 @@ private void buildSortedReadEndLists(final boolean useBarcodes) {

// See if we've already seen the first end or not
if (pairedEnds == null) {
pairedEnds = buildReadEnds(header, indexForRead, rec, useBarcodes);
// at this point pairedEnds and fragmentEnd are the same, but we need to make
// a copy since pairedEnds will be modified when the mate comes along.
pairedEnds = fragmentEnd.clone();
tmp.put(pairedEnds.read2ReferenceIndex, key, pairedEnds);
} else {
final int sequence = fragmentEnd.read1ReferenceIndex;
final int coordinate = fragmentEnd.read1Coordinate;
final int matesRefIndex = fragmentEnd.read1ReferenceIndex;
final int matesCoordinate = fragmentEnd.read1Coordinate;

// Set orientationForOpticalDuplicates, which always goes by the first then the second end for the strands. NB: must do this
// before updating the orientation later.
Expand All @@ -461,20 +463,30 @@ private void buildSortedReadEndLists(final boolean useBarcodes) {
((ReadEndsForMarkDuplicatesWithBarcodes) pairedEnds).readTwoBarcode = getReadTwoBarcodeValue(rec);
}

// If the second read is actually later, just add the second read data, else flip the reads
if (sequence > pairedEnds.read1ReferenceIndex ||
(sequence == pairedEnds.read1ReferenceIndex && coordinate >= pairedEnds.read1Coordinate)) {
pairedEnds.read2ReferenceIndex = sequence;
pairedEnds.read2Coordinate = coordinate;
// If the other read is actually later, simply add the other read's data as read2, else flip the reads
if (matesRefIndex > pairedEnds.read1ReferenceIndex ||
(matesRefIndex == pairedEnds.read1ReferenceIndex && matesCoordinate >= pairedEnds.read1Coordinate)) {
pairedEnds.read2ReferenceIndex = matesRefIndex;
pairedEnds.read2Coordinate = matesCoordinate;
pairedEnds.read2IndexInFile = indexForRead;
pairedEnds.orientation = ReadEnds.getOrientationByte(pairedEnds.orientation == ReadEnds.R,
rec.getReadNegativeStrandFlag());

// if the two read ends are in the same position, pointing in opposite directions,
// the orientation is undefined and the procedure above
// will depend on the order of the reads in the file.
// To avoid this, we set it explicitly (to FR):
if (pairedEnds.read2ReferenceIndex == pairedEnds.read1ReferenceIndex &&
pairedEnds.read2Coordinate == pairedEnds.read1Coordinate &&
pairedEnds.orientation == ReadEnds.RF) {
pairedEnds.orientation = ReadEnds.FR;
}
} else {
pairedEnds.read2ReferenceIndex = pairedEnds.read1ReferenceIndex;
pairedEnds.read2Coordinate = pairedEnds.read1Coordinate;
pairedEnds.read2IndexInFile = pairedEnds.read1IndexInFile;
pairedEnds.read1ReferenceIndex = sequence;
pairedEnds.read1Coordinate = coordinate;
pairedEnds.read1ReferenceIndex = matesRefIndex;
pairedEnds.read1Coordinate = matesCoordinate;
pairedEnds.read1IndexInFile = indexForRead;
pairedEnds.orientation = ReadEnds.getOrientationByte(rec.getReadNegativeStrandFlag(),
pairedEnds.orientation == ReadEnds.R);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@
*
* @author Nils Homer
*/
public class ReadEndsForMarkDuplicates extends ReadEnds {
public class ReadEndsForMarkDuplicates extends ReadEnds implements Cloneable {
/*
What do we need to store you ask? Well, we need to store:
- byte: orientation
Expand Down Expand Up @@ -71,4 +71,9 @@ public ReadEndsForMarkDuplicates(final ReadEndsForMarkDuplicates read) {
this.read1IndexInFile = read.read1IndexInFile;
this.read2IndexInFile = read.read2IndexInFile;
}

@Override
public ReadEndsForMarkDuplicates clone() {
return new ReadEndsForMarkDuplicates(this);
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,18 @@ public ReadEndsForMarkDuplicatesWithBarcodes(final ReadEndsForMarkDuplicates rea
super(read);
}

public ReadEndsForMarkDuplicatesWithBarcodes(final ReadEndsForMarkDuplicatesWithBarcodes read) {
super(read);
barcode = read.barcode;
readOneBarcode = read.readOneBarcode;
readTwoBarcode = read.readTwoBarcode;
}

@Override
public ReadEndsForMarkDuplicatesWithBarcodes clone() {
return new ReadEndsForMarkDuplicatesWithBarcodes(this);
}

public static int getSizeOf() {
return ReadEndsForMarkDuplicates.getSizeOf() + (3 * 4);
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -598,4 +598,5 @@ public void testTwoMappedPairsWithSupplementaryReadsAfterCanonical(final Boolean
tester.addMappedFragment(fragLikeFirst ? "RUNID:1:1:15993:13361" : "RUNID:1:1:16020:13352", 1, 400, markSecondaryAndSupplementaryRecordsLikeTheCanonical() && !fragLikeFirst, null, null, additionalFragSecondary, additionalFragSupplementary, DEFAULT_BASE_QUALITY);
tester.runTest();
}

}
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,11 @@ abstract public class AbstractMarkDuplicatesCommandLineProgramTester extends Sam
final DuplicationMetrics expectedMetrics;

public AbstractMarkDuplicatesCommandLineProgramTester(final ScoringStrategy duplicateScoringStrategy, SAMFileHeader.SortOrder sortOrder) {
super(50, true, SAMRecordSetBuilder.DEFAULT_CHROMOSOME_LENGTH, duplicateScoringStrategy, sortOrder);
this(duplicateScoringStrategy, sortOrder, true);
}

public AbstractMarkDuplicatesCommandLineProgramTester(final ScoringStrategy duplicateScoringStrategy, SAMFileHeader.SortOrder sortOrder, boolean recordNeedSorting) {
super(50, true, SAMRecordSetBuilder.DEFAULT_CHROMOSOME_LENGTH, duplicateScoringStrategy, sortOrder, recordNeedSorting);

expectedMetrics = new DuplicationMetrics();
expectedMetrics.READ_PAIR_OPTICAL_DUPLICATES = 0;
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,68 @@
/*
* The MIT License
*
* Copyright (c) 2016 The Broad Institute
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
* THE SOFTWARE.
*/
package picard.sam.markduplicates;

import htsjdk.samtools.SamReader;
import htsjdk.samtools.SamReaderFactory;
import htsjdk.samtools.util.CloserUtil;
import org.testng.annotations.DataProvider;
import org.testng.annotations.Test;

import java.io.File;

/**
* Tests a few hand build sam files as they are.
*/
public class AsIsMarkDuplicatesTester {

@DataProvider
public Object[][] testSameUnclipped5PrimeOppositeStrandData() {
final File TEST_DIR = new File("testdata/picard/sam/MarkDuplicates");
return new Object[][]{
new Object[]{new File(TEST_DIR, "sameUnclipped5primeEndv1.sam")},
new Object[]{new File(TEST_DIR, "sameUnclipped5primeEndv2.sam")},
new Object[]{new File(TEST_DIR, "sameUnclipped5primeEndCoordinateSortedv1.sam")},
new Object[]{new File(TEST_DIR, "sameUnclipped5primeEndCoordinateSortedv2.sam")},
new Object[]{new File(TEST_DIR, "sameUnclipped5primeEndCoordinateSortedv3.sam")},
new Object[]{new File(TEST_DIR, "sameUnclipped5primeEndCoordinateSortedv4.sam")}
};
}

@Test(dataProvider = "testSameUnclipped5PrimeOppositeStrandData")
public void testSameUnclipped5PrimeOppositeStrand(final File input) {

final AbstractMarkDuplicatesCommandLineProgramTester tester = new BySumOfBaseQAndInOriginalOrderMDTester();

final SamReader reader = SamReaderFactory.makeDefault().open(input);

tester.setHeader(reader.getFileHeader());
reader.iterator().stream().forEach(tester::addRecord);

CloserUtil.close(reader);
tester.setExpectedOpticalDuplicate(0);
tester.runTest();
}
}


Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
/*
* The MIT License
*
* Copyright (c) 2016 The Broad Institute
* Copyright (c) 2014 The Broad Institute
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
Expand All @@ -24,22 +24,21 @@

package picard.sam.markduplicates;

import htsjdk.samtools.DuplicateScoringStrategy;
import htsjdk.samtools.SAMFileHeader;
import picard.cmdline.CommandLineProgram;

/**
* This is the testing class for the QuerySortedMarkDuplicates (it's the same CLP, only it
* takes a query-sorted bam as input.) The difference in the tests themselves is that
* supplementary and secondary reads will be marked the same as their primary alignments,
* and that unmapped reads with mapped mates will be marked according to their mapped mate.
* Created by farjoun on 5/25/16.
*/
public class QuerySortedMarkDuplicatesTest extends MarkDuplicatesTest {
public class BySumOfBaseQAndInOriginalOrderMDTester extends AbstractMarkDuplicatesCommandLineProgramTester {

@Override
protected boolean markUnmappedRecordsLikeTheirMates() {
return true;
public BySumOfBaseQAndInOriginalOrderMDTester() {
super(DuplicateScoringStrategy.ScoringStrategy.SUM_OF_BASE_QUALITIES, SAMFileHeader.SortOrder.unsorted, false);
}

@Override
protected boolean markSecondaryAndSupplementaryRecordsLikeTheCanonical() { return true; }

@Override
protected AbstractMarkDuplicatesCommandLineProgramTester getTester() { return new QuerySortedMarkDuplicatesTester(); }
protected CommandLineProgram getProgram() {
return new MarkDuplicates();
}
}
8 changes: 6 additions & 2 deletions src/tests/java/picard/sam/testers/SamFileTester.java
Original file line number Diff line number Diff line change
Expand Up @@ -32,13 +32,16 @@ public abstract class SamFileTester extends CommandLineProgramTest {
private boolean deleteOnExit = true;
private final ArrayList<String> args = new ArrayList<>();

public SamFileTester(final int readLength, final boolean deleteOnExit, final int defaultChromosomeLength, final ScoringStrategy duplicateScoringStrategy, final SAMFileHeader.SortOrder sortOrder) {
public SamFileTester(final int readLength, final boolean deleteOnExit, final int defaultChromosomeLength, final ScoringStrategy duplicateScoringStrategy, final SAMFileHeader.SortOrder sortOrder, boolean recordsNeedSorting) {
this.deleteOnExit = deleteOnExit;
this.samRecordSetBuilder = new SAMRecordSetBuilder(true, sortOrder, true, defaultChromosomeLength, duplicateScoringStrategy);
this.samRecordSetBuilder = new SAMRecordSetBuilder(recordsNeedSorting, sortOrder, true, defaultChromosomeLength, duplicateScoringStrategy);
samRecordSetBuilder.setReadLength(readLength);
setOutputDir();
}

public SamFileTester(final int readLength, final boolean deleteOnExit, final int defaultChromosomeLength, final ScoringStrategy duplicateScoringStrategy, final SAMFileHeader.SortOrder sortOrder) {
this(readLength, deleteOnExit, defaultChromosomeLength, duplicateScoringStrategy, SAMFileHeader.SortOrder.coordinate, true);
}

public SamFileTester(final int readLength, final boolean deleteOnExit, final int defaultChromosomeLength, final ScoringStrategy duplicateScoringStrategy) {
this(readLength, deleteOnExit, defaultChromosomeLength, duplicateScoringStrategy, SAMFileHeader.SortOrder.coordinate);
Expand Down Expand Up @@ -340,4 +343,5 @@ private File createInputFile() {
public SamReader getInput() {
return samRecordSetBuilder.getSamReader();
}

}
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
@HD VN:1.5 SO:coordinate
@SQ SN:1 LN:197195432 UR:ftp://hgdownload.cse.ucsc.edu/goldenPath/Mus_musculus_assembly9/bigZips/chromFa.tar.gz AS:mm9 M5:f05d753079c455c0e57af88eeda24493 SP:Mus musculus
@SQ SN:2 LN:181748087 UR:ftp://hgdownload.cse.ucsc.edu/goldenPath/Mus_musculus_assembly9/bigZips/chromFa.tar.gz AS:mm9 M5:9b9d64dc89ecc73d3288eb38af3f94bd SP:Mus musculus
@SQ SN:3 LN:159599783 UR:ftp://hgdownload.cse.ucsc.edu/goldenPath/Mus_musculus_assembly9/bigZips/chromFa.tar.gz AS:mm9 M5:0a692666a1b8526e1d1e799beb71b6d0 SP:Mus musculus
@SQ SN:4 LN:155630120 UR:ftp://hgdownload.cse.ucsc.edu/goldenPath/Mus_musculus_assembly9/bigZips/chromFa.tar.gz AS:mm9 M5:f5993a04396a06ed6b28fa42b2429be0 SP:Mus musculus
@SQ SN:5 LN:152537259 UR:ftp://hgdownload.cse.ucsc.edu/goldenPath/Mus_musculus_assembly9/bigZips/chromFa.tar.gz AS:mm9 M5:f90804fb8fe9cb06076d51a710fb4563 SP:Mus musculus
@SQ SN:6 LN:149517037 UR:ftp://hgdownload.cse.ucsc.edu/goldenPath/Mus_musculus_assembly9/bigZips/chromFa.tar.gz AS:mm9 M5:258a37e20815bb7e3f2e974b9d4dd295 SP:Mus musculus
@SQ SN:7 LN:152524553 UR:ftp://hgdownload.cse.ucsc.edu/goldenPath/Mus_musculus_assembly9/bigZips/chromFa.tar.gz AS:mm9 M5:e0d6cea6f72cb4d9f8d0efc1d29dd180 SP:Mus musculus
@SQ SN:8 LN:131738871 UR:ftp://hgdownload.cse.ucsc.edu/goldenPath/Mus_musculus_assembly9/bigZips/chromFa.tar.gz AS:mm9 M5:5f217cb8a9685b9879add3ae110cabd7 SP:Mus musculus
@SQ SN:9 LN:124076172 UR:ftp://hgdownload.cse.ucsc.edu/goldenPath/Mus_musculus_assembly9/bigZips/chromFa.tar.gz AS:mm9 M5:dde08574942fc18050195618cc3f35af SP:Mus musculus
@SQ SN:10 LN:129993255 UR:ftp://hgdownload.cse.ucsc.edu/goldenPath/Mus_musculus_assembly9/bigZips/chromFa.tar.gz AS:mm9 M5:be7e6a13cc6b9da7c1da7b7fc32c5506 SP:Mus musculus
@SQ SN:11 LN:121843856 UR:ftp://hgdownload.cse.ucsc.edu/goldenPath/Mus_musculus_assembly9/bigZips/chromFa.tar.gz AS:mm9 M5:e0099550b3d3943fb9bb7af6fa6952c1 SP:Mus musculus
@SQ SN:12 LN:121257530 UR:ftp://hgdownload.cse.ucsc.edu/goldenPath/Mus_musculus_assembly9/bigZips/chromFa.tar.gz AS:mm9 M5:1f9c11dc6f288f93e9fab56772a36e85 SP:Mus musculus
@SQ SN:13 LN:120284312 UR:ftp://hgdownload.cse.ucsc.edu/goldenPath/Mus_musculus_assembly9/bigZips/chromFa.tar.gz AS:mm9 M5:a7b4bb418aa21e0ec59d9e2a1fe1810b SP:Mus musculus
@SQ SN:14 LN:125194864 UR:ftp://hgdownload.cse.ucsc.edu/goldenPath/Mus_musculus_assembly9/bigZips/chromFa.tar.gz AS:mm9 M5:09d1c8449706a17d40934302a0a3b671 SP:Mus musculus
@SQ SN:15 LN:103494974 UR:ftp://hgdownload.cse.ucsc.edu/goldenPath/Mus_musculus_assembly9/bigZips/chromFa.tar.gz AS:mm9 M5:e41c8b42b0921378b1fdd5172f6be067 SP:Mus musculus
@SQ SN:16 LN:98319150 UR:ftp://hgdownload.cse.ucsc.edu/goldenPath/Mus_musculus_assembly9/bigZips/chromFa.tar.gz AS:mm9 M5:e051b3930c2557ade21d67db41f3a518 SP:Mus musculus
@SQ SN:17 LN:95272651 UR:ftp://hgdownload.cse.ucsc.edu/goldenPath/Mus_musculus_assembly9/bigZips/chromFa.tar.gz AS:mm9 M5:47eede15e5761fb9c2267627f18211e7 SP:Mus musculus
@SQ SN:18 LN:90772031 UR:ftp://hgdownload.cse.ucsc.edu/goldenPath/Mus_musculus_assembly9/bigZips/chromFa.tar.gz AS:mm9 M5:9f9d41cfdb9d91b62b928a3eb4eb6928 SP:Mus musculus
@SQ SN:19 LN:61342430 UR:ftp://hgdownload.cse.ucsc.edu/goldenPath/Mus_musculus_assembly9/bigZips/chromFa.tar.gz AS:mm9 M5:591f8486f82c22442bb8463595a18e0a SP:Mus musculus
@SQ SN:X LN:166650296 UR:ftp://hgdownload.cse.ucsc.edu/goldenPath/Mus_musculus_assembly9/bigZips/chromFa.tar.gz AS:mm9 M5:3d0d9df898d2c830b858f91255d8a1eb SP:Mus musculus
@SQ SN:Y LN:15902555 UR:ftp://hgdownload.cse.ucsc.edu/goldenPath/Mus_musculus_assembly9/bigZips/chromFa.tar.gz AS:mm9 M5:5ff564f9fbc8cb87bcad6cfa6874902b SP:Mus musculus
@RG ID:00001.3 PL:illumina PU:00001ABXX101026.3.TTGAGCCT LB:Solexa-45924 DT:2010-10-26T00:00:00-0400 SM:stat2_120 CN:BI
@RG ID:00001.3 PL:illumina PU:00001ABXX101026.3.TTGAGCCT LB:Solexa-45924 DT:2010-10-26T00:00:00-0400 SM:stat2_120 CN:BI
ST-E00297:149016593:H3GVWCCXX:3:1218:20812:27591 145 8 43092875 0 150S1M = 43092875 49 AAAAAAAAAA BBBBBBBBBB MC:Z:151M RG:Z:00001.3
ST-E00297:149016593:H3GVWCCXX:3:1218:20812:27591 97 8 43092875 0 151M = 43092875 -49 AAAAAAAAAA BBBBBBBBBB MC:Z:150S1M RG:Z:00001.3
ST-E00297:149016593:H3GVWCCXX:5:2214:10145:57038 1169 8 43092875 0 150S1M = 43092875 44 AAAAAAAAAA AAAAAAAAAA MC:Z:151M RG:Z:00001.3
ST-E00297:149016593:H3GVWCCXX:5:2214:10145:57038 1121 8 43092875 0 151M = 43092875 -44 AAAAAAAAAA AAAAAAAAAA MC:Z:150S1M RG:Z:00001.3
Loading

0 comments on commit 85823d2

Please sign in to comment.