Skip to content

Commit

Permalink
r998: smart pairing; allow mixture of SE/PE reads
Browse files Browse the repository at this point in the history
  • Loading branch information
lh3 committed Nov 18, 2014
1 parent 6f9bf9a commit 80e4ecf
Show file tree
Hide file tree
Showing 7 changed files with 89 additions and 64 deletions.
38 changes: 14 additions & 24 deletions NEWS.md
Original file line number Diff line number Diff line change
@@ -1,29 +1,17 @@
Release 0.7.11 (XX September, 2014)
Release 0.7.11 (XX November, 2014)
-----------------------------------

A major change to BWA-MEM is the support of mapping to ALT contigs. To use this
feature, users need to manually create a file `indexbase.alt` with each line
giving the name of an ALT contig. During alignment, BWA-MEM will be able to
classify potential hits to ALT and non-ALT hits. It reports alignments and
assigns mapping quality (mapQ) loosely following these rules:
A major change to BWA-MEM is the support of mapping to ALT contigs in addition
to the primary assembly. Part of the ALT mapping strategy is implemented in
BWA-MEM and the rest in a postprocessing script for now. Due to the extra
layer of complexity on generating the reference genome and on the two-step
mapping, we start to provide a wrapper script and precompiled binaries since
this release. Please check README-alt.md for details.

1. The original mapQ of a non-ALT hit is computed across non-ALT hits only.
The reported mapQ of an ALT hit is always computed across all hits.

2. An ALT hit is only reported if its score is strictly better than all
overlapping non-ALT hits. A reported ALT hit is flagged with 0x800
(supplementary) unless there are no non-ALT hits.

3. The mapQ of a non-ALT hit is reduced to zero if its score is less than 80%
(controlled by option `-g`) of the score of an overlapping ALT hit. In this
case, the original mapQ is moved to the `om` tag.

This way, non-ALT alignments are only affected by ALT contigs if there are
significantly better ALT alignments. BWA-MEM is carefully engineered such that
ALT contigs do not interfere with the alignments to the primary assembly.

Users may consider to use ALT contigs from GRCh38. I am also constructing a
non-redundant and more complete set of sequences missing from GRCh38.
Another major addition to BWA-MEM is HLA typing, which made possible with the
new ALT mapping strategy. Necessary data and programs are also included in
the binary release. The wrapper script also performs HLA typing when HLA genes
are also included in the reference genome as additional ALT contigs.

Other notable changes to BWA-MEM:

Expand All @@ -42,7 +30,9 @@ Other notable changes to BWA-MEM:
* Added new pre-setting for Oxford Nanopore 2D reads. For small genomes,
though, LAST is still more sensitive.

(0.7.11: XX September 2014, rXXX)
* Added LAST-like seeding. This improves the accuracy for longer reads.

(0.7.11: XX November 2014, rXXX)



Expand Down
21 changes: 21 additions & 0 deletions bwa.c
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
#include "ksw.h"
#include "utils.h"
#include "kstring.h"
#include "kvec.h"

#ifdef USE_MALLOC_WRAPPERS
# include "malloc_wrap.h"
Expand Down Expand Up @@ -55,10 +56,12 @@ bseq1_t *bseq_read(int chunk_size, int *n_, void *ks1_, void *ks2_)
}
trim_readno(&ks->name);
kseq2bseq1(ks, &seqs[n]);
seqs[n].id = n;
size += seqs[n++].l_seq;
if (ks2) {
trim_readno(&ks2->name);
kseq2bseq1(ks2, &seqs[n]);
seqs[n].id = n;
size += seqs[n++].l_seq;
}
if (size >= chunk_size && (n&1) == 0) break;
Expand All @@ -71,6 +74,24 @@ bseq1_t *bseq_read(int chunk_size, int *n_, void *ks1_, void *ks2_)
return seqs;
}

void bseq_classify(int n, bseq1_t *seqs, int m[2], bseq1_t *sep[2])
{
int i, has_last;
kvec_t(bseq1_t) a[2] = {{0,0,0}, {0,0,0}};
for (i = 1, has_last = 1; i < n; ++i) {
if (has_last) {
if (strcmp(seqs[i].name, seqs[i-1].name) == 0) {
kv_push(bseq1_t, a[1], seqs[i-1]);
kv_push(bseq1_t, a[1], seqs[i]);
has_last = 0;
} else kv_push(bseq1_t, a[0], seqs[i-1]);
} else has_last = 1;
}
if (has_last) kv_push(bseq1_t, a[0], seqs[i-1]);
sep[0] = a[0].a, m[0] = a[0].n;
sep[1] = a[1].a, m[1] = a[1].n;
}

/*****************
* CIGAR related *
*****************/
Expand Down
3 changes: 2 additions & 1 deletion bwa.h
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ typedef struct {
} bwaidx_t;

typedef struct {
int l_seq;
int l_seq, id;
char *name, *comment, *seq, *qual, *sam;
} bseq1_t;

Expand All @@ -35,6 +35,7 @@ extern "C" {
#endif

bseq1_t *bseq_read(int chunk_size, int *n_, void *ks1_, void *ks2_);
void bseq_classify(int n, bseq1_t *seqs, int m[2], bseq1_t *sep[2]);

void bwa_fill_scmat(int a, int b, int8_t mat[25]);
uint32_t *bwa_gen_cigar(const int8_t mat[25], int q, int r, int w_, int64_t l_pac, const uint8_t *pac, int l_query, uint8_t *query, int64_t rb, int64_t re, int *score, int *n_cigar, int *NM);
Expand Down
1 change: 1 addition & 0 deletions bwamem.h
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ typedef struct __smem_i smem_i;
#define MEM_F_ALN_REG 0x80
#define MEM_F_REF_HDR 0x100
#define MEM_F_SOFTCLIP 0x200
#define MEM_F_SMARTPE 0x400

typedef struct {
int a, b; // match score and mismatch penalty
Expand Down
55 changes: 26 additions & 29 deletions extras/run-bwamem
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ use warnings;
use Getopt::Std;

my %opts = (t=>1, n=>64);
getopts("Spadsko:R:x:t:", \%opts);
getopts("Sadsko:R:x:t:", \%opts);

die('
Usage: run-bwamem [options] <idxbase> <file1> [file2]
Expand All @@ -17,30 +17,36 @@ Options: -o STR prefix for output files [inferred from
pacbio: pacbio subreads (~10kb query, high error rate)
ont2d: Oxford Nanopore reads (~10kb query, higher error rate)
-t INT number of threads [1]
-p input are paired-end reads if file2 absent
-a trim HiSeq2000/2500 PE resequencing adapters (via trimadap)
-d mark duplicate (via samblaster)
-S for SAM/BAM input, don\'t shuffle
-s sort the output alignment (higher RAM footprint)
-s sort the output alignment (requring more RAM)
-k keep temporary files generated by typeHLA
Note: File type is determined by the file extension of the first input sequence file.
Recognized file extensions: fasta, fa, fastq, fq, fasta.gz, fa.gz, fastq.gz,
fq.gz, sam, sam.gz and bam. SAM/BAM input will be converted to FASTQ.
Examples:
When option -d is in use, all the input reads are required to come from the same
library and all the reads from the library shall be included in the input. In
addition, samblaster does not distinguish optical and PCR duplicates.
* Map paired-end reads to GRCh38+ALT+decoy+HLA and perform HLA typing:
The output may consist of the following files:
run-bwamem -o prefix -t8 -R"@RG\tID:foo\tSM:bar" hs38d6.fa read1.fq.gz read2.fq.gz
{-o}.aln.sam.gz - unsorted alignment (unless -s is specified)
{-o}.aln.bam - sorted alignment (if -s is specified)
{-o}.oph.sam.gz - orphan single-end reads mapping (if input is paired SAM/BAM)
{-o}.hla.top - best HLA typing for the six classical HLA genes
{-o}.hla.all - additional HLA typing
{-o}.log.* - log files
* Remap coordinate-sorted BAM, trim Illumina PE adapters and sort the output. The BAM
may contain single-end or paired-end reads, or a mixture of the two types. The read
groups are not transferred to the output BAM.
run-bwamem -sao prefix hs38d6.fa old-srt.bam
* Remap name-grouped BAM and mark duplicates. Note that in this case, all reads from
a single library should be aligned at the same time. Paired-end only.
run-bwamem -Sdo prefix hs38d6.fa old-unsrt.bam
Output files:
{-o}.aln.bam - final alignment
{-o}.hla.top - best genotypes for the 6 classical HLA genes (if there are HLA-* contigs)
{-o}.hla.all - additional HLA genotypes consistent with data
{-o}.log.* - log files
') if @ARGV < 2;

Expand Down Expand Up @@ -103,19 +109,18 @@ if ($is_sam || $is_bam) {
my $cmd_sam2bam = $is_sam? "$root/htsbox samview -uS $ARGV[1] \\\n" : "cat $ARGV[1] \\\n";
my $ntmps = int($size / 4e9) + 1;
my $cmd_shuf = ($is_bam || $is_sam) && !defined($opts{S})? " | $root/htsbox bamshuf -uOn$ntmps - $prefix.shuf \\\n" : "";
my $cmd_bam2fq = "";
$cmd_bam2fq = " | $root/htsbox bam2fq -O " . (defined($opts{p})? "-s $prefix.oph.fq.gz " : "") . "- \\\n";
my $cmd_bam2fq = " | $root/htsbox bam2fq -O - \\\n";
$cmd = $cmd_sam2bam . $cmd_shuf . $cmd_bam2fq;
} elsif (@ARGV >= 3) {
$cmd = "$root/seqtk mergepe $ARGV[1] $ARGV[2] \\\n";
} else {
$cmd = "cat $ARGV[1] \\\n";
}

my $bwa_opts = ($opts{t} > 1? "-t$opts{t} " : "") . (defined($opts{x})? "-x $opts{x} " : "") . (defined($opts{R})? "-R'$opts{R}' " : "");
my $bwa_opts = "-p " . ($opts{t} > 1? "-t$opts{t} " : "") . (defined($opts{x})? "-x $opts{x} " : "") . (defined($opts{R})? "-R'$opts{R}' " : "");

$cmd .= " | $root/trimadap \\\n" if defined($opts{a});
$cmd .= " | $root/bwa mem $bwa_opts" . ($is_pe? "-p " : "") . "$ARGV[0] - 2> $prefix.log.bwamem \\\n";
$cmd .= " | $root/bwa mem $bwa_opts$ARGV[0] - 2> $prefix.log.bwamem \\\n";
$cmd .= " | $root/samblaster 2> $prefix.log.dedup \\\n" if defined($opts{d});

my $has_hla = 0;
Expand All @@ -131,15 +136,7 @@ if (-f "$ARGV[0].alt") {
}

my $t_sort = $opts{t} < 4? $opts{t} : 4;
$cmd .= defined($opts{s})? " | $root/samtools sort -@ $t_sort -m1G - $prefix.aln;\n" : " | gzip -1 > $prefix.aln.sam.gz;\n";

# map single end reads
if ($cmd =~ /$prefix.oph.fq.gz/) {
$cmd .= "[ -s $prefix.oph.fq.gz ] && zcat $prefix.oph.fq.gz \\\n";
$cmd .= " | $root/trimadap \\\n" if defined($opts{a});
$cmd .= " | $root/bwa mem $bwa_opts$ARGV[0] - 2>> $prefix.log.bwamem | gzip -1 > $prefix.oph.sam.gz;\n";
$cmd .= "rm -f $prefix.oph.fq.gz;\n";
}
$cmd .= defined($opts{s})? " | $root/samtools sort -@ $t_sort -m1G - $prefix.aln;\n" : " | $root/samtools view -1 - > $prefix.aln.bam;\n";

if ($has_hla && (!defined($opts{x}) || $opts{x} eq 'intractg')) {
$cmd .= "$root/run-HLA ". ($opts{x} eq 'intractg'? "-A " : "") . "$prefix.hla > $prefix.hla.top 2> $prefix.log.hla;\n";
Expand Down
33 changes: 24 additions & 9 deletions fastmap.c
Original file line number Diff line number Diff line change
Expand Up @@ -66,7 +66,7 @@ int main_mem(int argc, char *argv[])
else if (c == 't') opt->n_threads = atoi(optarg), opt->n_threads = opt->n_threads > 1? opt->n_threads : 1;
else if (c == 'P') opt->flag |= MEM_F_NOPAIRING;
else if (c == 'a') opt->flag |= MEM_F_ALL;
else if (c == 'p') opt->flag |= MEM_F_PE;
else if (c == 'p') opt->flag |= MEM_F_PE | MEM_F_SMARTPE;
else if (c == 'M') opt->flag |= MEM_F_NO_MULTI;
else if (c == 'S') opt->flag |= MEM_F_NO_RESCUE;
else if (c == 'e') opt->flag |= MEM_F_SELF_OVLP;
Expand Down Expand Up @@ -167,7 +167,7 @@ int main_mem(int argc, char *argv[])
fprintf(stderr, " intractg: -B9 -O16 -L5 (intra-species contigs to ref)\n");
// fprintf(stderr, " pbread: -k13 -W40 -c1000 -r10 -A1 -B1 -O1 -E1 -N25 -FeaD.001\n");
fprintf(stderr, "\nInput/output options:\n\n");
fprintf(stderr, " -p first query file consists of interleaved paired-end sequences\n");
fprintf(stderr, " -p smart pairing (ignoring in2.fq)\n");
fprintf(stderr, " -R STR read group header line such as '@RG\\tID:foo\\tSM:bar' [null]\n");
fprintf(stderr, " -j ignore ALT contigs\n");
fprintf(stderr, "\n");
Expand Down Expand Up @@ -248,7 +248,7 @@ int main_mem(int argc, char *argv[])
if (optind + 2 < argc) {
if (opt->flag&MEM_F_PE) {
if (bwa_verbose >= 2)
fprintf(stderr, "[W::%s] when '-p' is in use, the second query file will be ignored.\n", __func__);
fprintf(stderr, "[W::%s] when '-p' is in use, the second query file is ignored.\n", __func__);
} else {
ko2 = kopen(argv[optind + 2], &fd2);
if (ko2 == 0) {
Expand All @@ -265,19 +265,34 @@ int main_mem(int argc, char *argv[])
actual_chunk_size = fixed_chunk_size > 0? fixed_chunk_size : opt->chunk_size * opt->n_threads;
while ((seqs = bseq_read(actual_chunk_size, &n, ks, ks2)) != 0) {
int64_t size = 0;
if ((opt->flag & MEM_F_PE) && (n&1) == 1) {
if (bwa_verbose >= 2)
fprintf(stderr, "[W::%s] odd number of reads in the PE mode; last read dropped\n", __func__);
n = n>>1<<1;
}
if (!copy_comment)
for (i = 0; i < n; ++i) {
free(seqs[i].comment); seqs[i].comment = 0;
}
for (i = 0; i < n; ++i) size += seqs[i].l_seq;
if (bwa_verbose >= 3)
fprintf(stderr, "[M::%s] read %d sequences (%ld bp)...\n", __func__, n, (long)size);
mem_process_seqs(opt, idx->bwt, idx->bns, idx->pac, n_processed, n, seqs, pes0);
if (opt->flag & MEM_F_SMARTPE) {
bseq1_t *sep[2];
int i, n_sep[2];
mem_opt_t tmp_opt = *opt;
bseq_classify(n, seqs, n_sep, sep);
if (bwa_verbose >= 3)
fprintf(stderr, "[M::%s] %d single-end sequences; %d paired-end sequences\n", __func__, n_sep[0], n_sep[1]);
if (n_sep[0]) {
tmp_opt.flag &= ~MEM_F_PE;
mem_process_seqs(&tmp_opt, idx->bwt, idx->bns, idx->pac, n_processed, n_sep[0], sep[0], 0);
for (i = 0; i < n_sep[0]; ++i)
seqs[sep[0][i].id].sam = sep[0][i].sam;
}
if (n_sep[1]) {
tmp_opt.flag |= MEM_F_PE;
mem_process_seqs(&tmp_opt, idx->bwt, idx->bns, idx->pac, n_processed + n_sep[0], n_sep[1], sep[1], pes0);
for (i = 0; i < n_sep[1]; ++i)
seqs[sep[1][i].id].sam = sep[1][i].sam;
}
free(sep[0]); free(sep[1]);
} else mem_process_seqs(opt, idx->bwt, idx->bns, idx->pac, n_processed, n, seqs, pes0);
n_processed += n;
for (i = 0; i < n; ++i) {
if (seqs[i].sam) err_fputs(seqs[i].sam, stdout);
Expand Down
2 changes: 1 addition & 1 deletion main.c
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
#include "utils.h"

#ifndef PACKAGE_VERSION
#define PACKAGE_VERSION "0.7.10-r984-dirty"
#define PACKAGE_VERSION "0.7.10-r998-dirty"
#endif

int bwa_fa2pac(int argc, char *argv[]);
Expand Down

0 comments on commit 80e4ecf

Please sign in to comment.