r998: smart pairing; allow mixture of SE/PE reads

scau-drr · Nov 18, 2014 · 80e4ecf · 80e4ecf
1 parent 6f9bf9a
commit 80e4ecf
Show file tree

Hide file tree

Showing 7 changed files with 89 additions and 64 deletions.
diff --git a/NEWS.md b/NEWS.md
@@ -1,29 +1,17 @@
-Release 0.7.11 (XX September, 2014)
+Release 0.7.11 (XX November, 2014)
 -----------------------------------
 
-A major change to BWA-MEM is the support of mapping to ALT contigs. To use this
-feature, users need to manually create a file `indexbase.alt` with each line
-giving the name of an ALT contig. During alignment, BWA-MEM will be able to
-classify potential hits to ALT and non-ALT hits. It reports alignments and
-assigns mapping quality (mapQ) loosely following these rules:
+A major change to BWA-MEM is the support of mapping to ALT contigs in addition
+to the primary assembly. Part of the ALT mapping strategy is implemented in
+BWA-MEM and the rest in a postprocessing script for now. Due to the extra
+layer of complexity on generating the reference genome and on the two-step
+mapping, we start to provide a wrapper script and precompiled binaries since
+this release. Please check README-alt.md for details.
 
- 1. The original mapQ of a non-ALT hit is computed across non-ALT hits only.
-    The reported mapQ of an ALT hit is always computed across all hits.
-
- 2. An ALT hit is only reported if its score is strictly better than all
-    overlapping non-ALT hits. A reported ALT hit is flagged with 0x800
-    (supplementary) unless there are no non-ALT hits.
-
- 3. The mapQ of a non-ALT hit is reduced to zero if its score is less than 80%
-    (controlled by option `-g`) of the score of an overlapping ALT hit. In this
-    case, the original mapQ is moved to the `om` tag.
-
-This way, non-ALT alignments are only affected by ALT contigs if there are
-significantly better ALT alignments. BWA-MEM is carefully engineered such that
-ALT contigs do not interfere with the alignments to the primary assembly.
-
-Users may consider to use ALT contigs from GRCh38. I am also constructing a
-non-redundant and more complete set of sequences missing from GRCh38.
+Another major addition to BWA-MEM is HLA typing, which made possible with the
+new ALT mapping strategy. Necessary data and programs are also included in
+the binary release. The wrapper script also performs HLA typing when HLA genes
+are also included in the reference genome as additional ALT contigs.
 
 Other notable changes to BWA-MEM:
 
@@ -42,7 +30,9 @@ Other notable changes to BWA-MEM:
  * Added new pre-setting for Oxford Nanopore 2D reads. For small genomes,
    though, LAST is still more sensitive.
 
-(0.7.11: XX September 2014, rXXX)
+ * Added LAST-like seeding. This improves the accuracy for longer reads.
+
+(0.7.11: XX November 2014, rXXX)
 
 
 

diff --git a/bwa.c b/bwa.c
@@ -7,6 +7,7 @@
 #include "ksw.h"
 #include "utils.h"
 #include "kstring.h"
+#include "kvec.h"
 
 #ifdef USE_MALLOC_WRAPPERS
 #  include "malloc_wrap.h"
@@ -55,10 +56,12 @@ bseq1_t *bseq_read(int chunk_size, int *n_, void *ks1_, void *ks2_)
 		}
 		trim_readno(&ks->name);
 		kseq2bseq1(ks, &seqs[n]);
+		seqs[n].id = n;
 		size += seqs[n++].l_seq;
 		if (ks2) {
 			trim_readno(&ks2->name);
 			kseq2bseq1(ks2, &seqs[n]);
+			seqs[n].id = n;
 			size += seqs[n++].l_seq;
 		}
 		if (size >= chunk_size && (n&1) == 0) break;
@@ -71,6 +74,24 @@ bseq1_t *bseq_read(int chunk_size, int *n_, void *ks1_, void *ks2_)
 	return seqs;
 }
 
+void bseq_classify(int n, bseq1_t *seqs, int m[2], bseq1_t *sep[2])
+{
+	int i, has_last;
+	kvec_t(bseq1_t) a[2] = {{0,0,0}, {0,0,0}};
+	for (i = 1, has_last = 1; i < n; ++i) {
+		if (has_last) {
+			if (strcmp(seqs[i].name, seqs[i-1].name) == 0) {
+				kv_push(bseq1_t, a[1], seqs[i-1]);
+				kv_push(bseq1_t, a[1], seqs[i]);
+				has_last = 0;
+			} else kv_push(bseq1_t, a[0], seqs[i-1]);
+		} else has_last = 1;
+	}
+	if (has_last) kv_push(bseq1_t, a[0], seqs[i-1]);
+	sep[0] = a[0].a, m[0] = a[0].n;
+	sep[1] = a[1].a, m[1] = a[1].n;
+}
+
 /*****************
  * CIGAR related *
  *****************/

diff --git a/bwa.h b/bwa.h
@@ -23,7 +23,7 @@ typedef struct {
 } bwaidx_t;
 
 typedef struct {
-	int l_seq;
+	int l_seq, id;
 	char *name, *comment, *seq, *qual, *sam;
 } bseq1_t;
 
@@ -35,6 +35,7 @@ extern "C" {
 #endif
 
 	bseq1_t *bseq_read(int chunk_size, int *n_, void *ks1_, void *ks2_);
+	void bseq_classify(int n, bseq1_t *seqs, int m[2], bseq1_t *sep[2]);
 
 	void bwa_fill_scmat(int a, int b, int8_t mat[25]);
 	uint32_t *bwa_gen_cigar(const int8_t mat[25], int q, int r, int w_, int64_t l_pac, const uint8_t *pac, int l_query, uint8_t *query, int64_t rb, int64_t re, int *score, int *n_cigar, int *NM);

diff --git a/bwamem.h b/bwamem.h
@@ -20,6 +20,7 @@ typedef struct __smem_i smem_i;
 #define MEM_F_ALN_REG   0x80
 #define MEM_F_REF_HDR	0x100
 #define MEM_F_SOFTCLIP  0x200
+#define MEM_F_SMARTPE   0x400
 
 typedef struct {
 	int a, b;               // match score and mismatch penalty

diff --git a/extras/run-bwamem b/extras/run-bwamem
@@ -5,7 +5,7 @@ use warnings;
 use Getopt::Std;
 
 my %opts = (t=>1, n=>64);
-getopts("Spadsko:R:x:t:", \%opts);
+getopts("Sadsko:R:x:t:", \%opts);
 
 die('
 Usage:   run-bwamem [options] <idxbase> <file1> [file2]
@@ -17,30 +17,36 @@ Options: -o STR    prefix for output files                       [inferred from
                    pacbio:   pacbio subreads (~10kb query, high error rate)
                    ont2d:    Oxford Nanopore reads (~10kb query, higher error rate)
          -t INT    number of threads                                               [1]
-         -p        input are paired-end reads if file2 absent
 
          -a        trim HiSeq2000/2500 PE resequencing adapters (via trimadap)
          -d        mark duplicate (via samblaster)
          -S        for SAM/BAM input, don\'t shuffle
-         -s        sort the output alignment (higher RAM footprint)
+         -s        sort the output alignment (requring more RAM)
          -k        keep temporary files generated by typeHLA
 
-Note: File type is determined by the file extension of the first input sequence file.
-      Recognized file extensions: fasta, fa, fastq, fq, fasta.gz, fa.gz, fastq.gz,
-      fq.gz, sam, sam.gz and bam. SAM/BAM input will be converted to FASTQ.
+Examples:
 
-      When option -d is in use, all the input reads are required to come from the same
-      library and all the reads from the library shall be included in the input. In
-      addition, samblaster does not distinguish optical and PCR duplicates.
+ * Map paired-end reads to GRCh38+ALT+decoy+HLA and perform HLA typing:
 
-      The output may consist of the following files:
+     run-bwamem -o prefix -t8 -R"@RG\tID:foo\tSM:bar" hs38d6.fa read1.fq.gz read2.fq.gz
 
-        {-o}.aln.sam.gz - unsorted alignment (unless -s is specified)
-        {-o}.aln.bam    - sorted alignment (if -s is specified)
-        {-o}.oph.sam.gz - orphan single-end reads mapping (if input is paired SAM/BAM)
-        {-o}.hla.top    - best HLA typing for the six classical HLA genes
-        {-o}.hla.all    - additional HLA typing
-        {-o}.log.*      - log files
+ * Remap coordinate-sorted BAM, trim Illumina PE adapters and sort the output. The BAM
+   may contain single-end or paired-end reads, or a mixture of the two types. The read
+   groups are not transferred to the output BAM.
+
+     run-bwamem -sao prefix hs38d6.fa old-srt.bam
+
+ * Remap name-grouped BAM and mark duplicates. Note that in this case, all reads from
+   a single library should be aligned at the same time. Paired-end only.
+
+	 run-bwamem -Sdo prefix hs38d6.fa old-unsrt.bam
+
+Output files:
+
+  {-o}.aln.bam - final alignment
+  {-o}.hla.top - best genotypes for the 6 classical HLA genes (if there are HLA-* contigs)
+  {-o}.hla.all - additional HLA genotypes consistent with data
+  {-o}.log.*   - log files
 
 ') if @ARGV < 2;
 
@@ -103,19 +109,18 @@ if ($is_sam || $is_bam) {
 	my $cmd_sam2bam = $is_sam? "$root/htsbox samview -uS $ARGV[1] \\\n" : "cat $ARGV[1] \\\n";
 	my $ntmps = int($size / 4e9) + 1;
 	my $cmd_shuf = ($is_bam || $is_sam) && !defined($opts{S})? "  | $root/htsbox bamshuf -uOn$ntmps - $prefix.shuf \\\n" : "";
-	my $cmd_bam2fq = "";
-	$cmd_bam2fq = "  | $root/htsbox bam2fq -O " . (defined($opts{p})? "-s $prefix.oph.fq.gz " : "") . "- \\\n";
+	my $cmd_bam2fq = "  | $root/htsbox bam2fq -O - \\\n";
 	$cmd = $cmd_sam2bam . $cmd_shuf . $cmd_bam2fq;
 } elsif (@ARGV >= 3) {
 	$cmd = "$root/seqtk mergepe $ARGV[1] $ARGV[2] \\\n";
 } else {
 	$cmd = "cat $ARGV[1] \\\n";
 }
 
-my $bwa_opts = ($opts{t} > 1? "-t$opts{t} " : "") . (defined($opts{x})? "-x $opts{x} " : "") . (defined($opts{R})? "-R'$opts{R}' " : "");
+my $bwa_opts = "-p " . ($opts{t} > 1? "-t$opts{t} " : "") . (defined($opts{x})? "-x $opts{x} " : "") . (defined($opts{R})? "-R'$opts{R}' " : "");
 
 $cmd .= "  | $root/trimadap \\\n" if defined($opts{a});
-$cmd .= "  | $root/bwa mem $bwa_opts" . ($is_pe? "-p " : "") . "$ARGV[0] - 2> $prefix.log.bwamem \\\n";
+$cmd .= "  | $root/bwa mem $bwa_opts$ARGV[0] - 2> $prefix.log.bwamem \\\n";
 $cmd .= "  | $root/samblaster 2> $prefix.log.dedup \\\n" if defined($opts{d});
 
 my $has_hla = 0;
@@ -131,15 +136,7 @@ if (-f "$ARGV[0].alt") {
 }
 
 my $t_sort = $opts{t} < 4? $opts{t} : 4;
-$cmd .= defined($opts{s})? "  | $root/samtools sort -@ $t_sort -m1G - $prefix.aln;\n" : "  | gzip -1 > $prefix.aln.sam.gz;\n";
-
-# map single end reads
-if ($cmd =~ /$prefix.oph.fq.gz/) {
-	$cmd .= "[ -s $prefix.oph.fq.gz ] && zcat $prefix.oph.fq.gz \\\n";
-	$cmd .= "  | $root/trimadap \\\n" if defined($opts{a});
-	$cmd .= "  | $root/bwa mem $bwa_opts$ARGV[0] - 2>> $prefix.log.bwamem | gzip -1 > $prefix.oph.sam.gz;\n";
-	$cmd .= "rm -f $prefix.oph.fq.gz;\n";
-}
+$cmd .= defined($opts{s})? "  | $root/samtools sort -@ $t_sort -m1G - $prefix.aln;\n" : "  | $root/samtools view -1 - > $prefix.aln.bam;\n";
 
 if ($has_hla && (!defined($opts{x}) || $opts{x} eq 'intractg')) {
 	$cmd .= "$root/run-HLA ". ($opts{x} eq 'intractg'? "-A " : "") . "$prefix.hla > $prefix.hla.top 2> $prefix.log.hla;\n";

diff --git a/fastmap.c b/fastmap.c
@@ -66,7 +66,7 @@ int main_mem(int argc, char *argv[])
 		else if (c == 't') opt->n_threads = atoi(optarg), opt->n_threads = opt->n_threads > 1? opt->n_threads : 1;
 		else if (c == 'P') opt->flag |= MEM_F_NOPAIRING;
 		else if (c == 'a') opt->flag |= MEM_F_ALL;
-		else if (c == 'p') opt->flag |= MEM_F_PE;
+		else if (c == 'p') opt->flag |= MEM_F_PE | MEM_F_SMARTPE;
 		else if (c == 'M') opt->flag |= MEM_F_NO_MULTI;
 		else if (c == 'S') opt->flag |= MEM_F_NO_RESCUE;
 		else if (c == 'e') opt->flag |= MEM_F_SELF_OVLP;
@@ -167,7 +167,7 @@ int main_mem(int argc, char *argv[])
 		fprintf(stderr, "                     intractg: -B9 -O16 -L5  (intra-species contigs to ref)\n");
 //		fprintf(stderr, "                     pbread: -k13 -W40 -c1000 -r10 -A1 -B1 -O1 -E1 -N25 -FeaD.001\n");
 		fprintf(stderr, "\nInput/output options:\n\n");
-		fprintf(stderr, "       -p            first query file consists of interleaved paired-end sequences\n");
+		fprintf(stderr, "       -p            smart pairing (ignoring in2.fq)\n");
 		fprintf(stderr, "       -R STR        read group header line such as '@RG\\tID:foo\\tSM:bar' [null]\n");
 		fprintf(stderr, "       -j            ignore ALT contigs\n");
 		fprintf(stderr, "\n");
@@ -248,7 +248,7 @@ int main_mem(int argc, char *argv[])
 	if (optind + 2 < argc) {
 		if (opt->flag&MEM_F_PE) {
 			if (bwa_verbose >= 2)
-				fprintf(stderr, "[W::%s] when '-p' is in use, the second query file will be ignored.\n", __func__);
+				fprintf(stderr, "[W::%s] when '-p' is in use, the second query file is ignored.\n", __func__);
 		} else {
 			ko2 = kopen(argv[optind + 2], &fd2);
 			if (ko2 == 0) {
@@ -265,19 +265,34 @@ int main_mem(int argc, char *argv[])
 	actual_chunk_size = fixed_chunk_size > 0? fixed_chunk_size : opt->chunk_size * opt->n_threads;
 	while ((seqs = bseq_read(actual_chunk_size, &n, ks, ks2)) != 0) {
 		int64_t size = 0;
-		if ((opt->flag & MEM_F_PE) && (n&1) == 1) {
-			if (bwa_verbose >= 2)
-				fprintf(stderr, "[W::%s] odd number of reads in the PE mode; last read dropped\n", __func__);
-			n = n>>1<<1;
-		}
 		if (!copy_comment)
 			for (i = 0; i < n; ++i) {
 				free(seqs[i].comment); seqs[i].comment = 0;
 			}
 		for (i = 0; i < n; ++i) size += seqs[i].l_seq;
 		if (bwa_verbose >= 3)
 			fprintf(stderr, "[M::%s] read %d sequences (%ld bp)...\n", __func__, n, (long)size);
-		mem_process_seqs(opt, idx->bwt, idx->bns, idx->pac, n_processed, n, seqs, pes0);
+		if (opt->flag & MEM_F_SMARTPE) {
+			bseq1_t *sep[2];
+			int i, n_sep[2];
+			mem_opt_t tmp_opt = *opt;
+			bseq_classify(n, seqs, n_sep, sep);
+			if (bwa_verbose >= 3)
+				fprintf(stderr, "[M::%s] %d single-end sequences; %d paired-end sequences\n", __func__, n_sep[0], n_sep[1]);
+			if (n_sep[0]) {
+				tmp_opt.flag &= ~MEM_F_PE;
+				mem_process_seqs(&tmp_opt, idx->bwt, idx->bns, idx->pac, n_processed, n_sep[0], sep[0], 0);
+				for (i = 0; i < n_sep[0]; ++i)
+					seqs[sep[0][i].id].sam = sep[0][i].sam;
+			}
+			if (n_sep[1]) {
+				tmp_opt.flag |= MEM_F_PE;
+				mem_process_seqs(&tmp_opt, idx->bwt, idx->bns, idx->pac, n_processed + n_sep[0], n_sep[1], sep[1], pes0);
+				for (i = 0; i < n_sep[1]; ++i)
+					seqs[sep[1][i].id].sam = sep[1][i].sam;
+			}
+			free(sep[0]); free(sep[1]);
+		} else mem_process_seqs(opt, idx->bwt, idx->bns, idx->pac, n_processed, n, seqs, pes0);
 		n_processed += n;
 		for (i = 0; i < n; ++i) {
 			if (seqs[i].sam) err_fputs(seqs[i].sam, stdout);

diff --git a/main.c b/main.c
@@ -4,7 +4,7 @@
 #include "utils.h"
 
 #ifndef PACKAGE_VERSION
-#define PACKAGE_VERSION "0.7.10-r984-dirty"
+#define PACKAGE_VERSION "0.7.10-r998-dirty"
 #endif
 
 int bwa_fa2pac(int argc, char *argv[]);