From c9824432106175f00a2470c5ddaf217ce96a5c0d Mon Sep 17 00:00:00 2001 From: Heng Li Date: Wed, 17 Sep 2014 16:26:28 -0400 Subject: [PATCH 1/2] r854: improved the calculation of pa and build pa filtering into BWA-MEM --- bwamem.c | 27 ++++++++++++++++----------- bwamem.h | 4 +++- fastmap.c | 6 ++++-- main.c | 2 +- 4 files changed, 24 insertions(+), 15 deletions(-) diff --git a/bwamem.c b/bwamem.c index 533565be..7b6b1f45 100644 --- a/bwamem.c +++ b/bwamem.c @@ -78,6 +78,7 @@ mem_opt_t *mem_opt_init() o->min_chain_weight = 0; o->max_chain_extend = 1<<30; o->mapQ_coef_len = 50; o->mapQ_coef_fac = log(o->mapQ_coef_len); + o->min_pa_ratio = 0.8; bwa_fill_scmat(o->a, o->b, o->mat); return o; } @@ -514,14 +515,18 @@ int mem_mark_primary_se(const mem_opt_t *opt, int n, mem_alnreg_t *a, int64_t id int_v z = {0,0,0}; if (n == 0) return 0; for (i = n_pri = 0; i < n; ++i) { - a[i].sub = 0, a[i].secondary = -1, a[i].hash = hash_64(id+i); + a[i].sub = 0, a[i].alt_sc = 0, a[i].secondary = -1, a[i].hash = hash_64(id+i); if (!a[i].is_alt) ++n_pri; } ks_introsort(mem_ars_hash, n, a); mem_mark_primary_se_core(opt, n, a, &z); - for (i = 0; i < n; ++i) // don't track the "parent" hit of ALT secondary hits - if (a[i].is_alt && a[i].secondary >= 0) - a[i].secondary = INT_MAX; + for (i = 0; i < n; ++i) { + mem_alnreg_t *p = &a[i]; + if (p->is_alt && p->secondary >= 0) // don't track the "parent" hit of ALT secondary hits + p->secondary = INT_MAX; + if (!p->is_alt && p->secondary >= 0 && a[p->secondary].is_alt) + p->alt_sc = a[p->secondary].score; + } if (n_pri > 0 || n_pri != n) { ks_introsort(mem_ars_hash2, n, a); for (i = 0; i < n_pri; ++i) a[i].sub = 0, a[i].secondary = -1; @@ -808,7 +813,7 @@ void mem_aln2sam(const mem_opt_t *opt, const bntseq_t *bns, kstring_t *str, bseq if (p->rid >= 0) { // with coordinate kputs(bns->anns[p->rid].name, str); kputc('\t', str); // RNAME kputl(p->pos + 1, str); kputc('\t', str); // POS - kputw(p->mapq, str); kputc('\t', str); // MAPQ + kputw(p->score < opt->min_pa_ratio * p->alt_sc? 0 : p->mapq, str); kputc('\t', str); // MAPQ if (p->n_cigar) { // aligned for (i = 0; i < p->n_cigar; ++i) { int c = p->cigar[i]&0xf; @@ -880,13 +885,11 @@ void mem_aln2sam(const mem_opt_t *opt, const bntseq_t *bns, kstring_t *str, bseq for (i = 0; i < n; ++i) if (i != which && !(list[i].flag&0x100)) break; if (i < n) { // there are other primary hits; output them - int pri_alt_sc = -1; kputsn("\tSA:Z:", 6, str); for (i = 0; i < n; ++i) { const mem_aln_t *r = &list[i]; int k; - if (i == which || (list[i].flag&0x100)) continue; // proceed if: 1) different from the current; 2) not shadowed multi hit - if (list[i].is_alt) pri_alt_sc = pri_alt_sc > r->score? pri_alt_sc : r->score; + if (i == which || (r->flag&0x100)) continue; // proceed if: 1) different from the current; 2) not shadowed multi hit kputs(bns->anns[r->rid].name, str); kputc(',', str); kputl(r->pos+1, str); kputc(',', str); kputc("+-"[r->is_rev], str); kputc(',', str); @@ -897,9 +900,11 @@ void mem_aln2sam(const mem_opt_t *opt, const bntseq_t *bns, kstring_t *str, bseq kputc(',', str); kputw(r->NM, str); kputc(';', str); } - if (pri_alt_sc > 0) - ksprintf(str, "\tpa:f:%.3f", (double)p->score / pri_alt_sc); } + if (p->alt_sc > 0) + ksprintf(str, "\tpa:f:%.3f", (double)p->score / p->alt_sc); + if (p->score < opt->min_pa_ratio * p->alt_sc) + ksprintf(str, "\tom:i:%d", p->mapq); } if (p->XA) { kputsn("\tXA:Z:", 6, str); kputs(p->XA, str); } if (s->comment) { kputc('\t', str); kputs(s->comment, str); } @@ -1098,7 +1103,7 @@ mem_aln_t mem_reg2aln(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t * assert(a.rid == ar->rid); a.pos = pos - bns->anns[a.rid].offset; a.score = ar->score; a.sub = ar->sub > ar->csub? ar->sub : ar->csub; - a.is_alt = ar->is_alt; + a.is_alt = ar->is_alt; a.alt_sc = ar->alt_sc; free(query); return a; } diff --git a/bwamem.h b/bwamem.h index 20e8d05e..f5f33bbd 100644 --- a/bwamem.h +++ b/bwamem.h @@ -48,6 +48,7 @@ typedef struct { float XA_drop_ratio; // when counting hits for the XA tag, ignore alignments with score < XA_drop_ratio * max_score; only effective for the XA tag float mask_level_redun; float mapQ_coef_len; + float min_pa_ratio; int mapQ_coef_fac; int max_ins; // when estimating insert size distribution, skip pairs with insert longer than this value int max_matesw; // perform maximally max_matesw rounds of mate-SW for each end @@ -62,6 +63,7 @@ typedef struct { int score; // best local SW score int truesc; // actual score corresponding to the aligned region; possibly smaller than $score int sub; // 2nd best SW score + int alt_sc; int csub; // SW score of a tandem hit int sub_n; // approximate number of suboptimal hits int w; // actual band width used in extension @@ -90,7 +92,7 @@ typedef struct { // This struct is only used for the convenience of API. uint32_t *cigar; // CIGAR in the BAM encoding: opLen<<4|op; op to integer mapping: MIDSH=>01234 char *XA; // alternative mappings - int score, sub; + int score, sub, alt_sc; } mem_aln_t; #ifdef __cplusplus diff --git a/fastmap.c b/fastmap.c index 546b2eaa..8164ff3f 100644 --- a/fastmap.c +++ b/fastmap.c @@ -55,7 +55,7 @@ int main_mem(int argc, char *argv[]) opt = mem_opt_init(); memset(&opt0, 0, sizeof(mem_opt_t)); - while ((c = getopt(argc, argv, "epaFMCSPHVYk:c:v:s:r:t:R:A:B:O:E:U:w:L:d:T:Q:D:m:I:N:W:x:G:h:y:K:")) >= 0) { + while ((c = getopt(argc, argv, "epaFMCSPHVYk:c:v:s:r:t:R:A:B:O:E:U:w:L:d:T:Q:D:m:I:N:W:x:G:h:y:K:g:")) >= 0) { if (c == 'k') opt->min_seed_len = atoi(optarg), opt0.min_seed_len = 1; else if (c == 'x') mode = optarg; else if (c == 'w') opt->w = atoi(optarg), opt0.w = 1; @@ -85,6 +85,7 @@ int main_mem(int argc, char *argv[]) else if (c == 'N') opt->max_chain_extend = atoi(optarg), opt0.max_chain_extend = 1; else if (c == 'W') opt->min_chain_weight = atoi(optarg), opt0.min_chain_weight = 1; else if (c == 'y') opt->max_mem_intv = atol(optarg), opt0.max_mem_intv = 1; + else if (c == 'g') opt->min_pa_ratio = atof(optarg), opt0.min_pa_ratio = 1; else if (c == 'C') copy_comment = 1; else if (c == 'K') fixed_chunk_size = atoi(optarg); else if (c == 'Q') { @@ -138,7 +139,7 @@ int main_mem(int argc, char *argv[]) fprintf(stderr, " -w INT band width for banded alignment [%d]\n", opt->w); fprintf(stderr, " -d INT off-diagonal X-dropoff [%d]\n", opt->zdrop); fprintf(stderr, " -r FLOAT look for internal seeds inside a seed longer than {-k} * FLOAT [%g]\n", opt->split_factor); - fprintf(stderr, " -y INT find MEMs longer than {-k} * {-r} with size less than INT [%ld]\n", (long)opt->max_mem_intv); + fprintf(stderr, " -y INT find MEMs longer than {-k} * {-r} with size less than INT (EXPERIMENTAL) [%ld]\n", (long)opt->max_mem_intv); // fprintf(stderr, " -s INT look for internal seeds inside a seed with less than INT occ [%d]\n", opt->split_width); fprintf(stderr, " -c INT skip seeds with more than INT occurrences [%d]\n", opt->max_occ); fprintf(stderr, " -D FLOAT drop chains shorter than FLOAT fraction of the longest overlapping chain [%.2f]\n", opt->drop_ratio); @@ -164,6 +165,7 @@ int main_mem(int argc, char *argv[]) fprintf(stderr, " -R STR read group header line such as '@RG\\tID:foo\\tSM:bar' [null]\n"); fprintf(stderr, "\n"); fprintf(stderr, " -v INT verbose level: 1=error, 2=warning, 3=message, 4+=debugging [%d]\n", bwa_verbose); + fprintf(stderr, " -g FLOAT set mapQ to zero if the ratio of the primary-to-alt scores below FLOAT [%.3f]\n", opt->min_pa_ratio); fprintf(stderr, " -T INT minimum score to output [%d]\n", opt->T); fprintf(stderr, " -h INT if there are 80%% of the max score, output all in XA [%d]\n", opt->max_hits); fprintf(stderr, " -a output all alignments for SE or unpaired PE\n"); diff --git a/main.c b/main.c index 123f0d4b..03f6b22d 100644 --- a/main.c +++ b/main.c @@ -4,7 +4,7 @@ #include "utils.h" #ifndef PACKAGE_VERSION -#define PACKAGE_VERSION "0.7.10-r849-dirty" +#define PACKAGE_VERSION "0.7.10-r854-dirty" #endif int bwa_fa2pac(int argc, char *argv[]); From a32d44d8d64b8a3b8417a8e8802560352a31ace4 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Wed, 17 Sep 2014 23:07:56 -0400 Subject: [PATCH 2/2] r855: show ALT hits in the PE mode, too In the previous version, it does not --- bwamem_pair.c | 22 +++++++++++++++++++--- main.c | 2 +- 2 files changed, 20 insertions(+), 4 deletions(-) diff --git a/bwamem_pair.c b/bwamem_pair.c index f2124276..f754743e 100644 --- a/bwamem_pair.c +++ b/bwamem_pair.c @@ -240,6 +240,19 @@ int mem_pair(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, cons return ret; } +void mem_aln2sam(const mem_opt_t *opt, const bntseq_t *bns, kstring_t *str, bseq1_t *s, int n, const mem_aln_t *list, int which, const mem_aln_t *m); + +static inline void mem_reg2sam_alt(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, kstring_t *str, bseq1_t *s, mem_alnreg_t *p, int extra_flag, const mem_aln_t *m) +{ // a simplified version of mem_reg2sam() + mem_aln_t t; + if (p->score < opt->T || p->secondary >= 0 || !p->is_alt) return; + t = mem_reg2aln(opt, bns, pac, s->l_seq, s->seq, p); + t.flag |= extra_flag; + t.flag |= (opt->flag&MEM_F_NO_MULTI)? 0x10000 : 0x800; + mem_aln2sam(opt, bns, str, s, 1, &t, 0, m); + free(t.cigar); +} + #define raw_mapq(diff, a) ((int)(6.02 * (diff) / (a) + .499)) int mem_sam_pe(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, const mem_pestat_t pes[4], uint64_t id, bseq1_t s[2], mem_alnreg_v a[2]) @@ -247,7 +260,6 @@ int mem_sam_pe(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, co extern int mem_mark_primary_se(const mem_opt_t *opt, int n, mem_alnreg_t *a, int64_t id); extern int mem_approx_mapq_se(const mem_opt_t *opt, const mem_alnreg_t *a); extern void mem_reg2sam(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, bseq1_t *s, mem_alnreg_v *a, int extra_flag, const mem_aln_t *m); - extern void mem_aln2sam(const mem_opt_t *opt, const bntseq_t *bns, kstring_t *str, bseq1_t *s, int n, const mem_aln_t *list, int which, const mem_aln_t *m); extern char **mem_gen_alt(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, const mem_alnreg_v *a, int l_query, const char *query); int n = 0, i, j, z[2], o, subo, n_sub, extra_flag = 1, n_pri[2]; @@ -327,8 +339,12 @@ int mem_sam_pe(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, co // write SAM h[0] = mem_reg2aln(opt, bns, pac, s[0].l_seq, s[0].seq, &a[0].a[z[0]]); h[0].mapq = q_se[0]; h[0].flag |= 0x40 | extra_flag; h[0].XA = XA[0]? XA[0][z[0]] : 0; h[1] = mem_reg2aln(opt, bns, pac, s[1].l_seq, s[1].seq, &a[1].a[z[1]]); h[1].mapq = q_se[1]; h[1].flag |= 0x80 | extra_flag; h[1].XA = XA[1]? XA[1][z[1]] : 0; - mem_aln2sam(opt, bns, &str, &s[0], 1, &h[0], 0, &h[1]); s[0].sam = strdup(str.s); str.l = 0; - mem_aln2sam(opt, bns, &str, &s[1], 1, &h[1], 0, &h[0]); s[1].sam = str.s; + mem_aln2sam(opt, bns, &str, &s[0], 1, &h[0], 0, &h[1]); // write the primary hit for read1 + if (n_pri[0] < a[0].n) mem_reg2sam_alt(opt, bns, pac, &str, &s[0], &a[0].a[n_pri[0]], 0x41|extra_flag, &h[1]); + s[0].sam = strdup(str.s); str.l = 0; + mem_aln2sam(opt, bns, &str, &s[1], 1, &h[1], 0, &h[0]); // write the primary hit for read2 + if (n_pri[1] < a[1].n) mem_reg2sam_alt(opt, bns, pac, &str, &s[1], &a[1].a[n_pri[1]], 0x81|extra_flag, &h[0]); + s[1].sam = str.s; if (strcmp(s[0].name, s[1].name) != 0) err_fatal(__func__, "paired reads have different names: \"%s\", \"%s\"\n", s[0].name, s[1].name); free(h[0].cigar); // h[0].XA will be freed in the following block free(h[1].cigar); diff --git a/main.c b/main.c index 03f6b22d..bedbe43b 100644 --- a/main.c +++ b/main.c @@ -4,7 +4,7 @@ #include "utils.h" #ifndef PACKAGE_VERSION -#define PACKAGE_VERSION "0.7.10-r854-dirty" +#define PACKAGE_VERSION "0.7.10-r855-dirty" #endif int bwa_fa2pac(int argc, char *argv[]);