Skip to content

Commit

Permalink
r867: fixed a few bugs; added ALT hits to XA
Browse files Browse the repository at this point in the history
  • Loading branch information
lh3 committed Sep 19, 2014
1 parent b22bed2 commit 9af3606
Show file tree
Hide file tree
Showing 6 changed files with 61 additions and 31 deletions.
2 changes: 1 addition & 1 deletion bwa-helper.js
Original file line number Diff line number Diff line change
Expand Up @@ -780,7 +780,7 @@ function main(args)
else if (cmd == 'markovlp') bwa_markOvlp(args);
else if (cmd == 'pas2reg') bwa_pas2reg(args);
else if (cmd == 'reg2cut') bwa_reg2cut(args);
else if (cmd == 'altgen' || cmd == 'genalt') bwa_alt(args);
else if (cmd == 'altgen' || cmd == 'genalt') bwa_altgen(args);
else if (cmd == 'altlift') bwa_altlift(args);
else if (cmd == 'shortname') bwa_shortname(args);
else warn("Unrecognized command");
Expand Down
33 changes: 23 additions & 10 deletions bwamem.c
Original file line number Diff line number Diff line change
Expand Up @@ -72,7 +72,8 @@ mem_opt_t *mem_opt_init()
o->split_factor = 1.5;
o->chunk_size = 10000000;
o->n_threads = 1;
o->max_hits = 5;
o->max_XA_hits = 5;
o->max_XA_hits_alt = 50;
o->max_matesw = 50;
o->mask_level_redun = 0.95;
o->min_chain_weight = 0;
Expand Down Expand Up @@ -511,26 +512,38 @@ static void mem_mark_primary_se_core(const mem_opt_t *opt, int n, mem_alnreg_t *

int mem_mark_primary_se(const mem_opt_t *opt, int n, mem_alnreg_t *a, int64_t id)
{
int i, n_pri;
int i, j, n_pri;
int_v z = {0,0,0};
if (n == 0) return 0;
for (i = n_pri = 0; i < n; ++i) {
a[i].sub = 0, a[i].alt_sc = 0, a[i].secondary = -1, a[i].hash = hash_64(id+i);
a[i].sub = a[i].alt_sc = 0, a[i].secondary = a[i].secondary_alt = -1, a[i].hash = hash_64(id+i);
if (!a[i].is_alt) ++n_pri;
}
ks_introsort(mem_ars_hash, n, a);
mem_mark_primary_se_core(opt, n, a, &z);
for (i = 0; i < n; ++i) {
mem_alnreg_t *p = &a[i];
if (p->is_alt && p->secondary >= 0) // don't track the "parent" hit of ALT secondary hits
p->secondary = INT_MAX;
p->secondary_alt = i; // keep the rank in the first round
if (!p->is_alt && p->secondary >= 0 && a[p->secondary].is_alt)
p->alt_sc = a[p->secondary].score;
}
if (n_pri > 0 && n_pri < n) {
ks_introsort(mem_ars_hash2, n, a);
for (i = 0; i < n_pri; ++i) a[i].sub = 0, a[i].secondary = -1;
mem_mark_primary_se_core(opt, n_pri, a, &z);
if (n_pri >= 0 && n_pri < n) {
kv_resize(int, z, n);
if (n_pri > 0) ks_introsort(mem_ars_hash2, n, a);
for (i = 0; i < n; ++i) z.a[a[i].secondary_alt] = i;
for (i = 0; i < n; ++i) {
if (a[i].secondary < 0) {
a[i].secondary_alt = -1;
continue;
}
j = z.a[a[i].secondary];
a[i].secondary_alt = a[j].is_alt? j : -1;
if (a[i].is_alt) a[i].secondary = INT_MAX;
}
if (n_pri > 0) { // mark primary for hits to the primary assembly only
for (i = 0; i < n_pri; ++i) a[i].sub = 0, a[i].secondary = -1;
mem_mark_primary_se_core(opt, n_pri, a, &z);
}
}
free(z.a);
return n_pri;
Expand Down Expand Up @@ -976,7 +989,7 @@ void mem_reg2sam(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac,
if (p->secondary >= 0) q->sub = -1; // don't output sub-optimal score
if (k && p->secondary < 0) // if supplementary
q->flag |= (opt->flag&MEM_F_NO_MULTI)? 0x10000 : 0x800;
if (k && q->mapq > aa.a[0].mapq) q->mapq = aa.a[0].mapq;
if (k && !p->is_alt && q->mapq > aa.a[0].mapq) q->mapq = aa.a[0].mapq;
}
if (aa.n == 0) { // no alignments good enough; then write an unaligned record
mem_aln_t t;
Expand Down
3 changes: 2 additions & 1 deletion bwamem.h
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,7 @@ typedef struct {
int mapQ_coef_fac;
int max_ins; // when estimating insert size distribution, skip pairs with insert longer than this value
int max_matesw; // perform maximally max_matesw rounds of mate-SW for each end
int max_hits; // if there are max_hits or fewer, output them all
int max_XA_hits, max_XA_hits_alt; // if there are max_hits or fewer, output them all
int8_t mat[25]; // scoring matrix; mat[0] == 0 if unset
} mem_opt_t;

Expand All @@ -69,6 +69,7 @@ typedef struct {
int w; // actual band width used in extension
int seedcov; // length of regions coverged by seeds
int secondary; // index of the parent hit shadowing the current hit; <0 if primary
int secondary_alt;
int seedlen0; // length of the starting seed
int n_comp:30, is_alt:2; // number of sub-alignments chained together
float frac_rep;
Expand Down
43 changes: 27 additions & 16 deletions bwamem_extra.c
Original file line number Diff line number Diff line change
Expand Up @@ -112,43 +112,54 @@ void mem_reg2ovlp(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac,
s->sam = str.s;
}

static inline void get_pri_idx(double XA_drop_ratio, const mem_alnreg_t *a, int i, int r[2])
{
int j = a[i].secondary, k = a[i].secondary_alt;
r[0] = r[1] = -1;
if (j >= 0 && j < INT_MAX && !a[j].is_alt && !a[i].is_alt && a[i].score >= a[j].score * XA_drop_ratio) r[0] = j;
if (k >= 0 && a[k].is_alt && a[i].score >= a[k].score * XA_drop_ratio) r[1] = k;
}

// Okay, returning strings is bad, but this has happened a lot elsewhere. If I have time, I need serious code cleanup.
char **mem_gen_alt(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, const mem_alnreg_v *a, int l_query, const char *query) // ONLY work after mem_mark_primary_se()
{
int i, k, *cnt, tot;
kstring_t *aln = 0;
int i, k, *cnt, tot, r[2];
kstring_t *aln = 0, str = {0,0,0};
char **XA = 0;

cnt = calloc(a->n, sizeof(int));
for (i = 0, tot = 0; i < a->n; ++i) {
int j = a->a[i].secondary;
if (j >= 0 && j < INT_MAX && a->a[i].score >= a->a[j].score * opt->XA_drop_ratio)
++cnt[j], ++tot;
get_pri_idx(opt->XA_drop_ratio, a->a, i, r);
if (r[0] >= 0) ++cnt[r[0]], ++tot;
if (r[1] >= 0) ++cnt[r[1]], ++tot;
}
if (tot == 0) goto end_gen_alt;
aln = calloc(a->n, sizeof(kstring_t));
for (i = 0; i < a->n; ++i) {
mem_aln_t t;
int j = a->a[i].secondary;
if (j < 0 || j == INT_MAX || a->a[i].score < a->a[j].score * opt->XA_drop_ratio) continue; // we don't process the primary alignments as they will be converted to SAM later
if (cnt[j] > opt->max_hits) continue;
get_pri_idx(opt->XA_drop_ratio, a->a, i, r);
if (r[0] < 0 && r[1] < 0) continue;
if ((r[0] >= 0 && cnt[r[0]] > opt->max_XA_hits) || (r[1] >= 0 && cnt[r[1]] > opt->max_XA_hits_alt)) continue;
t = mem_reg2aln(opt, bns, pac, l_query, query, &a->a[i]);
kputs(bns->anns[t.rid].name, &aln[j]);
kputc(',', &aln[j]); kputc("+-"[t.is_rev], &aln[j]); kputl(t.pos + 1, &aln[j]);
kputc(',', &aln[j]);
str.l = 0;
kputs(bns->anns[t.rid].name, &str);
kputc(',', &str); kputc("+-"[t.is_rev], &str); kputl(t.pos + 1, &str);
kputc(',', &str);
for (k = 0; k < t.n_cigar; ++k) {
kputw(t.cigar[k]>>4, &aln[j]);
kputc("MIDSHN"[t.cigar[k]&0xf], &aln[j]);
kputw(t.cigar[k]>>4, &str);
kputc("MIDSHN"[t.cigar[k]&0xf], &str);
}
kputc(',', &aln[j]); kputw(t.NM, &aln[j]);
kputc(';', &aln[j]);
kputc(',', &str); kputw(t.NM, &str);
kputc(';', &str);
free(t.cigar);
if (r[0] >= 0 && cnt[r[0]] <= opt->max_XA_hits) kputsn(str.s, str.l, &aln[r[0]]);
if (r[1] >= 0 && cnt[r[1]] <= opt->max_XA_hits_alt) kputsn(str.s, str.l, &aln[r[1]]);
}
XA = calloc(a->n, sizeof(char*));
for (k = 0; k < a->n; ++k)
XA[k] = aln[k].s;

end_gen_alt:
free(cnt); free(aln);
free(cnt); free(aln); free(str.s);
return XA;
}
9 changes: 7 additions & 2 deletions fastmap.c
Original file line number Diff line number Diff line change
Expand Up @@ -79,7 +79,6 @@ int main_mem(int argc, char *argv[])
else if (c == 'r') opt->split_factor = atof(optarg), opt0.split_factor = 1.;
else if (c == 'D') opt->drop_ratio = atof(optarg), opt0.drop_ratio = 1.;
else if (c == 'm') opt->max_matesw = atoi(optarg), opt0.max_matesw = 1;
else if (c == 'h') opt->max_hits = atoi(optarg), opt0.max_hits = 1;
else if (c == 's') opt->split_width = atoi(optarg), opt0.split_width = 1;
else if (c == 'G') opt->max_chain_gap = atoi(optarg), opt0.max_chain_gap = 1;
else if (c == 'N') opt->max_chain_extend = atoi(optarg), opt0.max_chain_extend = 1;
Expand All @@ -88,6 +87,12 @@ int main_mem(int argc, char *argv[])
else if (c == 'g') opt->min_pa_ratio = atof(optarg), opt0.min_pa_ratio = 1;
else if (c == 'C') copy_comment = 1;
else if (c == 'K') fixed_chunk_size = atoi(optarg);
else if (c == 'h') {
opt0.max_XA_hits = opt0.max_XA_hits_alt = 1;
opt->max_XA_hits = opt->max_XA_hits_alt = strtol(optarg, &p, 10);
if (*p != 0 && ispunct(*p) && isdigit(p[1]))
opt->max_XA_hits_alt = strtol(p+1, &p, 10);
}
else if (c == 'Q') {
opt0.mapQ_coef_len = 1;
opt->mapQ_coef_len = atoi(optarg);
Expand Down Expand Up @@ -167,7 +172,7 @@ int main_mem(int argc, char *argv[])
fprintf(stderr, " -v INT verbose level: 1=error, 2=warning, 3=message, 4+=debugging [%d]\n", bwa_verbose);
fprintf(stderr, " -g FLOAT set mapQ to zero if the ratio of the primary-to-alt scores below FLOAT [%.3f]\n", opt->min_pa_ratio);
fprintf(stderr, " -T INT minimum score to output [%d]\n", opt->T);
fprintf(stderr, " -h INT if there are <INT hits with score >80%% of the max score, output all in XA [%d]\n", opt->max_hits);
fprintf(stderr, " -h INT[,INT] if there are <INT hits with score >80%% of the max score, output all in XA [%d,%d]\n", opt->max_XA_hits, opt->max_XA_hits_alt);
fprintf(stderr, " -a output all alignments for SE or unpaired PE\n");
fprintf(stderr, " -C append FASTA/FASTQ comment to SAM output\n");
fprintf(stderr, " -V output the reference FASTA header in the XR tag\n");
Expand Down
2 changes: 1 addition & 1 deletion main.c
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
#include "utils.h"

#ifndef PACKAGE_VERSION
#define PACKAGE_VERSION "0.7.10-r858-dirty"
#define PACKAGE_VERSION "0.7.10-r867-dirty"
#endif

int bwa_fa2pac(int argc, char *argv[]);
Expand Down

0 comments on commit 9af3606

Please sign in to comment.