Skip to content

Commit

Permalink
r929: added simplified LAST-like seeding
Browse files Browse the repository at this point in the history
  • Loading branch information
lh3 committed Oct 20, 2014
1 parent 25d4551 commit 038af2a
Show file tree
Hide file tree
Showing 5 changed files with 54 additions and 19 deletions.
21 changes: 18 additions & 3 deletions bwamem.c
Original file line number Diff line number Diff line change
Expand Up @@ -121,7 +121,7 @@ static void mem_collect_intv(const mem_opt_t *opt, const bwt_t *bwt, int len, co
// first pass: find all SMEMs
while (x < len) {
if (seq[x] < 4) {
x = bwt_smem1a(bwt, len, seq, x, start_width, split_len, opt->max_mem_intv, &a->mem1, a->tmpv);
x = bwt_smem1(bwt, len, seq, x, start_width, &a->mem1, a->tmpv);
for (i = 0; i < a->mem1.n; ++i) {
bwtintv_t *p = &a->mem1.a[i];
int slen = (uint32_t)p->info - (p->info>>32); // seed length
Expand All @@ -131,7 +131,6 @@ static void mem_collect_intv(const mem_opt_t *opt, const bwt_t *bwt, int len, co
} else ++x;
}
// second pass: find MEMs inside a long SMEM
if (opt->max_mem_intv > 0) goto sort_intv;
old_n = a->mem.n;
for (k = 0; k < old_n; ++k) {
bwtintv_t *p = &a->mem.a[k];
Expand All @@ -142,8 +141,24 @@ static void mem_collect_intv(const mem_opt_t *opt, const bwt_t *bwt, int len, co
if ((uint32_t)a->mem1.a[i].info - (a->mem1.a[i].info>>32) >= opt->min_seed_len)
kv_push(bwtintv_t, a->mem, a->mem1.a[i]);
}
// third pass: LAST-like
if (opt->max_mem_intv > 0) {
x = 0;
while (x < len) {
if (seq[x] < 4) {
if (1) {
bwtintv_t m;
x = bwt_seed_strategy1(bwt, len, seq, x, opt->max_mem_intv, &m);
if (m.x[2] > 0) kv_push(bwtintv_t, a->mem, m);
} else { // for now, we never come to this block which is slower
x = bwt_smem1a(bwt, len, seq, x, start_width, opt->max_mem_intv, &a->mem1, a->tmpv);
for (i = 0; i < a->mem1.n; ++i)
kv_push(bwtintv_t, a->mem, a->mem1.a[i]);
}
} else ++x;
}
}
// sort
sort_intv:
ks_introsort(mem_intv, a->mem.n, a->mem.a);
}

Expand Down
2 changes: 1 addition & 1 deletion bwamem_extra.c
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,7 @@ const bwtintv_v *smem_next(smem_i *itr)
while (itr->start < itr->len && itr->query[itr->start] > 3) ++itr->start; // skip ambiguous bases
if (itr->start == itr->len) return 0;
ori_start = itr->start;
itr->start = bwt_smem1a(itr->bwt, itr->len, itr->query, ori_start, itr->min_intv, itr->max_len, itr->max_intv, itr->matches, itr->tmpvec); // search for SMEM
itr->start = bwt_smem1a(itr->bwt, itr->len, itr->query, ori_start, itr->min_intv, itr->max_intv, itr->matches, itr->tmpvec); // search for SMEM
return itr->matches;
}

Expand Down
44 changes: 32 additions & 12 deletions bwt.c
Original file line number Diff line number Diff line change
Expand Up @@ -285,8 +285,8 @@ static void bwt_reverse_intvs(bwtintv_v *p)
}
}
}

int bwt_smem1a(const bwt_t *bwt, int len, const uint8_t *q, int x, int min_intv, int max_len, uint64_t max_intv, bwtintv_v *mem, bwtintv_v *tmpvec[2])
// NOTE: $max_intv is not currently used in BWA-MEM
int bwt_smem1a(const bwt_t *bwt, int len, const uint8_t *q, int x, int min_intv, uint64_t max_intv, bwtintv_v *mem, bwtintv_v *tmpvec[2])
{
int i, j, c, ret;
bwtintv_t ik, ok[4];
Expand All @@ -302,10 +302,12 @@ int bwt_smem1a(const bwt_t *bwt, int len, const uint8_t *q, int x, int min_intv,
ik.info = x + 1;

for (i = x + 1, curr->n = 0; i < len; ++i) { // forward search
if (q[i] < 4) { // an A/C/G/T base
if (ik.x[2] < max_intv) { // an interval small enough
kv_push(bwtintv_t, *curr, ik);
break;
} else if (q[i] < 4) { // an A/C/G/T base
c = 3 - q[i]; // complement of q[i]
bwt_extend(bwt, &ik, ok, 0);
if (ik.x[2] < max_intv && (i - x > max_len || ik.x[2] == 1)) break;
if (ok[c].x[2] != ik.x[2]) { // change of the interval size
kv_push(bwtintv_t, *curr, ik);
if (ok[c].x[2] < min_intv) break; // the interval size is too small to be extended further
Expand All @@ -325,13 +327,8 @@ int bwt_smem1a(const bwt_t *bwt, int len, const uint8_t *q, int x, int min_intv,
c = i < 0? -1 : q[i] < 4? q[i] : -1; // c==-1 if i<0 or q[i] is an ambiguous base
for (j = 0, curr->n = 0; j < prev->n; ++j) {
bwtintv_t *p = &prev->a[j];
int max_stop = 0;
bwt_extend(bwt, p, ok, 1);
if (ok[c].x[2] != p->x[2]) { // change of interval size
if (p->info - i - 1 > max_len && p->x[2] < max_intv)
max_stop = 1;
}
if (c < 0 || ok[c].x[2] < min_intv || max_stop) { // keep the hit if reaching the beginning or an ambiguous base or the intv is small enough
if (c >= 0 && ik.x[2] >= max_intv) bwt_extend(bwt, p, ok, 1);
if (c < 0 || ik.x[2] < max_intv || ok[c].x[2] < min_intv) { // keep the hit if reaching the beginning or an ambiguous base or the intv is small enough
if (curr->n == 0) { // test curr->n>0 to make sure there are no longer matches
if (mem->n == 0 || i + 1 < mem->a[mem->n-1].info>>32) { // skip contained matches
ik = *p; ik.info |= (uint64_t)(i + 1)<<32;
Expand All @@ -355,7 +352,30 @@ int bwt_smem1a(const bwt_t *bwt, int len, const uint8_t *q, int x, int min_intv,

int bwt_smem1(const bwt_t *bwt, int len, const uint8_t *q, int x, int min_intv, bwtintv_v *mem, bwtintv_v *tmpvec[2])
{
return bwt_smem1a(bwt, len, q, x, min_intv, INT_MAX, 0, mem, tmpvec);
return bwt_smem1a(bwt, len, q, x, min_intv, 0, mem, tmpvec);
}

int bwt_seed_strategy1(const bwt_t *bwt, int len, const uint8_t *q, int x, int max_intv, bwtintv_t *mem)
{
int i, c;
bwtintv_t ik, ok[4];

memset(mem, 0, sizeof(bwtintv_t));
if (q[x] > 3) return x + 1;
bwt_set_intv(bwt, q[x], ik); // the initial interval of a single base
for (i = x + 1; i < len; ++i) { // forward search
if (q[i] < 4) { // an A/C/G/T base
c = 3 - q[i]; // complement of q[i]
bwt_extend(bwt, &ik, ok, 0);
if (ok[c].x[2] < max_intv) {
*mem = ok[c];
mem->info = (uint64_t)x<<32 | (i + 1);
return i + 1;
}
ik = ok[c];
} else return i + 1;
}
return len;
}

/*************************
Expand Down
4 changes: 2 additions & 2 deletions bwt.h
Original file line number Diff line number Diff line change
Expand Up @@ -119,9 +119,9 @@ extern "C" {
* Return the end of the longest exact match starting from _x_.
*/
int bwt_smem1(const bwt_t *bwt, int len, const uint8_t *q, int x, int min_intv, bwtintv_v *mem, bwtintv_v *tmpvec[2]);
int bwt_smem1a(const bwt_t *bwt, int len, const uint8_t *q, int x, int min_intv, int max_len, uint64_t max_intv, bwtintv_v *mem, bwtintv_v *tmpvec[2]);
int bwt_smem1a(const bwt_t *bwt, int len, const uint8_t *q, int x, int min_intv, uint64_t max_intv, bwtintv_v *mem, bwtintv_v *tmpvec[2]);

// SMEM iterator interface
int bwt_seed_strategy1(const bwt_t *bwt, int len, const uint8_t *q, int x, int max_intv, bwtintv_t *mem);

#ifdef __cplusplus
}
Expand Down
2 changes: 1 addition & 1 deletion main.c
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
#include "utils.h"

#ifndef PACKAGE_VERSION
#define PACKAGE_VERSION "0.7.10-r926-dirty"
#define PACKAGE_VERSION "0.7.10-r929-dirty"
#endif

int bwa_fa2pac(int argc, char *argv[]);
Expand Down

0 comments on commit 038af2a

Please sign in to comment.