Skip to content

Commit

Permalink
drop smem based mapping algorithm
Browse files Browse the repository at this point in the history
While we can compute smems very efficiently, there is still a long way to get
the alignment. On simulated data, this smem-based algorithm is 4X faster than
bwasw and twice as fast as bowtie2, but the accuracy is far lower than bwasw
and even lower than bowtie2 in the high-mapQ range. I am kind of sure that if
we continue to increase the mapping accuracy, the speed will approach to bwasw,
if not slower.

Smem-based mapping algorithm is still interesting, but given that I am short of
time, I will not explore it further.
  • Loading branch information
Heng Li committed Oct 27, 2011
1 parent 7467671 commit 7babb54
Show file tree
Hide file tree
Showing 2 changed files with 18 additions and 132 deletions.
147 changes: 16 additions & 131 deletions fastmap.c
Original file line number Diff line number Diff line change
Expand Up @@ -8,125 +8,26 @@
#include "kseq.h"
KSEQ_INIT(gzFile, gzread)

typedef struct {
int qbeg, len;
int64_t tbeg;
} fmhit_t;

#define hit_qlt(a, b) ((a).qbeg < (b).qbeg)
#define hit_tlt(a, b) ((a).tbeg < (b).tbeg)

#include "ksort.h"
KSORT_INIT(hitq, fmhit_t, hit_qlt)
KSORT_INIT(hitt, fmhit_t, hit_tlt)

extern unsigned char nst_nt4_table[256];

typedef struct { size_t n, m; fmhit_t *a; } fmhit_v;

static uint64_t cluster_qhits(fmhit_v *hits, int beg, int end, int max_dist, int *_score)
{
int qend, score, max = 0, cbeg = beg;
size_t i;
uint64_t max_cluster = 0;
ks_introsort(hitq, end - beg, hits->a + beg);
qend = hits->a[beg].qbeg + hits->a[beg].len;
score = hits->a[beg].len;
for (i = beg + 1; i < end; ++i) {
fmhit_t *p = &hits->a[i];
if (p->qbeg - qend > max_dist) {
if (score > max) max = score, max_cluster = (uint64_t)cbeg<<32 | i;
score = 0; cbeg = i;
}
score += p->len;
qend = p->qbeg + p->len;
}
if (score > max) max = score, max_cluster = (uint64_t)cbeg<<32 | i;
*_score = score;
return max_cluster;
}

static int cluster_hits(fmhit_v *hits, int max_dist)
{
size_t i;
int64_t tend;
uint64_t cluster, max_cluster = 0;
int j, score, max = 0, max2 = 0, cbeg = 0, cend;
ks_introsort(hitt, hits->n, hits->a);
tend = hits->a[0].tbeg + hits->a[0].len;
for (i = 1; i < hits->n; ++i) {
fmhit_t *p = &hits->a[i];
if (p->tbeg - tend > max_dist) {
cluster = cluster_qhits(hits, cbeg, i, max_dist, &score);
if (score > max) max2 = max, max = score, max_cluster = cluster;
else if (score > max2) max2 = score;
cbeg = i;
}
tend = p->tbeg + p->len;
}
cluster = cluster_qhits(hits, cbeg, i, max_dist, &score);
if (score > max) max2 = max, max = score, max_cluster = cluster;
else if (score > max2) max2 = score;
cbeg = max_cluster>>32; cend = (uint32_t)max_cluster;
for (i = 0, j = cbeg; j < cend; ++j) hits->a[i++] = hits->a[j];
hits->n = i;
return (int)(200.0 * (max - max2) / max + .499);
}

static int fake_cigar(const bntseq_t *bns, fmhit_v *hits, int beg, int end, int len, uint32_t *cigar, int64_t *pos, int *is_rev)
{
size_t i;
int qbeg, qend, n_cigar = 0;
int64_t tbeg, tend, tmp;
qbeg = len; qend = 0;
tbeg = bns->l_pac<<1; tend = 0;
for (i = beg; i < end; ++i) {
fmhit_t *p = &hits->a[i];
if (p->qbeg < qbeg) qbeg = p->qbeg;
if (p->qbeg + p->len > qend) qend = p->qbeg + p->len;
if (p->tbeg < tbeg) tbeg = p->tbeg;
if (p->tbeg + p->len > qend) tend = p->tbeg + p->len;
}
if (tbeg >= bns->l_pac) {
tmp = tend; tend = bns->l_pac*2 - tbeg; tbeg = bns->l_pac*2 - tmp;
tmp = qend; qend = len - qbeg; qbeg = len - tmp;
*is_rev = 1;
} else *is_rev = 0;
*pos = tbeg;
if (qbeg) cigar[n_cigar++] = qbeg<<4|4;
if (tend - tbeg < qend - qbeg) { // reference is shorter
cigar[n_cigar++] = (uint32_t)(tend - tbeg)<<4 | 0;
cigar[n_cigar++] = (uint32_t)((qend - qbeg) - (tend - tbeg))<<4 | 2;
} else if (tend - tbeg > qend - qbeg) { // query is shorter
cigar[n_cigar++] = (uint32_t)(qend - qbeg)<<4 | 0;
cigar[n_cigar++] = (uint32_t)((tend - tbeg) - (qend - qbeg))<<4 | 1;
} else cigar[n_cigar++] = (uint32_t)(qend - qbeg)<<4 | 0;
if (len > qend) cigar[n_cigar++] = (uint32_t)(len - qend)<<4|4;
return n_cigar;
}

int main_fastmap(int argc, char *argv[])
{
int c, i, min_iwidth = 20, min_len = 17, max_dist = 50, mem_only = 0;
int c, i, min_iwidth = 20, min_len = 17;
kseq_t *seq;
bwtint_t k;
gzFile fp;
bwt_t *bwt;
bntseq_t *bns;
bwtintv_v a[3], mem, *tvec[3];
fmhit_v hits;
uint32_t cigar[1024];

while ((c = getopt(argc, argv, "w:l:d:p")) >= 0) {
while ((c = getopt(argc, argv, "w:l:")) >= 0) {
switch (c) {
case 'w': min_iwidth = atoi(optarg); break;
case 'l': min_len = atoi(optarg); break;
case 'd': max_dist = atoi(optarg); break;
case 'p': mem_only = 1; break;
}
}
if (optind + 1 >= argc) {
fprintf(stderr, "bwa fastmap <idxbase> <in.fq>\n");
fprintf(stderr, "Usage: bwa fastmap [-l minLen=%d] [-w maxSaSize=%d] <idxbase> <in.fq>\n", min_len, min_iwidth);
return 1;
}

Expand All @@ -145,46 +46,30 @@ int main_fastmap(int argc, char *argv[])
kv_init(a[i]);
tvec[i] = &a[i];
}
kv_init(mem); kv_init(hits);
kv_init(mem);
while (kseq_read(seq) >= 0) {
for (i = 0; i < seq->seq.l; ++i)
seq->seq.s[i] = nst_nt4_table[(int)seq->seq.s[i]];
bwt_smem(bwt, seq->seq.l, (uint8_t*)seq->seq.s, &mem, tvec);
if (mem_only) printf("SQ\t%s\t%ld\n", seq->name.s, seq->seq.l);
for (i = 0, hits.n = 0; i < mem.n; ++i) {
printf("SQ\t%s\t%ld\n", seq->name.s, seq->seq.l);
for (i = 0; i < mem.n; ++i) {
bwtintv_t *p = &mem.a[i];
if ((uint32_t)p->info - (p->info>>32) < min_len) continue;
if (mem_only) printf("EM\t%d\t%d\t%ld", (uint32_t)(p->info>>32), (uint32_t)p->info, (long)p->x[2]);
printf("EM\t%d\t%d\t%ld", (uint32_t)(p->info>>32), (uint32_t)p->info, (long)p->x[2]);
if (p->x[2] <= min_iwidth) {
for (k = 0; k < p->x[2]; ++k) {
fmhit_t z;
z.tbeg = bwt_sa(bwt, p->x[0] + k);
z.qbeg = p->info>>32;
z.len = (uint32_t)p->info - z.qbeg;
kv_push(fmhit_t, hits, z);
if (mem_only) {
int is_rev, ref_id;
int64_t pos = bns_depos(bns, z.tbeg, &is_rev);
if (is_rev) pos -= z.len - 1;
bns_cnt_ambi(bns, pos, z.len, &ref_id);
printf("\t%s:%c%ld", bns->anns[ref_id].name, "+-"[is_rev], (long)(pos - bns->anns[ref_id].offset) + 1);
}
bwtint_t pos;
int len, is_rev, ref_id;
len = (uint32_t)p->info - (p->info>>32);
pos = bns_depos(bns, bwt_sa(bwt, p->x[0] + k), &is_rev);
if (is_rev) pos -= len - 1;
bns_cnt_ambi(bns, pos, len, &ref_id);
printf("\t%s:%c%ld", bns->anns[ref_id].name, "+-"[is_rev], (long)(pos - bns->anns[ref_id].offset) + 1);
}
}
if (mem_only) putchar('\n');
putchar('\n');
}
if (!mem_only) {
int64_t pos;
int n_cigar, is_rev, ref_id, mapq;
if (hits.n == 0) continue;
mapq = cluster_hits(&hits, max_dist);
n_cigar = fake_cigar(bns, &hits, 0, hits.n, seq->seq.l, cigar, &pos, &is_rev);
bns_cnt_ambi(bns, pos, 1, &ref_id);
printf("%s\t%d\t%s\t%ld\t%d\t", seq->name.s, is_rev?16:0, bns->anns[ref_id].name, (long)(pos - bns->anns[ref_id].offset) + 1, mapq);
for (i = 0; i < n_cigar; ++i)
printf("%d%c", cigar[i]>>4, "MIDNSHP"[cigar[i]&0xf]);
printf("\t*\t0\t0\t*\t*\n");
} else puts("//");
puts("//");
}

free(mem.a);
Expand Down
3 changes: 2 additions & 1 deletion main.c
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
#include "utils.h"

#ifndef PACKAGE_VERSION
#define PACKAGE_VERSION "0.6.0-r64-dev"
#define PACKAGE_VERSION "0.6.0-r68-dev"
#endif

static int usage()
Expand All @@ -19,6 +19,7 @@ static int usage()
fprintf(stderr, " samse generate alignment (single ended)\n");
fprintf(stderr, " sampe generate alignment (paired ended)\n");
fprintf(stderr, " bwasw BWA-SW for long queries\n");
fprintf(stderr, " fastmap identify super-maximal exact matches\n");
fprintf(stderr, "\n");
fprintf(stderr, " fa2pac convert FASTA to PAC format\n");
fprintf(stderr, " pac2bwt generate BWT from PAC\n");
Expand Down

0 comments on commit 7babb54

Please sign in to comment.