-
Notifications
You must be signed in to change notification settings - Fork 0
/
fastmap.c
127 lines (117 loc) · 3.35 KB
/
fastmap.c
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
#include <zlib.h>
#include <stdio.h>
#include <unistd.h>
#include <stdlib.h>
#include "bntseq.h"
#include "bwt.h"
#include "kvec.h"
#include "kseq.h"
KSEQ_INIT(gzFile, gzread)
extern unsigned char nst_nt4_table[256];
typedef struct {
const bwt_t *bwt;
const uint8_t *query;
int start, len;
bwtintv_v *tmpvec[2], *matches;
} smem_i;
smem_i *smem_iter_init(const bwt_t *bwt)
{
smem_i *iter;
iter = calloc(1, sizeof(smem_i));
iter->bwt = bwt;
iter->tmpvec[0] = calloc(1, sizeof(bwtintv_v));
iter->tmpvec[1] = calloc(1, sizeof(bwtintv_v));
iter->matches = calloc(1, sizeof(bwtintv_v));
return iter;
}
void smem_iter_destroy(smem_i *iter)
{
free(iter->tmpvec[0]->a);
free(iter->tmpvec[1]->a);
free(iter->matches->a);
free(iter);
}
void smem_set_query(smem_i *iter, int len, const uint8_t *query)
{
iter->query = query;
iter->start = 0;
iter->len = len;
}
int smem_next(smem_i *iter)
{
iter->tmpvec[0]->n = iter->tmpvec[1]->n = iter->matches->n = 0;
if (iter->start >= iter->len || iter->start < 0) return -1;
while (iter->start < iter->len && iter->query[iter->start] > 3) ++iter->start; // skip ambiguous bases
if (iter->start == iter->len) return -1;
iter->start = bwt_smem1(iter->bwt, iter->len, iter->query, iter->start, iter->matches, iter->tmpvec);
return iter->start;
}
int main_fastmap(int argc, char *argv[])
{
int c, i, min_iwidth = 20, min_len = 17, print_seq = 0;
kseq_t *seq;
bwtint_t k;
gzFile fp;
bwt_t *bwt;
bntseq_t *bns;
smem_i *iter;
while ((c = getopt(argc, argv, "w:l:s")) >= 0) {
switch (c) {
case 's': print_seq = 1; break;
case 'w': min_iwidth = atoi(optarg); break;
case 'l': min_len = atoi(optarg); break;
}
}
if (optind + 1 >= argc) {
fprintf(stderr, "Usage: bwa fastmap [-s] [-l minLen=%d] [-w maxSaSize=%d] <idxbase> <in.fq>\n", min_len, min_iwidth);
return 1;
}
fp = gzopen(argv[optind + 1], "r");
seq = kseq_init(fp);
{ // load the packed sequences, BWT and SA
char *tmp = calloc(strlen(argv[optind]) + 5, 1);
strcat(strcpy(tmp, argv[optind]), ".bwt");
bwt = bwt_restore_bwt(tmp);
strcat(strcpy(tmp, argv[optind]), ".sa");
bwt_restore_sa(tmp, bwt);
free(tmp);
bns = bns_restore(argv[optind]);
}
iter = smem_iter_init(bwt);
while (kseq_read(seq) >= 0) {
printf("SQ\t%s\t%ld", seq->name.s, seq->seq.l);
if (print_seq) {
putchar('\t');
puts(seq->seq.s);
} else putchar('\n');
for (i = 0; i < seq->seq.l; ++i)
seq->seq.s[i] = nst_nt4_table[(int)seq->seq.s[i]];
smem_set_query(iter, seq->seq.l, (uint8_t*)seq->seq.s);
while (smem_next(iter) > 0) {
for (i = 0; i < iter->matches->n; ++i) {
bwtintv_t *p = &iter->matches->a[i];
if ((uint32_t)p->info - (p->info>>32) < min_len) continue;
printf("EM\t%d\t%d\t%ld", (uint32_t)(p->info>>32), (uint32_t)p->info, (long)p->x[2]);
if (p->x[2] <= min_iwidth) {
for (k = 0; k < p->x[2]; ++k) {
bwtint_t pos;
int len, is_rev, ref_id;
len = (uint32_t)p->info - (p->info>>32);
pos = bns_depos(bns, bwt_sa(bwt, p->x[0] + k), &is_rev);
if (is_rev) pos -= len - 1;
bns_cnt_ambi(bns, pos, len, &ref_id);
printf("\t%s:%c%ld", bns->anns[ref_id].name, "+-"[is_rev], (long)(pos - bns->anns[ref_id].offset) + 1);
}
} else fputs("\t*", stdout);
putchar('\n');
}
}
puts("//");
}
smem_iter_destroy(iter);
bns_destroy(bns);
bwt_destroy(bwt);
kseq_destroy(seq);
gzclose(fp);
return 0;
}