-
Notifications
You must be signed in to change notification settings - Fork 0
/
bwamem_extra.c
165 lines (151 loc) · 5.29 KB
/
bwamem_extra.c
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
#include <limits.h>
#include "bwa.h"
#include "bwamem.h"
#include "bntseq.h"
#include "kstring.h"
/***************************
* SMEM iterator interface *
***************************/
struct __smem_i {
const bwt_t *bwt;
const uint8_t *query;
int start, len;
int min_intv, max_len;
uint64_t max_intv;
bwtintv_v *matches; // matches; to be returned by smem_next()
bwtintv_v *sub; // sub-matches inside the longest match; temporary
bwtintv_v *tmpvec[2]; // temporary arrays
};
smem_i *smem_itr_init(const bwt_t *bwt)
{
smem_i *itr;
itr = calloc(1, sizeof(smem_i));
itr->bwt = bwt;
itr->tmpvec[0] = calloc(1, sizeof(bwtintv_v));
itr->tmpvec[1] = calloc(1, sizeof(bwtintv_v));
itr->matches = calloc(1, sizeof(bwtintv_v));
itr->sub = calloc(1, sizeof(bwtintv_v));
itr->min_intv = 1;
itr->max_len = INT_MAX;
itr->max_intv = 0;
return itr;
}
void smem_itr_destroy(smem_i *itr)
{
free(itr->tmpvec[0]->a); free(itr->tmpvec[0]);
free(itr->tmpvec[1]->a); free(itr->tmpvec[1]);
free(itr->matches->a); free(itr->matches);
free(itr->sub->a); free(itr->sub);
free(itr);
}
void smem_set_query(smem_i *itr, int len, const uint8_t *query)
{
itr->query = query;
itr->start = 0;
itr->len = len;
}
void smem_config(smem_i *itr, int min_intv, int max_len, uint64_t max_intv)
{
itr->min_intv = min_intv;
itr->max_len = max_len;
itr->max_intv = max_intv;
}
const bwtintv_v *smem_next(smem_i *itr)
{
int ori_start;
itr->tmpvec[0]->n = itr->tmpvec[1]->n = itr->matches->n = itr->sub->n = 0;
if (itr->start >= itr->len || itr->start < 0) return 0;
while (itr->start < itr->len && itr->query[itr->start] > 3) ++itr->start; // skip ambiguous bases
if (itr->start == itr->len) return 0;
ori_start = itr->start;
itr->start = bwt_smem1a(itr->bwt, itr->len, itr->query, ori_start, itr->min_intv, itr->max_intv, itr->matches, itr->tmpvec); // search for SMEM
return itr->matches;
}
/***********************
*** Extra functions ***
***********************/
mem_alnreg_v mem_align1(const mem_opt_t *opt, const bwt_t *bwt, const bntseq_t *bns, const uint8_t *pac, int l_seq, const char *seq_)
{ // the difference from mem_align1_core() is that this routine: 1) calls mem_mark_primary_se(); 2) does not modify the input sequence
extern mem_alnreg_v mem_align1_core(const mem_opt_t *opt, const bwt_t *bwt, const bntseq_t *bns, const uint8_t *pac, int l_seq, char *seq, void *buf);
extern void mem_mark_primary_se(const mem_opt_t *opt, int n, mem_alnreg_t *a, int64_t id);
mem_alnreg_v ar;
char *seq;
seq = malloc(l_seq);
memcpy(seq, seq_, l_seq); // makes a copy of seq_
ar = mem_align1_core(opt, bwt, bns, pac, l_seq, seq, 0);
mem_mark_primary_se(opt, ar.n, ar.a, lrand48());
free(seq);
return ar;
}
void mem_reg2ovlp(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, bseq1_t *s, mem_alnreg_v *a)
{
int i;
kstring_t str = {0,0,0};
for (i = 0; i < a->n; ++i) {
const mem_alnreg_t *p = &a->a[i];
int is_rev, rid, qb = p->qb, qe = p->qe;
int64_t pos, rb = p->rb, re = p->re;
pos = bns_depos(bns, rb < bns->l_pac? rb : re - 1, &is_rev);
rid = bns_pos2rid(bns, pos);
assert(rid == p->rid);
pos -= bns->anns[rid].offset;
kputs(s->name, &str); kputc('\t', &str);
kputw(s->l_seq, &str); kputc('\t', &str);
if (is_rev) qb ^= qe, qe ^= qb, qb ^= qe; // swap
kputw(qb, &str); kputc('\t', &str); kputw(qe, &str); kputc('\t', &str);
kputs(bns->anns[rid].name, &str); kputc('\t', &str);
kputw(bns->anns[rid].len, &str); kputc('\t', &str);
kputw(pos, &str); kputc('\t', &str); kputw(pos + (re - rb), &str); kputc('\t', &str);
ksprintf(&str, "%.3f", (double)p->truesc / opt->a / (qe - qb > re - rb? qe - qb : re - rb));
kputc('\n', &str);
}
s->sam = str.s;
}
static inline int get_pri_idx(double XA_drop_ratio, const mem_alnreg_t *a, int i)
{
int k = a[i].secondary_all;
if (k >= 0 && a[i].score >= a[k].score * XA_drop_ratio) return k;
return -1;
}
// Okay, returning strings is bad, but this has happened a lot elsewhere. If I have time, I need serious code cleanup.
char **mem_gen_alt(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, const mem_alnreg_v *a, int l_query, const char *query) // ONLY work after mem_mark_primary_se()
{
int i, k, r, *cnt, tot;
kstring_t *aln = 0, str = {0,0,0};
char **XA = 0, *has_alt;
cnt = calloc(a->n, sizeof(int));
has_alt = calloc(a->n, 1);
for (i = 0, tot = 0; i < a->n; ++i) {
r = get_pri_idx(opt->XA_drop_ratio, a->a, i);
if (r >= 0) {
++cnt[r], ++tot;
if (a->a[i].is_alt) has_alt[r] = 1;
}
}
if (tot == 0) goto end_gen_alt;
aln = calloc(a->n, sizeof(kstring_t));
for (i = 0; i < a->n; ++i) {
mem_aln_t t;
if ((r = get_pri_idx(opt->XA_drop_ratio, a->a, i)) < 0) continue;
if (cnt[r] > opt->max_XA_hits_alt || (!has_alt[r] && cnt[r] > opt->max_XA_hits)) continue;
t = mem_reg2aln(opt, bns, pac, l_query, query, &a->a[i]);
str.l = 0;
kputs(bns->anns[t.rid].name, &str);
kputc(',', &str); kputc("+-"[t.is_rev], &str); kputl(t.pos + 1, &str);
kputc(',', &str);
for (k = 0; k < t.n_cigar; ++k) {
kputw(t.cigar[k]>>4, &str);
kputc("MIDSHN"[t.cigar[k]&0xf], &str);
}
kputc(',', &str); kputw(t.NM, &str);
kputc(';', &str);
free(t.cigar);
kputsn(str.s, str.l, &aln[r]);
}
XA = calloc(a->n, sizeof(char*));
for (k = 0; k < a->n; ++k)
XA[k] = aln[k].s;
end_gen_alt:
free(has_alt); free(cnt); free(aln); free(str.s);
return XA;
}