forked from lh3/minimap2
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathseed.c
106 lines (101 loc) · 3.42 KB
/
seed.c
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
#include "mmpriv.h"
#include "kalloc.h"
#include "ksort.h"
mm_seed_t *mm_seed_collect_all(void *km, const mm_idx_t *mi, const mm128_v *mv, int32_t *n_m_)
{
mm_seed_t *m;
size_t i;
int32_t k;
m = (mm_seed_t*)kmalloc(km, mv->n * sizeof(mm_seed_t));
for (i = k = 0; i < mv->n; ++i) {
const uint64_t *cr;
mm_seed_t *q;
mm128_t *p = &mv->a[i];
uint32_t q_pos = (uint32_t)p->y, q_span = p->x & 0xff;
int t;
cr = mm_idx_get(mi, p->x>>8, &t);
if (t == 0) continue;
q = &m[k++];
q->q_pos = q_pos, q->q_span = q_span, q->cr = cr, q->n = t, q->seg_id = p->y >> 32;
q->is_tandem = q->flt = 0;
if (i > 0 && p->x>>8 == mv->a[i - 1].x>>8) q->is_tandem = 1;
if (i < mv->n - 1 && p->x>>8 == mv->a[i + 1].x>>8) q->is_tandem = 1;
}
*n_m_ = k;
return m;
}
#define MAX_MAX_HIGH_OCC 128
void mm_seed_select(int32_t n, mm_seed_t *a, int len, int max_occ, int max_max_occ, int dist)
{ // for high-occ minimizers, choose up to max_high_occ in each high-occ streak
extern void ks_heapdown_uint64_t(size_t i, size_t n, uint64_t*);
extern void ks_heapmake_uint64_t(size_t n, uint64_t*);
int32_t i, last0, m;
uint64_t b[MAX_MAX_HIGH_OCC]; // this is to avoid a heap allocation
if (n == 0 || n == 1) return;
for (i = m = 0; i < n; ++i)
if (a[i].n > max_occ) ++m;
if (m == 0) return; // no high-frequency k-mers; do nothing
for (i = 0, last0 = -1; i <= n; ++i) {
if (i == n || a[i].n <= max_occ) {
if (i - last0 > 1) {
int32_t ps = last0 < 0? 0 : (uint32_t)a[last0].q_pos>>1;
int32_t pe = i == n? len : (uint32_t)a[i].q_pos>>1;
int32_t j, k, st = last0 + 1, en = i;
int32_t max_high_occ = (int32_t)((double)(pe - ps) / dist + .499);
if (max_high_occ > 0) {
if (max_high_occ > MAX_MAX_HIGH_OCC)
max_high_occ = MAX_MAX_HIGH_OCC;
for (j = st, k = 0; j < en && k < max_high_occ; ++j, ++k)
b[k] = (uint64_t)a[j].n<<32 | j;
ks_heapmake_uint64_t(k, b); // initialize the binomial heap
for (; j < en; ++j) { // if there are more, choose top max_high_occ
if (a[j].n < (int32_t)(b[0]>>32)) { // then update the heap
b[0] = (uint64_t)a[j].n<<32 | j;
ks_heapdown_uint64_t(0, k, b);
}
}
for (j = 0; j < k; ++j) a[(uint32_t)b[j]].flt = 1;
}
for (j = st; j < en; ++j) a[j].flt ^= 1;
for (j = st; j < en; ++j)
if (a[j].n > max_max_occ)
a[j].flt = 1;
}
last0 = i;
}
}
}
mm_seed_t *mm_collect_matches(void *km, int *_n_m, int qlen, int max_occ, int max_max_occ, int dist, const mm_idx_t *mi, const mm128_v *mv, int64_t *n_a, int *rep_len, int *n_mini_pos, uint64_t **mini_pos)
{
int rep_st = 0, rep_en = 0, n_m, n_m0;
size_t i;
mm_seed_t *m;
*n_mini_pos = 0;
*mini_pos = (uint64_t*)kmalloc(km, mv->n * sizeof(uint64_t));
m = mm_seed_collect_all(km, mi, mv, &n_m0);
if (dist > 0 && max_max_occ > max_occ) {
mm_seed_select(n_m0, m, qlen, max_occ, max_max_occ, dist);
} else {
for (i = 0; i < n_m0; ++i)
if (m[i].n > max_occ)
m[i].flt = 1;
}
for (i = 0, n_m = 0, *rep_len = 0, *n_a = 0; i < n_m0; ++i) {
mm_seed_t *q = &m[i];
//fprintf(stderr, "X\t%d\t%d\t%d\n", q->q_pos>>1, q->n, q->flt);
if (q->flt) {
int en = (q->q_pos >> 1) + 1, st = en - q->q_span;
if (st > rep_en) {
*rep_len += rep_en - rep_st;
rep_st = st, rep_en = en;
} else rep_en = en;
} else {
*n_a += q->n;
(*mini_pos)[(*n_mini_pos)++] = (uint64_t)q->q_span<<32 | q->q_pos>>1;
m[n_m++] = *q;
}
}
*rep_len += rep_en - rep_st;
*_n_m = n_m;
return m;
}