Skip to content

Commit

Permalink
mate rescue seems working (not MT)
Browse files Browse the repository at this point in the history
  • Loading branch information
Heng Li committed Nov 6, 2011
1 parent 17eaac5 commit c8c79ef
Show file tree
Hide file tree
Showing 7 changed files with 617 additions and 32 deletions.
2 changes: 1 addition & 1 deletion Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ CFLAGS= -g -Wall -O2
CXXFLAGS= $(CFLAGS)
DFLAGS= -DHAVE_PTHREAD #-D_FILE_OFFSET_BITS=64
OBJS= QSufSort.o bwt_gen.o utils.o bwt.o bwtio.o bwtaln.o bwtgap.o \
is.o bntseq.o bwtmisc.o bwtindex.o stdaln.o simple_dp.o \
is.o bntseq.o bwtmisc.o bwtindex.o ksw.o stdaln.o simple_dp.o \
bwaseqio.o bwase.o bwape.o kstring.o cs2nt.o \
bwtsw2_core.o bwtsw2_main.o bwtsw2_aux.o bwt_lite.o \
bwtsw2_chain.o bamlite.o fastmap.o bwtsw2_pair.o
Expand Down
4 changes: 3 additions & 1 deletion bwtsw2.h
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,8 @@
#include "bwt_lite.h"
#include "bwt.h"

#define BSW2_FLAG_MATESW 0x100

typedef struct {
int a, b, q, r, t, qr, bw;
int z, is, t_seeds, hard_clip;
Expand Down Expand Up @@ -50,7 +52,7 @@ extern "C" {
bsw2global_t *bsw2_global_init();
void bsw2_global_destroy(bsw2global_t *_pool);

void bwtsw2_pair(const uint8_t *pac, int n, bsw2seq1_t *seq, bwtsw2_t **hit);
void bsw2_pair(const bsw2opt_t *opt, int64_t l_pac, const uint8_t *pac, int n, bsw2seq1_t *seq, bwtsw2_t **hit);

#ifdef __cplusplus
}
Expand Down
64 changes: 39 additions & 25 deletions bwtsw2_aux.c
Original file line number Diff line number Diff line change
Expand Up @@ -66,22 +66,14 @@ void bsw2_destroy(bwtsw2_t *b)
free(b);
}

bwtsw2_t *bsw2_dup(const bwtsw2_t *b)
bwtsw2_t *bsw2_dup_no_cigar(const bwtsw2_t *b)
{
bwtsw2_t *p;
int i;
p = calloc(1, sizeof(bwtsw2_t));
p->max = p->n = b->n;
kroundup32(p->max);
p->hits = calloc(p->max, sizeof(bsw2hit_t));
p->n_cigar = calloc(p->max, sizeof(int));
p->cigar = calloc(p->max, sizeof(void*));
memcpy(p->hits, b->hits, p->n * sizeof(bsw2hit_t));
for (i = 0; i < p->n; ++i) {
p->n_cigar[i] = b->n_cigar[i];
p->cigar[i] = malloc(p->n_cigar[i] * 4);
memcpy(p->cigar[i], b->cigar[i], p->n_cigar[i] * 4);
}
return p;
}

Expand Down Expand Up @@ -406,9 +398,22 @@ static int fix_cigar(const char *qname, const bntseq_t *bns, bsw2hit_t *p, int n
return n_cigar;
}

static int est_mapq(const bsw2hit_t *p, const bsw2opt_t *opt)
{
float c = 1.0;
int qual, subo = p->G2 > opt->t? p->G2 : opt->t;
if (p->flag>>16 == 1 || p->flag>>16 == 2) c *= .5;
if (p->n_seeds < 2) c *= .2;
qual = (int)(c * (p->G - subo) * (250.0 / p->G + 0.03 / opt->a) + .499);
if (qual > 250) qual = 250;
if (qual < 0) qual = 0;
if (p->flag&1) qual = 0; // this is a random hit
return qual;
}

/* generate SAM lines for a sequence in ks with alignment stored in
* b. ks->name and ks->seq will be freed and set to NULL in the end. */
static void print_hits(const bntseq_t *bns, const bsw2opt_t *opt, bsw2seq1_t *ks, bwtsw2_t *b)
static void print_hits(const bntseq_t *bns, const bsw2opt_t *opt, bsw2seq1_t *ks, bwtsw2_t *b, int is_pe, bwtsw2_t *bmate)
{
int i, k;
kstring_t str;
Expand All @@ -433,18 +438,15 @@ static void print_hits(const bntseq_t *bns, const bsw2opt_t *opt, bsw2seq1_t *ks
nn = bns_cnt_ambi(bns, p->k, p->len, &seqid);
coor = p->k - bns->anns[seqid].offset;
}
ksprintf(&str, "%s\t%d", ks->name, p->flag&0x10);
ksprintf(&str, "%s\t%d", ks->name, (p->flag&0xff)|(is_pe?1:0));
ksprintf(&str, "\t%s\t%ld", seqid>=0? bns->anns[seqid].name : "*", (long)coor + 1);
if (p->l == 0) {
{ // estimate mapping quality
float c = 1.0;
int subo = p->G2 > opt->t? p->G2 : opt->t;
if (p->flag>>16 == 1 || p->flag>>16 == 2) c *= .5;
if (p->n_seeds < 2) c *= .2;
qual = (int)(c * (p->G - subo) * (250.0 / p->G + 0.03 / opt->a) + .499);
if (qual > 250) qual = 250;
if (qual < 0) qual = 0;
if (p->flag&1) qual = 0;
qual = est_mapq(p, opt);
if ((p->flag & BSW2_FLAG_MATESW) && bmate && bmate->n == 1) { // this alignment is from Smith-Waterman rescue
int mate_qual = est_mapq(bmate->hits, opt);
qual = qual < mate_qual? qual : mate_qual;
}
}
ksprintf(&str, "\t%d\t", qual);
for (k = 0; k < b->n_cigar[i]; ++k)
Expand All @@ -469,6 +471,7 @@ static void print_hits(const bntseq_t *bns, const bsw2opt_t *opt, bsw2seq1_t *ks
} else ksprintf(&str, "\t*");
ksprintf(&str, "\tAS:i:%d\tXS:i:%d\tXF:i:%d\tXE:i:%d\tXN:i:%d", p->G, p->G2, p->flag>>16, p->n_seeds, nn);
if (p->l) ksprintf(&str, "\tXI:i:%d", p->l - p->k + 1);
if (p->flag&BSW2_FLAG_MATESW) ksprintf(&str, "\tXT:i:1");
kputc('\n', &str);
}
ks->sam = str.s;
Expand Down Expand Up @@ -526,7 +529,7 @@ static void bsw2_aln_core(int tid, bsw2seq_t *_seq, const bsw2opt_t *_opt, const
rseq[1][i] = c;
}
if (l - k < opt.t) { // too few unambiguous bases
print_hits(bns, &opt, p, 0);
buf[x] = 0;
free(seq[0]); continue;
}
// alignment
Expand All @@ -548,17 +551,28 @@ static void bsw2_aln_core(int tid, bsw2seq_t *_seq, const bsw2opt_t *_opt, const
bsw2_resolve_query_overlaps(b[0], opt.mask_level);
} else b[1] = 0;
// generate CIGAR and print SAM
gen_cigar(&opt, l, seq, pac, b[0]);
buf[x] = bsw2_dup(b[0]);
buf[x] = bsw2_dup_no_cigar(b[0]);
// free
free(seq[0]);
bsw2_destroy(b[0]);
}
bwtsw2_pair(pac, _seq->n, _seq->seq, buf);
if (is_pe) bsw2_pair(&opt, bns->l_pac, pac, _seq->n, _seq->seq, buf);
for (x = 0; x < _seq->n; ++x) {
print_hits(bns, &opt, &_seq->seq[x], buf[x]);
bsw2_destroy(buf[x]);
bsw2seq1_t *p = _seq->seq + x;
uint8_t *seq[2];
int i;
seq[0] = malloc(p->l * 2); seq[1] = seq[0] + p->l;
for (i = 0; i < p->l; ++i) {
int c = nst_nt4_table[(int)p->seq[i]];
if (c >= 4) c = (int)(drand48() * 4);
seq[0][i] = c;
seq[1][p->l-1-i] = 3 - c;
}
gen_cigar(&opt, p->l, seq, pac, buf[x]);
print_hits(bns, &opt, p, buf[x], is_pe, buf[x^1]);
free(seq[0]);
}
for (x = 0; x < _seq->n; ++x) bsw2_destroy(buf[x]);
free(buf);
bsw2_global_destroy(pool);
}
Expand Down
1 change: 1 addition & 0 deletions bwtsw2_core.c
Original file line number Diff line number Diff line change
Expand Up @@ -327,6 +327,7 @@ int bsw2_resolve_duphits(const bntseq_t *bns, const bwt_t *bwt, bwtsw2_t *b, int
}
if (!compatible) {
p->G = 0;
if (q->G2 < p->G2) q->G2 = p->G2;
break;
}
}
Expand Down
125 changes: 120 additions & 5 deletions bwtsw2_pair.c
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
#include <math.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include "bwt.h"
#include "bntseq.h"
#include "bwtsw2.h"
Expand All @@ -10,13 +11,14 @@
#define MIN_RATIO 0.8
#define OUTLIER_BOUND 2.0
#define MAX_STDDEV 4.0
#define EXT_STDDEV 4.0

typedef struct {
int low, high;
double avg, std;
} bsw2pestat_t;

bsw2pestat_t bwtsw2_stat(int n, bwtsw2_t **buf)
bsw2pestat_t bsw2_stat(int n, bwtsw2_t **buf)
{
extern void ks_introsort_uint64_t(size_t n, uint64_t *a);
int i, k, x, p25, p50, p75, tmp, max_len = 0;
Expand All @@ -27,11 +29,11 @@ bsw2pestat_t bwtsw2_stat(int n, bwtsw2_t **buf)
for (i = k = 0; i < n; i += 2) {
bsw2hit_t *t[2];
int l;
if (buf[i]->n != 1 || buf[i+1]->n != 1) continue; // more than 1 hits
if (buf[i] == 0 || buf[i]->n != 1 || buf[i+1]->n != 1) continue; // more than 1 hits
t[0] = &buf[i]->hits[0]; t[1] = &buf[i+1]->hits[0];
if (t[0]->G2 > 0.8 * t[0]->G) continue; // the best hit is not good enough
if (t[1]->G2 > 0.8 * t[1]->G) continue; // the best hit is not good enough
l = t[0]->k > t[1]->k? t[0]->k - t[1]->k + (t[1]->end - t[1]->beg) : t[1]->k - t[0]->k + (t[0]->end - t[0]->beg);
l = t[0]->k > t[1]->k? t[0]->k - t[1]->k + t[1]->len : t[1]->k - t[0]->k + t[0]->len;
max_len = max_len > t[0]->end - t[0]->beg? max_len : t[0]->end - t[0]->beg;
max_len = max_len > t[1]->end - t[1]->beg? max_len : t[1]->end - t[1]->beg;
isize[k++] = l;
Expand Down Expand Up @@ -64,8 +66,121 @@ bsw2pestat_t bwtsw2_stat(int n, bwtsw2_t **buf)
return r;
}

void bwtsw2_pair(const uint8_t *pac, int n, bsw2seq1_t *seq, bwtsw2_t **hits)
typedef struct {
int n_cigar, beg, end, len;
int64_t pos;
uint32_t *cigar;
} pairaux_t;

extern unsigned char nst_nt4_table[256];
static int8_t g_mat[25];

void bsw2_pair1(const bsw2opt_t *opt, int64_t l_pac, const uint8_t *pac, const bsw2pestat_t *st, const bsw2hit_t *h, int l_mseq, const char *mseq, bsw2hit_t *a)
{
extern void seq_reverse(int len, ubyte_t *seq, int is_comp);
int64_t k, beg, end;
uint8_t *seq, *ref;
int i;
ksw_query_t *q;
ksw_aux_t aux[2];
// compute the region start and end
a->n_seeds = 1; a->l = 0; a->flag |= BSW2_FLAG_MATESW;
if (h->is_rev == 0) {
beg = (int64_t)(h->k + st->avg - EXT_STDDEV * st->std - l_mseq + .499);
end = (int64_t)(h->k + st->avg + EXT_STDDEV * st->std + .499);
a->is_rev = 1; a->flag |= 16;
} else {
beg = (int64_t)(h->k + h->end - h->beg - st->avg - EXT_STDDEV * st->std + .499);
end = (int64_t)(h->k + h->end - h->beg - st->avg + EXT_STDDEV * st->std + l_mseq + .499);
a->is_rev = 0;
}
if (beg < 1) beg = 1;
if (end > l_pac) end = l_pac;
// generate the sequence
seq = malloc(l_mseq + (end - beg));
ref = seq + l_mseq;
for (k = beg; k < end; ++k)
ref[k - beg] = pac[k>>2] >> ((~k&3)<<1) & 0x3;
if (h->is_rev == 0) {
for (i = 0; i < l_mseq; ++i) { // on the reverse strand
int c = nst_nt4_table[(int)mseq[i]];
seq[l_mseq - 1 - i] = c > 3? 4 : 3 - c;
}
} else {
for (i = 0; i < l_mseq; ++i) // on the forward strand
seq[i] = nst_nt4_table[(int)mseq[i]];
}
/* The following code can be made up to 2-fold as fast. I am just lazy... */
// forward Smith-Waterman
aux[0].T = opt->t; aux[0].gapo = opt->q; aux[0].gape = opt->r; aux[1] = aux[0];
q = ksw_qinit(l_mseq * g_mat[0] < 250? 1 : 2, l_mseq, seq, 5, g_mat);
ksw_sse2(q, end - beg, ref, &aux[0]);
free(q);
if (aux[0].score == 0) {
free(seq);
return;
}
// reverse Smith-Waterman
seq_reverse(l_mseq, seq, 0);
seq_reverse(end - beg, ref, 0);
q = ksw_qinit(l_mseq * g_mat[0] < 250? 1 : 2, l_mseq, seq, 5, g_mat);
ksw_sse2(q, end - beg, ref, &aux[1]);
free(q);
aux[1].te = end - beg - 1 - aux[1].te; // change to the forward-strand coordinate
// write output
a->G = aux[0].score;
a->G2 = aux[0].score2 > aux[1].score2? aux[0].score2 : aux[1].score2;
a->k = beg + aux[1].te;
a->len = aux[0].te + 1 - aux[1].te;
a->beg = l_mseq - 1 - aux[1].qe;
a->end = aux[0].qe + 1;
free(seq);
}

void bsw2_pair(const bsw2opt_t *opt, int64_t l_pac, const uint8_t *pac, int n, bsw2seq1_t *seq, bwtsw2_t **hits)
{
extern int bsw2_resolve_duphits(const bntseq_t *bns, const bwt_t *bwt, bwtsw2_t *b, int IS);
bsw2pestat_t pes;
pes = bwtsw2_stat(n, hits);
int i, j, k, n_rescued = 0;
pes = bsw2_stat(n, hits);
for (i = k = 0; i < 5; ++i) {
for (j = 0; j < 4; ++j)
g_mat[k++] = i == j? opt->a : -opt->b;
g_mat[k++] = 0;
}
for (i = 0; i < n; i += 2) {
bsw2hit_t a[2];
memset(&a, 0, sizeof(bsw2hit_t) * 2);
a[0].flag = 1<<6; a[1].flag = 1<<7;
for (j = 0; j < 2; ++j) { // set the read1/2 flag
if (hits[i+j] == 0) continue;
for (k = 0; k < hits[i+j]->n; ++k) {
bsw2hit_t *p = &hits[i+j]->hits[k];
p->flag |= 1<<(6+j);
}
}
if (hits[i] == 0 || hits[i+1] == 0) continue; // one end has excessive N
if (hits[i]->n != 1 && hits[i+1]->n != 1) continue; // no end has exactly one hit
if (hits[i]->n > 1 || hits[i+1]->n > 1) continue; // one read has more than one hit
if (hits[i+0]->n == 1) bsw2_pair1(opt, l_pac, pac, &pes, &hits[i+0]->hits[0], seq[i+1].l, seq[i+1].seq, &a[1]);
if (hits[i+1]->n == 1) bsw2_pair1(opt, l_pac, pac, &pes, &hits[i+1]->hits[0], seq[i+0].l, seq[i+0].seq, &a[0]);
// the following enumerate all possibilities. It is tedious but necessary...
//if (strstr(seq[i].name, "22_49258265_49258755_4")) fprintf(stderr, "%lld\t%lld\t(%d,%d)\n", hits[i+1]->hits[0].k, a[1].k, a[0].G, a[0].G2);
if (hits[i]->n + hits[i+1]->n == 1) { // one end mapped; the other not
bwtsw2_t *p[2];
int which;
if (hits[i]->n == 1) p[0] = hits[i], p[1] = hits[i+1], which = 1;
else p[0] = hits[i+1], p[1] = hits[i], which = 0;
if (a[which].G == 0) continue;
if (p[1]->max == 0) {
p[1]->max = 1;
p[1]->hits = malloc(sizeof(bsw2hit_t));
}
memcpy(p[1]->hits, &a[which], sizeof(bsw2hit_t));
p[1]->n = 1;
++n_rescued;
} else { // then both ends mapped
}
}
fprintf(stderr, "[%s] rescued %d reads\n", __func__, n_rescued);
}
Loading

0 comments on commit c8c79ef

Please sign in to comment.