Skip to content

Commit

Permalink
PE seems working; more testing needed
Browse files Browse the repository at this point in the history
  • Loading branch information
Heng Li committed Oct 20, 2011
1 parent 098f44c commit ec307a1
Show file tree
Hide file tree
Showing 4 changed files with 63 additions and 62 deletions.
1 change: 1 addition & 0 deletions bntseq.c
Original file line number Diff line number Diff line change
Expand Up @@ -294,6 +294,7 @@ int bns_coor_pac2real(const bntseq_t *bns, int64_t pac_coor, int len, int32_t *r
} else right = mid;
}
*real_seq = mid;
if (len == 0) return 0;
// binary search for holes
left = 0; right = bns->n_holes; nn = 0;
while (left < right) {
Expand Down
57 changes: 29 additions & 28 deletions bwape.c
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
#include "bntseq.h"
#include "utils.h"
#include "stdaln.h"
#include "bwase.h"

typedef struct {
int n;
Expand Down Expand Up @@ -37,10 +38,8 @@ typedef struct {
extern int g_log_n[256]; // in bwase.c
static kh_64_t *g_hash;

void bwase_initialize();
void bwa_aln2seq_core(int n_aln, const bwt_aln1_t *aln, bwa_seq_t *s, int set_main, int n_multi);
void bwa_aln2seq(int n_aln, const bwt_aln1_t *aln, bwa_seq_t *s);
void bwa_refine_gapped(const bntseq_t *bns, int n_seqs, bwa_seq_t *seqs, ubyte_t *_pacseq, bntseq_t *ntbns);
int bwa_approx_mapQ(const bwa_seq_t *p, int mm);
void bwa_print_sam1(const bntseq_t *bns, bwa_seq_t *p, const bwa_seq_t *mate, int mode, int max_top2);
bntseq_t *bwa_open_nt(const char *prefix);
Expand Down Expand Up @@ -277,25 +276,23 @@ typedef struct {
kvec_t(bwt_aln1_t) aln;
} aln_buf_t;

int bwa_cal_pac_pos_pe(const char *prefix, bwt_t *const _bwt[2], int n_seqs, bwa_seq_t *seqs[2], FILE *fp_sa[2], isize_info_t *ii,
int bwa_cal_pac_pos_pe(const bntseq_t *bns, const char *prefix, bwt_t *const _bwt, int n_seqs, bwa_seq_t *seqs[2], FILE *fp_sa[2], isize_info_t *ii,
const pe_opt_t *opt, const gap_opt_t *gopt, const isize_info_t *last_ii)
{
int i, j, cnt_chg = 0;
char str[1024];
bwt_t *bwt[2];
bwt_t *bwt;
pe_data_t *d;
aln_buf_t *buf[2];

d = (pe_data_t*)calloc(1, sizeof(pe_data_t));
buf[0] = (aln_buf_t*)calloc(n_seqs, sizeof(aln_buf_t));
buf[1] = (aln_buf_t*)calloc(n_seqs, sizeof(aln_buf_t));

if (_bwt[0] == 0) { // load forward SA
strcpy(str, prefix); strcat(str, ".bwt"); bwt[0] = bwt_restore_bwt(str);
strcpy(str, prefix); strcat(str, ".sa"); bwt_restore_sa(str, bwt[0]);
strcpy(str, prefix); strcat(str, ".rbwt"); bwt[1] = bwt_restore_bwt(str);
strcpy(str, prefix); strcat(str, ".rsa"); bwt_restore_sa(str, bwt[1]);
} else bwt[0] = _bwt[0], bwt[1] = _bwt[1];
if (_bwt == 0) { // load forward SA
strcpy(str, prefix); strcat(str, ".bwt"); bwt = bwt_restore_bwt(str);
strcpy(str, prefix); strcat(str, ".sa"); bwt_restore_sa(str, bwt);
} else bwt = _bwt;

// SE
for (i = 0; i != n_seqs; ++i) {
Expand All @@ -314,16 +311,17 @@ int bwa_cal_pac_pos_pe(const char *prefix, bwt_t *const _bwt[2], int n_seqs, bwa
// generate SE alignment and mapping quality
bwa_aln2seq(n_aln, d->aln[j].a, p[j]);
if (p[j]->type == BWA_TYPE_UNIQUE || p[j]->type == BWA_TYPE_REPEAT) {
int strand;
int max_diff = gopt->fnr > 0.0? bwa_cal_maxdiff(p[j]->len, BWA_AVG_ERR, gopt->fnr) : gopt->max_diff;
p[j]->pos = p[j]->strand? bwt_sa(bwt[0], p[j]->sa)
: bwt[1]->seq_len - (bwt_sa(bwt[1], p[j]->sa) + p[j]->len);
p[j]->seQ = p[j]->mapQ = bwa_approx_mapQ(p[j], max_diff);
p[j]->pos = bwa_sa2pos(bns, bwt, p[j]->sa, p[j]->len, &strand);
p[j]->strand = strand;
}
}
}

// infer isize
infer_isize(n_seqs, seqs, ii, opt->ap_prior, bwt[0]->seq_len);
infer_isize(n_seqs, seqs, ii, opt->ap_prior, bwt->seq_len/2);
if (ii->avg < 0.0 && last_ii->avg > 0.0) *ii = *last_ii;
if (opt->force_isize) {
fprintf(stderr, "[%s] discard insert size estimate as user's request.\n", __func__);
Expand Down Expand Up @@ -361,8 +359,11 @@ int bwa_cal_pac_pos_pe(const char *prefix, bwt_t *const _bwt[2], int n_seqs, bwa
poslist_t *z = &kh_val(g_hash, iter);
z->n = r->l - r->k + 1;
z->a = (bwtint_t*)malloc(sizeof(bwtint_t) * z->n);
for (l = r->k; l <= r->l; ++l)
z->a[l - r->k] = r->a? bwt_sa(bwt[0], l) : bwt[1]->seq_len - (bwt_sa(bwt[1], l) + p[j]->len);
for (l = r->k; l <= r->l; ++l) {
int strand;
z->a[l - r->k] = bwa_sa2pos(bns, bwt, l, p[j]->len, &strand);
r->a = strand;
}
}
for (l = 0; l < kh_val(g_hash, iter).n; ++l) {
x = kh_val(g_hash, iter).a[l];
Expand All @@ -371,7 +372,9 @@ int bwa_cal_pac_pos_pe(const char *prefix, bwt_t *const _bwt[2], int n_seqs, bwa
}
} else { // then calculate on the fly
for (l = r->k; l <= r->l; ++l) {
x = r->a? bwt_sa(bwt[0], l) : bwt[1]->seq_len - (bwt_sa(bwt[1], l) + p[j]->len);
int strand;
x = bwa_sa2pos(bns, bwt, l, p[j]->len, &strand);
r->a = strand;
x = x<<32 | k<<1 | j;
kv_push(uint64_t, d->arr, x);
}
Expand All @@ -389,8 +392,10 @@ int bwa_cal_pac_pos_pe(const char *prefix, bwt_t *const _bwt[2], int n_seqs, bwa
bwa_aln2seq_core(d->aln[j].n, d->aln[j].a, p[j], 0, p[j]->c1+p[j]->c2-1 > opt->N_multi? opt->n_multi : opt->N_multi);
} else bwa_aln2seq_core(d->aln[j].n, d->aln[j].a, p[j], 0, opt->n_multi);
for (k = 0; k < p[j]->n_multi; ++k) {
int strand;
bwt_multi1_t *q = p[j]->multi + k;
q->pos = q->strand? bwt_sa(bwt[0], q->pos) : bwt[1]->seq_len - (bwt_sa(bwt[1], q->pos) + p[j]->len);
q->pos = bwa_sa2pos(bns, bwt, q->pos, p[j]->len, &strand);
q->strand = strand;
}
}
}
Expand All @@ -403,9 +408,7 @@ int bwa_cal_pac_pos_pe(const char *prefix, bwt_t *const _bwt[2], int n_seqs, bwa
kv_destroy(buf[1][i].aln);
}
free(buf[0]); free(buf[1]);
if (_bwt[0] == 0) {
bwt_destroy(bwt[0]); bwt_destroy(bwt[1]);
}
if (_bwt == 0) bwt_destroy(bwt);
kv_destroy(d->arr);
kv_destroy(d->pos[0]); kv_destroy(d->pos[1]);
kv_destroy(d->aln[0]); kv_destroy(d->aln[1]);
Expand Down Expand Up @@ -655,12 +658,12 @@ void bwa_sai2sam_pe_core(const char *prefix, char *const fn_sa[2], char *const f
khint_t iter;
isize_info_t last_ii; // this is for the last batch of reads
char str[1024];
bwt_t *bwt[2];
bwt_t *bwt;
uint8_t *pac;

// initialization
bwase_initialize(); // initialize g_log_n[] in bwase.c
pac = 0; bwt[0] = bwt[1] = 0;
pac = 0; bwt = 0;
for (i = 1; i != 256; ++i) g_log_n[i] = (int)(4.343 * log(i) + 0.5);
bns = bns_restore(prefix);
srand48(bns->seed);
Expand All @@ -679,10 +682,8 @@ void bwa_sai2sam_pe_core(const char *prefix, char *const fn_sa[2], char *const f
ntbns = bwa_open_nt(prefix);
} else { // for Illumina alignment only
if (popt->is_preload) {
strcpy(str, prefix); strcat(str, ".bwt"); bwt[0] = bwt_restore_bwt(str);
strcpy(str, prefix); strcat(str, ".sa"); bwt_restore_sa(str, bwt[0]);
strcpy(str, prefix); strcat(str, ".rbwt"); bwt[1] = bwt_restore_bwt(str);
strcpy(str, prefix); strcat(str, ".rsa"); bwt_restore_sa(str, bwt[1]);
strcpy(str, prefix); strcat(str, ".bwt"); bwt = bwt_restore_bwt(str);
strcpy(str, prefix); strcat(str, ".sa"); bwt_restore_sa(str, bwt);
pac = (ubyte_t*)calloc(bns->l_pac/4+1, 1);
rewind(bns->fp_pac);
fread(pac, 1, bns->l_pac/4+1, bns->fp_pac);
Expand All @@ -702,7 +703,7 @@ void bwa_sai2sam_pe_core(const char *prefix, char *const fn_sa[2], char *const f
t = clock();

fprintf(stderr, "[bwa_sai2sam_pe_core] convert to sequence coordinate... \n");
cnt_chg = bwa_cal_pac_pos_pe(prefix, bwt, n_seqs, seqs, fp_sa, &ii, popt, &opt, &last_ii);
cnt_chg = bwa_cal_pac_pos_pe(bns, prefix, bwt, n_seqs, seqs, fp_sa, &ii, popt, &opt, &last_ii);
fprintf(stderr, "[bwa_sai2sam_pe_core] time elapses: %.2f sec\n", (float)(clock() - t) / CLOCKS_PER_SEC); t = clock();
fprintf(stderr, "[bwa_sai2sam_pe_core] changing coordinates of %d alignments.\n", cnt_chg);

Expand Down Expand Up @@ -746,7 +747,7 @@ void bwa_sai2sam_pe_core(const char *prefix, char *const fn_sa[2], char *const f
if (kh_exist(g_hash, iter)) free(kh_val(g_hash, iter).a);
kh_destroy(64, g_hash);
if (pac) {
free(pac); bwt_destroy(bwt[0]); bwt_destroy(bwt[1]);
free(pac); bwt_destroy(bwt);
}
}

Expand Down
63 changes: 30 additions & 33 deletions bwase.c
Original file line number Diff line number Diff line change
Expand Up @@ -109,54 +109,52 @@ int bwa_approx_mapQ(const bwa_seq_t *p, int mm)
return (23 < g_log_n[n])? 0 : 23 - g_log_n[n];
}

bwtint_t bwa_sa2pos(const bntseq_t *bns, const bwt_t *bwt, bwtint_t sapos, int len, int *strand)
{
bwtint_t pacpos;
int32_t ref_id;
pacpos = bwt_sa(bwt, sapos);
bns_coor_pac2real(bns, pacpos, 0, &ref_id);
*strand = !(ref_id&1);
/* NB: For gapped alignment, pacpos may not be correct, which will be fixed
* in refine_gapped_core(). This line also determines the way "x" is
* calculated in refine_gapped_core() when (ext < 0 && is_end == 0). */
if (ref_id&1) // mapped to the forward strand
pacpos = bns->anns[ref_id].len - (pacpos + len - bns->anns[ref_id].offset) + bns->anns[ref_id-1].offset;
return pacpos;
}

/**
* Derive the actual position in the read from the given suffix array
* coordinates. Note that the position will be approximate based on
* whether indels appear in the read and whether calculations are
* performed from the start or end of the read.
*/
void bwa_cal_pac_pos_core(const bwt_t *forward_bwt, const bwt_t *reverse_bwt, bwa_seq_t *seq, const int max_mm, const float fnr)
void bwa_cal_pac_pos_core(const bntseq_t *bns, const bwt_t *bwt, bwa_seq_t *seq, const int max_mm, const float fnr)
{
int max_diff;
int max_diff, strand;
if (seq->type != BWA_TYPE_UNIQUE && seq->type != BWA_TYPE_REPEAT) return;
max_diff = fnr > 0.0? bwa_cal_maxdiff(seq->len, BWA_AVG_ERR, fnr) : max_mm;
if (seq->strand) { // reverse strand only
seq->pos = bwt_sa(forward_bwt, seq->sa);
seq->seQ = seq->mapQ = bwa_approx_mapQ(seq, max_diff);
} else { // forward strand only
/* NB: For gapped alignment, p->pos may not be correct, which
* will be fixed in refine_gapped_core(). This line also
* determines the way "x" is calculated in
* refine_gapped_core() when (ext < 0 && is_end == 0). */
seq->pos = reverse_bwt->seq_len - (bwt_sa(reverse_bwt, seq->sa) + seq->len);
seq->seQ = seq->mapQ = bwa_approx_mapQ(seq, max_diff);
}
seq->seQ = seq->mapQ = bwa_approx_mapQ(seq, max_diff);
seq->pos = bwa_sa2pos(bns, bwt, seq->sa, seq->len, &strand);
seq->strand = strand;
seq->seQ = seq->mapQ = bwa_approx_mapQ(seq, max_diff);
}

void bwa_cal_pac_pos(const char *prefix, int n_seqs, bwa_seq_t *seqs, int max_mm, float fnr)
void bwa_cal_pac_pos(const bntseq_t *bns, const char *prefix, int n_seqs, bwa_seq_t *seqs, int max_mm, float fnr)
{
int i, j;
int i, j, strand;
char str[1024];
bwt_t *bwt;
// load forward SA
strcpy(str, prefix); strcat(str, ".bwt"); bwt = bwt_restore_bwt(str);
strcpy(str, prefix); strcat(str, ".sa"); bwt_restore_sa(str, bwt);
for (i = 0; i != n_seqs; ++i) {
if (seqs[i].strand) bwa_cal_pac_pos_core(bwt, 0, &seqs[i], max_mm, fnr);
for (j = 0; j < seqs[i].n_multi; ++j) {
bwt_multi1_t *p = seqs[i].multi + j;
if (p->strand) p->pos = bwt_sa(bwt, p->pos);
}
}
bwt_destroy(bwt);
// load reverse BWT and SA
strcpy(str, prefix); strcat(str, ".rbwt"); bwt = bwt_restore_bwt(str);
strcpy(str, prefix); strcat(str, ".rsa"); bwt_restore_sa(str, bwt);
for (i = 0; i != n_seqs; ++i) {
if (!seqs[i].strand) bwa_cal_pac_pos_core(0, bwt, &seqs[i], max_mm, fnr);
bwa_cal_pac_pos_core(bns, bwt, &seqs[i], max_mm, fnr);
for (j = 0; j < seqs[i].n_multi; ++j) {
bwt_multi1_t *p = seqs[i].multi + j;
if (!p->strand) p->pos = bwt->seq_len - (bwt_sa(bwt, p->pos) + seqs[i].len);
p->pos = bwa_sa2pos(bns, bwt, p->pos, seqs[i].len, &strand);
p->strand = strand;
}
}
bwt_destroy(bwt);
Expand All @@ -174,7 +172,7 @@ static bwa_cigar_t *refine_gapped_core(bwtint_t l_pac, const ubyte_t *pacseq, in
int l = 0, path_len, ref_len;
AlnParam ap = aln_param_bwa;
path_t *path;
int64_t k, __pos = *_pos > l_pac? (int64_t)((int32_t)*_pos) : *_pos;
int64_t k, __pos = *_pos;

ref_len = len + abs(ext);
if (ext > 0) {
Expand All @@ -192,7 +190,7 @@ static bwa_cigar_t *refine_gapped_core(bwtint_t l_pac, const ubyte_t *pacseq, in
aln_global_core(ref_seq, l, (ubyte_t*)seq, len, &ap, path, &path_len);
cigar = bwa_aln_path2cigar(path, path_len, n_cigar);

if (ext < 0 && is_end_correct) { // fix coordinate for reads mapped on the forward strand
if (ext < 0 && is_end_correct) { // fix coordinate for reads mapped to the forward strand
for (l = k = 0; k < *n_cigar; ++k) {
if (__cigar_op(cigar[k]) == FROM_D) l -= __cigar_len(cigar[k]);
else if (__cigar_op(cigar[k]) == FROM_I) l += __cigar_len(cigar[k]);
Expand Down Expand Up @@ -238,8 +236,7 @@ char *bwa_cal_md1(int n_cigar, bwa_cigar_t *cigar, int len, bwtint_t pos, ubyte_
} else ++u;
}
x += l; y += l;
/* } else if (cigar[k]>>14 == FROM_I || cigar[k]>>14 == 3) { */
} else if (__cigar_op(cigar[k]) == FROM_I || __cigar_op(cigar[k]) == FROM_S) {
} else if (__cigar_op(cigar[k]) == FROM_I || __cigar_op(cigar[k]) == FROM_S) {
y += l;
if (__cigar_op(cigar[k]) == FROM_I) nm += l;
} else if (__cigar_op(cigar[k]) == FROM_D) {
Expand Down Expand Up @@ -631,7 +628,7 @@ void bwa_sai2sam_se_core(const char *prefix, const char *fn_sa, const char *fn_f
}

fprintf(stderr, "[bwa_aln_core] convert to sequence coordinate... ");
bwa_cal_pac_pos(prefix, n_seqs, seqs, opt.max_diff, opt.fnr); // forward bwt will be destroyed here
bwa_cal_pac_pos(bns, prefix, n_seqs, seqs, opt.max_diff, opt.fnr); // forward bwt will be destroyed here
fprintf(stderr, "%.2f sec\n", (float)(clock() - t) / CLOCKS_PER_SEC); t = clock();

fprintf(stderr, "[bwa_aln_core] refine gapped alignments... ");
Expand Down
4 changes: 3 additions & 1 deletion bwase.h
Original file line number Diff line number Diff line change
Expand Up @@ -12,13 +12,15 @@ extern "C" {
// Initialize mapping tables in the bwa single-end mapper.
void bwase_initialize();
// Calculate the approximate position of the sequence from the specified bwt with loaded suffix array.
void bwa_cal_pac_pos_core(const bwt_t* forward_bwt, const bwt_t* reverse_bwt, bwa_seq_t* seq, const int max_mm, const float fnr);
void bwa_cal_pac_pos_core(const bntseq_t *bns, const bwt_t* bwt, bwa_seq_t* seq, const int max_mm, const float fnr);
// Refine the approximate position of the sequence to an actual placement for the sequence.
void bwa_refine_gapped(const bntseq_t *bns, int n_seqs, bwa_seq_t *seqs, ubyte_t *_pacseq, bntseq_t *ntbns);
// Backfill certain alignment properties mainly centering around number of matches.
void bwa_aln2seq(int n_aln, const bwt_aln1_t *aln, bwa_seq_t *s);
// Calculate the end position of a read given a certain sequence.
int64_t pos_end(const bwa_seq_t *p);
//
bwtint_t bwa_sa2pos(const bntseq_t *bns, const bwt_t *bwt, bwtint_t sapos, int len, int *strand);

#ifdef __cplusplus
}
Expand Down

0 comments on commit ec307a1

Please sign in to comment.