From 59eaf650ac86620b03539264591e2681d4b55ad4 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Mon, 11 Feb 2013 10:59:38 -0500 Subject: [PATCH] code backup --- Makefile | 2 +- bntseq.h | 1 + bwamem.c | 20 +++++++++++++++----- bwamem.h | 10 ++++++++++ bwamem_pair.c | 44 ++++++++++++++++++++++++++++++++++++++++++++ fastmap.c | 6 +++--- 6 files changed, 74 insertions(+), 9 deletions(-) create mode 100644 bwamem_pair.c diff --git a/Makefile b/Makefile index 46e0b80b..c97fbcf2 100644 --- a/Makefile +++ b/Makefile @@ -4,7 +4,7 @@ CFLAGS= -g -Wall -O2 CXXFLAGS= $(CFLAGS) AR= ar DFLAGS= -DHAVE_PTHREAD #-D_NO_SSE2 #-D_FILE_OFFSET_BITS=64 -LOBJS= bwa.o bamlite.o utils.o bwt.o bwtio.o bwtaln.o bwtgap.o bntseq.o bwamem.o stdaln.o \ +LOBJS= bwa.o bamlite.o utils.o bwt.o bwtio.o bwtaln.o bwtgap.o bntseq.o bwamem.o bwamem_pair.o stdaln.o \ bseq.o bwaseqio.o bwase.o kstring.o AOBJS= QSufSort.o bwt_gen.o \ is.o bwtmisc.o bwtindex.o ksw.o simple_dp.o \ diff --git a/bntseq.h b/bntseq.h index d4096b44..0425540f 100644 --- a/bntseq.h +++ b/bntseq.h @@ -29,6 +29,7 @@ #define BWT_BNTSEQ_H #include +#include #include #ifndef BWA_UBYTE diff --git a/bwamem.c b/bwamem.c index 7dd55b4e..b17d23b5 100644 --- a/bwamem.c +++ b/bwamem.c @@ -14,7 +14,7 @@ #define MAPQ_COEF 40. -int mem_debug = 0; +int mem_verbose = 3; // 1: error only; 2: error+warning; 3: message+error+warning; >=4: debugging void mem_fill_scmat(int a, int b, int8_t mat[25]) { @@ -36,6 +36,7 @@ mem_opt_t *mem_opt_init() o->split_width = 10; o->max_occ = 10000; o->max_chain_gap = 10000; + o->max_ins = 10000; o->mask_level = 0.50; o->chain_drop_ratio = 0.50; o->chunk_size = 10000000; @@ -427,7 +428,7 @@ void mem_chain2aln(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, int a->score = ksw_extend(l_query - qe, query + qe, rmax[1] - rmax[0] - re, rseq + re, 5, opt->mat, opt->q, opt->r, opt->w, a->score, &qle, &tle); a->qe = qe + qle; a->re = rmax[0] + re + tle; } else a->qe = l_query, a->re = s->rbeg + s->len; - if (mem_debug >= 2) printf("[%d] score=%d\t[%d,%d) <=> [%ld,%ld)\n", k, a->score, a->qb, a->qe, (long)a->rb, (long)a->re); + if (mem_verbose >= 4) printf("[%d] score=%d\t[%d,%d) <=> [%ld,%ld)\n", k, a->score, a->qb, a->qe, (long)a->rb, (long)a->re); // check how many seeds have been covered for (i = k + 1; i < c->n; ++i) { const mem_seed_t *t = &c->seeds[i]; @@ -574,7 +575,7 @@ static mem_alnreg_v find_alnreg(const mem_opt_t *opt, const bwt_t *bwt, const bn s->seq[i] = nst_nt4_table[(int)s->seq[i]]; chn = mem_chain(opt, bwt, s->l_seq, (uint8_t*)s->seq); chn.n = mem_chain_flt(opt, chn.n, chn.a); - if (mem_debug >= 1) mem_print_chain(bns, &chn); + if (mem_verbose >= 4) mem_print_chain(bns, &chn); regs.n = regs.m = chn.n; regs.a = malloc(regs.n * sizeof(mem_alnreg_t)); for (i = 0; i < chn.n; ++i) { @@ -593,6 +594,7 @@ typedef struct { const uint8_t *pac; bseq1_t *seqs; mem_alnreg_v *regs; + mem_pestat_t *pes; } worker_t; static void *worker1(void *data) @@ -628,6 +630,8 @@ int mem_process_seqs(const mem_opt_t *opt, const bwt_t *bwt, const bntseq_t *bns int i; worker_t *w; mem_alnreg_v *regs; + mem_pestat_t pes[4]; + w = calloc(opt->n_threads, sizeof(worker_t)); regs = malloc(n * sizeof(mem_alnreg_v)); for (i = 0; i < opt->n_threads; ++i) { @@ -635,21 +639,27 @@ int mem_process_seqs(const mem_opt_t *opt, const bwt_t *bwt, const bntseq_t *bns p->start = i; p->step = opt->n_threads; p->n = n; p->opt = opt; p->bwt = bwt; p->bns = bns; p->pac = pac; p->seqs = seqs; p->regs = regs; + p->pes = &pes[0]; } #ifdef HAVE_PTHREAD if (opt->n_threads == 1) { - worker1(w); worker2(w); + worker1(w); + mem_pestat(opt, bns->l_pac, n, regs, pes); + worker2(w); } else { pthread_t *tid; tid = (pthread_t*)calloc(opt->n_threads, sizeof(pthread_t)); for (i = 0; i < opt->n_threads; ++i) pthread_create(&tid[i], 0, worker1, &w[i]); for (i = 0; i < opt->n_threads; ++i) pthread_join(tid[i], 0); + mem_pestat(opt, bns->l_pac, n, regs, pes); for (i = 0; i < opt->n_threads; ++i) pthread_create(&tid[i], 0, worker2, &w[i]); for (i = 0; i < opt->n_threads; ++i) pthread_join(tid[i], 0); free(tid); } #else - worker1(w); worker2(w); + worker1(w); + mem_pestat(opt, bns->l_pac, n, regs, pes); + worker2(w); #endif for (i = 0; i < n; ++i) { fputs(seqs[i].sam, stdout); diff --git a/bwamem.h b/bwamem.h index c26893ae..c89abf6a 100644 --- a/bwamem.h +++ b/bwamem.h @@ -21,6 +21,7 @@ typedef struct { int n_threads, chunk_size; int pe_dir, is_pe; float mask_level, chain_drop_ratio; + int max_ins; // maximum insert size int8_t mat[25]; // scoring matrix; mat[0] == 0 if unset } mem_opt_t; @@ -36,9 +37,16 @@ typedef struct { int sub_n; } mem_alnreg_t; +typedef struct { + int low, high, failed; + double avg, std; +} mem_pestat_t; + typedef kvec_t(mem_chain_t) mem_chain_v; typedef kvec_t(mem_alnreg_t) mem_alnreg_v; +extern int mem_verbose; + #ifdef __cplusplus extern "C" { #endif @@ -58,6 +66,8 @@ uint32_t *mem_gen_cigar(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, int mem_process_seqs(const mem_opt_t *opt, const bwt_t *bwt, const bntseq_t *bns, const uint8_t *pac, int n, bseq1_t *seqs); +void mem_pestat(const mem_opt_t *opt, int64_t l_pac, int n, const mem_alnreg_v *regs, mem_pestat_t pes[4]); + #ifdef __cplusplus } #endif diff --git a/bwamem_pair.c b/bwamem_pair.c new file mode 100644 index 00000000..2a0079b7 --- /dev/null +++ b/bwamem_pair.c @@ -0,0 +1,44 @@ +#include +#include "kstring.h" +#include "bwamem.h" +#include "kvec.h" + +#define MIN_RATIO 0.8 + +static int cal_sub(const mem_opt_t *opt, mem_alnreg_v *r) +{ + int j; + for (j = 1; j < r->n; ++j) { // choose unique alignment + int b_max = r->a[j].qb > r->a[0].qb? r->a[j].qb : r->a[0].qb; + int e_min = r->a[j].qe < r->a[0].qe? r->a[j].qe : r->a[0].qe; + if (e_min > b_max) { // have overlap + int min_l = r->a[j].qe - r->a[j].qb < r->a[0].qe - r->a[0].qb? r->a[j].qe - r->a[j].qb : r->a[0].qe - r->a[0].qb; + if (e_min - b_max >= min_l * opt->mask_level) break; // significant overlap + } + } + return j < r->n? r->a[j].score : opt->min_seed_len * opt->a; +} + +void mem_pestat(const mem_opt_t *opt, int64_t l_pac, int n, const mem_alnreg_v *regs, mem_pestat_t pes[4]) +{ + int i; + kvec_t(int) isize[4]; + memset(isize, 0, sizeof(kvec_t(int)) * 4); + for (i = 0; i < n>>1; i += 2) { + int dir; + int64_t is, pos[2]; + mem_alnreg_v *r[2]; + r[0] = (mem_alnreg_v*)®s[i<<1|0]; + r[1] = (mem_alnreg_v*)®s[i<<1|1]; + if (r[0]->n == 0 || r[1]->n == 0) continue; + if (cal_sub(opt, r[0]) > MIN_RATIO * r[0]->a[0].score) continue; + if (cal_sub(opt, r[1]) > MIN_RATIO * r[1]->a[0].score) continue; + pos[0] = r[0]->a[0].rb < l_pac? r[0]->a[0].rb : (l_pac<<1) - 1 - r[0]->a[0].rb; // forward coordinate + pos[1] = r[1]->a[0].rb < l_pac? r[1]->a[0].rb : (l_pac<<1) - 1 - r[1]->a[0].rb; + if (pos[0] < pos[1]) dir = (r[0]->a[0].rb >= l_pac)<<1 | (r[1]->a[0].rb >= l_pac); + else dir = (r[1]->a[0].rb >= l_pac)<<1 | (r[0]->a[0].rb >= l_pac); + is = abs(pos[0] - pos[1]); + if (is <= opt->max_ins) kv_push(int, isize[dir], is); + } + if (mem_verbose >= 3) fprintf(stderr, "[M::%s] # candidates unique pairs for (FF, FR, RF, RR): (%ld, %ld, %ld, %ld)\n", __func__, isize[0].n, isize[1].n, isize[2].n, isize[3].n); +} diff --git a/fastmap.c b/fastmap.c index 5093897a..698b3e12 100644 --- a/fastmap.c +++ b/fastmap.c @@ -11,7 +11,6 @@ KSEQ_DECLARE(gzFile) extern unsigned char nst_nt4_table[256]; -extern int mem_debug; int main_mem(int argc, char *argv[]) { @@ -25,10 +24,10 @@ int main_mem(int argc, char *argv[]) bseq1_t *seqs; opt = mem_opt_init(); - while ((c = getopt(argc, argv, "k:c:D:s:")) >= 0) { + while ((c = getopt(argc, argv, "k:c:v:s:")) >= 0) { if (c == 'k') opt->min_seed_len = atoi(optarg); else if (c == 'c') opt->max_occ = atoi(optarg); - else if (c == 'D') mem_debug = atoi(optarg); + else if (c == 'v') mem_verbose = atoi(optarg); else if (c == 's') opt->split_width = atoi(optarg); } if (optind + 1 >= argc) { @@ -37,6 +36,7 @@ int main_mem(int argc, char *argv[]) fprintf(stderr, "Options: -k INT minimum seed length [%d]\n", opt->min_seed_len); fprintf(stderr, " -c INT skip seeds with more than INT occurrences [%d]\n", opt->max_occ); fprintf(stderr, " -s INT look for internal seeds inside a seed with less than INT occ [%d]\n", opt->split_width); + fprintf(stderr, " -v INT verbose level [%d]\n", mem_verbose); fprintf(stderr, "\n"); free(opt); return 1;