Skip to content

Commit

Permalink
adding support for SAM/BAM output using htslib. Please turn on within
Browse files Browse the repository at this point in the history
the Makefile.
  • Loading branch information
Nils Homer committed Jul 28, 2014
1 parent 3efc331 commit 482a47a
Show file tree
Hide file tree
Showing 12 changed files with 264 additions and 25 deletions.
24 changes: 19 additions & 5 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -2,30 +2,44 @@ CC= gcc
#CC= clang --analyze
CFLAGS= -g -Wall -Wno-unused-function -O2
WRAP_MALLOC=-DUSE_MALLOC_WRAPPERS
# set to 1 if you wish to have bam support, type 'make clean; make all'
USE_HTSLIB=0
AR= ar
DFLAGS= -DHAVE_PTHREAD $(WRAP_MALLOC)
DFLAGS= -DHAVE_PTHREAD $(WRAP_MALLOC)
LOBJS= utils.o kthread.o kstring.o ksw.o bwt.o bntseq.o bwa.o bwamem.o bwamem_pair.o bwamem_extra.o malloc_wrap.o
AOBJS= QSufSort.o bwt_gen.o bwase.o bwaseqio.o bwtgap.o bwtaln.o bamlite.o \
is.o bwtindex.o bwape.o kopen.o pemerge.o \
bwtsw2_core.o bwtsw2_main.o bwtsw2_aux.o bwt_lite.o \
bwtsw2_chain.o fastmap.o bwtsw2_pair.o
PROG= bwa
INCLUDES=
LIBS= -lm -lz -lpthread
LIBS= -lm -lz -lpthread
SUBDIRS= .

.SUFFIXES:.c .o .cc

.c.o:
$(CC) -c $(CFLAGS) $(DFLAGS) $(INCLUDES) $< -o $@
ifeq ($(USE_HTSLIB),1)
$(CC) -c $(CFLAGS) $(DFLAGS) -DUSE_HTSLIB $(INCLUDES) -I ../htslib $< -o $@
else
$(CC) -c $(CFLAGS) $(DFLAGS) $(INCLUDES) $< -o $@
endif

all:$(PROG)

bwa:libbwa.a $(AOBJS) main.o
$(CC) $(CFLAGS) $(DFLAGS) $(AOBJS) main.o -o $@ -L. -lbwa $(LIBS)
ifeq ($(USE_HTSLIB),1)
$(CC) $(CFLAGS) $(DFLAGS) $(AOBJS) main.o -o $@ ../htslib/libhts.a -L. -L../htslib -lbwa $(LIBS)
else
$(CC) $(CFLAGS) $(DFLAGS) $(AOBJS) main.o -o $@ -L. -lbwa $(LIBS)
endif

bwamem-lite:libbwa.a example.o
$(CC) $(CFLAGS) $(DFLAGS) example.o -o $@ -L. -lbwa $(LIBS)
ifeq ($(USE_HTSLIB),1)
$(CC) $(CFLAGS) $(DFLAGS) example.o -o $@ ../htslib/libhts.a -L. -L../htslib -lbwa $(LIBS)
else
$(CC) $(CFLAGS) $(DFLAGS) example.o -o $@ -L. -lbwa $(LIBS)
endif

libbwa.a:$(LOBJS)
$(AR) -csru $@ $(LOBJS)
Expand Down
2 changes: 2 additions & 0 deletions bamlite.c
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
#ifndef USE_HTSLIB
#include <stdlib.h>
#include <ctype.h>
#include <string.h>
Expand Down Expand Up @@ -208,3 +209,4 @@ int bamlite_gzclose(gzFile file) {
return ret;
}
#endif /* USE_VERBOSE_ZLIB_WRAPPERS */
#endif
2 changes: 2 additions & 0 deletions bamlite.h
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
#ifndef BAMLITE_H_
#define BAMLITE_H_
#ifndef USE_HTSLIB

#include <stdint.h>
#include <zlib.h>
Expand Down Expand Up @@ -112,3 +113,4 @@ extern "C" {
#endif

#endif
#endif
39 changes: 39 additions & 0 deletions bwa.c
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,9 @@
#include "ksw.h"
#include "utils.h"
#include "kstring.h"
#ifdef USE_HTSLIB
#include <htslib/sam.h>
#endif

#ifdef USE_MALLOC_WRAPPERS
# include "malloc_wrap.h"
Expand Down Expand Up @@ -274,6 +277,19 @@ void bwa_print_sam_hdr(const bntseq_t *bns, const char *rg_line)
err_printf("%s\n", bwa_pg);
}

#ifdef USE_HTSLIB
void bwa_format_sam_hdr(const bntseq_t *bns, const char *rg_line, kstring_t *str)
{
int i;
extern char *bwa_pg;
str->l = 0; str->s = 0;
for (i = 0; i < bns->n_seqs; ++i)
ksprintf(str, "@SQ\tSN:%s\tLN:%d\n", bns->anns[i].name, bns->anns[i].len);
if (rg_line) ksprintf(str, "%s\n", rg_line);
ksprintf(str, "%s\n", bwa_pg);
}
#endif

static char *bwa_escape(char *s)
{
char *p, *q;
Expand Down Expand Up @@ -319,3 +335,26 @@ char *bwa_set_rg(const char *s)
return 0;
}

#ifdef USE_HTSLIB
bams_t *bams_init() {
return calloc(1, sizeof(bams_t));
}

void bams_add(bams_t *bams, bam1_t *b) {
if (bams->m == bams->l) {
bams->m = bams->m << 2;
bams->bams = realloc(bams->bams, sizeof(bam1_t) * bams->m);
}
bams->bams[bams->l] = b;
bams->l++;
}

void bams_destroy(bams_t *bams) {
int i;
for (i = 0; i < bams->l; i++) {
bam_destroy1(bams->bams[i]);
}
free(bams->bams);
free(bams);
}
#endif
32 changes: 32 additions & 0 deletions bwa.h
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,9 @@
#include <stdint.h>
#include "bntseq.h"
#include "bwt.h"
#ifdef USE_HTSLIB
#include <htslib/sam.h>
#endif

#define BWA_IDX_BWT 0x1
#define BWA_IDX_BNS 0x2
Expand All @@ -16,11 +19,31 @@ typedef struct {
uint8_t *pac; // the actual 2-bit encoded reference sequences with 'N' converted to a random base
} bwaidx_t;

#ifdef USE_HTSLIB
typedef struct {
int l, m;
bam1_t **bams;
} bams_t;
#endif

typedef struct {
int l_seq;
#ifdef USE_HTSLIB
char *name, *comment, *seq, *qual;
bams_t *bams;
#else
char *name, *comment, *seq, *qual, *sam;
#endif
} bseq1_t;

// This is here to faciliate passing around HTSLIB's bam_hdr_t structure when we are not compiling with HTSLIB
#ifndef USE_HTSLIB
typedef struct {
void *ptr;
} bam_hdr_t; // DO NOT USE
#endif


extern int bwa_verbose;
extern char bwa_rg_id[256];

Expand All @@ -41,8 +64,17 @@ extern "C" {
void bwa_idx_destroy(bwaidx_t *idx);

void bwa_print_sam_hdr(const bntseq_t *bns, const char *rg_line);
#ifdef USE_HTSLIB
void bwa_format_sam_hdr(const bntseq_t *bns, const char *rg_line, kstring_t *str);
#endif
char *bwa_set_rg(const char *s);

#ifdef USE_HTSLIB
bams_t *bams_init();
void bams_add(bams_t *bams, bam1_t *b);
void bams_destroy(bams_t *bams);
#endif

#ifdef __cplusplus
}
#endif
Expand Down
42 changes: 33 additions & 9 deletions bwamem.c
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,10 @@
#include "ksort.h"
#include "utils.h"

#ifdef USE_HTSLIB
#include <htslib/sam.h>
#endif

#ifdef USE_MALLOC_WRAPPERS
# include "malloc_wrap.h"
#endif
Expand Down Expand Up @@ -805,7 +809,7 @@ static inline int get_rlen(int n_cigar, const uint32_t *cigar)
return l;
}

void mem_aln2sam(const bntseq_t *bns, kstring_t *str, bseq1_t *s, int n, const mem_aln_t *list, int which, const mem_aln_t *m_, int softclip_all)
void mem_aln2sam(const bntseq_t *bns, kstring_t *str, bseq1_t *s, int n, const mem_aln_t *list, int which, const mem_aln_t *m_, int softclip_all, bam_hdr_t *h)
{
int i, l_name;
mem_aln_t ptmp = list[which], *p = &ptmp, mtmp, *m = 0; // make a copy of the alignment to convert
Expand Down Expand Up @@ -922,6 +926,15 @@ void mem_aln2sam(const bntseq_t *bns, kstring_t *str, bseq1_t *s, int n, const m
if (p->XA) { kputsn("\tXA:Z:", 6, str); kputs(p->XA, str); }
if (s->comment) { kputc('\t', str); kputs(s->comment, str); }
kputc('\n', str);

#ifdef USE_HTSLIB
bam1_t *b = bam_init1();
if (sam_parse1(str, h, b) < 0) {
// TODO: error!
}
bams_add(s->bams, b);
str->l = 0; str->s = 0;
#endif
}

/************************
Expand Down Expand Up @@ -955,7 +968,7 @@ int mem_approx_mapq_se(const mem_opt_t *opt, const mem_alnreg_t *a)
}

// TODO (future plan): group hits into a uint64_t[] array. This will be cleaner and more flexible
void mem_reg2sam_se(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, bseq1_t *s, mem_alnreg_v *a, int extra_flag, const mem_aln_t *m)
void mem_reg2sam_se(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, bseq1_t *s, mem_alnreg_v *a, int extra_flag, const mem_aln_t *m, bam_hdr_t *h)
{
extern char **mem_gen_alt(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, mem_alnreg_v *a, int l_query, const char *query);
kstring_t str;
Expand Down Expand Up @@ -987,14 +1000,16 @@ void mem_reg2sam_se(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pa
mem_aln_t t;
t = mem_reg2aln(opt, bns, pac, s->l_seq, s->seq, 0);
t.flag |= extra_flag;
mem_aln2sam(bns, &str, s, 1, &t, 0, m, opt->flag&MEM_F_SOFTCLIP);
mem_aln2sam(bns, &str, s, 1, &t, 0, m, opt->flag&MEM_F_SOFTCLIP, h);
} else {
for (k = 0; k < aa.n; ++k)
mem_aln2sam(bns, &str, s, aa.n, aa.a, k, m, opt->flag&MEM_F_SOFTCLIP);
mem_aln2sam(bns, &str, s, aa.n, aa.a, k, m, opt->flag&MEM_F_SOFTCLIP, h);
for (k = 0; k < aa.n; ++k) free(aa.a[k].cigar);
free(aa.a);
}
#ifndef USE_HTSLIB
s->sam = str.s;
#endif
if (XA) {
for (k = 0; k < a->n; ++k) free(XA[k]);
free(XA);
Expand Down Expand Up @@ -1120,6 +1135,7 @@ typedef struct {
bseq1_t *seqs;
mem_alnreg_v *regs;
int64_t n_processed;
bam_hdr_t *h;
} worker_t;

static void worker1(void *data, int i, int tid)
Expand All @@ -1138,26 +1154,33 @@ static void worker1(void *data, int i, int tid)

static void worker2(void *data, int i, int tid)
{
extern int mem_sam_pe(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, const mem_pestat_t pes[4], uint64_t id, bseq1_t s[2], mem_alnreg_v a[2]);
extern int mem_sam_pe(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, const mem_pestat_t pes[4], uint64_t id, bseq1_t s[2], mem_alnreg_v a[2], bam_hdr_t *h);
extern void mem_reg2ovlp(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, bseq1_t *s, mem_alnreg_v *a);
worker_t *w = (worker_t*)data;
if (!(w->opt->flag&MEM_F_PE)) {
#ifdef USE_HTSLIB
w->seqs[i].bams = bams_init();
#endif
if (bwa_verbose >= 4) printf("=====> Finalizing read '%s' <=====\n", w->seqs[i].name);
if (w->opt->flag & MEM_F_ALN_REG) {
mem_reg2ovlp(w->opt, w->bns, w->pac, &w->seqs[i], &w->regs[i]);
} else {
mem_mark_primary_se(w->opt, w->regs[i].n, w->regs[i].a, w->n_processed + i);
mem_reg2sam_se(w->opt, w->bns, w->pac, &w->seqs[i], &w->regs[i], 0, 0);
mem_reg2sam_se(w->opt, w->bns, w->pac, &w->seqs[i], &w->regs[i], 0, 0, w->h);
}
free(w->regs[i].a);
} else {
if (bwa_verbose >= 4) printf("=====> Finalizing read pair '%s' <=====\n", w->seqs[i<<1|0].name);
mem_sam_pe(w->opt, w->bns, w->pac, w->pes, (w->n_processed>>1) + i, &w->seqs[i<<1], &w->regs[i<<1]);
#ifdef USE_HTSLIB
w->seqs[i<<1].bams = bams_init();
w->seqs[1+(i<<1)].bams = bams_init();
#endif
mem_sam_pe(w->opt, w->bns, w->pac, w->pes, (w->n_processed>>1) + i, &w->seqs[i<<1], &w->regs[i<<1], w->h);
free(w->regs[i<<1|0].a); free(w->regs[i<<1|1].a);
}
}

void mem_process_seqs(const mem_opt_t *opt, const bwt_t *bwt, const bntseq_t *bns, const uint8_t *pac, int64_t n_processed, int n, bseq1_t *seqs, const mem_pestat_t *pes0)
void mem_process_seqs(const mem_opt_t *opt, const bwt_t *bwt, const bntseq_t *bns, const uint8_t *pac, int64_t n_processed, int n, bseq1_t *seqs, const mem_pestat_t *pes0, bam_hdr_t *h)
{
extern void kt_for(int n_threads, void (*func)(void*,int,int), void *data, int n);
worker_t w;
Expand All @@ -1169,12 +1192,13 @@ void mem_process_seqs(const mem_opt_t *opt, const bwt_t *bwt, const bntseq_t *bn
ctime = cputime(); rtime = realtime();
global_bns = bns;
regs = malloc(n * sizeof(mem_alnreg_v));
w.opt = opt; w.bwt = bwt; w.bns = bns; w.pac = pac;
w.opt = opt; w.bwt = bwt; w.bns = bns; w.pac = pac;
w.seqs = seqs; w.regs = regs; w.n_processed = n_processed;
w.pes = &pes[0];
w.aux = malloc(opt->n_threads * sizeof(smem_aux_t));
for (i = 0; i < opt->n_threads; ++i)
w.aux[i] = smem_aux_init();
w.h = h;
kt_for(opt->n_threads, worker1, &w, (opt->flag&MEM_F_PE)? n>>1 : n); // find mapping positions
for (i = 0; i < opt->n_threads; ++i)
smem_aux_destroy(w.aux[i]);
Expand Down
6 changes: 5 additions & 1 deletion bwamem.h
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,9 @@ typedef struct {
int max_matesw; // perform maximally max_matesw rounds of mate-SW for each end
int max_hits; // if there are max_hits or fewer, output them all
int8_t mat[25]; // scoring matrix; mat[0] == 0 if unset
#ifdef USE_HTSLIB
int bam_output;
#endif
} mem_opt_t;

typedef struct {
Expand Down Expand Up @@ -123,8 +126,9 @@ extern "C" {
* @param seqs query sequences; $seqs[i].seq/sam to be modified after the call
* @param pes0 insert-size info; if NULL, infer from data; if not NULL, it should be an array with 4 elements,
* corresponding to each FF, FR, RF and RR orientation. See mem_pestat() for more info.
* @param h the BAM header, NULL if not using HTSLIB
*/
void mem_process_seqs(const mem_opt_t *opt, const bwt_t *bwt, const bntseq_t *bns, const uint8_t *pac, int64_t n_processed, int n, bseq1_t *seqs, const mem_pestat_t *pes0);
void mem_process_seqs(const mem_opt_t *opt, const bwt_t *bwt, const bntseq_t *bns, const uint8_t *pac, int64_t n_processed, int n, bseq1_t *seqs, const mem_pestat_t *pes0, bam_hdr_t *h);

/**
* Find the aligned regions for one query sequence
Expand Down
12 changes: 12 additions & 0 deletions bwamem_extra.c
Original file line number Diff line number Diff line change
Expand Up @@ -80,7 +80,11 @@ mem_alnreg_v mem_align1(const mem_opt_t *opt, const bwt_t *bwt, const bntseq_t *
return ar;
}

#ifdef USE_HTSLIB
void mem_reg2ovlp(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, bseq1_t *s, mem_alnreg_v *a, bam_hdr_t *h)
#else
void mem_reg2ovlp(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, bseq1_t *s, mem_alnreg_v *a)
#endif
{
int i;
kstring_t str = {0,0,0};
Expand All @@ -102,7 +106,15 @@ void mem_reg2ovlp(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac,
ksprintf(&str, "%.3f", (double)p->truesc / opt->a / (qe - qb > re - rb? qe - qb : re - rb));
kputc('\n', &str);
}
#ifdef USE_HTSLIB
bam1_t *b = bam_init1();
if (sam_parse1(&str, h, b) < 0) {
// TODO: error!
}
bams_add(s->bams, b);
#else
s->sam = str.s;
#endif
}

// Okay, returning strings is bad, but this has happened a lot elsewhere. If I have time, I need serious code cleanup.
Expand Down
Loading

0 comments on commit 482a47a

Please sign in to comment.