Skip to content

Commit

Permalink
support read group in bwa-mem
Browse files Browse the repository at this point in the history
  • Loading branch information
lh3 committed Feb 23, 2013
1 parent cfa7165 commit ee4540c
Show file tree
Hide file tree
Showing 6 changed files with 88 additions and 76 deletions.
66 changes: 63 additions & 3 deletions bwa.c
Original file line number Diff line number Diff line change
@@ -1,10 +1,13 @@
#include <string.h>
#include <stdio.h>
#include <zlib.h>
#include "bntseq.h"
#include "bwa.h"
#include "ksw.h"
#include "utils.h"

int bwa_verbose = 3;
char bwa_rg_id[256];

/************************
* Batch FASTA/Q reader *
Expand Down Expand Up @@ -132,8 +135,7 @@ bwt_t *bwa_idx_load_bwt(const char *hint)
bwt_t *bwt;
prefix = bwa_idx_infer_prefix(hint);
if (prefix == 0) {
if (bwa_verbose >= 1)
fprintf(stderr, "[E::%s] fail to locate the index files\n", __func__);
if (bwa_verbose >= 1) fprintf(stderr, "[E::%s] fail to locate the index files\n", __func__);
return 0;
}
tmp = calloc(strlen(prefix) + 5, 1);
Expand All @@ -150,7 +152,10 @@ bwaidx_t *bwa_idx_load(const char *hint, int which)
bwaidx_t *idx;
char *prefix;
prefix = bwa_idx_infer_prefix(hint);
if (prefix == 0) return 0;
if (prefix == 0) {
if (bwa_verbose >= 1) fprintf(stderr, "[E::%s] fail to locate the index files\n", __func__);
return 0;
}
idx = calloc(1, sizeof(bwaidx_t));
if (which & BWA_IDX_BWT) idx->bwt = bwa_idx_load_bwt(hint);
if (which & BWA_IDX_BNS) {
Expand All @@ -174,3 +179,58 @@ void bwa_idx_destroy(bwaidx_t *idx)
if (idx->pac) free(idx->pac);
free(idx);
}

/***********************
* SAM header routines *
***********************/

void bwa_print_sam_hdr(const bntseq_t *bns, const char *rg_line)
{
int i;
for (i = 0; i < bns->n_seqs; ++i)
err_printf("@SQ\tSN:%s\tLN:%d\n", bns->anns[i].name, bns->anns[i].len);
if (rg_line) err_printf("%s\n", rg_line);
}

static char *bwa_escape(char *s)
{
char *p, *q;
for (p = q = s; *p; ++p) {
if (*p == '\\') {
++p;
if (*p == 't') *q++ = '\t';
else if (*p == 'n') *q++ = '\n';
else if (*p == 'r') *q++ = '\r';
else if (*p == '\\') *q++ = '\\';
} else *q++ = *p;
}
*q = '\0';
return s;
}

char *bwa_set_rg(const char *s)
{
char *p, *q, *r, *rg_line = 0;
memset(bwa_rg_id, 0, 256);
if (strstr(s, "@RG") != s) return 0;
rg_line = strdup(s);
bwa_escape(rg_line);
if ((p = strstr(rg_line, "\tID:")) == 0) {
if (bwa_verbose >= 1) fprintf(stderr, "[E::%s] no ID in the @RG line\n", __func__);
goto err_set_rg;
}
p += 4;
for (q = p; *q && *q != '\t' && *q != '\n'; ++q);
if (q - p + 1 > 256) {
if (bwa_verbose >= 1) fprintf(stderr, "[E::%s] RG:ID is longer than 255 characters\n", __func__);
goto err_set_rg;
}
for (q = p, r = bwa_rg_id; *q && *q != '\t' && *q != '\n'; ++q)
*r++ = *q;
return rg_line;

err_set_rg:
free(rg_line);
return 0;
}

4 changes: 4 additions & 0 deletions bwa.h
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ typedef struct {
} bseq1_t;

extern int bwa_verbose;
extern char bwa_rg_id[256];

#ifdef __cplusplus
extern "C" {
Expand All @@ -36,6 +37,9 @@ extern "C" {
bwaidx_t *bwa_idx_load(const char *hint, int which);
void bwa_idx_destroy(bwaidx_t *idx);

void bwa_print_sam_hdr(const bntseq_t *bns, const char *rg_line);
char *bwa_set_rg(const char *s);

#ifdef __cplusplus
}
#endif
Expand Down
1 change: 1 addition & 0 deletions bwamem.c
Original file line number Diff line number Diff line change
Expand Up @@ -571,6 +571,7 @@ void bwa_hit2sam(kstring_t *str, const int8_t mat[25], int q, int r, int w, cons
}
if (p->score >= 0) { kputsn("\tAS:i:", 6, str); kputw(p->score, str); }
if (p->sub >= 0) { kputsn("\tXS:i:", 6, str); kputw(p->sub, str); }
if (bwa_rg_id) { kputsn("\tRG:i:", 6, str); kputs(bwa_rg_id, str); }
kputc('\n', str);
free(cigar);
#undef is_mapped
Expand Down
19 changes: 6 additions & 13 deletions bwape.c
Original file line number Diff line number Diff line change
Expand Up @@ -611,7 +611,7 @@ ubyte_t *bwa_paired_sw(const bntseq_t *bns, const ubyte_t *_pacseq, int n_seqs,
return pacseq;
}

void bwa_sai2sam_pe_core(const char *prefix, char *const fn_sa[2], char *const fn_fa[2], pe_opt_t *popt)
void bwa_sai2sam_pe_core(const char *prefix, char *const fn_sa[2], char *const fn_fa[2], pe_opt_t *popt, const char *rg_line)
{
extern bwa_seqio_t *bwa_open_reads(int mode, const char *fn_fa);
int i, j, n_seqs, tot_seqs = 0;
Expand Down Expand Up @@ -654,7 +654,7 @@ void bwa_sai2sam_pe_core(const char *prefix, char *const fn_sa[2], char *const f
}

// core loop
bwa_print_sam_SQ(bns);
bwa_print_sam_hdr(bns, rg_line);
bwa_print_sam_PG();
while ((seqs[0] = bwa_read_seq(ks[0], 0x40000, &n_seqs, opt0.mode, opt0.trim_qual)) != 0) {
int cnt_chg;
Expand Down Expand Up @@ -715,20 +715,15 @@ void bwa_sai2sam_pe_core(const char *prefix, char *const fn_sa[2], char *const f

int bwa_sai2sam_pe(int argc, char *argv[])
{
extern char *bwa_rg_line, *bwa_rg_id;
extern int bwa_set_rg(const char *s);
int c;
pe_opt_t *popt;
char *prefix;
char *prefix, *rg_line = 0;

popt = bwa_init_pe_opt();
while ((c = getopt(argc, argv, "a:o:sPn:N:c:f:Ar:")) >= 0) {
switch (c) {
case 'r':
if (bwa_set_rg(optarg) < 0) {
fprintf(stderr, "[%s] malformated @RG line\n", __func__);
return 1;
}
if ((rg_line = bwa_set_rg(optarg)) == 0) return 1;
break;
case 'a': popt->max_isize = atoi(optarg); break;
case 'o': popt->max_occ = atoi(optarg); break;
Expand Down Expand Up @@ -764,11 +759,9 @@ int bwa_sai2sam_pe(int argc, char *argv[])
}
if ((prefix = bwa_idx_infer_prefix(argv[optind])) == 0) {
fprintf(stderr, "[%s] fail to locate the index\n", __func__);
free(bwa_rg_line); free(bwa_rg_id);
return 0;
}
bwa_sai2sam_pe_core(prefix, argv + optind + 1, argv + optind+3, popt);
free(bwa_rg_line); free(bwa_rg_id); free(prefix);
free(popt);
bwa_sai2sam_pe_core(prefix, argv + optind + 1, argv + optind+3, popt, rg_line);
free(prefix); free(popt);
return 0;
}
59 changes: 5 additions & 54 deletions bwase.c
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,6 @@
#include "bwa.h"

int g_log_n[256];
char *bwa_rg_line, *bwa_rg_id;

void bwa_print_sam_PG();

Expand Down Expand Up @@ -490,56 +489,13 @@ void bwa_print_sam1(const bntseq_t *bns, bwa_seq_t *p, const bwa_seq_t *mate, in
}
}

void bwa_print_sam_SQ(const bntseq_t *bns)
{
int i;
for (i = 0; i < bns->n_seqs; ++i)
err_printf("@SQ\tSN:%s\tLN:%d\n", bns->anns[i].name, bns->anns[i].len);
if (bwa_rg_line) err_printf("%s\n", bwa_rg_line);
}

void bwase_initialize()
{
int i;
for (i = 1; i != 256; ++i) g_log_n[i] = (int)(4.343 * log(i) + 0.5);
}

char *bwa_escape(char *s)
{
char *p, *q;
for (p = q = s; *p; ++p) {
if (*p == '\\') {
++p;
if (*p == 't') *q++ = '\t';
else if (*p == 'n') *q++ = '\n';
else if (*p == 'r') *q++ = '\r';
else if (*p == '\\') *q++ = '\\';
} else *q++ = *p;
}
*q = '\0';
return s;
}

int bwa_set_rg(const char *s)
{
char *p, *q, *r;
if (strstr(s, "@RG") != s) return -1;
if (bwa_rg_line) free(bwa_rg_line);
if (bwa_rg_id) free(bwa_rg_id);
bwa_rg_line = strdup(s);
bwa_rg_id = 0;
bwa_escape(bwa_rg_line);
p = strstr(bwa_rg_line, "\tID:");
if (p == 0) return -1;
p += 4;
for (q = p; *q && *q != '\t' && *q != '\n'; ++q);
bwa_rg_id = calloc(q - p + 1, 1);
for (q = p, r = bwa_rg_id; *q && *q != '\t' && *q != '\n'; ++q)
*r++ = *q;
return 0;
}

void bwa_sai2sam_se_core(const char *prefix, const char *fn_sa, const char *fn_fa, int n_occ)
void bwa_sai2sam_se_core(const char *prefix, const char *fn_sa, const char *fn_fa, int n_occ, const char *rg_line)
{
extern bwa_seqio_t *bwa_open_reads(int mode, const char *fn_fa);
int i, n_seqs, tot_seqs = 0, m_aln;
Expand All @@ -559,7 +515,7 @@ void bwa_sai2sam_se_core(const char *prefix, const char *fn_sa, const char *fn_f

m_aln = 0;
fread(&opt, sizeof(gap_opt_t), 1, fp_sa);
bwa_print_sam_SQ(bns);
bwa_print_sam_hdr(bns, rg_line);
//bwa_print_sam_PG();
// set ks
ks = bwa_open_reads(opt.mode, fn_fa);
Expand Down Expand Up @@ -608,15 +564,12 @@ void bwa_sai2sam_se_core(const char *prefix, const char *fn_sa, const char *fn_f
int bwa_sai2sam_se(int argc, char *argv[])
{
int c, n_occ = 3;
char *prefix;
char *prefix, *rg_line = 0;
while ((c = getopt(argc, argv, "hn:f:r:")) >= 0) {
switch (c) {
case 'h': break;
case 'r':
if (bwa_set_rg(optarg) < 0) {
fprintf(stderr, "[%s] malformated @RG line\n", __func__);
return 1;
}
if ((rg_line = bwa_set_rg(optarg)) == 0) return 1;
break;
case 'n': n_occ = atoi(optarg); break;
case 'f': xreopen(optarg, "w", stdout); break;
Expand All @@ -630,10 +583,8 @@ int bwa_sai2sam_se(int argc, char *argv[])
}
if ((prefix = bwa_idx_infer_prefix(argv[optind])) == 0) {
fprintf(stderr, "[%s] fail to locate the index\n", __func__);
free(bwa_rg_line); free(bwa_rg_id);
return 0;
}
bwa_sai2sam_se_core(prefix, argv[optind+1], argv[optind+2], n_occ);
free(bwa_rg_line); free(bwa_rg_id);
bwa_sai2sam_se_core(prefix, argv[optind+1], argv[optind+2], n_occ, rg_line);
return 0;
}
15 changes: 9 additions & 6 deletions fastmap.c
Original file line number Diff line number Diff line change
Expand Up @@ -14,22 +14,25 @@ extern unsigned char nst_nt4_table[256];
int main_mem(int argc, char *argv[])
{
mem_opt_t *opt;
int c, n, l;
int c, n;
gzFile fp, fp2 = 0;
kseq_t *ks, *ks2 = 0;
bseq1_t *seqs;
bwaidx_t *idx;
char *rg_line = 0;

opt = mem_opt_init();
while ((c = getopt(argc, argv, "PHk:c:v:s:r:t:")) >= 0) {
while ((c = getopt(argc, argv, "PHk:c:v:s:r:t:R:")) >= 0) {
if (c == 'k') opt->min_seed_len = atoi(optarg);
else if (c == 't') opt->n_threads = atoi(optarg), opt->n_threads = opt->n_threads > 1? opt->n_threads : 1;
else if (c == 'P') opt->flag |= MEM_F_NOPAIRING;
else if (c == 'H') opt->flag |= MEM_F_HARDCLIP;
else if (c == 'c') opt->max_occ = atoi(optarg);
else if (c == 'v') mem_verbose = atoi(optarg);
else if (c == 'r') opt->split_factor = atof(optarg);
else if (c == 's') opt->split_width = atoi(optarg);
else if (c == 'R') {
if ((rg_line = bwa_set_rg(optarg)) == 0) return 1; // FIXME: memory leak
} else if (c == 's') opt->split_width = atoi(optarg);
}
if (optind + 1 >= argc) {
fprintf(stderr, "\n");
Expand All @@ -39,15 +42,15 @@ int main_mem(int argc, char *argv[])
fprintf(stderr, " -c INT skip seeds with more than INT occurrences [%d]\n", opt->max_occ);
fprintf(stderr, " -s INT look for internal seeds inside a seed with less than INT occ [%d]\n", opt->split_width);
fprintf(stderr, " -r FLOAT look for internal seeds inside a seed longer than {-k} * FLOAT [%g]\n", opt->split_factor);
fprintf(stderr, " -R STR read group header line such as '@RG\tID:foo\tSM:bar' [null]\n");
fprintf(stderr, " -v INT verbose level [%d]\n", mem_verbose);
fprintf(stderr, "\n");
free(opt);
return 1;
}
mem_fill_scmat(opt->a, opt->b, opt->mat);
idx = bwa_idx_load(argv[optind], BWA_IDX_ALL);
for (l = 0; l < idx->bns->n_seqs; ++l)
printf("@SQ\tSN:%s\tLN:%d\n", idx->bns->anns[l].name, idx->bns->anns[l].len);
if ((idx = bwa_idx_load(argv[optind], BWA_IDX_ALL)) == 0) return 1; // FIXME: memory leak
bwa_print_sam_hdr(idx->bns, rg_line);

fp = strcmp(argv[optind + 1], "-")? gzopen(argv[optind + 1], "r") : gzdopen(fileno(stdin), "r");
ks = kseq_init(fp);
Expand Down

0 comments on commit ee4540c

Please sign in to comment.