Skip to content

Commit

Permalink
moved indexing to libbwa
Browse files Browse the repository at this point in the history
  • Loading branch information
lh3 committed Feb 13, 2016
1 parent 86170ff commit 9a82834
Show file tree
Hide file tree
Showing 3 changed files with 44 additions and 27 deletions.
9 changes: 5 additions & 4 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -4,9 +4,10 @@ CFLAGS= -g -Wall -Wno-unused-function -O2
WRAP_MALLOC=-DUSE_MALLOC_WRAPPERS
AR= ar
DFLAGS= -DHAVE_PTHREAD $(WRAP_MALLOC)
LOBJS= utils.o kthread.o kstring.o ksw.o bwt.o bntseq.o bwa.o bwamem.o bwamem_pair.o bwamem_extra.o malloc_wrap.o
AOBJS= QSufSort.o bwt_gen.o rope.o rle.o bwashm.o bwase.o bwaseqio.o bwtgap.o bwtaln.o bamlite.o \
is.o bwtindex.o bwape.o kopen.o pemerge.o maxk.o \
LOBJS= utils.o kthread.o kstring.o ksw.o bwt.o bntseq.o bwa.o bwamem.o bwamem_pair.o bwamem_extra.o malloc_wrap.o \
QSufSort.o bwt_gen.o rope.o rle.o is.o bwtindex.o
AOBJS= bwashm.o bwase.o bwaseqio.o bwtgap.o bwtaln.o bamlite.o \
bwape.o kopen.o pemerge.o maxk.o \
bwtsw2_core.o bwtsw2_main.o bwtsw2_aux.o bwt_lite.o \
bwtsw2_chain.o fastmap.o bwtsw2_pair.o
PROG= bwa
Expand Down Expand Up @@ -63,7 +64,7 @@ bwt_gen.o: QSufSort.h malloc_wrap.h
bwt_lite.o: bwt_lite.h malloc_wrap.h
bwtaln.o: bwtaln.h bwt.h bwtgap.h utils.h bwa.h bntseq.h malloc_wrap.h
bwtgap.o: bwtgap.h bwt.h bwtaln.h malloc_wrap.h
bwtindex.o: bntseq.h bwt.h utils.h rle.h rope.h malloc_wrap.h
bwtindex.o: bntseq.h bwa.h bwt.h utils.h rle.h rope.h malloc_wrap.h
bwtsw2_aux.o: bntseq.h bwt_lite.h utils.h bwtsw2.h bwt.h kstring.h
bwtsw2_aux.o: malloc_wrap.h bwa.h ksw.h kseq.h ksort.h
bwtsw2_chain.o: bwtsw2.h bntseq.h bwt_lite.h bwt.h malloc_wrap.h ksort.h
Expand Down
7 changes: 7 additions & 0 deletions bwa.h
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,11 @@

#define BWA_CTL_SIZE 0x10000

#define BWTALGO_AUTO 0
#define BWTALGO_RB2 1
#define BWTALGO_BWTSW 2
#define BWTALGO_IS 3

typedef struct {
bwt_t *bwt; // FM-index
bntseq_t *bns; // information on the reference sequences
Expand Down Expand Up @@ -41,6 +46,8 @@ extern "C" {
uint32_t *bwa_gen_cigar(const int8_t mat[25], int q, int r, int w_, int64_t l_pac, const uint8_t *pac, int l_query, uint8_t *query, int64_t rb, int64_t re, int *score, int *n_cigar, int *NM);
uint32_t *bwa_gen_cigar2(const int8_t mat[25], int o_del, int e_del, int o_ins, int e_ins, int w_, int64_t l_pac, const uint8_t *pac, int l_query, uint8_t *query, int64_t rb, int64_t re, int *score, int *n_cigar, int *NM);

int bwa_idx_build(const char *fa, const char *prefix, int algo_type, int block_size);

char *bwa_idx_infer_prefix(const char *hint);
bwt_t *bwa_idx_load_bwt(const char *hint);

Expand Down
55 changes: 32 additions & 23 deletions bwtindex.c
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@
#include <time.h>
#include <zlib.h>
#include "bntseq.h"
#include "bwa.h"
#include "bwt.h"
#include "utils.h"
#include "rle.h"
Expand Down Expand Up @@ -208,19 +209,14 @@ int bwa_bwt2sa(int argc, char *argv[]) // the "bwt2sa" command

int bwa_index(int argc, char *argv[]) // the "index" command
{
extern void bwa_pac_rev_core(const char *fn, const char *fn_rev);

char *prefix = 0, *str, *str2, *str3;
int c, algo_type = 0, is_64 = 0, block_size = 10000000;
clock_t t;
int64_t l_pac;

int c, algo_type = BWTALGO_AUTO, is_64 = 0, block_size = 10000000;
char *prefix = 0, *str;
while ((c = getopt(argc, argv, "6a:p:b:")) >= 0) {
switch (c) {
case 'a': // if -a is not set, algo_type will be determined later
if (strcmp(optarg, "rb2") == 0) algo_type = 1;
else if (strcmp(optarg, "bwtsw") == 0) algo_type = 2;
else if (strcmp(optarg, "is") == 0) algo_type = 3;
if (strcmp(optarg, "rb2") == 0) algo_type = BWTALGO_RB2;
else if (strcmp(optarg, "bwtsw") == 0) algo_type = BWTALGO_BWTSW;
else if (strcmp(optarg, "is") == 0) algo_type = BWTALGO_IS;
else err_fatal(__func__, "unknown algorithm: '%s'.", optarg);
break;
case 'p': prefix = strdup(optarg); break;
Expand Down Expand Up @@ -252,64 +248,77 @@ int bwa_index(int argc, char *argv[]) // the "index" command
strcpy(prefix, argv[optind]);
if (is_64) strcat(prefix, ".64");
}
bwa_idx_build(argv[optind], prefix, algo_type, block_size);
free(prefix);
return 0;
}

int bwa_idx_build(const char *fa, const char *prefix, int algo_type, int block_size)
{
extern void bwa_pac_rev_core(const char *fn, const char *fn_rev);

char *str, *str2, *str3;
clock_t t;
int64_t l_pac;

str = (char*)calloc(strlen(prefix) + 10, 1);
str2 = (char*)calloc(strlen(prefix) + 10, 1);
str3 = (char*)calloc(strlen(prefix) + 10, 1);

{ // nucleotide indexing
gzFile fp = xzopen(argv[optind], "r");
gzFile fp = xzopen(fa, "r");
t = clock();
fprintf(stderr, "[bwa_index] Pack FASTA... ");
if (bwa_verbose >= 3) fprintf(stderr, "[bwa_index] Pack FASTA... ");
l_pac = bns_fasta2bntseq(fp, prefix, 0);
fprintf(stderr, "%.2f sec\n", (float)(clock() - t) / CLOCKS_PER_SEC);
if (bwa_verbose >= 3) fprintf(stderr, "%.2f sec\n", (float)(clock() - t) / CLOCKS_PER_SEC);
err_gzclose(fp);
}
if (algo_type == 0) algo_type = l_pac > 50000000? 2 : 3; // set the algorithm for generating BWT
{
strcpy(str, prefix); strcat(str, ".pac");
strcpy(str2, prefix); strcat(str2, ".bwt");
t = clock();
fprintf(stderr, "[bwa_index] Construct BWT for the packed sequence...\n");
if (bwa_verbose >= 3) fprintf(stderr, "[bwa_index] Construct BWT for the packed sequence...\n");
if (algo_type == 2) bwt_bwtgen2(str, str2, block_size);
else if (algo_type == 1 || algo_type == 3) {
bwt_t *bwt;
bwt = bwt_pac2bwt(str, algo_type == 3);
bwt_dump_bwt(str2, bwt);
bwt_destroy(bwt);
}
fprintf(stderr, "[bwa_index] %.2f seconds elapse.\n", (float)(clock() - t) / CLOCKS_PER_SEC);
if (bwa_verbose >= 3) fprintf(stderr, "[bwa_index] %.2f seconds elapse.\n", (float)(clock() - t) / CLOCKS_PER_SEC);
}
{
bwt_t *bwt;
strcpy(str, prefix); strcat(str, ".bwt");
t = clock();
fprintf(stderr, "[bwa_index] Update BWT... ");
if (bwa_verbose >= 3) fprintf(stderr, "[bwa_index] Update BWT... ");
bwt = bwt_restore_bwt(str);
bwt_bwtupdate_core(bwt);
bwt_dump_bwt(str, bwt);
bwt_destroy(bwt);
fprintf(stderr, "%.2f sec\n", (float)(clock() - t) / CLOCKS_PER_SEC);
if (bwa_verbose >= 3) fprintf(stderr, "%.2f sec\n", (float)(clock() - t) / CLOCKS_PER_SEC);
}
{
gzFile fp = xzopen(argv[optind], "r");
gzFile fp = xzopen(fa, "r");
t = clock();
fprintf(stderr, "[bwa_index] Pack forward-only FASTA... ");
if (bwa_verbose >= 3) fprintf(stderr, "[bwa_index] Pack forward-only FASTA... ");
l_pac = bns_fasta2bntseq(fp, prefix, 1);
fprintf(stderr, "%.2f sec\n", (float)(clock() - t) / CLOCKS_PER_SEC);
if (bwa_verbose >= 3) fprintf(stderr, "%.2f sec\n", (float)(clock() - t) / CLOCKS_PER_SEC);
err_gzclose(fp);
}
{
bwt_t *bwt;
strcpy(str, prefix); strcat(str, ".bwt");
strcpy(str3, prefix); strcat(str3, ".sa");
t = clock();
fprintf(stderr, "[bwa_index] Construct SA from BWT and Occ... ");
if (bwa_verbose >= 3) fprintf(stderr, "[bwa_index] Construct SA from BWT and Occ... ");
bwt = bwt_restore_bwt(str);
bwt_cal_sa(bwt, 32);
bwt_dump_sa(str3, bwt);
bwt_destroy(bwt);
fprintf(stderr, "%.2f sec\n", (float)(clock() - t) / CLOCKS_PER_SEC);
if (bwa_verbose >= 3) fprintf(stderr, "%.2f sec\n", (float)(clock() - t) / CLOCKS_PER_SEC);
}
free(str3); free(str2); free(str); free(prefix);
free(str3); free(str2); free(str);
return 0;
}

0 comments on commit 9a82834

Please sign in to comment.