Skip to content

Commit

Permalink
r898: read the index into a single memory block
Browse files Browse the repository at this point in the history
Prepare for shared memory. Not used now.
  • Loading branch information
lh3 committed Oct 15, 2014
1 parent c443579 commit c5e859b
Show file tree
Hide file tree
Showing 4 changed files with 83 additions and 13 deletions.
15 changes: 6 additions & 9 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -71,36 +71,33 @@ algorithm and setting may vary. The following list gives the recommended
settings:

* Illumina/454/IonTorrent single-end reads longer than ~70bp or assembly
contigs up to a few megabases mapped to a close related reference genome:
contigs up to a few megabases mapped to a closely related reference genome:

bwa mem ref.fa reads.fq > aln.sam

* Illumina single-end reads no longer than ~70bp:
* Illumina single-end reads shorter than ~70bp:

bwa aln ref.fa reads.fq > reads.sai; bwa samse ref.fa reads.sai reads.fq > aln-se.sam

* Illumina/454/IonTorrent paired-end reads longer than ~70bp:

bwa mem ref.fa read1.fq read2.fq > aln-pe.sam

* Illumina paired-end reads no longer than ~70bp:
* Illumina paired-end reads shorter than ~70bp:

bwa aln ref.fa read1.fq > read1.sai; bwa aln ref.fa read2.fq > read2.sai
bwa sampe ref.fa read1.sai read2.sai read1.fq read2.fq > aln-pe.sam

* PacBio subreads to a reference genome:
* PacBio subreads or Oxford Nanopore reads to a reference genome:

bwa mem -x pacbio ref.fa reads.fq > aln.sam

* Oxford Nanopore reads to a reference genome:

bwa mem -x ont2d ref.fa reads.fq > aln.sam

BWA-MEM is recommended for query sequences longer than ~70bp for a variety of
error rates (or sequence divergence). Generally, BWA-MEM is more tolerant with
errors given longer query sequences as the chance of missing all seeds is small.
As is shown above, with non-default settings, BWA-MEM works with PacBio subreads
with a sequencing error rate as high as ~15%.
As is shown above, with non-default settings, BWA-MEM works with Oxford Nanopore
reads with a sequencing error rate over 20%.

####<a name="multihit"></a>2. Why does a read appear multiple times in the output SAM?

Expand Down
74 changes: 71 additions & 3 deletions bwa.c
Original file line number Diff line number Diff line change
Expand Up @@ -259,12 +259,80 @@ bwaidx_t *bwa_idx_load(const char *hint, int which)
void bwa_idx_destroy(bwaidx_t *idx)
{
if (idx == 0) return;
if (idx->bwt) bwt_destroy(idx->bwt);
if (idx->bns) bns_destroy(idx->bns);
if (idx->pac) free(idx->pac);
if (idx->mem == 0) {
if (idx->bwt) bwt_destroy(idx->bwt);
if (idx->bns) bns_destroy(idx->bns);
if (idx->pac) free(idx->pac);
} else free((uint8_t*)idx->mem);
free(idx);
}

int bwa_mem2idx(int64_t l_mem, const uint8_t *mem, bwaidx_t *idx)
{
int64_t k = 0, x;
int i;

// generate idx->bwt
x = sizeof(bwt_t); idx->bwt = (bwt_t*)(mem + k); k += x;
x = idx->bwt->n_sa * sizeof(bwtint_t); idx->bwt->sa = (bwtint_t*)(mem + k); k += x;
x = idx->bwt->bwt_size * 4; idx->bwt->bwt = (uint32_t*)(mem + k); k += x;

// generate idx->bns and idx->pac
x = sizeof(bntseq_t); idx->bns = (bntseq_t*)(mem + k); k += x;
x = idx->bns->n_holes * sizeof(bntamb1_t); idx->bns->ambs = (bntamb1_t*)(mem + k); k += x;
x = idx->bns->n_seqs * sizeof(bntann1_t); idx->bns->anns = (bntann1_t*)(mem + k); k += x;
for (i = 0; i < idx->bns->n_seqs; ++i) {
idx->bns->anns[i].name = (char*)(mem + k); k += strlen(idx->bns->anns[i].name) + 1;
idx->bns->anns[i].anno = (char*)(mem + k); k += strlen(idx->bns->anns[i].anno) + 1;
}
idx->pac = (uint8_t*)(mem + k);

idx->l_mem = k; idx->mem = mem;
return 0;
}

int bwa_idx2mem(bwaidx_t *idx)
{
int i;
int64_t k = 0, x, tmp;
uint8_t *mem = 0;

// copy idx->bwt
mem = malloc(sizeof(bwt_t) + idx->bwt->n_sa * sizeof(bwtint_t));
x = sizeof(bwt_t); memcpy(mem + k, idx->bwt, x); k += x;
x = idx->bwt->n_sa * sizeof(bwtint_t); memcpy(mem + k, idx->bwt->sa, x); k += x;
free(idx->bwt->sa);
mem = realloc(mem, k + idx->bwt->bwt_size * 4);
x = idx->bwt->bwt_size * 4; memcpy(mem + k, idx->bwt->bwt, x); k += x;
free(idx->bwt->bwt);
free(idx->bwt); idx->bwt = 0;

// copy idx->bns
tmp = idx->bns->n_seqs * sizeof(bntann1_t) + idx->bns->n_holes * sizeof(bntamb1_t);
for (i = 0; i < idx->bns->n_seqs; ++i) // compute the size of heap-allocated memory
tmp += strlen(idx->bns->anns[i].name) + strlen(idx->bns->anns[i].anno) + 2;
mem = realloc(mem, k + sizeof(bntseq_t) + tmp);
x = sizeof(bntseq_t); memcpy(mem + k, idx->bns, x); k += x;
x = idx->bns->n_holes * sizeof(bntamb1_t); memcpy(mem + k, idx->bns->ambs, x); k += x;
free(idx->bns->ambs);
x = idx->bns->n_seqs * sizeof(bntann1_t); memcpy(mem + k, idx->bns->anns, x); k += x;
for (i = 0; i < idx->bns->n_seqs; ++i) {
x = strlen(idx->bns->anns[i].name) + 1; memcpy(mem + k, idx->bns->anns[i].name, x); k += x;
x = strlen(idx->bns->anns[i].anno) + 1; memcpy(mem + k, idx->bns->anns[i].anno, x); k += x;
free(idx->bns->anns[i].name); free(idx->bns->anns[i].anno);
}
free(idx->bns->anns);

// copy idx->pac
x = idx->bns->l_pac/4+1;
mem = realloc(mem, k + x);
memcpy(mem + k, idx->pac, x); k += x;
free(idx->bns); idx->bns = 0;
free(idx->pac); idx->pac = 0;

return bwa_mem2idx(k, mem, idx);
}

/***********************
* SAM header routines *
***********************/
Expand Down
5 changes: 5 additions & 0 deletions bwa.h
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,9 @@ typedef struct {
bwt_t *bwt; // FM-index
bntseq_t *bns; // information on the reference sequences
uint8_t *pac; // the actual 2-bit encoded reference sequences with 'N' converted to a random base

int64_t l_mem;
const uint8_t *mem;
} bwaidx_t;

typedef struct {
Expand All @@ -39,6 +42,8 @@ extern "C" {

bwaidx_t *bwa_idx_load(const char *hint, int which);
void bwa_idx_destroy(bwaidx_t *idx);
int bwa_idx2mem(bwaidx_t *idx);
int bwa_mem2idx(int64_t l_mem, const uint8_t *mem, bwaidx_t *idx);

void bwa_print_sam_hdr(const bntseq_t *bns, const char *rg_line);
char *bwa_set_rg(const char *s);
Expand Down
2 changes: 1 addition & 1 deletion main.c
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
#include "utils.h"

#ifndef PACKAGE_VERSION
#define PACKAGE_VERSION "0.7.10-r896-dirty"
#define PACKAGE_VERSION "0.7.10-r898-dirty"
#endif

int bwa_fa2pac(int argc, char *argv[]);
Expand Down

0 comments on commit c5e859b

Please sign in to comment.