Skip to content

Commit

Permalink
Update
Browse files Browse the repository at this point in the history
  • Loading branch information
fbreitwieser committed Oct 23, 2017
1 parent cdc9fb6 commit 7d1ab24
Show file tree
Hide file tree
Showing 6 changed files with 69 additions and 33 deletions.
6 changes: 3 additions & 3 deletions scripts/krakenu-build_db.sh
Original file line number Diff line number Diff line change
Expand Up @@ -255,7 +255,7 @@ if [ "$KRAKEN_LCA_DATABASE" != "0" ]; then
set -x
set_lcas $MEMFLAG -x -d $SORTED_DB_NAME -o database.kdb -i database.idx -v \
-b taxDB $PARAM -t $KRAKEN_THREAD_CT -m seqid2taxid.map -c database.kdb.counts \
-F <( cat_library ) > seqid2taxid-plus.map
-F <( cat_library ) -T > seqid2taxid-plus.map
set +x
if [ "$KRAKEN_ADD_TAXIDS_FOR_SEQ" == "1" ] || [ "$KRAKEN_ADD_TAXIDS_FOR_GENOME" == "1" ]; then
mv seqid2taxid.map seqid2taxid.map.orig
Expand Down Expand Up @@ -300,8 +300,8 @@ if [ "$KRAKEN_UID_DATABASE" != "0" ]; then
## Make a classification report
REPNAME=uid_database
if [[ ! -s $REPNAME.report.tsv ]]; then
#echo "Creating UID database summary report $REPNAME.report.tsv ..."
#krakenu --db . --report-file $REPNAME.report.tsv --threads $KRAKEN_THREAD_CT --uid-mapping --fasta-input <(cat_library) > $REPNAME.kraken.tsv
echo "Creating UID database summary report $REPNAME.report.tsv ..."
krakenu --db . --report-file $REPNAME.report.tsv --threads $KRAKEN_THREAD_CT --uid-mapping --fasta-input <(cat_library) > $REPNAME.kraken.tsv
fi
fi

Expand Down
6 changes: 3 additions & 3 deletions src/Makefile
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
CXX = g++
FOPENMP?=-fopenmp
CXXFLAGS = -Wall -std=c++11 $(FOPENMP) -O2 -g -Wfatal-errors
CXXFLAGS = -Wall -std=c++11 $(FOPENMP) -g -Wfatal-errors
#CXXFLAGS = -Wall -std=c++11 $(FOPENMP) -O3 -Wfatal-errors
PROGS = db_sort set_lcas classify make_seqid_to_taxid_map db_shrink build_taxdb grade_classification dump_taxdb read_uid_mapping
PROGS = classify db_sort set_lcas make_seqid_to_taxid_map db_shrink build_taxdb grade_classification dump_taxdb read_uid_mapping
LIBFLAGS = -L. -I./gzstream -L./gzstream -lz -lgzstream

.PHONY: all install clean
Expand All @@ -28,7 +28,7 @@ read_uid_mapping: quickfile.o
classify: classify.cpp krakendb.o quickfile.o krakenutil.o seqreader.o uid_mapping.o hyperloglogplus.h taxdb.h report-cols.h
$(CXX) $(CXXFLAGS) -o classify $^ $(LIBFLAGS)

build_taxdb: taxdb.h report-cols.h
build_taxdb: taxdb.h report-cols.h quickfile.o

make_seqid_to_taxid_map: quickfile.o

Expand Down
30 changes: 25 additions & 5 deletions src/krakenutil.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
#include "kraken_headers.hpp"
#include "krakenutil.hpp"
#include <unordered_set>
#include<algorithm>

using namespace std;

Expand Down Expand Up @@ -50,7 +51,26 @@ namespace kraken {
// Return lowest common ancestor of a and b
// LCA(0,x) = LCA(x,0) = x
// Default ancestor is 1 (root of tree)
uint32_t lca(const unordered_map<uint32_t, uint32_t> &parent_map, uint32_t a, uint32_t b) {
uint32_t lca(const unordered_map<uint32_t, uint32_t> &parent_map,
uint32_t a, uint32_t b)
{
if (a == 0 || b == 0)
return a ? a : b;

unordered_set<uint32_t> a_path;
while (a > 0) {
a_path.insert(a);
a = parent_map.at(a);
}
while (b > 0) {
if (a_path.count(b) > 0)
return b;
b = parent_map.at(b);
}
return 1;
}

uint32_t lca_vec(const unordered_map<uint32_t, uint32_t> &parent_map, uint32_t a, uint32_t b) {
if (a == 0 || b == 0)
return a ? a : b;

Expand All @@ -59,19 +79,19 @@ namespace kraken {
do {
if (a == b)
return a;
a_path.insert(a);
a_path.push_back(a);
a = parent_map.at(a);
} while (a != a_path.back())
} while (a != a_path.back());

// search for b in the path from a to the root
uint32_t last_b = 0;
do {
if (a_path.find(b) != a_path.end())
if (std::find(a_path.begin(), a_path.end(), b) != a_path.end())
return b;

last_b = b;
b = parent_map.at(b);
} while (last_b != b)
} while (last_b != b);
return 1;
}

Expand Down
39 changes: 24 additions & 15 deletions src/set_lcas.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -39,16 +39,16 @@ void usage(int exit_code=EX_USAGE);
void process_files();
void process_single_file();
void process_file(string filename, uint32_t taxid);
void set_lcas(uint32_t taxid, string &seq, size_t start, size_t finish);
void set_lcas(uint32_t taxid, string &seq, size_t start, size_t finish, bool is_contaminant_taxid = false);

int Num_threads = 1;
string DB_filename, Index_filename,
Output_DB_filename, TaxDB_filename,
Kmer_count_filename,
File_to_taxon_map_filename,
ID_to_taxon_map_filename, Multi_fasta_filename;
bool force_taxid = false;
int New_taxid_start = 1000000000;
bool force_contaminant_taxid = false;
uint32_t New_taxid_start = 1000000000;

bool Allow_extra_kmers = false;
bool verbose = false;
Expand Down Expand Up @@ -88,7 +88,7 @@ int main(int argc, char **argv) {

parse_command_line(argc, argv);

if (!TaxDB_filename.empty() && !force_taxid) {
if (!TaxDB_filename.empty()) {
taxdb = TaxonomyDB<uint32_t, ReadCounts>(TaxDB_filename);
for (const auto & tax : taxdb.taxIDsAndEntries) {
if (tax.first != 0)
Expand Down Expand Up @@ -211,6 +211,15 @@ unordered_map<string,uint32_t> read_seqid_to_taxid_map(string ID_to_taxon_map_fi
string line, seq_id, name;
uint32_t taxid;

if (Add_taxIds_for_Assembly && Add_taxIds_for_Sequences) {
for (const auto& k : taxdb.taxIDsAndEntries) {
if (k.first >= New_taxid_start) {
New_taxid_start = k.first;
}
}
cerr << "Starting new taxonomy IDs with " << (New_taxid_start+1) << endl;
}

// Used when adding new taxids for assembly or sequence
unordered_map<string, uint32_t> name_to_taxid_map;

Expand Down Expand Up @@ -298,6 +307,8 @@ void process_single_file() {
continue;
}

bool is_contaminant_taxid = taxid == 32630 || taxid == 81077;

if (Add_taxIds_for_Sequences && taxid != 9606) {
// Update entry based on header line
auto entryIt = taxdb.taxIDsAndEntries.find(taxid);
Expand All @@ -319,7 +330,7 @@ void process_single_file() {
} else {
#pragma omp parallel for schedule(dynamic)
for (size_t i = 0; i < dna.seq.size(); i += SKIP_LEN)
set_lcas(taxid, dna.seq, i, i + SKIP_LEN + Database.get_k() - 1);
set_lcas(taxid, dna.seq, i, i + SKIP_LEN + Database.get_k() - 1, is_contaminant_taxid);
++seqs_processed;
}
} else {
Expand Down Expand Up @@ -378,7 +389,7 @@ void process_sequence(DNASequence dna) {
// Or maybe asembly_summary file?
}

void set_lcas(uint32_t taxid, string &seq, size_t start, size_t finish) {
void set_lcas(uint32_t taxid, string &seq, size_t start, size_t finish, bool is_contaminant_taxid) {
KmerScanner scanner(seq, start, finish);
uint64_t *kmer_ptr;
uint32_t *val_ptr;
Expand All @@ -403,14 +414,12 @@ void set_lcas(uint32_t taxid, string &seq, size_t start, size_t finish) {
if (Use_uids_instead_of_taxids) {
#pragma omp critical(new_uid)
*val_ptr = uid_mapping(Taxids_to_UID_map, UID_to_taxids_vec, taxid, *val_ptr, current_uid, UID_map_file);
} else if (!force_taxid && taxid != contaminant_taxids) {
if (Parent_map.find(taxid) != Parent_map.end()) {
*val_ptr = lca(Parent_map, taxid, *val_ptr);
}
} else {
// When force_taxid is set, do not compute lca, but assign the taxid
} else if (force_contaminant_taxid && is_contaminant_taxid) {
// When force_contaminant_taxid is set, do not compute lca, but assign the taxid
// of the (last) sequence to k-mers
*val_ptr = taxid;
} else {
*val_ptr = lca(Parent_map, taxid, *val_ptr);
}
}
}
Expand Down Expand Up @@ -454,7 +463,7 @@ void parse_command_line(int argc, char **argv) {
#endif
break;
case 'T' :
force_taxid = true;
force_contaminant_taxid = true;
break;
case 'v' :
verbose = true;
Expand Down Expand Up @@ -516,8 +525,8 @@ void usage(int exit_code) {
<< " -f filename File to taxon map" << endl
<< " -F filename Multi-FASTA file with sequence data" << endl
<< " -m filename Sequence ID to taxon map" << endl
<< " -a Add taxonomy IDs (starting with "<<New_taxid_start<<") for assemblies (third column in seqid2taxid.map) to Taxonomy DB" << endl
<< " -A Add taxonomy IDs (starting with "<<New_taxid_start<<") for sequences to Taxonomy DB" << endl
<< " -a Add taxonomy IDs (starting with "<<(New_taxid_start+1)<<") for assemblies (third column in seqid2taxid.map) to Taxonomy DB" << endl
<< " -A Add taxonomy IDs (starting with "<<(New_taxid_start+1)<<") for sequences to Taxonomy DB" << endl
<< " -T Do not set LCA as taxid for kmers, but the taxid of the sequence" << endl
<< " -I filename Write UIDs into database, and output (binary) UID-to-taxid map to filename" << endl
<< " -p Pretend - do not write database back to disk (when working in RAM)" << endl
Expand Down
16 changes: 11 additions & 5 deletions tests/build-dbs.sh
Original file line number Diff line number Diff line change
Expand Up @@ -7,14 +7,15 @@ set -eu


build_db() {
K=$1; shift
NAM=$1; shift
local K=$1; shift
local NAM=$1; shift
local MIN=15

DB_NAM=refseq-$NAM-k$K
local DB_NAM=refseq-$NAM-k$K
DB_DIR=$DIR/dbs/$DB_NAM

mkdir -p $DB_DIR
CMD="krakenu-build --kmer-len $K --minimizer-len 12 --threads $THREADS --db $DB_DIR --build --taxids-for-genomes --taxids-for-sequences --taxonomy-dir=$DIR/data/taxonomy --uid-database"
CMD="krakenu-build --kmer-len $K --minimizer-len $MIN --threads $THREADS --db $DB_DIR --build --taxids-for-genomes --taxids-for-sequences --taxonomy-dir=$DIR/data/taxonomy --uid-database"
for L in $@; do
CMD="$CMD --library-dir=$DIR/data/library/$L"
done
Expand All @@ -34,13 +35,18 @@ build_db() {
}

#export PATH="$DIR/install:$PATH"
for K in 31 21; do
for K in 31; do
if [[ `uname` == "Darwin" ]]; then
build_db $K viral viral
build_db $K all-viral viral viral-neighbors
else
build_db $K oct2017 archaea-dusted bacteria-dusted viral-dusted viral-neighbors-dusted \
vertebrate_mammalian contaminants

EUKD=$DIR/dbs/refseq-euk-oct2017-k31
[[ -d $EUKD ]] || mkdir -p $EUKD
[[ -f $EUKD/taxDB ]] || cp -v $DB_DIR/taxDB $EUKD
build_db $K euk-oct2017 fungi protozoa
#build_db $K bacteria bacteria archaea
fi
done
Expand Down
5 changes: 3 additions & 2 deletions tests/init.sh
Original file line number Diff line number Diff line change
Expand Up @@ -9,10 +9,11 @@ set -xeu

## Download taxonomy and genomic data into data/
#$DIR/install/krakenu-download --db $DIR/data -R --include-viral-neighbors taxonomy refseq/archaea refseq/bacteria refseq/viral/Any
$DIR/install/krakenu-download --db $DIR/data -R refseq/fungi refseq/fungi/Chromosome refseq/protozoa refseq/protozoa/Chromosome
#$DIR/install/krakenu-download --db $DIR/data --fna rna,genomic -R refseq/vertebrate_mammalian/Chromosome/taxid9606
$DIR/install/krakenu-download --db $DIR/data -R contaminants
#$DIR/install/krakenu-download --db $DIR/data -R contaminants

for i in viral viral-neighbors archaea bacteria; do
for i in fungi protozoa viral viral-neighbors archaea bacteria; do
[[ -s "$DIR/data/all-$i.fna" ]] || find $DIR/data/library/$i -name '*.fna' -exec cat {} \; > $DIR/data/all-$i.fna
[[ -s "$DIR/data/all-$i.map" ]] || find $DIR/data/library/$i -name '*.map' -exec cat {} \; > $DIR/data/all-$i.map
DUSTED_F="$DIR/data/all-$i-dusted.fna"
Expand Down

0 comments on commit 7d1ab24

Please sign in to comment.