Skip to content

Commit

Permalink
Various improvements and fixes for building and classification
Browse files Browse the repository at this point in the history
  • Loading branch information
fbreitwieser committed Oct 2, 2017
1 parent 436938d commit 6e909c1
Show file tree
Hide file tree
Showing 9 changed files with 214 additions and 104 deletions.
7 changes: 5 additions & 2 deletions scripts/krakenu
Original file line number Diff line number Diff line change
Expand Up @@ -97,7 +97,9 @@ if ($@) {
die "$PROG: $@";
}

my @kdb_files = map { "$_/database.kdb" } @db_prefix;
my $database = $uid_mapping? "uid_database.kdb" : "database.kdb";
my @kdb_files = map { "$_/$database" } @db_prefix;

my @idx_files = map { "$_/database.idx" } @db_prefix;

foreach my $file (@kdb_files,@idx_files) {
Expand Down Expand Up @@ -148,7 +150,7 @@ push @flags, "-r", $report_file if defined $report_file;
push @flags, "-a", $db_prefix[0]."/taxDB";
push @flags, "-s" if $print_sequence;
if ($uid_mapping) {
my $uid_mapping_file = "$db_prefix[0]/uid_to_taxid";
my $uid_mapping_file = "$db_prefix[0]/uid_to_taxid.map";
if (!-f $uid_mapping_file) {
print STDERR "Missing required file $uid_mapping_file for UID mapping.\n";
exit(1);
Expand Down Expand Up @@ -220,6 +222,7 @@ Usage: $PROG [options] <filename(s)>
Options:
--db NAME Name for Kraken DB (default: $default_db)
--report-file FILENAME Write Kraken report to FILENAME
--uid-mapping Map using UID database
--threads NUM Number of threads (default: $def_thread_ct)
--fasta-input Input is FASTA format
--fastq-input Input is FASTQ format
Expand Down
8 changes: 4 additions & 4 deletions scripts/krakenu-build_db.sh
Original file line number Diff line number Diff line change
Expand Up @@ -254,7 +254,7 @@ if [ "$KRAKEN_LCA_DATABASE" != "0" ]; then
start_time1=$(date "+%s.%N")
set -x
set_lcas $MEMFLAG -x -d $SORTED_DB_NAME -o database.kdb -i database.idx -v \
-b taxDB $PARAM -t $KRAKEN_THREAD_CT -m seqid2taxid.map -c database.kmer_count \
-b taxDB $PARAM -t $KRAKEN_THREAD_CT -m seqid2taxid.map -c database.kdb.counts \
-F <( cat_library ) > seqid2taxid-plus.map
set +x
if [ "$KRAKEN_ADD_TAXIDS_FOR_SEQ" == "1" ] || [ "$KRAKEN_ADD_TAXIDS_FOR_GENOME" == "1" ]; then
Expand Down Expand Up @@ -292,16 +292,16 @@ if [ "$KRAKEN_UID_DATABASE" != "0" ]; then
fi
start_time1=$(date "+%s.%N")
set_lcas $MEMFLAG -x -d $SORTED_DB_NAME -I uid_to_taxid.map -o uid_database.kdb -i database.idx -v \
-b taxDB $PARAM -t $KRAKEN_THREAD_CT -m seqid2taxid.map -c uid_database.kmer_count -F <( cat_library )
-b taxDB $PARAM -t $KRAKEN_THREAD_CT -m seqid2taxid.map -c uid_database.kdb.counts -F <( cat_library )

echo "UID Database created. [$(report_time_elapsed $start_time1)]"
fi

## Make a classification report
REPNAME=uid_database
if [[ ! -s $REPNAME.report.tsv ]]; then
echo "Creating UID database summary report $REPNAME.report.tsv ..."
krakenu --db . --report-file $REPNAME.report.tsv --threads $KRAKEN_THREAD_CT --fasta-input <(cat_library) > $REPNAME.kraken.tsv
#echo "Creating UID database summary report $REPNAME.report.tsv ..."
#krakenu --db . --report-file $REPNAME.report.tsv --threads $KRAKEN_THREAD_CT --uid-mapping --fasta-input <(cat_library) > $REPNAME.kraken.tsv
fi
fi

Expand Down
16 changes: 12 additions & 4 deletions src/Makefile
Original file line number Diff line number Diff line change
@@ -1,7 +1,8 @@
CXX = g++
FOPENMP?=-fopenmp
CXXFLAGS = -Wall -std=c++11 $(FOPENMP) -O2 -g -Wfatal-errors
PROGS = db_sort set_lcas classify make_seqid_to_taxid_map db_shrink build_taxdb grade_classification dump_taxdb
#CXXFLAGS = -Wall -std=c++11 $(FOPENMP) -O3 -Wfatal-errors
PROGS = db_sort set_lcas classify make_seqid_to_taxid_map db_shrink build_taxdb grade_classification dump_taxdb read_uid_mapping
LIBFLAGS = -L. -I./gzstream -L./gzstream -lz -lgzstream

.PHONY: all install clean
Expand All @@ -18,17 +19,21 @@ db_shrink: krakendb.o quickfile.o

db_sort: krakendb.o quickfile.o

set_lcas: krakendb.o quickfile.o krakenutil.o seqreader.o uid_mapping.cpp
set_lcas: krakendb.o quickfile.o krakenutil.o seqreader.o uid_mapping.o

grade_classification: taxdb.h

classify: krakendb.o quickfile.o krakenutil.o seqreader.o uid_mapping.cpp
$(CXX) $(CXXFLAGS) -o classify classify.cpp $^ $(LIBFLAGS)
read_uid_mapping: quickfile.o

classify: classify.cpp krakendb.o quickfile.o krakenutil.o seqreader.o uid_mapping.o hyperloglogplus.h
$(CXX) $(CXXFLAGS) -o classify $^ $(LIBFLAGS)

build_taxdb: taxdb.h

make_seqid_to_taxid_map: quickfile.o

read_uid_mapping: quickfile.o krakenutil.o uid_mapping.o

krakenutil.o: krakenutil.cpp krakenutil.hpp taxdb.h
$(CXX) $(CXXFLAGS) -c krakenutil.cpp

Expand All @@ -40,3 +45,6 @@ seqreader.o: seqreader.cpp seqreader.hpp quickfile.hpp

quickfile.o: quickfile.cpp quickfile.hpp
$(CXX) $(CXXFLAGS) -c quickfile.cpp

uid_mapping.o: krakenutil.hpp uid_mapping.hpp uid_mapping.cpp
$(CXX) $(CXXFLAGS) -c uid_mapping.cpp
117 changes: 79 additions & 38 deletions src/classify.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ bool classify_sequence(DNASequence &dna, ostringstream &koss,
ostringstream &coss, ostringstream &uoss,
unordered_map<uint32_t, ReadCounts>&);
inline void print_sequence(ostringstream* oss_ptr, const DNASequence& dna);
string hitlist_string(const vector<uint32_t> &taxa);
string hitlist_string(const vector<uint32_t> &taxa, const vector<char>& ambig_list);


set<uint32_t> get_ancestry(uint32_t taxon);
Expand Down Expand Up @@ -211,7 +211,7 @@ int main(int argc, char **argv) {
Report_output = cout_or_file(Report_output_file);
}

cerr << "Print_kraken: " << Print_kraken << "; Print_kraken_report: " << Print_kraken_report << "; k: " << uint32_t(KrakenDatabases[0]->get_k()) << endl;
//cerr << "Print_kraken: " << Print_kraken << "; Print_kraken_report: " << Print_kraken_report << "; k: " << uint32_t(KrakenDatabases[0]->get_k()) << endl;

struct timeval tv1, tv2;
gettimeofday(&tv1, NULL);
Expand All @@ -222,21 +222,26 @@ int main(int argc, char **argv) {
std::cerr << "Finishing up ..\n";

if (Print_kraken_report) {
for (auto fname : DB_filenames) {
ifstream ifs(fname + ".counts");
if (ifs.good()) {
ifs.close();
taxdb.readGenomeSizes(fname+".counts");
}
}

taxdb.setReadCounts(taxon_counts);
TaxReport<uint32_t,ReadCounts> rep = TaxReport<uint32_t, ReadCounts>(*Report_output, taxdb, false);
rep.setReportCols({
"percReadsClade",
"numReadsClade",
"numReadsTaxon",
"numUniqueKmersClade",
"numUniqueKmersTaxon",
"numKmersClade",
"numKmersTaxon",
"numKmersInDatabaseClade",
"numKmersInDatabaseTaxon",
"%",
"reads",
"taxReads",
"kmers",
"dup",
"cov",
"taxID",
"taxRank",
"indentedName"});
"rank",
"taxName"});
rep.printReport("kraken","blu");
}

Expand Down Expand Up @@ -336,22 +341,23 @@ void process_file(char *filename) {
total_sequences += work_unit.size();
total_bases += total_nt;
//if (Print_Progress && total_sequences % 100000 < work_unit.size())
if (Print_Progress && total_sequences % 100000 < work_unit.size())
if (Print_Progress) {
cerr << "\rProcessed " << total_sequences << " sequences (" << total_classified << " classified) ...";
}
}
}
} // end parallel section

delete reader;
}

inline
uint32_t get_taxon_for_kmer(KrakenDB& database, uint64_t* kmer_ptr, uint64_t& current_bin_key,
int64_t& current_min_pos, int64_t& current_max_pos) {
uint32_t* val_ptr = database.kmer_query(
database.canonical_representation(*kmer_ptr), &current_bin_key,
&current_min_pos, &current_max_pos);
uint32_t taxon = val_ptr ? *val_ptr : 0;
return taxon;
return val_ptr ? *val_ptr : 0;
}


Expand Down Expand Up @@ -385,7 +391,45 @@ void append_hitlist_string(string& hitlist_string, uint32_t& last_taxon, uint32_
}
}

string hitlist_string(const vector<uint32_t> &taxa)
string hitlist_string(const vector<uint32_t> &taxa, const vector<uint8_t> &ambig)
{
int64_t last_code;
int code_count = 1;
ostringstream hitlist;

if (ambig[0]) { last_code = -1; }
else { last_code = taxa[0]; }

for (size_t i = 1; i < taxa.size(); i++) {
int64_t code;
if (ambig[i]) { code = -1; }
else { code = taxa[i]; }

if (code == last_code) {
code_count++;
}
else {
if (last_code >= 0) {
hitlist << last_code << ":" << code_count << " ";
}
else {
hitlist << "A:" << code_count << " ";
}
code_count = 1;
last_code = code;
}
}
if (last_code >= 0) {
hitlist << last_code << ":" << code_count;
}
else {
hitlist << "A:" << code_count;
}
return hitlist.str();
}


string hitlist_string_depr(const vector<uint32_t> &taxa)
{
uint32_t last_code = taxa[0];
int code_count = 1;
Expand Down Expand Up @@ -421,11 +465,8 @@ string hitlist_string(const vector<uint32_t> &taxa)
bool classify_sequence(DNASequence &dna, ostringstream &koss,
ostringstream &coss, ostringstream &uoss,
unordered_map<uint32_t, ReadCounts>& my_taxon_counts) {
size_t n_kmers = dna.seq.size()-KrakenDatabases[0]->get_k()+1;
vector<uint32_t> taxa;
taxa.reserve(n_kmers);
//vector<uint8_t> ambig_list;
//ambig_list.reserve(n_kmers);
vector<uint8_t> ambig_list;
unordered_map<uint32_t, uint32_t> hit_counts;
uint64_t *kmer_ptr;
uint32_t taxon = 0;
Expand All @@ -445,39 +486,40 @@ bool classify_sequence(DNASequence &dna, ostringstream &koss,
vector<db_status> db_statuses(KrakenDatabases.size());

if (dna.seq.size() >= KrakenDatabases[0]->get_k()) {
size_t n_kmers = dna.seq.size()-KrakenDatabases[0]->get_k()+1;
taxa.reserve(n_kmers);
ambig_list.reserve(n_kmers);
KmerScanner scanner(dna.seq);
while ((kmer_ptr = scanner.next_kmer()) != NULL) {
taxon = 0;
if (scanner.ambig_kmer()) {
//append_hitlist_string(hitlist_string, last_taxon, last_counter, ambig_taxon);
//ambig_list.push_back(1);
taxa.push_back(-1);
ambig_list.push_back(1);
}
else {
//ambig_list.push_back(0);

ambig_list.push_back(0);
// go through multiple databases to map k-mer
for (size_t i=0; i<KrakenDatabases.size(); ++i) {
taxon = get_taxon_for_kmer(*KrakenDatabases[i], kmer_ptr,
db_statuses[i].current_bin_key, db_statuses[i].current_min_pos, db_statuses[i].current_max_pos);
taxon = get_taxon_for_kmer(*KrakenDatabases[i], kmer_ptr,
db_statuses[i].current_bin_key, db_statuses[i].current_min_pos, db_statuses[i].current_max_pos);

//uint32_t* val_ptr = KrakenDatabases[i]->kmer_query(
// KrakenDatabases[i]->canonical_representation(*kmer_ptr), &db_statuses[i].current_bin_key,
// &db_statuses[i].current_min_pos, &db_statuses[i].current_max_pos);
//taxon = val_ptr ? *val_ptr : 0;
if (taxon) break;
}

//cerr << "taxon for " << *kmer_ptr << " is " << taxon << endl;

// cerr << "taxon for " << *kmer_ptr << " is " << taxon << endl;
my_taxon_counts[taxon].add_kmer(*kmer_ptr);

if (taxon) {
if (taxon == -1) {
cerr << "ERROR: Invalid taxon (-1)" << endl;
exit(1);
}
hit_counts[taxon]++;
if (Quick_mode && ++hits >= Minimum_hit_count)
break;
}
taxa.push_back(taxon);
}
taxa.push_back(taxon);
//append_hitlist_string(hitlist_string, last_taxon, last_counter, taxon);
}
}
Expand All @@ -488,7 +530,8 @@ bool classify_sequence(DNASequence &dna, ostringstream &koss,
cerr << "Quick mode not available when mapping UIDs" << endl;
exit(1);
} else {
call = resolve_uids2(hit_counts, Parent_map, (const uint32_t *)UID_to_TaxID_map_file.ptr(), UID_to_TaxID_map_file.size());
call = resolve_uids2(hit_counts, Parent_map,
UID_to_TaxID_map_file.ptr(), UID_to_TaxID_map_file.size());
}
} else {
if (Quick_mode)
Expand All @@ -497,8 +540,6 @@ bool classify_sequence(DNASequence &dna, ostringstream &koss,
call = resolve_tree(hit_counts, Parent_map);
}


#pragma omp atomic
++(my_taxon_counts[call].n_reads);

if (Print_unclassified && !call)
Expand Down Expand Up @@ -528,7 +569,7 @@ bool classify_sequence(DNASequence &dna, ostringstream &koss,
if (taxa.empty())
koss << "0:0";
else
koss << hitlist_string(taxa);
koss << hitlist_string(taxa, ambig_list);
//if (hitlist_string.empty() && last_counter == 0)
// koss << "0:0";
//else {
Expand Down
32 changes: 29 additions & 3 deletions src/taxdb.h
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@
#include <unordered_map>
#include <vector>
#include <unordered_set>
#include <iomanip>
#include "report-cols.h"

using namespace std;
Expand Down Expand Up @@ -223,6 +224,7 @@ class TaxonomyDB {

void setGenomeSizes(const std::unordered_map<TAXID, uint64_t> & genomeSizes);
void setReadCounts(const std::unordered_map<TAXID, READCOUNTS>& readCounts);
void readGenomeSizes(string file);
void setGenomeSize(const TAXID taxid, const uint64_t genomeSize);
void addReadCount(const TAXID taxid, const READCOUNTS& readCounts_);

Expand Down Expand Up @@ -876,6 +878,23 @@ void TaxonomyDB<TAXID,READCOUNTS>::setGenomeSize(const TAXID taxid, const uint64
}


template<typename TAXID, typename READCOUNTS>
void TaxonomyDB<TAXID,READCOUNTS>::readGenomeSizes(string file) {
for (auto& entry : taxIDsAndEntries) {
entry.second.genomeSize = 0;
entry.second.genomeSizeOfChildren = 0;
}
log_msg("Reading genome sizes from " + file);
std::ifstream inFile(file);
if (!inFile.is_open())
throw std::runtime_error("unable to open file " + file);
TAXID taxonomyID;
uint64_t size;
while (!inFile.eof()) {
inFile >> taxonomyID >> size;
setGenomeSize(taxonomyID, size);
}
}

template<typename TAXID, typename READCOUNTS>
void TaxonomyDB<TAXID,READCOUNTS>::setReadCounts(const unordered_map<TAXID, READCOUNTS>& readCounts) {
Expand Down Expand Up @@ -967,22 +986,29 @@ void TaxReport<TAXID,READCOUNTS>::printReport(TaxonomyEntry<TAXID,READCOUNTS>& t

template<typename TAXID, typename READCOUNTS>
void TaxReport<TAXID,READCOUNTS>::printLine(TaxonomyEntry<TAXID,READCOUNTS>& tax, unsigned depth) {

long long unique_kmers_for_clade = ( tax.readCounts.kmers.cardinality() + tax.readCountsOfChildren.kmers.cardinality());
double genome_size = double(tax.genomeSize+tax.genomeSizeOfChildren);

for (auto& col : _report_cols) {
switch (col) {
case REPORTCOLS::NAME: _reportOfb << tax.scientificName ; break;
case REPORTCOLS::SPACED_NAME: _reportOfb << string(2*depth, ' ') + tax.scientificName; break;
case REPORTCOLS::TAX_ID: _reportOfb << (tax.taxonomyID == (uint32_t)-1? -1 : (int32_t) tax.taxonomyID); break;
case REPORTCOLS::DEPTH: _reportOfb << depth; break;
case REPORTCOLS::PERCENTAGE: _reportOfb << 100.0*(reads(tax.readCounts) + reads(tax.readCountsOfChildren))/_total_n_reads; break;
case REPORTCOLS::PERCENTAGE: _reportOfb << setprecision(4) << 100.0*(reads(tax.readCounts) + reads(tax.readCountsOfChildren))/_total_n_reads; break;
//case REPORTCOLS::ABUNDANCE: _reportOfb << 100*counts.abundance[0]; break;
//case REPORTCOLS::ABUNDANCE_LEN: _reportOfb << 100*counts.abundance[1]; break;
case REPORTCOLS::NUM_READS: _reportOfb << reads(tax.readCounts); break;
case REPORTCOLS::NUM_READS_CLADE: _reportOfb << (reads(tax.readCounts) + reads(tax.readCountsOfChildren)); break;
case REPORTCOLS::NUM_UNIQUE_KMERS: _reportOfb << tax.readCounts.kmers.cardinality(); break;
case REPORTCOLS::NUM_UNIQUE_KMERS_CLADE: _reportOfb << (tax.readCounts.kmers.cardinality() + tax.readCountsOfChildren.kmers.cardinality()); break;
case REPORTCOLS::NUM_UNIQUE_KMERS_CLADE: _reportOfb << unique_kmers_for_clade; break;
case REPORTCOLS::NUM_KMERS: _reportOfb << tax.readCounts.n_kmers; break;
case REPORTCOLS::NUM_KMERS_CLADE: _reportOfb << tax.readCounts.n_kmers + tax.readCountsOfChildren.n_kmers; break;
case REPORTCOLS::NUM_KMERS_IN_DATABASE: _reportOfb << tax.genomeSize; break;
case REPORTCOLS::NUM_KMERS_IN_DATABASE: _reportOfb << tax.genomeSize; break;
case REPORTCOLS::CLADE_KMER_COVERAGE: if (genome_size == 0) { _reportOfb << "NA"; } else {
_reportOfb << setprecision(4) << (unique_kmers_for_clade / genome_size); }; break;
case REPORTCOLS::CLADE_KMER_DUPLICITY: _reportOfb << setprecision(3) << ( double(tax.readCounts.n_kmers + tax.readCountsOfChildren.n_kmers) / unique_kmers_for_clade ); break;
case REPORTCOLS::NUM_KMERS_IN_DATABASE_CLADE: _reportOfb << tax.genomeSize + tax.genomeSizeOfChildren; break;
//case REPORTCOLS::GENOME_SIZE: ; break;
//case REPORTCOLS::NUM_WEIGHTED_READS: ; break;
Expand Down
Loading

0 comments on commit 6e909c1

Please sign in to comment.