From de12003be89856301511ea471aecc845f6851ab8 Mon Sep 17 00:00:00 2001 From: Rone Charles Date: Wed, 1 Feb 2023 12:29:57 -0500 Subject: [PATCH] Address feedback and fix potential undefined behavior in seqreader.cc --- scripts/mask_low_complexity.sh | 4 +- src/CMakeLists.txt | 4 +- src/Makefile | 9 +- src/dustmasker.cc | 322 ---------------------------- src/k2mask.cc | 370 +++++++++++++++++++++++++++++++++ src/seqreader.cc | 2 + 6 files changed, 381 insertions(+), 330 deletions(-) delete mode 100644 src/dustmasker.cc create mode 100644 src/k2mask.cc diff --git a/scripts/mask_low_complexity.sh b/scripts/mask_low_complexity.sh index ef07555..0fa5251 100755 --- a/scripts/mask_low_complexity.sh +++ b/scripts/mask_low_complexity.sh @@ -13,7 +13,7 @@ set -o pipefail # Stop on failures in non-final pipeline commands target="$1" -MASKER="dustmasker" +MASKER="k2mask" if [ -n "$KRAKEN2_PROTEIN_DB" ]; then MASKER="segmasker" fi @@ -33,7 +33,7 @@ if [ -d $target ]; then done elif [ -f $target ]; then if [ ! -e "$target.masked" ]; then - $MASKER -in $target -outfmt fasta | sed -e '/^>/!s/[a-z]/x/g' > "$target.tmp" + $MASKER -in $target -threads 4 -outfmt fasta | sed -e '/^>/!s/[a-z]/x/g' > "$target.tmp" mv "$target.tmp" $target touch "$target.masked" fi diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 3c316a0..95ccaf3 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -43,8 +43,8 @@ add_executable(lookup_accession_numbers omp_hack.cc utilities.cc) -add_executable(dustmasker - dustmasker.cc +add_executable(k2mask + k2mask.cc seqreader.cc) target_link_libraries(dustmasker ${ZLIB_LIBRARIES} Threads::Threads) diff --git a/src/Makefile b/src/Makefile index 4453faa..a913327 100644 --- a/src/Makefile +++ b/src/Makefile @@ -1,10 +1,11 @@ CXX = g++ -CXXFLAGS = -fopenmp -Wall -std=c++11 -O3 +OMPFLAG ?= -fopenmp +CXXFLAGS = $(OMPFLAG) -Wall -std=c++11 -O3 CXXFLAGS += -DLINEAR_PROBING .PHONY: all clean install -PROGS = estimate_capacity build_db classify dump_table lookup_accession_numbers +PROGS = estimate_capacity build_db classify dump_table lookup_accession_numbers k2mask all: $(PROGS) @@ -24,13 +25,13 @@ omp_hack.o: omp_hack.cc omp_hack.h reports.o: reports.cc reports.h kraken2_data.h aa_translate.o: aa_translate.cc aa_translate.h utilities.o: utilities.cc utilities.h -dustmasker.o: dustmasker.cc gzstream.h threadpool.h classify.o: classify.cc kraken2_data.h kv_store.h taxonomy.h seqreader.h mmscanner.h compact_hash.h aa_translate.h reports.h utilities.h readcounts.h dump_table.o: dump_table.cc compact_hash.h taxonomy.h mmscanner.h kraken2_data.h reports.h estimate_capacity.o: estimate_capacity.cc kv_store.h mmscanner.h seqreader.h utilities.h build_db.o: build_db.cc taxonomy.h mmscanner.h seqreader.h compact_hash.h kv_store.h kraken2_data.h utilities.h lookup_accession_numbers.o: lookup_accession_numbers.cc mmap_file.h utilities.h +k2mask.o: k2mask.cc gzstream.h threadpool.h build_db: build_db.o mmap_file.o compact_hash.o taxonomy.o seqreader.o mmscanner.o omp_hack.o utilities.o $(CXX) $(CXXFLAGS) -o $@ $^ @@ -47,5 +48,5 @@ dump_table: dump_table.o mmap_file.o compact_hash.o omp_hack.o taxonomy.o report lookup_accession_numbers: lookup_accession_numbers.o mmap_file.o omp_hack.o utilities.o $(CXX) $(CXXFLAGS) -o $@ $^ -dustmasker: dustmasker.o seqreader.o +k2mask: k2mask.o seqreader.o $(CXX) $(CXXFLAGS) -o $@ $^ -lz -pthread diff --git a/src/dustmasker.cc b/src/dustmasker.cc deleted file mode 100644 index 1de2562..0000000 --- a/src/dustmasker.cc +++ /dev/null @@ -1,322 +0,0 @@ -/* - * Symmetric Dustmasker mostly based on a similarly named implementation - * by Heng Li as part of the minimap project. - */ - -#include -#include -#include -#include -#include -#include -#include - -#include -#include -#include -#include - -#include "gzstream.h" -#include "seqreader.h" -#include "threadpool.h" - -using namespace kraken2; - -uint8_t asc2dna[] = { - /* 0 */ 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, - /* 16 */ 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, - /* 32 */ 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, - /* - */ - /* 48 */ 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, - /* 64 */ 4, 0, 4, 1, 4, 4, 4, 2, 4, 4, 4, 4, 4, 4, 4, 4, - /* A B C D G H K M N */ - /* 80 */ 4, 4, 4, 4, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, - /* R S T U V W Y */ - /* 96 */ 4, 0, 4, 1, 4, 4, 4, 2, 4, 4, 4, 4, 4, 4, 4, 4, - /* a b c d g h k m n */ - /* 112 */ 4, 4, 4, 4, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, - /* r s t u v w y */ - /* 128 */ 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, - /* 144 */ 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, - /* 160 */ 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, - /* 176 */ 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, - /* 192 */ 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, - /* 208 */ 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, - /* 224 */ 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, - /* 240 */ 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, -}; - -struct PerfectInterval { - int start; - int finish; - int left; - int right; -}; - -struct MaskRange { - int start; - int finish; -}; - -struct SDust { - SDust(): rw(0), rv(0), l(0) { - memset(cv, 0, 64 * sizeof(int)); - memset(cw, 0, 64 * sizeof(int)); - } - - void reset() { - kmers.clear(); - perfectIntervals.clear(); - ranges.clear(); - memset(cw, 0, 64 * sizeof(int)); - memset(cv, 0, 64 * sizeof(int)); - l = rv = rv = 0; - } - - std::deque kmers; - std::vector perfectIntervals; - std::vector ranges; - Sequence seq; - int cw[64]; - int cv[64]; - int rw; - int rv; - int l; -}; - -int windowSize = 64; -int threshold = 20; -std::function maskChar = tolower; - -void usage(const char *prog) { - std::cerr << "usage: " << prog << " [-T threshold] [-W window size] [-l line length] [-r char] [-t threads]" - << std::endl; - exit(1); -} - -void shiftWindow(SDust &sd, int t) { - int s; - if (sd.kmers.size() >= windowSize - 2) { - s = sd.kmers.front(); - sd.kmers.pop_front(); - sd.rw -= --sd.cw[s]; - if (sd.l > sd.kmers.size()) { - --sd.l; - sd.rv -= --sd.cv[s]; - } - } - sd.kmers.push_back(t); - sd.l++; - sd.rw += sd.cw[t]++; - sd.rv += sd.cv[t]++; - if (sd.cv[t] * 10 > threshold * 2) { - do { - s = sd.kmers.at(sd.kmers.size() - sd.l); - sd.rv -= --sd.cv[s]; - sd.l--; - } while (s != t); - } -} - -void saveMaskedRegions(SDust &sd, int windowStart) { - bool saved = false; - if (sd.perfectIntervals.size() == 0 - || sd.perfectIntervals.back().start >= windowStart) - return; - PerfectInterval &p = sd.perfectIntervals.back(); - if (!sd.ranges.empty()) { - int start = sd.ranges.back().start; - int finish = sd.ranges.back().finish; - if (p.start <= finish) { - sd.ranges.back() = MaskRange { start, std::max(p.finish, finish) }; - saved = true; - } - } - if (!saved) - sd.ranges.push_back(MaskRange { p.start, p.finish}); - while (!sd.perfectIntervals.empty() - && sd.perfectIntervals.back().start < windowStart) - sd.perfectIntervals.pop_back(); -} - -void findPerfect(SDust &sd, int windowStart) { - int cv[64]; - int maxLeft = 0; - int maxRight = 0; - int newLeft = 0; - int newRight = sd.rv; - - memcpy(cv, sd.cv, 64 * sizeof(int)); - for (int i = sd.kmers.size() - sd.l - 1; i >= 0; --i) { - int j; - int kmer = sd.kmers.at(i); - newRight += cv[kmer]++; - newLeft = sd.kmers.size() - i - 1; - if (newRight * 10 > threshold * newLeft) { - for (j = 0; j < sd.perfectIntervals.size() && sd.perfectIntervals[j].start >= i + windowStart; ++j) { - PerfectInterval &p = sd.perfectIntervals[j]; - if (maxRight == 0 || p.right * maxLeft > maxRight * p.left) { - maxLeft = p.left; - maxRight = p.right; - } - } - if (maxRight == 0 || newRight * maxLeft >= maxRight * newLeft) { - maxLeft = newLeft; - maxRight = newRight; - PerfectInterval p; - p.start = i + windowStart; - p.finish = sd.kmers.size() + 2 + windowStart; - p.left = newLeft; - p.right = newRight; - auto position = sd.perfectIntervals.begin() + j; - sd.perfectIntervals.insert(position, p); - } - } - } -} - -void runSymmetricDust(SDust &sd, char *seq, size_t size, int offset) { - int triplet = 0; - int windowStart; - for (int i = 0, l = 0; i < size; i++) { - int base = asc2dna[(int)seq[i]]; - if (base < 4) { - l++; - triplet = (triplet << 2 | base) & 63; - if (l >= 3) { - windowStart = std::max(l - windowSize, 0); - saveMaskedRegions(sd, windowStart); - shiftWindow(sd, triplet); - if (sd.rw * 10 > sd.l * threshold) - findPerfect(sd, windowStart); - } - } - } - while (!sd.perfectIntervals.empty()) - saveMaskedRegions(sd, windowStart++); -} - -void printFasta(Sequence seq, std::ostream& out, int width) { - out.write(&seq.header[0], seq.header.size()); - out << '\n'; - for (size_t i = 0; i < seq.seq.size(); i += width) { - if (i + width >= seq.seq.size()) - width = seq.seq.size() - i; - out.write(&seq.seq[i], width); - out << '\n'; - } -} - -SDust *mask(SDust *sd) { - size_t i = 0; // ns = 0; - std::string &seq = sd->seq.seq; - - // for (i = 0; i < seq.size() && asc2dna[seq[i]] == 4; i++) - // seq[i] = maskChar(seq[i]); - for (; i < seq.size(); i++) { - if (asc2dna[(int)seq[i]] != 4) { - // if (ns >= windowSize) { - // for (; ns > 0; --ns) { - // seq[i - ns] = maskChar(seq[i - ns]); - // } - // } - // ns = 0; - int start = i; - for (;;) { - seq[i] = toupper(seq[i]); - if ((i+1) == seq.size() - || asc2dna[(int)seq[i+1]] == 4) - break; - i++; - } - char *s = &seq[0] + start; - runSymmetricDust(*sd, s, i - start + 1, start); - for (size_t j = 0; j < sd->ranges.size(); j++) { - for (size_t i = sd->ranges[j].start; i < sd->ranges[j].finish; i++) - s[i] = maskChar(s[i]); - } - sd->ranges.clear(); - // } else { - // ns++; - } - } - // while (asc2dna[seq[--i]] == 4) - // seq[i] = maskChar(seq[i]); - return sd; -} - -int main(int argc, char **argv) { - int ch; - int lineLength = 72; - int threads = 1; - std::string buffer; - const char *prog = "dustmasker"; - while ((ch = getopt(argc, argv, "W:T:l:t:r:")) != -1) { - switch (ch) { - case 'W': - windowSize = atoi(optarg); - break; - case 'T': - threshold = atoi(optarg); - break; - case 'l': - lineLength = atoi(optarg); - break; - case 't': - threads = atoi(optarg); - break; - case 'r': { - if (strlen(optarg) != 1) { - std::cerr << prog << ": -r expects a single character, " - << optarg << " given." << std::endl; - usage(prog); - } - int r = optarg[0]; - maskChar = [=](char c) { return r; }; - break; - } - default: - usage(prog); - } - } - argc -= optind; - argv += optind; - - if (argc > 0) { - std::cerr << prog << ": reads from stdin and writes to stdout" - << std::endl; - usage(prog); - } - gzistream is("/dev/stdin"); - std::vector sds(threads); - for (size_t i = 0; i < sds.size(); i++) { - sds[i] = new SDust(); - } - thread_pool pool(threads - 1); - std::queue> tasks; - for (SDust *sd = sds.back(); BatchSequenceReader::ReadNextSequence(is, sd->seq, buffer); sd = sds.back()) { - sds.pop_back(); - if (threads > 1) { - tasks.push(pool.submit(mask, sd)); - while (sds.empty()) { - sd = tasks.front().get(); - printFasta(sd->seq, std::cout, lineLength); - tasks.pop(); - sd->reset(); - sds.push_back(sd); - } - } else { - mask(sd); - printFasta(sd->seq, std::cout, lineLength); - sds.push_back(sd); - } - } - while (!tasks.empty()) { - SDust *sd = tasks.front().get(); - printFasta(sd->seq, std::cout, lineLength); - tasks.pop(); - } - for (size_t i = 0; i < sds.size(); i++) - delete(sds[i]); - std::cout.flush(); -} diff --git a/src/k2mask.cc b/src/k2mask.cc new file mode 100644 index 0000000..2e7abfb --- /dev/null +++ b/src/k2mask.cc @@ -0,0 +1,370 @@ +/* + * Symmetric Dustmasker mostly based on a similarly named implementation + * by Heng Li as part of the minimap project. + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +#include "gzstream.h" +#include "seqreader.h" +#include "threadpool.h" + +using namespace kraken2; + +uint8_t asc2dna[] = { + /* 0 */ 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + /* 16 */ 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + /* 32 */ 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + /* - */ + /* 48 */ 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + /* 64 */ 4, 0, 4, 1, 4, 4, 4, 2, 4, 4, 4, 4, 4, 4, 4, 4, + /* A B C D G H K M N */ + /* 80 */ 4, 4, 4, 4, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + /* R S T U V W Y */ + /* 96 */ 4, 0, 4, 1, 4, 4, 4, 2, 4, 4, 4, 4, 4, 4, 4, 4, + /* a b c d g h k m n */ + /* 112 */ 4, 4, 4, 4, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + /* r s t u v w y */ + /* 128 */ 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + /* 144 */ 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + /* 160 */ 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + /* 176 */ 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + /* 192 */ 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + /* 208 */ 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + /* 224 */ 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + /* 240 */ 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, +}; + +struct PerfectInterval { + int start; + int finish; + int left; + int right; +}; + +struct MaskRange { + int start; + int finish; +}; + +struct SDust { + SDust(): rw(0), rv(0), l(0) { + memset(cv, 0, 64 * sizeof(int)); + memset(cw, 0, 64 * sizeof(int)); + } + + void reset() { + kmers.clear(); + perfectIntervals.clear(); + ranges.clear(); + memset(cw, 0, 64 * sizeof(int)); + memset(cv, 0, 64 * sizeof(int)); + l = rw = rv = 0; + } + + std::deque kmers; + std::vector perfectIntervals; + std::vector ranges; + Sequence seq; + int cw[64]; + int cv[64]; + int rw; + int rv; + int l; +}; + +int windowSize = 64; +int threshold = 20; +std::function processMaskedNucleotide = tolower; + +void usage(const char *prog) { + std::cerr << "usage: " << prog + << " [-T | -level threshold] [-W | -window window size] [-i | -in input file]" + << std::endl + << " [-w | -width sequence width] [-o | -out output file] [-r | -replace char]" + << std::endl + << " [-f | -infmt input format] [-t | -threads threads]" + << std::endl; + exit(1); +} + +bool stricasecmp(std::string &s1, std::string &s2) { + if (s1.size() != s2.size()) + return false; + for (size_t i = 0; i < s1.size(); i++) { + if (tolower(s1[i]) != tolower(s2[i])) + return false; + } + return true; +} + +void shiftWindow(SDust &sd, int t) { + int s; + if ((int)sd.kmers.size() >= windowSize - 2) { + s = sd.kmers.front(); + sd.kmers.pop_front(); + sd.rw -= --sd.cw[s]; + if (sd.l > (int)sd.kmers.size()) { + --sd.l; + sd.rv -= --sd.cv[s]; + } + } + sd.kmers.push_back(t); + sd.l++; + sd.rw += sd.cw[t]++; + sd.rv += sd.cv[t]++; + if (sd.cv[t] * 10 > threshold * 2) { + do { + s = sd.kmers.at(sd.kmers.size() - sd.l); + sd.rv -= --sd.cv[s]; + sd.l--; + } while (s != t); + } +} + +void saveMaskedRegions(SDust &sd, int windowStart) { + bool saved = false; + if (sd.perfectIntervals.size() == 0 + || sd.perfectIntervals.back().start >= windowStart) + return; + PerfectInterval &p = sd.perfectIntervals.back(); + if (!sd.ranges.empty()) { + int start = sd.ranges.back().start; + int finish = sd.ranges.back().finish; + if (p.start <= finish) { + sd.ranges.back() = MaskRange { start, std::max(p.finish, finish) }; + saved = true; + } + } + if (!saved) + sd.ranges.push_back(MaskRange { p.start, p.finish}); + while (!sd.perfectIntervals.empty() + && sd.perfectIntervals.back().start < windowStart) + sd.perfectIntervals.pop_back(); +} + +void findPerfect(SDust &sd, int windowStart) { + int cv[64]; + int maxLeft = 0; + int maxRight = 0; + int newLeft = 0; + int newRight = sd.rv; + + memcpy(cv, sd.cv, 64 * sizeof(int)); + for (int i = sd.kmers.size() - sd.l - 1; i >= 0; --i) { + size_t j; + int kmer = sd.kmers.at(i); + newRight += cv[kmer]++; + newLeft = sd.kmers.size() - i - 1; + if (newRight * 10 > threshold * newLeft) { + for (j = 0; j < sd.perfectIntervals.size() && sd.perfectIntervals[j].start >= i + windowStart; ++j) { + PerfectInterval &p = sd.perfectIntervals[j]; + if (maxRight == 0 || p.right * maxLeft > maxRight * p.left) { + maxLeft = p.left; + maxRight = p.right; + } + } + if (maxRight == 0 || newRight * maxLeft >= maxRight * newLeft) { + maxLeft = newLeft; + maxRight = newRight; + PerfectInterval p; + p.start = i + windowStart; + p.finish = sd.kmers.size() + 2 + windowStart; + p.left = newLeft; + p.right = newRight; + auto position = sd.perfectIntervals.begin() + j; + sd.perfectIntervals.insert(position, p); + } + } + } +} + +void runSymmetricDust(SDust &sd, char *seq, size_t size, int offset) { + int triplet = 0; + int windowStart = 0; + int l = 0; + for (size_t i = 0; i < size; i++) { + int base = asc2dna[(int)seq[i]]; + if (base < 4) { + l++; + triplet = (triplet << 2 | base) & 63; + if (l >= 3) { + windowStart = std::max(l - (int)windowSize, 0); + saveMaskedRegions(sd, windowStart); + shiftWindow(sd, triplet); + if (sd.rw * 10 > sd.l * threshold) + findPerfect(sd, windowStart); + } + } + } + while (!sd.perfectIntervals.empty()) + saveMaskedRegions(sd, windowStart++); +} + +void printFasta(Sequence seq, std::ostream& out, int width) { + out.write(&seq.header[0], seq.header.size()); + out << '\n'; + for (size_t i = 0; i < seq.seq.size(); i += width) { + if (i + width >= seq.seq.size()) + width = seq.seq.size() - i; + out.write(&seq.seq[i], width); + out << '\n'; + } +} + +SDust *mask(SDust *sd) { + size_t i = 0; + std::string &seq = sd->seq.seq; + + for (; i < seq.size(); i++) { + if (asc2dna[(int)seq[i]] != 4) { + int start = i; + for (;;) { + seq[i] = toupper(seq[i]); + if ((i+1) == seq.size() + || asc2dna[(int)seq[i+1]] == 4) + break; + i++; + } + char *s = &seq[0] + start; + runSymmetricDust(*sd, s, i - start + 1, start); + for (size_t j = 0; j < sd->ranges.size(); j++) { + for (int i = sd->ranges[j].start; i < sd->ranges[j].finish; i++) + s[i] = processMaskedNucleotide(s[i]); + } + sd->ranges.clear(); + } + } + return sd; +} + +int main(int argc, char **argv) { + int ch; + int lineWidth = 72; + int threads = 1; + std::string infile = "/dev/stdin"; + std::string outfile = "/dev/stdout"; + std::string buffer; + const char *prog = "k2mask"; + + struct option longopts[] = { + {"window", required_argument, NULL, 'W'}, + {"level", required_argument, NULL, 'T'}, + {"in", required_argument, NULL, 'i'}, + {"out", required_argument, NULL, 'o'}, + {"width", required_argument, NULL, 'l'}, + {"outfmt", required_argument, NULL, 'f'}, + {"threads", required_argument, NULL, 't'}, + {"replace", required_argument, NULL, 'r'}, + {"help", no_argument, NULL, 'h'}, + {NULL, 0, NULL, 0} + }; + + while ((ch = getopt_long_only(argc, argv, "W:T:hi:l:o:r:t:", longopts, NULL)) != -1) { + switch (ch) { + case 'W': + windowSize = atoi(optarg); + break; + case 'T': + threshold = atoi(optarg); + break; + case 'i': + // Input file name (default: stdin) + infile = optarg; + break; + case 'w': + // Wrap sequence every after outputting this many (72) characters + lineWidth = atoi(optarg); + break; + case 'f': { + // Output format, currently FASTA only + std::string arg(optarg); + std::string format("fasta"); + + if (!stricasecmp(arg, format)) { + std::cerr << prog << ": currently only supports outputting FASTA." + << std::endl; + std::exit(1); + } + break; + } + case 'o': + // Output file name (default: stdout) + outfile = optarg; + break; + case 't': + // Number of threads. If _n_ threads are specified, + // _n - 1_ threads will be used for masking and the + // final thread used to manage I/O. + threads = atoi(optarg); + break; + case 'r': { + // Replace masked character with a specified letter. (default tolower) + if (strlen(optarg) != 1) { + std::cerr << prog << ": -r expects a single character, " + << optarg << " given." << std::endl; + usage(prog); + } + int r = optarg[0]; + processMaskedNucleotide = [=](char c) { return r; }; + break; + } + case 'h': + default: + usage(prog); + } + } + argc -= optind; + argv += optind; + + if (argc > 0) { + std::cerr << prog << ": reads from stdin and writes to stdout" + << std::endl; + usage(prog); + } + gzistream is(infile.c_str()); + std::ofstream out(outfile, std::ios::out | std::ios::trunc); + std::vector sds(threads); + for (size_t i = 0; i < sds.size(); i++) { + sds[i] = new SDust(); + } + thread_pool pool(threads - 1); + std::queue> tasks; + for (SDust *sd = sds.back(); BatchSequenceReader::ReadNextSequence(is, sd->seq, buffer); sd = sds.back()) { + sds.pop_back(); + if (threads > 1) { + tasks.push(pool.submit(mask, sd)); + while (sds.empty()) { + sd = tasks.front().get(); + printFasta(sd->seq, out, lineWidth); + tasks.pop(); + sd->reset(); + sds.push_back(sd); + } + } else { + mask(sd); + printFasta(sd->seq, out, lineWidth); + sds.push_back(sd); + } + } + while (!tasks.empty()) { + SDust *sd = tasks.front().get(); + printFasta(sd->seq, out, lineWidth); + tasks.pop(); + } + for (size_t i = 0; i < sds.size(); i++) + delete(sds[i]); + out.flush(); +} diff --git a/src/seqreader.cc b/src/seqreader.cc index b7a5661..075a994 100644 --- a/src/seqreader.cc +++ b/src/seqreader.cc @@ -11,6 +11,8 @@ using std::string; namespace kraken2 { void StripString(string &str) { + if (str.size() == 0) + return; while (isspace(str.back())) str.pop_back(); }