diff --git a/alignment.cpp b/alignment.cpp index 58f81ef..9b2da8f 100644 --- a/alignment.cpp +++ b/alignment.cpp @@ -46,10 +46,10 @@ namespace Alignment_sp -// Mutation +// AmrMutation -Mutation::Mutation (size_t pos_arg, +AmrMutation::AmrMutation (size_t pos_arg, const string &geneMutation_arg, const string &class_arg, const string &subclass_arg, @@ -85,7 +85,7 @@ Mutation::Mutation (size_t pos_arg, -void Mutation::parse (const string &geneMutation, +void AmrMutation::parse (const string &geneMutation, string &reference, string &allele, string &gene, @@ -142,7 +142,7 @@ void Mutation::parse (const string &geneMutation, -bool Mutation::operator< (const Mutation &other) const +bool AmrMutation::operator< (const AmrMutation &other) const { LESS_PART (*this, other, gene); LESS_PART (*this, other, pos); @@ -195,7 +195,7 @@ void SeqChange::qc () const QC_ASSERT (start_target <= al->targetEnd); QC_ASSERT (prev != '-'); QC_IMPLY (reference. empty (), prev); - for (const Mutation* mutation : mutations) + for (const AmrMutation* mutation : mutations) { QC_ASSERT (mutation); QC_ASSERT (between (mutation->pos, al->refStart, al->refEnd)); @@ -379,7 +379,7 @@ void SeqChange::setNeighborhoodMismatch (size_t flankingLen) -bool SeqChange::matchesMutation (const Mutation& mut) const +bool SeqChange::matchesMutation (const AmrMutation& mut) const { if (empty ()) return false; @@ -635,7 +635,7 @@ void Alignment::refMutation2refSeq () -void Alignment::setSeqChanges (const Vector &refMutations, +void Alignment::setSeqChanges (const Vector &refMutations, size_t flankingLen/*, bool allMutationsP*/) { @@ -707,10 +707,10 @@ void Alignment::setSeqChanges (const Vector &refMutations, IMPLY (start_ref_prev != no_index, start_ref_prev <= seqChange. start_ref); while (j < refMutations. size ()) { - const Mutation& mut = refMutations [j]; + const AmrMutation& mut = refMutations [j]; if (verbose ()) { - cout << "Processing Mutation: "; + cout << "Processing AmrMutation: "; mut. saveText (cout); cout << endl; } @@ -751,7 +751,7 @@ void Alignment::setSeqChanges (const Vector &refMutations, { size_t refPos = refStart; size_t i = 0; - for (const Mutation& mut : refMutations) + for (const AmrMutation& mut : refMutations) { while (refPos < mut. pos) { diff --git a/alignment.hpp b/alignment.hpp index c0c47f2..844245d 100644 --- a/alignment.hpp +++ b/alignment.hpp @@ -47,7 +47,7 @@ static constexpr char pm_delimiter = '_'; -struct Mutation : Root +struct AmrMutation : Root // Database { size_t pos {0}; @@ -68,15 +68,15 @@ struct Mutation : Root int ref_pos {0}; - Mutation (size_t pos_arg, + AmrMutation (size_t pos_arg, const string &geneMutation_arg, const string &class_arg = "X", const string &subclass_arg = "X", const string &name_arg = "X"); // Input: pos_arg: 1-based - Mutation () = default; - Mutation (Mutation &&other) = default; - Mutation& operator= (Mutation &&other) = default; + AmrMutation () = default; + AmrMutation (AmrMutation &&other) = default; + AmrMutation& operator= (AmrMutation &&other) = default; private: static void parse (const string &geneMutation, string &reference, @@ -98,12 +98,12 @@ struct Mutation : Root { return pos + reference. size (); } string wildtype () const { return gene + "_" + reference + to_string (ref_pos + 1) + reference; } - bool operator< (const Mutation &other) const; - bool operator== (const Mutation &other) const + bool operator< (const AmrMutation &other) const; + bool operator== (const AmrMutation &other) const { return geneMutation == other. geneMutation; } void apply (string &seq) const { if (pos >= seq. size ()) - throw runtime_error ("Mutation position " + to_string (pos) + " is outside the sequence: " + seq); + throw runtime_error ("AmrMutation position " + to_string (pos) + " is outside the sequence: " + seq); seq = seq. substr (0, pos) + allele + seq. substr (pos + reference. size ()); } }; @@ -137,7 +137,7 @@ struct SeqChange : Root // 0..1 char prev {'\0'}; - VectorPtr mutations; + VectorPtr mutations; // !nullptr const SeqChange* replacement {nullptr}; @@ -148,7 +148,7 @@ struct SeqChange : Root : al (al_arg) {} SeqChange (const Alignment* al_arg, - const Mutation* mutation_arg) + const AmrMutation* mutation_arg) : al (al_arg) //, mutation (checkPtr (mutation_arg)) { mutations << checkPtr (mutation_arg); } @@ -165,7 +165,7 @@ struct SeqChange : Root << ' ' << start_target + 1 << ' ' << neighborhoodMismatch; //if (mutation) - for (const Mutation* mutation : mutations) + for (const AmrMutation* mutation : mutations) { os << ' ' ; mutation->saveText (os); } @@ -194,7 +194,7 @@ struct SeqChange : Root void setStartTarget (); void setNeighborhoodMismatch (size_t flankingLen); public: - bool matchesMutation (const Mutation& mut) const; + bool matchesMutation (const AmrMutation& mut) const; }; @@ -231,7 +231,7 @@ struct Alignment : Root size_t refStart {0}; size_t refEnd {0}; size_t refLen {0}; - Mutation refMutation; + AmrMutation refMutation; // !empty() => original refSeq is the result of refMutation.apply() to the original refSeq //int ref_offset {0}; @@ -255,7 +255,7 @@ struct Alignment : Root // Output: nident void refMutation2refSeq (); // Update: refSeq - void setSeqChanges (const Vector &refMutations, + void setSeqChanges (const Vector &refMutations, size_t flankingLen/*, bool allMutationsP*/); // Input: flankingLen: valid if > 0 diff --git a/amr_report.cpp b/amr_report.cpp index 2315c5d..2fdd6c1 100644 --- a/amr_report.cpp +++ b/amr_report.cpp @@ -325,7 +325,7 @@ struct Susceptible : Root -map > accession2mutations; +map > accession2mutations; map accession2susceptible; @@ -413,7 +413,7 @@ struct BlastAlignment : Alignment const string geneMutation = rfindSplit (refAccession, ':'); const size_t pos = str2 (rfindSplit (refAccession, ':')); ASSERT (refMutation. empty ()); - refMutation = move (Mutation (pos, geneMutation)); + refMutation = move (AmrMutation (pos, geneMutation)); ASSERT (! refMutation. empty ()); refMutation. qc (); refMutation2refSeq (); @@ -473,12 +473,12 @@ struct BlastAlignment : Alignment nident++; if (! targetProt) - cdss << move (Locus (0, targetName, targetStart, targetEnd, targetStrand, partialDna, 0)); + cdss << move (Locus (0, targetName, targetStart, targetEnd, targetStrand, partialDna, 0, string (), string ())); - if (const Vector* refMutations = findPtr (accession2mutations, refAccession)) + if (const Vector* refMutations = findPtr (accession2mutations, refAccession)) { if (verbose ()) - cout << "Mutation protein found: " << refAccession << endl << line << endl; + cout << "AmrMutation protein found: " << refAccession << endl << line << endl; QC_ASSERT (isMutation ()); setSeqChanges (*refMutations, 0); if (verbose ()) @@ -590,10 +590,10 @@ struct BlastAlignment : Alignment for (const SeqChange& seqChange : seqChanges_) { const string method (empty () ? na : getMethod (cds)); - VectorPtr mutations (seqChange. mutations); + VectorPtr mutations (seqChange. mutations); if (mutations. empty ()) mutations << nullptr; - for (const Mutation* mut : mutations) + for (const AmrMutation* mut : mutations) { IMPLY (! verbose (), isMutation () == ! (seqChange. empty () && ! mut)); TabDel td (2, false); @@ -715,7 +715,7 @@ struct BlastAlignment : Alignment { Set mutationSymbols; for (const SeqChange& seqChange : seqChanges) if (! seqChange. empty ()) - for (const Mutation* mutation : seqChange. mutations) + for (const AmrMutation* mutation : seqChange. mutations) mutationSymbols << mutation->geneMutation; return mutationSymbols; } @@ -1384,7 +1384,7 @@ struct Batch QC_ASSERT (pos > 0); replace (organism_, '_', ' '); if (organism_ == organism) - accession2mutations [accession] << move (Mutation ((size_t) pos, geneMutation, classS, subclass, name)); + accession2mutations [accession] << move (AmrMutation ((size_t) pos, geneMutation, classS, subclass, name)); } catch (const exception &e) { @@ -1737,13 +1737,13 @@ struct Batch #if 0 // [UNKNOWN] { - map mutation2ptr; + map mutation2ptr; for (const auto& it : accession2mutations) - for (const Mutation& mut : it. second) + for (const AmrMutation& mut : it. second) mutation2ptr [mut] = & mut; for (const BlastAlignment* al : goodBlastAls) for (const SeqChange& seqChange : al->seqChanges) - if (const Mutation* mut = seqChange. mutation) + if (const AmrMutation* mut = seqChange. mutation) mutation2ptr. erase (*mut); for (const auto& it : mutation2ptr) { diff --git a/amrfinder.cpp b/amrfinder.cpp index bde7e03..1c8ce9a 100644 --- a/amrfinder.cpp +++ b/amrfinder.cpp @@ -33,6 +33,8 @@ * cat, cp, cut, head, ln, mv, sort, tail * * Release changes: +* 3.10.4 03/24/2021 PD-3761 amrfinder --help will not break if /tmp is full +* 3.10.3 03/15/2021 PD-3749 --nucleotide_flank5_output, --nucleotide_flank5_size * 3.10.2 03/03/2021 PD-3729 Neighboring point mutations are reported * 3.10.1 02/17/2021 PD-3679 AMRProt-susceptible.tab * 3.9.10 02/16/2021 PD-3694 message about missing "latest/" symbolic link; amrfinder_update.cpp: createLatestLink() @@ -238,6 +240,8 @@ struct ThisApplication : ShellApplication addKey ("output", "Write output to OUTPUT_FILE instead of STDOUT", "", 'o', "OUTPUT_FILE"); addKey ("protein_output", "Output protein FASTA file of reported proteins", "", '\0', "PROT_FASTA_OUT"); addKey ("nucleotide_output", "Output nucleotide FASTA file of reported nucleotide sequences", "", '\0', "NUC_FASTA_OUT"); + addKey ("nucleotide_flank5_output", "Output nucleotide FASTA file of reported nucleotide sequences with 5' flanking sequences", "", '\0', "NUC_FLANK5_FASTA_OUT"); + addKey ("nucleotide_flank5_size", "5' flanking sequence size for NUC_FLANK5_FASTA_OUT", "0", '\0', "NUC_FLANK5_SIZE"); addFlag ("quiet", "Suppress messages to STDERR", 'q'); addFlag ("gpipe_org", "NCBI internal GPipe organism names"); addKey ("parm", "amr_report parameters for testing: -nosame -noblast -skip_hmm_check -bed", "", '\0', "PARM"); @@ -323,7 +327,9 @@ struct ThisApplication : ShellApplication void prepare_fasta_extract (StringVector &&columns, - const string &tmpSuf) const + const string &tmpSuf, + bool saveHeader) const + // Input: tmp + ".amr" { TextTable t (tmp + ".amr"); t. qc (); @@ -331,7 +337,7 @@ struct ThisApplication : ShellApplication t. rows. filterValue ([] (const StringVector& row) { return row [0] == "NA"; }); t. rows. sort (); t. rows. uniq (); - t. saveHeader = false; + t. saveHeader = saveHeader; t. qc (); t. saveFile (tmp + "." + tmpSuf); } @@ -368,6 +374,8 @@ struct ThisApplication : ShellApplication const string output = shellQuote (getArg ("output")); const string prot_out = shellQuote (getArg ("protein_output")); const string dna_out = shellQuote (getArg ("nucleotide_output")); + const string dnaFlank5_out = shellQuote (getArg ("nucleotide_flank5_output")); + const uint dnaFlank5_size = arg2uint ("nucleotide_flank5_size"); const bool quiet = getFlag ("quiet"); const bool gpipe_org = getFlag ("gpipe_org"); @@ -427,7 +435,7 @@ struct ThisApplication : ShellApplication if (! emptyArg (output)) try { OFStream f (unQuote (output)); } - catch (...) { throw runtime_error ("Cannot open output file " + output); } + catch (...) { throw runtime_error ("Cannot create output file " + output); } // For timing... @@ -579,9 +587,15 @@ struct ThisApplication : ShellApplication } } if (emptyArg (prot) && ! emptyArg (prot_out)) - throw runtime_error ("Parameter --protein must be present for --protein_out"); + throw runtime_error ("Parameter --protein must be present for --protein_output"); if (emptyArg (dna) && ! emptyArg (dna_out)) - throw runtime_error ("Parameter --nucleotide must be present for --nucleotide_out"); + throw runtime_error ("Parameter --nucleotide must be present for --nucleotide_output"); + if (emptyArg (dna) && ! emptyArg (dnaFlank5_out)) + throw runtime_error ("Parameter --nucleotide must be present for --nucleotide_flank5_output"); + if (emptyArg (dnaFlank5_out) && dnaFlank5_size > 0) + throw runtime_error ("Parameter --nucleotide_flank5_output must be present for --nucleotide_flank5_size"); + if (! emptyArg (dnaFlank5_out) && dnaFlank5_size == 0) + throw runtime_error ("Parameter --nucleotide_flank5_size must be present with a positive value for --nucleotide_flank5_output"); ASSERT (! searchMode. empty ()); if (emptyArg (organism)) includes << key2shortHelp ("organism") + " option to add mutation searches and suppress common proteins"; @@ -941,14 +955,30 @@ struct ThisApplication : ShellApplication if (! emptyArg (prot_out)) { - prepare_fasta_extract (StringVector {"Protein identifier", "Gene symbol", "Sequence name"}, "prot_out"); + prepare_fasta_extract (StringVector {"Protein identifier", "Gene symbol", "Sequence name"}, "prot_out", false); exec (fullProg ("fasta_extract") + prot + " " + tmp + ".prot_out -aa" + qcS + " -log " + logFName + " > " + prot_out, logFName); } if (! emptyArg (dna_out)) { - prepare_fasta_extract (StringVector {"Contig id", "Start", "Stop", "Strand", "Gene symbol", "Sequence name"}, "dna_out"); + prepare_fasta_extract (StringVector {"Contig id", "Start", "Stop", "Strand", "Gene symbol", "Sequence name"}, "dna_out", false); exec (fullProg ("fasta_extract") + dna + " " + tmp + ".dna_out" + qcS + " -log " + logFName + " > " + dna_out, logFName); } + if (! emptyArg (dnaFlank5_out)) + { + prepare_fasta_extract (StringVector {"Contig id", "Start", "Stop", "Strand", "Gene symbol", "Sequence name"}, "dna_out", true); + // 0 1 2 3 + TextTable t (tmp + ".dna_out"); + t. qc (); + for (StringVector& row : t. rows) + if (row [3] == "+") + row [1] = to_string (max (1, stoi (row [1]) - (int) dnaFlank5_size)); + else + row [2] = to_string (stoi (row [2]) + (int) dnaFlank5_size); + t. saveHeader = false; + t. qc (); + t. saveFile (tmp + ".dnaFlank5_out"); + exec (fullProg ("fasta_extract") + dna + " " + tmp + ".dnaFlank5_out" + qcS + " -log " + logFName + " > " + dnaFlank5_out, logFName); + } // timing the run diff --git a/common.cpp b/common.cpp index 0b051d0..12eae89 100644 --- a/common.cpp +++ b/common.cpp @@ -1089,6 +1089,7 @@ void Rand::run () // Threads size_t Threads::threadsToStart = 0; +bool Threads::quiet = false; @@ -1202,7 +1203,6 @@ string which (const string &progName) Threads::Threads (size_t threadsToStart_arg, bool quiet_arg) -: quiet (quiet_arg) { if (! isMainThread ()) throw logic_error ("Threads are started not from the main thread"); @@ -1213,9 +1213,11 @@ Threads::Threads (size_t threadsToStart_arg, if (threadsToStart >= threads_max) throw logic_error ("Too many threads to start"); + quiet = quiet_arg; + threads. reserve (threadsToStart); - if (! quiet && verbose (1)) + if (! quiet && verbose (1) && threadsToStart) cerr << "# Threads started: " << threadsToStart + 1 << endl; } @@ -1236,6 +1238,7 @@ Threads::~Threads () threads. clear (); threadsToStart = 0; + quiet = false; } @@ -2306,10 +2309,12 @@ void TextTable::filterColumns (const StringVector &newColumnNames) void TextTable::group (const StringVector &by, - const StringVector &sum) + const StringVector &sum, + const StringVector &aggr) { - const Vector byIndex (columns2indexes (by)); - const Vector sumIndex (columns2indexes (sum)); + const Vector byIndex (columns2indexes (by)); + const Vector sumIndex (columns2indexes (sum)); + const Vector aggrIndex (columns2indexes (aggr)); const auto lt = [&byIndex,this] (const StringVector &a, const StringVector &b) { for (const size_t i : byIndex) @@ -2332,7 +2337,7 @@ void TextTable::group (const StringVector &by, { ASSERT (i < j); if (rows [i]. same (rows [j], byIndex)) - merge (i, j, sumIndex); + merge (i, j, sumIndex, aggrIndex); else { i++; @@ -2350,10 +2355,13 @@ void TextTable::group (const StringVector &by, void TextTable::merge (RowNum toIndex, RowNum fromIndex, - const Vector &sum) + const Vector &sum, + const Vector &aggr) { + ASSERT (toIndex < fromIndex); StringVector& to = rows [toIndex]; const StringVector& from = rows [fromIndex]; + for (const size_t i : sum) { const Header& h = header [i]; @@ -2363,6 +2371,23 @@ void TextTable::merge (RowNum toIndex, oss << stod (to [i]) + stod (from [i]); to [i] = oss. str (); } + + for (const size_t i : aggr) + { + constexpr char sep = ','; + if (from [i]. empty ()) + continue; + if (to [i]. empty ()) + to [i] = from [i]; + else + { + StringVector vec (to [i], sep, true); + vec << from [i]; + vec. sort (); + vec. uniq (); + to [i] = vec. toString (string (1, sep)); + } + } } @@ -3439,6 +3464,7 @@ int Application::run (int argc, qc (); + createTmp (); body (); @@ -3467,7 +3493,7 @@ int Application::run (int argc, ShellApplication::~ShellApplication () { - if (! tmp. empty () && ! logPtr) + if (tmpCreated && ! logPtr) exec ("rm -fr " + tmp + "*"); } @@ -3485,12 +3511,6 @@ void ShellApplication::initEnvironment () tmp = s; else tmp = "/tmp"; - const string tmpDir (tmp); - tmp += "/XXXXXX"; - if (mkstemp (var_cast (tmp. c_str ())) == -1) - throw runtime_error ("Error creating a temporary file in " + tmpDir); - if (tmp. empty ()) - throw runtime_error ("Cannot create a temporary file in " + tmpDir); } // execDir, programName @@ -3514,6 +3534,24 @@ void ShellApplication::initEnvironment () +void ShellApplication::createTmp () +{ + ASSERT (! tmpCreated); + + if (useTmp) + { + const string tmpDir (tmp); + tmp += "/XXXXXX"; + if (mkstemp (var_cast (tmp. c_str ())) == -1) + throw runtime_error ("Error creating a temporary file in " + tmpDir); + if (tmp. empty ()) + throw runtime_error ("Cannot create a temporary file in " + tmpDir); + tmpCreated = true; + } +} + + + string ShellApplication::getHelp () const { string help (Application::getHelp ()); diff --git a/common.hpp b/common.hpp index c47b85b..c81b3f9 100644 --- a/common.hpp +++ b/common.hpp @@ -1278,7 +1278,7 @@ struct Threads : Singleton private: static size_t threadsToStart; vector threads; - const bool quiet; + static bool quiet; public: @@ -1289,6 +1289,8 @@ struct Threads : Singleton static bool empty () { return ! threadsToStart; } + static bool isQuiet () + { return quiet; } size_t getAvailable () const { return threadsToStart < threads. size () ? 0 : (threadsToStart - threads. size ()); } Threads& operator<< (thread &&t) @@ -1314,7 +1316,8 @@ struct Threads : Singleton template - void arrayThreads (const Func& func, + void arrayThreads (bool quiet, + const Func& func, size_t i_max, vector &results, Args&&... args) @@ -1334,7 +1337,7 @@ template if (chunk * threads_max < i_max) chunk++; ASSERT (chunk * threads_max >= i_max); - Threads th (threads_max - 1); + Threads th (threads_max - 1, quiet); FFOR (size_t, tn, threads_max) { const size_t from = tn * chunk; @@ -2421,6 +2424,16 @@ struct Set : set return false; } public: + bool intersects (const unordered_set &other) const + { if (universal) + return ! other. empty (); + if (other. empty ()) + return false; + for (const T& t : *this) + if (contains (other, t)) + return true; + return false; + } Set& operator<< (const T &el) { if (! universal) @@ -2690,7 +2703,7 @@ struct Progress : Nocopy static bool isUsed () { return beingUsed; } static bool enabled () - { return ! beingUsed && verbose (1); } + { return ! beingUsed && ! Threads::isQuiet () && verbose (1); } }; @@ -3253,11 +3266,13 @@ struct TextTable : Named // can be repeated // ordered void group (const StringVector &by, - const StringVector &sum); + const StringVector &sum, + const StringVector &aggr); private: void merge (RowNum toIndex, RowNum fromIndex, - const Vector &sum); + const Vector &sum, + const Vector &aggr); public: void indexes2values (const Vector &indexes, RowNum row_num, @@ -3873,6 +3888,8 @@ struct Application : Singleton, Root protected: virtual void initEnvironment () {} + virtual void createTmp () + {} string getInstruction () const; virtual string getHelp () const; public: @@ -3895,6 +3912,9 @@ struct ShellApplication : Application string tmp; // Temporary file prefix: ($TMPDIR or "/tmp") + "/XXXXXX" // If log is used then tmp is printed in the log file and the temporary files are not deleted +private: + bool tmpCreated {false}; +public: string execDir; // Ends with '/' // Physically real directory of the software @@ -3913,6 +3933,7 @@ struct ShellApplication : Application protected: void initEnvironment () override; + void createTmp () override; string getHelp () const override; private: void body () const final; diff --git a/dna_mutation.cpp b/dna_mutation.cpp index 86c23a7..3f4740d 100644 --- a/dna_mutation.cpp +++ b/dna_mutation.cpp @@ -48,7 +48,7 @@ using namespace Alignment_sp; namespace { -map > accession2mutations; +map > accession2mutations; unique_ptr mutation_all; string input_name; @@ -86,7 +86,7 @@ struct BlastnAlignment : Alignment } replace (product, '_', ' '); qc (); - if (const Vector* refMutations = findPtr (accession2mutations, refName)) + if (const Vector* refMutations = findPtr (accession2mutations, refName)) setSeqChanges (*refMutations, flankingLen); } catch (...) @@ -108,10 +108,10 @@ struct BlastnAlignment : Alignment { const string na ("NA"); for (const SeqChange& seqChange : seqChanges) { - VectorPtr mutations (seqChange. mutations); + VectorPtr mutations (seqChange. mutations); if (mutations. empty ()) mutations << nullptr; - for (const Mutation* mutation : mutations) + for (const AmrMutation* mutation : mutations) { ASSERT (! (seqChange. empty () && ! mutation)); TabDel td (2, false); @@ -205,7 +205,7 @@ struct Batch iss. reset (f. line); iss >> accession >> pos >> geneMutation >> classS >> subclass >> name; QC_ASSERT (pos > 0); - accession2mutations [accession]. push_back (move (Mutation ((size_t) pos, geneMutation, classS, subclass, name))); + accession2mutations [accession]. push_back (move (AmrMutation ((size_t) pos, geneMutation, classS, subclass, name))); } } for (auto& it : accession2mutations) @@ -231,7 +231,7 @@ struct Batch << "Stop" // targetEnd << "Strand" // targetStrand << "Gene symbol" - << "Mutation name" + << "AmrMutation name" << "Scope" // PD-2825 // PD-1856 << "Element type" @@ -367,13 +367,13 @@ struct ThisApplication : Application #if 0 // [UNKNOWN] { - map mutation2ptr; + map mutation2ptr; for (const auto& it : accession2mutations) - for (const Mutation& mut : it. second) + for (const AmrMutation& mut : it. second) mutation2ptr [mut] = & mut; for (const BlastnAlignment* al : batch. blastAls) for (const SeqChange& seqChange : al->seqChanges) - if (const Mutation* mut = seqChange. mutation) + if (const AmrMutation* mut = seqChange. mutation) mutation2ptr. erase (*mut); for (const auto& it : mutation2ptr) { diff --git a/fasta_extract.cpp b/fasta_extract.cpp index ab015dc..d317c56 100644 --- a/fasta_extract.cpp +++ b/fasta_extract.cpp @@ -114,16 +114,21 @@ void process (const string &id, replaceStr (seq, "-", ""); QC_ASSERT (! seq. empty ()); - for (const Segment& seg : *segments) + for (Segment& seg : var_cast (*segments)) { cout << '>' << id; if (seg. isDna ()) + { + QC_ASSERT (seg. start <= seq. size ()); + minimize (seg. stop, seq. size ()); + QC_ASSERT (seg. start < seg. stop); cout << ':' << seg. start + 1 << '-' << seg. stop << ' ' << "strand:" << (seg. strand ? '+' : '-'); + } cout << ' ' << seg. genesymbol << ' ' << seg. name << endl; string seq1 (seq); if (seg. isDna ()) { - QC_ASSERT (seg. stop <= seq1. size ()); + ASSERT (seg. stop <= seq1. size ()); seq1 = seq1. substr (seg. start, seg. size ()); if (! seg. strand) { @@ -148,7 +153,7 @@ struct ThisApplication : Application addPositional ("fasta", "FASTA file"); addPositional ("target", "Target identifiers in the FASTA file to extract.\n\ Line format for amino acid sequences : \n\ -Line format for nucleotide sequences : =1)> = start)> \ +Line format for nucleotide sequences : =1)> = start)> \ "); addFlag ("aa", "Amino acid sequenes, otherwise nucleotide"); version = SVN_REV; @@ -173,13 +178,16 @@ Line format for nucleotide sequences : =1)> = start)> > id; - char strand; if (! aa) { + char strand = '\0'; iss >> seg. start >> seg. stop >> strand; QC_ASSERT (seg. start); QC_ASSERT (seg. start <= seg. stop); seg. start--; + QC_ASSERT ( strand == '+' + || strand == '-' + ); seg. strand = (strand == '+'); } iss >> seg. genesymbol; diff --git a/gff.cpp b/gff.cpp index 57e9db0..5f50d45 100644 --- a/gff.cpp +++ b/gff.cpp @@ -53,7 +53,9 @@ Locus::Locus (size_t lineNum_arg, size_t stop_arg, bool strand_arg, bool partial_arg, - size_t crossOriginSeqLen_arg) + size_t crossOriginSeqLen_arg, + string gene_arg, + string product_arg) : lineNum (lineNum_arg) , contig (contig_arg) , start (start_arg) @@ -62,6 +64,8 @@ Locus::Locus (size_t lineNum_arg, , partial (partial_arg) , contigLen (crossOriginSeqLen_arg) , crossOrigin (bool (crossOriginSeqLen_arg)) +, gene (gene_arg) +, product (product_arg) { //QC_ASSERT (lineNum >= 1); trim (contig, '_'); @@ -86,7 +90,7 @@ bool Locus::operator< (const Locus& other) const LESS_PART (*this, other, start) LESS_PART (*this, other, stop) LESS_PART (*this, other, strand) - LESS_PART (*this, other, contigLen); +//LESS_PART (*this, other, contigLen); LESS_PART (*this, other, crossOrigin); return false; } @@ -225,15 +229,28 @@ Annot::Annot (Gff, const bool partial = contains (attributes, "partial=true"); string locusTag; + string gene; + string product; const string locusTagName (! pgap && (locus_tag || pseudo) ? "locus_tag=" : "Name="); while (! attributes. empty ()) { - locusTag = findSplit (attributes, ';'); - trim (locusTag, tmpSpace); - if (isLeft (locusTag, locusTagName)) - break; + string s (findSplit (attributes, ';')); + trim (s, tmpSpace); + if (isLeft (s, locusTagName)) + locusTag = s; + else if (isLeft (s, "gene=")) + { + gene = s; + findSplit (gene, '='); + } + else if (isLeft (s, "product=")) + { + product = s; + findSplit (product, '='); + replace (product, tmpSpace, ' '); + } } - if (! isLeft (locusTag, locusTagName)) + if (locusTag. empty ()) continue; //throw runtime_error (errorS + "No attribute '" + locusTagName + "': " + f. line); if (! pgap && contains (locusTag, ":")) @@ -250,7 +267,7 @@ Annot::Annot (Gff, } QC_ASSERT (! locusTag. empty ()); - const Locus locus (f. lineNum, contig, (size_t) start, (size_t) stop, strand == "+", partial, 0); + Locus locus (f. lineNum, contig, (size_t) start, (size_t) stop, strand == "+", partial, 0, gene, product); #if 0 // DNA may be truncated if (type == "CDS" && ! pseudo && locus. size () % 3 != 0) @@ -261,7 +278,7 @@ Annot::Annot (Gff, } #endif - prot2cdss [locusTag] << locus; + prot2cdss [locusTag] << move (locus); } } @@ -311,7 +328,7 @@ Annot::Annot (Bed, trim (locusTag, '_'); ASSERT (! locusTag. empty ()); - prot2cdss [locusTag] << Locus (f. lineNum, contig, start, stop, strand == '+', false/*partial*/, 0); + prot2cdss [locusTag] << Locus (f. lineNum, contig, start, stop, strand == '+', false/*partial*/, 0, string (), string ()); } } diff --git a/gff.hpp b/gff.hpp index 2fd847e..ba907de 100644 --- a/gff.hpp +++ b/gff.hpp @@ -61,6 +61,8 @@ struct Locus size_t contigLen {0}; // 0 <=> unknown bool crossOrigin {false}; + string gene; + string product; Locus (size_t lineNum_arg, @@ -69,14 +71,25 @@ struct Locus size_t stop_arg, bool strand_arg, bool partial_arg, - size_t crossOriginSeqLen); + size_t crossOriginSeqLen, + string gene_arg, + string product_arg); Locus () = default; bool empty () const { return contig. empty (); } void print (ostream &os) const - { os << contig << ' ' << start << ' ' << stop << ' ' << strand << ' ' << contigLen << ' ' << crossOrigin << endl; } + { os << contig + << ' ' << start + << ' ' << stop + << ' ' << strand + << ' ' << contigLen + << ' ' << crossOrigin + << ' ' << gene + << ' ' << product + << endl; + } bool operator< (const Locus& other) const; size_t size () const { return crossOrigin diff --git a/version.txt b/version.txt index 7b59a5c..8d7f852 100644 --- a/version.txt +++ b/version.txt @@ -1 +1 @@ -3.10.2 +3.10.4