Skip to content

Commit

Permalink
PD-3347 -lcl parameter in gff_check and amr_report
Browse files Browse the repository at this point in the history
  • Loading branch information
vbrover committed Mar 24, 2020
1 parent 2c69418 commit 45f1afd
Showing 8 changed files with 106 additions and 57 deletions.
71 changes: 40 additions & 31 deletions amr_report.cpp
Original file line number Diff line number Diff line change
@@ -1087,14 +1087,21 @@ struct Batch
{
if (isLeft (f. line, "#"))
continue;
string organism_, accession, geneMutation, classS, subclass, name;
int pos;
iss. reset (f. line);
iss >> organism_ >> accession >> pos >> geneMutation >> classS >> subclass >> name;
QC_ASSERT (pos > 0);
replace (organism_, '_', ' ');
if (organism_ == organism)
accession2mutations [accession] << move (Mutation ((size_t) pos, geneMutation, classS, subclass, name));
try
{
string organism_, accession, geneMutation, classS, subclass, name;
int pos;
iss. reset (f. line);
iss >> organism_ >> accession >> pos >> geneMutation >> classS >> subclass >> name;
QC_ASSERT (pos > 0);
replace (organism_, '_', ' ');
if (organism_ == organism)
accession2mutations [accession] << move (Mutation ((size_t) pos, geneMutation, classS, subclass, name));
}
catch (const exception &e)
{
throw runtime_error ("Reading " + strQuote (mutation_tab) + " line:\n" + f. line + "\n" + e. what ());
}
}
if (verbose ())
PRINT (accession2mutations. size ());
@@ -1348,34 +1355,34 @@ struct Batch
{
// Cf. BlastAlignment::saveText()
TabDel td;
td << "Protein identifier"; // targetName // PD-2534 // 1
td << "Protein identifier"; // 1 // targetName // PD-2534
if (cdsExist)
// Contig
td << "Contig id" // 2
<< "Start" // targetStart // 3
<< "Stop" // targetEnd // 4
<< "Strand"; // targetStrand // 5
td << (print_fam ? "FAM.id" : "Gene symbol") // 6
<< "Sequence name"
<< "Scope" // PD-2825
td << "Contig id" // 2
<< "Start" // targetStart // 3
<< "Stop" // targetEnd // 4
<< "Strand"; // targetStrand // 5
td << (print_fam ? "FAM.id" : "Gene symbol") // 6 or 2
<< "Sequence name" // 7 or 3
<< "Scope" // PD-2825 // 8 or 4
// PD-1856
<< "Element type"
<< "Element subtype"
<< "Class"
<< "Subclass"
<< "Element type" // 9 or 5
<< "Element subtype" // 10 or 6
<< "Class" // 11 or 7
<< "Subclass" // 12 or 8
//
<< "Method"
<< "Target length"
<< "Method" // 13 or 9
<< "Target length" // 14 or 10
//
<< "Reference sequence length" // refLen
<< "% Coverage of reference sequence" // queryCoverage
<< "% Identity to reference sequence"
<< "Alignment length" // targetSeq. size ()
<< "Accession of closest sequence" // refAccession
<< "Name of closest sequence"
<< "Reference sequence length" // 15 or 11 // refLen
<< "% Coverage of reference sequence" // 16 or 12 // queryCoverage
<< "% Identity to reference sequence" // 17 or 13
<< "Alignment length" // 18 or 14 // targetSeq. size ()
<< "Accession of closest sequence" // 19 or 15 // refAccession
<< "Name of closest sequence" // 20 or 16
//
<< "HMM id"
<< "HMM description"
<< "HMM id" // 21 or 17
<< "HMM description" // 22 or 18
;
if (cdsExist)
if (useCrossOrigin)
@@ -1507,6 +1514,7 @@ struct ThisApplication : Application
addKey ("gff_match", ".gff-FASTA matching file for \"locus_tag\": \"<FASTA id> <locus_tag>\"");
addFlag ("bed", "Browser Extensible Data format of the <gff> file");
addFlag ("pgap", "Protein, genomic and GFF files are created by the NCBI PGAP");
addFlag ("lcl", "Nucleotide FASTA created by PGAP has \"lcl|\" prefix in accessions");
addKey ("dna_len", "File with lines: <dna id> <dna length>");
addKey ("hmmdom", "HMM domain alignments");
addKey ("hmmsearch", "Output of hmmsearch");
@@ -1545,6 +1553,7 @@ struct ThisApplication : Application
const string gffMatchFName = getArg ("gff_match");
const bool bedP = getFlag ("bed");
const bool pgap = getFlag ("pgap");
const bool lcl = getFlag ("lcl");
const string dnaLenFName = getArg ("dna_len");
const string hmmDom = getArg ("hmmdom");
const string hmmsearch = getArg ("hmmsearch");
@@ -1740,7 +1749,7 @@ struct ThisApplication : Application
else
{
Annot::Gff gff;
annot. reset (new Annot (gff, gffFName, false, ! gffMatchFName. empty (), pgap));
annot. reset (new Annot (gff, gffFName, false, ! gffMatchFName. empty (), pgap, lcl));
}
ASSERT (annot. get ());
map<string/*seqid*/,string/*locusTag*/> seqId2locusTag;
19 changes: 17 additions & 2 deletions amrfinder.cpp
Original file line number Diff line number Diff line change
@@ -20,7 +20,7 @@
* warranties of performance, merchantability or fitness for any particular
* purpose.
*
* Please cite the author in any work or product based on this material.
* Please cite NCBI in any work or product based on this material.
*
* ===========================================================================
*
@@ -33,6 +33,8 @@
* cat, cp, cut, grep, head, mkdir, mv, nproc, sort, tail, which
*
* Release changes:
* 03/24/2020 PD-3347 -lcl parameter if gff_check and amr_report
* 3.6.18 03/17/2020 PD-3396 amr_report.cpp prints a better error message on missing sublcass in data
* 3.6.17 03/12/2020 Software version is printed after software directory
* 3.6.16 03/06/2020 PD-3363 --mutation_all: UNKNOWN are not reported
* PD-2328 Last 2 columns of report are real HMM hits
@@ -480,9 +482,22 @@ struct ThisApplication : ShellApplication
prog2dir ["amr_report"] = execDir;


bool lcl = false;
if (pgap && ! emptyArg (dna)) // PD-3347
{
LineInput f (unQuote (dna));
while (f. nextLine ())
if (isLeft (f. line, ">lcl|"))
{
lcl = true;
break;
}
}


string amr_report_blastp;
string amr_report_blastx;
const string pgapS (ifS (pgap, " -pgap"));
const string pgapS (ifS (pgap, " -pgap" + ifS (lcl, " -lcl")));
bool blastxChunks = false;
{
Threads th (threads_max - 1, true);
2 changes: 1 addition & 1 deletion amrfinder_update.cpp
Original file line number Diff line number Diff line change
@@ -33,7 +33,7 @@
* mkdir, ln
* curl.{h,c}
*
* Release changes:
* Release changes: see amrfinder.cpp
*
*/

3 changes: 2 additions & 1 deletion common.cpp
Original file line number Diff line number Diff line change
@@ -883,7 +883,8 @@ bool getChar (istream &is,
ASSERT (i == c);
return false;
}
ASSERT (i >= 0 && i <= 255);
if (! (i >= 0 && i <= 255))
throw runtime_error ("Cannot read character: " + to_string (i));
c = static_cast<char> (i);

return true;
48 changes: 34 additions & 14 deletions gff.cpp
Original file line number Diff line number Diff line change
@@ -96,14 +96,45 @@ bool Locus::operator< (const Locus& other) const

// Gff

namespace
{

void pgap_accession (string &accession,
bool lcl)
// Update: accession
{
static const string gnlPrefix ("gnl|");
static const string lclPrefix ("lcl|");

size_t pos = accession. rfind (':');
if (pos == string::npos)
{
if (lcl)
accession = lclPrefix + accession;
}
else
{
if (lcl)
throw runtime_error ("Accession " + strQuote (accession) + " cannot have " + strQuote (gnlPrefix) + " and " + strQuote (lclPrefix) + " at the same time");
accession [pos] = '|';
accession = gnlPrefix + accession;
}
}

}



Annot::Annot (Gff,
const string &fName,
bool trimProject,
bool locus_tag,
bool pgap)
bool pgap,
bool lcl)
{
IMPLY (pgap, ! locus_tag);
IMPLY (pgap, ! trimProject);
IMPLY (lcl, pgap);

if (fName. empty ())
throw runtime_error ("Empty GFF file name");
@@ -214,19 +245,8 @@ Annot::Annot (Gff,
trim (locusTag, tmpSpace);
if (pgap)
{
const string gnlPrefix ("gnl|");
size_t pos = locusTag. rfind (':');
if (pos != string::npos)
{
locusTag [pos] = '|';
locusTag = gnlPrefix + locusTag;
}
pos = contig. rfind (':');
if (pos != string::npos)
{
contig [pos] = '|';
contig = gnlPrefix + contig;
}
pgap_accession (locusTag, false);
pgap_accession (contig, lcl);
}
QC_ASSERT (! locusTag. empty ());

3 changes: 2 additions & 1 deletion gff.hpp
Original file line number Diff line number Diff line change
@@ -101,7 +101,8 @@ struct Annot : Root
const string &fName,
bool trimProject,
bool locus_tag,
bool pgap);
bool pgap,
bool lcl);
// https://github.com/The-Sequence-Ontology/Specifications/blob/master/gff3.md
// Requirement: the protein id should be in the attribute "Name=<id>" (9th field) of the rows with type "CDS" or "gene" (3rd field)
// Input: fName may be empty
15 changes: 9 additions & 6 deletions gff_check.cpp
Original file line number Diff line number Diff line change
@@ -60,7 +60,8 @@ struct ThisApplication : Application
addPositional ("gff", ".gff-file, if " + strQuote (noFile) + " then exit 0");
addKey ("prot", "Protein FASTA file");
addKey ("dna", "DNA FASTA file");
addFlag ("pgap", "Input files are created by PGAP"); // exclusive with --gpipe
addFlag ("pgap", "Protein, genomic and GFF files are created by the NCBI PGAP");
addFlag ("lcl", "Nucleotide FASTA created by PGAP has \"lcl|\" prefix in accessions");
// Output
addKey ("locus_tag", "Output file with matches: \"<FASTA id> <GFF id>\", where <id> is from " + strQuote (locus_tagS + "<id>") + " in the FASTA comment and from the .gff-file");
version = SVN_REV;
@@ -74,19 +75,19 @@ struct ThisApplication : Application
const string protFName = getArg ("prot");
const string dnaFName = getArg ("dna");
const string locus_tagFName = getArg ("locus_tag");
//const bool gpipe = getFlag ("gpipe");
const bool pgap = getFlag ("pgap");
const bool lcl = getFlag ("lcl");

//if (gpipe && pgapx)
//throw runtime_error ("Flags --gpipe and --pgapx are mutually exclusive");
if (lcl && ! pgap)
throw runtime_error ("-lcl requires -pgap");


if (isRight (gffName, noFile))
return;


Annot::Gff gff;
const Annot annot (gff, gffName, false, ! locus_tagFName. empty (), pgap);
const Annot annot (gff, gffName, false, ! locus_tagFName. empty (), pgap, lcl);


if (! protFName. empty ())
@@ -175,7 +176,9 @@ struct ThisApplication : Application
iss >> contigId;
ASSERT (! contains (contigId, ' '));
if (contigId. empty ())
throw runtime_error (__FILE__ ": No contig identifier in: " + f. line);
throw runtime_error (__FILE__ ": No contig identifier in:\n" + f. line);
if (lcl && ! isLeft (contigId, "lcl|"))
throw runtime_error (__FILE__ ": Contig identifier does not start with " + strQuote ("lcl|") + ":\n" + f. line);
contigIds << move (contigId);
}
contigIds. sort ();
2 changes: 1 addition & 1 deletion version.txt
Original file line number Diff line number Diff line change
@@ -1 +1 @@
3.6.17
3.6.18

0 comments on commit 45f1afd

Please sign in to comment.