Skip to content

Commit

Permalink
fix(--gtdbtk_classification_file): Add method to parse taxonomy from …
Browse files Browse the repository at this point in the history
…classify file.

Closes #428
  • Loading branch information
aaronmussig committed Sep 30, 2022
1 parent 7d45a9b commit 0f124fa
Show file tree
Hide file tree
Showing 2 changed files with 23 additions and 8 deletions.
11 changes: 9 additions & 2 deletions gtdbtk/files/classify_summary.py
Original file line number Diff line number Diff line change
Expand Up @@ -111,6 +111,15 @@ def has_row(self) -> bool:
return True
return False

def get_gid_taxonomy(self) -> Dict[str, List[str]]:
out = dict()
for gid, row in self.rows.items():
split_tax = row.classification.split(';')
if len(split_tax) != 7:
raise GTDBTkExit(f'Expected a 7-rank taxonomy for {gid} but got {row.classification}')
out[gid] = split_tax
return out

def write(self):
"""Writes the summary file to disk. None will be replaced with N/A"""
with open(self.path, 'w') as fh:
Expand Down Expand Up @@ -174,5 +183,3 @@ class ClassifySummaryFileBAC120(ClassifySummaryFile):
def __init__(self, out_dir: str, prefix: str):
path = os.path.join(out_dir, PATH_BAC120_SUMMARY_OUT.format(prefix=prefix))
super().__init__(path, 'bac120')


20 changes: 14 additions & 6 deletions gtdbtk/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@
from gtdbtk.external.fasttree import FastTree
from gtdbtk.infer_ranks import InferRanks
from gtdbtk.files.batchfile import Batchfile
from gtdbtk.files.classify_summary import ClassifySummaryFileAR53
from gtdbtk.files.classify_summary import ClassifySummaryFileAR53, ClassifySummaryFile
from gtdbtk.markers import Markers
from gtdbtk.misc import Misc
from gtdbtk.model.enum import Domain
Expand Down Expand Up @@ -204,16 +204,24 @@ def _read_taxonomy_files(self, options) -> Dict[str, Tuple[str, str, str, str, s
check_file_exists(options.gtdbtk_classification_file)

self.logger.info('Reading GTDB-Tk classification file.')
gtdbtk_taxonomy = Taxonomy().read(options.gtdbtk_classification_file)
del gtdbtk_taxonomy['user_genome']
num_reassigned = 0
gtdbtk_classify_file = ClassifySummaryFile(path=options.gtdbtk_classification_file)
gtdbtk_classify_file.read()
gtdbtk_taxonomy = gtdbtk_classify_file.get_gid_taxonomy()
if len(gtdbtk_taxonomy) == 0:
raise GTDBTkExit(f'No genomes found in GTDB-Tk classification file: {options.gtdbtk_classification_file}')

num_rep_reassigned = 0
num_usr_reassigned = 0
for gid, taxa in gtdbtk_taxonomy.items():
if gid in taxonomy:
num_reassigned += 1
num_rep_reassigned += 1
else:
num_usr_reassigned += 1
taxonomy[gid] = taxa

self.logger.info(f'Read GTDB-Tk classifications for {len(gtdbtk_taxonomy):,} genomes.')
self.logger.info(f'Reassigned taxonomy for {num_reassigned:,} GTDB representative genomes.')
self.logger.info(f'Reassigned taxonomy for {num_rep_reassigned:,} GTDB representative '
f'genomes, and {num_usr_reassigned:,} query genomes.')

if options.custom_taxonomy_file:
# add and overwrite taxonomy for genomes specified in the
Expand Down

0 comments on commit 0f124fa

Please sign in to comment.