Skip to content

Commit

Permalink
Improved the detection of IDs interpreted by BLAST as PDB chain IDs i…
Browse files Browse the repository at this point in the history
…n the CreateSchema, AlleleCall and PrepExternalSchema modules.
  • Loading branch information
rfm-targa committed Jul 15, 2024
1 parent d4b8676 commit 2cf781f
Show file tree
Hide file tree
Showing 3 changed files with 53 additions and 20 deletions.
25 changes: 20 additions & 5 deletions CHEWBBACA/chewBBACA.py
Original file line number Diff line number Diff line change
Expand Up @@ -212,15 +212,15 @@ def msg(name=None):
genome_list = fo.join_paths(args.output_directory, [ct.GENOME_LIST])
args.input_files, total_inputs = pv.check_input_type(args.input_files, genome_list)
# Detect if some inputs share the same unique prefix
repeated_prefixes = pv.check_unique_prefixes(args.input_files)
repeated_prefixes = pv.check_unique_prefixes(genome_list)
# Detect if filenames include blank spaces
blank_spaces = pv.check_blanks(args.input_files)
blank_spaces = pv.check_blanks(genome_list)
# Check if any input file has an unique prefix >= 50 characters
long_prefixes = pv.check_prefix_length(args.input_files)
# Check if any file prefixes are interpreted as PDB IDs
long_prefixes = pv.check_prefix_length(genome_list)
# Check if any input file prefixes are interpreted as PDB IDs
makeblastdb_path = fo.join_paths(args.blast_path, [ct.MAKEBLASTDB_ALIAS])
blastdbcmd_path = fo.join_paths(args.blast_path, [ct.BLASTDBCMD_ALIAS])
pdb_prefixes = pv.check_prefix_pdb(args.input_files, args.output_directory, makeblastdb_path, blastdbcmd_path)
pdb_prefixes = pv.check_prefix_pdb(genome_list, args.output_directory, makeblastdb_path, blastdbcmd_path)

print(f'Number of inputs: {total_inputs}')

Expand Down Expand Up @@ -503,6 +503,10 @@ def msg(name=None):
blank_spaces = pv.check_blanks(genome_list)
# Check if any input file has an unique prefix >= 50 characters
long_prefixes = pv.check_prefix_length(genome_list)
# Check if any input file prefixes are interpreted as PDB IDs
makeblastdb_path = fo.join_paths(args.blast_path, [ct.MAKEBLASTDB_ALIAS])
blastdbcmd_path = fo.join_paths(args.blast_path, [ct.BLASTDBCMD_ALIAS])
pdb_prefixes = pv.check_prefix_pdb(genome_list, args.output_directory, makeblastdb_path, blastdbcmd_path)

# Determine if schema was downloaded from Chewie-NS
ns_config = fo.join_paths(args.schema_directory, ['.ns_config'])
Expand Down Expand Up @@ -1024,6 +1028,17 @@ def msg(name=None):
else:
loci_list, total_loci = pv.check_input_type(args.schema_directory, loci_list)

# Detect if some inputs share the same unique prefix
repeated_prefixes = pv.check_unique_prefixes(loci_list)
# Detect if filenames include blank spaces
blank_spaces = pv.check_blanks(loci_list)
# Check if any input file has an unique prefix >= 50 characters
long_prefixes = pv.check_prefix_length(loci_list)
# Check if any input file prefixes are interpreted as PDB IDs
makeblastdb_path = fo.join_paths(args.blast_path, [ct.MAKEBLASTDB_ALIAS])
blastdbcmd_path = fo.join_paths(args.blast_path, [ct.BLASTDBCMD_ALIAS])
pdb_prefixes = pv.check_prefix_pdb(loci_list, args.output_directory, makeblastdb_path, blastdbcmd_path)

print(f'Number of cores: {args.cpu_cores}')
print(f'BLAST Score Ratio: {args.blast_score_ratio}')
print(f'Translation table: {args.translation_table}')
Expand Down
13 changes: 7 additions & 6 deletions CHEWBBACA/utils/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -153,7 +153,7 @@
DUMMY_FASTA = 'dummy.fasta'
DUMMY_BLASTDB = 'dummy_db'
DUMMY_DIR = 'dummy_dir'
DUMMY_OUTPUT = 'dummy_output.fasta'
DUMMY_BLASTDBCMD_FASTA = 'dummy_blastdbcmd.fasta'

# BLAST warnings to be ignored
# This warning is raised in BLAST>=2.10 when passing a TXT file with sequence identifiers to -seqidlist
Expand Down Expand Up @@ -637,10 +637,11 @@
'(e.g. BLAST does not accept sequence IDs longer than 50 '
'characters when creating a database).')

INPUTS_PDB_PREFIX = ('The following input files have a prefix that is '
'interpreted as a PDB ID:\n{0}\nPlease ensure that '
'the prefix is not a valid PDB ID. This is necessary '
'to avoid issues related to how these IDs are recognized '
'by chewBBACA and its dependencies.')
INPUTS_PDB_PREFIX = ('The following input files have prefixes that are '
'interpreted by BLAST as chain PDB IDs:\n{0}\nBLAST modifies the '
'IDs of the CDSs that include these prefixes when creating a database, '
'which leads to issues when chewBBACA cannot find the original '
'IDs in the results. Please ensure that the file prefixes (substring '
'before the first "." in the filename) cannot be interpreted as chain PDB IDs.')

MISSING_INPUT_ARG = ('Path to input files does not exist. Please provide a valid path.')
35 changes: 26 additions & 9 deletions CHEWBBACA/utils/parameters_validation.py
Original file line number Diff line number Diff line change
Expand Up @@ -1351,29 +1351,46 @@ def check_prefix_length(input_list):


def check_prefix_pdb(input_list, output_directory, makeblastdb_path, blastdbcmd_path):
"""
"""Check if the BLAST database includes the expected sequence IDs.
Parameters
----------
input_list : str
Path to file that contains the list of paths to input files.
output_directory : str
Path to the directory where dummy data will be created.
makeblastdb_path : str
Path to the makeblastdb executable.
blastdbcmd_path : str
Path to the blastdbcmd executable.
Returns
-------
None
"""
input_paths = fo.read_lines(input_list)
prefixes = get_file_prefixes(input_paths)
print(prefixes)
# Create directory to store dummy data
dummy_dir = os.path.join(output_directory, ct.DUMMY_DIR)
fo.create_directory(dummy_dir)
# Create dummy FASTA records
dummy_seqids = [f'{i}-protein1' for i in (prefixes)]
dummy_records = [fao.fasta_str_record(ct.FASTA_RECORD_TEMPLATE, [i, ct.DUMMY_PROT]) for i in dummy_seqids]
dummy_file = os.path.join(dummy_dir, ct.DUMMY_FASTA)
fo.write_lines(dummy_records, dummy_file)
dummy_fasta = os.path.join(dummy_dir, ct.DUMMY_FASTA)
fo.write_lines(dummy_records, dummy_fasta)
# Create BLAST db
blastdb = os.path.join(dummy_dir, ct.DUMMY_BLASTDB)
bw.make_blast_db(makeblastdb_path, dummy_file, blastdb, 'prot')
dummy_blastdb = os.path.join(dummy_dir, ct.DUMMY_BLASTDB)
bw.make_blast_db(makeblastdb_path, dummy_fasta, dummy_blastdb, 'prot')
# Get sequence from BLAST db
output_fasta = os.path.join(dummy_dir, ct.DUMMY_OUTPUT)
blastdbcmd_std = bw.run_blastdbcmd(blastdbcmd_path, blastdb, output_fasta)
dummy_blastdbcmd_fasta = os.path.join(dummy_dir, ct.DUMMY_BLASTDBCMD_FASTA)
blastdbcmd_std = bw.run_blastdbcmd(blastdbcmd_path, dummy_blastdb, dummy_blastdbcmd_fasta)
# Check if the sequence IDs in the BLAST db match the expected IDs
records = fao.sequence_generator(output_fasta)
records = fao.sequence_generator(dummy_blastdbcmd_fasta)
recids = [record.id for record in records]
modified_seqids = set(dummy_seqids) - set(recids)
# Delete dummy data
fo.delete_directory(dummy_dir)
# Exit if BLAST db includes sequence IDs that do not match the expected IDs
if len(modified_seqids) > 0:
pdb_prefix = [prefixes[p.split('-protein')[0]][0] for p in modified_seqids]
sys.exit(ct.INPUTS_PDB_PREFIX.format('\n'.join(pdb_prefix)))

0 comments on commit 2cf781f

Please sign in to comment.