Improved the detection of IDs interpreted by BLAST as PDB chain IDs i…

…n the CreateSchema, AlleleCall and PrepExternalSchema modules.
B-UMMI · Jul 15, 2024 · 2cf781f · 2cf781f
1 parent d4b8676
commit 2cf781f
Show file tree

Hide file tree

Showing 3 changed files with 53 additions and 20 deletions.
diff --git a/CHEWBBACA/chewBBACA.py b/CHEWBBACA/chewBBACA.py
@@ -212,15 +212,15 @@ def msg(name=None):
 	genome_list = fo.join_paths(args.output_directory, [ct.GENOME_LIST])
 	args.input_files, total_inputs = pv.check_input_type(args.input_files, genome_list)
 	# Detect if some inputs share the same unique prefix
-	repeated_prefixes = pv.check_unique_prefixes(args.input_files)
+	repeated_prefixes = pv.check_unique_prefixes(genome_list)
 	# Detect if filenames include blank spaces
-	blank_spaces = pv.check_blanks(args.input_files)
+	blank_spaces = pv.check_blanks(genome_list)
 	# Check if any input file has an unique prefix >= 50 characters
-	long_prefixes = pv.check_prefix_length(args.input_files)
-	# Check if any file prefixes are interpreted as PDB IDs
+	long_prefixes = pv.check_prefix_length(genome_list)
+	# Check if any input file prefixes are interpreted as PDB IDs
 	makeblastdb_path = fo.join_paths(args.blast_path, [ct.MAKEBLASTDB_ALIAS])
 	blastdbcmd_path = fo.join_paths(args.blast_path, [ct.BLASTDBCMD_ALIAS])
-	pdb_prefixes = pv.check_prefix_pdb(args.input_files, args.output_directory, makeblastdb_path, blastdbcmd_path)
+	pdb_prefixes = pv.check_prefix_pdb(genome_list, args.output_directory, makeblastdb_path, blastdbcmd_path)
 
 	print(f'Number of inputs: {total_inputs}')
 
@@ -503,6 +503,10 @@ def msg(name=None):
 	blank_spaces = pv.check_blanks(genome_list)
 	# Check if any input file has an unique prefix >= 50 characters
 	long_prefixes = pv.check_prefix_length(genome_list)
+	# Check if any input file prefixes are interpreted as PDB IDs
+	makeblastdb_path = fo.join_paths(args.blast_path, [ct.MAKEBLASTDB_ALIAS])
+	blastdbcmd_path = fo.join_paths(args.blast_path, [ct.BLASTDBCMD_ALIAS])
+	pdb_prefixes = pv.check_prefix_pdb(genome_list, args.output_directory, makeblastdb_path, blastdbcmd_path)
 
 	# Determine if schema was downloaded from Chewie-NS
 	ns_config = fo.join_paths(args.schema_directory, ['.ns_config'])
@@ -1024,6 +1028,17 @@ def msg(name=None):
 	else:
 		loci_list, total_loci = pv.check_input_type(args.schema_directory, loci_list)
 
+	# Detect if some inputs share the same unique prefix
+	repeated_prefixes = pv.check_unique_prefixes(loci_list)
+	# Detect if filenames include blank spaces
+	blank_spaces = pv.check_blanks(loci_list)
+	# Check if any input file has an unique prefix >= 50 characters
+	long_prefixes = pv.check_prefix_length(loci_list)
+	# Check if any input file prefixes are interpreted as PDB IDs
+	makeblastdb_path = fo.join_paths(args.blast_path, [ct.MAKEBLASTDB_ALIAS])
+	blastdbcmd_path = fo.join_paths(args.blast_path, [ct.BLASTDBCMD_ALIAS])
+	pdb_prefixes = pv.check_prefix_pdb(loci_list, args.output_directory, makeblastdb_path, blastdbcmd_path)
+
 	print(f'Number of cores: {args.cpu_cores}')
 	print(f'BLAST Score Ratio: {args.blast_score_ratio}')
 	print(f'Translation table: {args.translation_table}')

diff --git a/CHEWBBACA/utils/constants.py b/CHEWBBACA/utils/constants.py
@@ -153,7 +153,7 @@
 DUMMY_FASTA = 'dummy.fasta'
 DUMMY_BLASTDB = 'dummy_db'
 DUMMY_DIR = 'dummy_dir'
-DUMMY_OUTPUT = 'dummy_output.fasta'
+DUMMY_BLASTDBCMD_FASTA = 'dummy_blastdbcmd.fasta'
 
 # BLAST warnings to be ignored
 # This warning is raised in BLAST>=2.10 when passing a TXT file with sequence identifiers to -seqidlist
@@ -637,10 +637,11 @@
 					  '(e.g. BLAST does not accept sequence IDs longer than 50 '
 					  'characters when creating a database).')
 
-INPUTS_PDB_PREFIX = ('The following input files have a prefix that is '
-					 'interpreted as a PDB ID:\n{0}\nPlease ensure that '
-					 'the prefix is not a valid PDB ID. This is necessary '
-					 'to avoid issues related to how these IDs are recognized '
-					 'by chewBBACA and its dependencies.')
+INPUTS_PDB_PREFIX = ('The following input files have prefixes that are '
+					 'interpreted by BLAST as chain PDB IDs:\n{0}\nBLAST modifies the '
+					 'IDs of the CDSs that include these prefixes when creating a database, '
+					 'which leads to issues when chewBBACA cannot find the original '
+					 'IDs in the results. Please ensure that the file prefixes (substring '
+					 'before the first "." in the filename) cannot be interpreted as chain PDB IDs.')
 
 MISSING_INPUT_ARG = ('Path to input files does not exist. Please provide a valid path.')
diff --git a/CHEWBBACA/utils/parameters_validation.py b/CHEWBBACA/utils/parameters_validation.py
@@ -1351,29 +1351,46 @@ def check_prefix_length(input_list):
 
 
 def check_prefix_pdb(input_list, output_directory, makeblastdb_path, blastdbcmd_path):
-	"""
+	"""Check if the BLAST database includes the expected sequence IDs.
+
+	Parameters
+	----------
+	input_list : str
+		Path to file that contains the list of paths to input files.
+	output_directory : str
+		Path to the directory where dummy data will be created.
+	makeblastdb_path : str
+		Path to the makeblastdb executable.
+	blastdbcmd_path : str
+		Path to the blastdbcmd executable.
+
+	Returns
+	-------
+	None
 	"""
 	input_paths = fo.read_lines(input_list)
 	prefixes = get_file_prefixes(input_paths)
-	print(prefixes)
 	# Create directory to store dummy data
 	dummy_dir = os.path.join(output_directory, ct.DUMMY_DIR)
 	fo.create_directory(dummy_dir)
 	# Create dummy FASTA records
 	dummy_seqids = [f'{i}-protein1' for i in (prefixes)]
 	dummy_records = [fao.fasta_str_record(ct.FASTA_RECORD_TEMPLATE, [i, ct.DUMMY_PROT]) for i in dummy_seqids]
-	dummy_file = os.path.join(dummy_dir, ct.DUMMY_FASTA)
-	fo.write_lines(dummy_records, dummy_file)
+	dummy_fasta = os.path.join(dummy_dir, ct.DUMMY_FASTA)
+	fo.write_lines(dummy_records, dummy_fasta)
 	# Create BLAST db
-	blastdb = os.path.join(dummy_dir, ct.DUMMY_BLASTDB)
-	bw.make_blast_db(makeblastdb_path, dummy_file, blastdb, 'prot')
+	dummy_blastdb = os.path.join(dummy_dir, ct.DUMMY_BLASTDB)
+	bw.make_blast_db(makeblastdb_path, dummy_fasta, dummy_blastdb, 'prot')
 	# Get sequence from BLAST db
-	output_fasta = os.path.join(dummy_dir, ct.DUMMY_OUTPUT)
-	blastdbcmd_std = bw.run_blastdbcmd(blastdbcmd_path, blastdb, output_fasta)
+	dummy_blastdbcmd_fasta = os.path.join(dummy_dir, ct.DUMMY_BLASTDBCMD_FASTA)
+	blastdbcmd_std = bw.run_blastdbcmd(blastdbcmd_path, dummy_blastdb, dummy_blastdbcmd_fasta)
 	# Check if the sequence IDs in the BLAST db match the expected IDs
-	records = fao.sequence_generator(output_fasta)
+	records = fao.sequence_generator(dummy_blastdbcmd_fasta)
 	recids = [record.id for record in records]
 	modified_seqids = set(dummy_seqids) - set(recids)
+	# Delete dummy data
+	fo.delete_directory(dummy_dir)
+	# Exit if BLAST db includes sequence IDs that do not match the expected IDs
 	if len(modified_seqids) > 0:
 		pdb_prefix = [prefixes[p.split('-protein')[0]][0] for p in modified_seqids]
 		sys.exit(ct.INPUTS_PDB_PREFIX.format('\n'.join(pdb_prefix)))