Create blastx_from_gb.py

marcelauliano · Jun 28, 2023 · 7ccf1d5 · 7ccf1d5
1 parent 8ede6db
commit 7ccf1d5
Showing 1 changed file with 79 additions and 0 deletions.
diff --git a/blastx_from_gb.py b/blastx_from_gb.py
@@ -0,0 +1,79 @@
+import argparse
+import subprocess
+from Bio import SeqIO
+
+def extract_protein_sequences(genbank_files, protein_fasta_file):
+    if protein_fasta_file:
+        return load_protein_sequences_from_file(protein_fasta_file)
+    else:
+        return extract_protein_sequences_from_genbank(genbank_files)
+
+def extract_protein_sequences_from_genbank(genbank_files):
+    protein_sequences = []
+    for genbank_file in genbank_files:
+        for record in SeqIO.parse(genbank_file, "genbank"):
+            for feature in record.features:
+                if feature.type == "CDS" and "translation" in feature.qualifiers:
+                    protein_sequence = feature.qualifiers["translation"][0]
+                    gene_name = feature.qualifiers.get("gene", ["Unknown"])[0]
+                    organism = record.annotations.get("organism", "")
+                    if organism:
+                        fasta_id = f"{organism}|{gene_name}"
+                    else:
+                        fasta_id = gene_name
+                    protein_sequences.append((fasta_id, protein_sequence))
+    return protein_sequences
+
+def load_protein_sequences_from_file(protein_fasta_file):
+    protein_sequences = []
+    for record in SeqIO.parse(protein_fasta_file, "fasta"):
+        fasta_id = record.id
+        protein_sequence = str(record.seq)
+        protein_sequences.append((fasta_id, protein_sequence))
+    return protein_sequences
+
+def save_sequences_to_fasta(sequences, output_file):
+    with open(output_file, "w") as fasta_file:
+        for fasta_id, sequence in sequences:
+            fasta_file.write(f">{fasta_id}\n{sequence}\n")
+
+def create_protein_database(protein_fasta_file, database_path):
+    command = f"makeblastdb -in {protein_fasta_file} -dbtype prot -out {database_path}"
+    subprocess.run(command, shell=True, check=True)
+
+def run_blastx(nucleotide_file, protein_database_path, blast_output):
+    command = f"blastx -query {nucleotide_file} -db {protein_database_path} -outfmt '6 std qlen slen' -evalue 0.1 -out {blast_output}"
+    subprocess.run(command, shell=True, check=True)
+
+def main(genbank_files, nucleotide_file, blast_output, protein_fasta_file):
+    # Extract protein sequences
+    protein_sequences = extract_protein_sequences(genbank_files, protein_fasta_file)
+
+    # Save protein sequences to a fasta file if not provided
+    if not protein_fasta_file:
+        protein_fasta_file = "protein_sequences.fasta"
+        save_sequences_to_fasta(protein_sequences, protein_fasta_file)
+
+    # Create protein database    
+    protein_database_path = "protein_database"
+    create_protein_database(protein_fasta_file, protein_database_path)
+
+    # Run blastx
+    run_blastx(nucleotide_file, protein_database_path, blast_output)
+
+    # Add header line to blast output file
+    header_line = "qseqid\tsseqid\t%_identity\talig_length\tmismatch\tgapopen\tqstart\tqend\tsstart\tsend\tevalue\tbit score\tqlen\tslen\n"
+    with open(blast_output, "r+") as output_file:
+        content = output_file.read()
+        output_file.seek(0, 0)
+        output_file.write(header_line + content)
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Protein sequence extraction and blastx pipeline")
+    parser.add_argument("genbank_files", nargs="*", help="GenBank file(s) containing protein coding sequences")
+    parser.add_argument("nucleotide_file", help="Input nucleotide FASTA file")
+    parser.add_argument("-o", "--output", dest="blast_output", default="blast_results.txt", help="Blast output file")
+    parser.add_argument("-p", "--protein-fasta", dest="protein_fasta_file", help="Input protein FASTA file")
+    args = parser.parse_args()
+
+    main(args.genbank_files, args.nucleotide_file, args.blast_output, args.protein_fasta_file)