diff --git a/OrthoEvol/Orthologs/Blast/base_blastn.py b/OrthoEvol/Orthologs/Blast/base_blastn.py index 88fd7178..ec48b694 100644 --- a/OrthoEvol/Orthologs/Blast/base_blastn.py +++ b/OrthoEvol/Orthologs/Blast/base_blastn.py @@ -68,10 +68,14 @@ def __init__(self, project, method, template=None, save_data=True, quiet=True, * self.complete_time_file = self.project + '_TIME.csv' self.complete_time_file_path = self.data / Path(self.complete_time_file) - self.blastn_parameters, self.query_config = self.select_method(method=self.method) + self.blastn_parameters, self.query_config = BaseBlastN.select_method(method=self.method) def _make_blast_dir(self, gene, path): - """Create a blast directory for a gene.""" + """Create a blast directory for a gene. + + :param gene: A gene that will be run with blastn. + :param path: The path of the gene directory named with the gene name. + """ try: Path.mkdir(path, exist_ok=True, parents=True) self.blastn_log.debug("Directory created for %s" % gene) @@ -79,7 +83,12 @@ def _make_blast_dir(self, gene, path): self.blastn_log.debug("Directory exists for %s" % gene) def _create_temp_fasta(self, query, gene, query_config): - """Create a temporary fasta file using blastdbcmd.""" + """Create a temporary fasta file using blastdbcmd. + + :param query: The blast query or reference accession. + :param gene: The gene name of the accession. + :param query_config: A configuration dict for the blastdbcmd string. + """ try: blastdbcmd_query = "blastdbcmd -entry {query} -db {db} -outfmt %f -out {temp fasta}".format(**query_config) blastdbcmd_status = run(blastdbcmd_query, stdout=PIPE, @@ -97,10 +106,11 @@ def _create_temp_fasta(self, query, gene, query_config): with contextlib.suppress(ValueError): self.current_gene_list.remove(gene) - def select_method(self, method): + @staticmethod + def select_method(method): """Select a method for running blastn. - :param method: a blast method - 1, 2, 3, or None + :param method: The blast method to use. Either 1, 2, or 3. """ # Local blast using seqidlist if method == 1: @@ -140,7 +150,7 @@ def select_method(self, method): 'temp fasta': ''} else: raise ValueError('%s is not a blast method.' % method) - return (blastn_parameters, query_config) + return blastn_parameters, query_config def configure(self, query_accessions, query_organism, auto_start=False): """This method configures everything for our BLAST workflow. @@ -227,7 +237,8 @@ def parse_xml(self, xml_path, gene, organism): for hsp in hit.hsps: # Find the highest scoring hit for each gene if hsp.bitscore_raw > maximum: - # If the gene is a predicted non-coding RefSeq gene then go the the next hit + # If the gene is a predicted non-coding RefSeq gene + # then go the the next hit # https://en.wikipedia.org/wiki/RefSeq if "xr" in str(hit.id.lower()): self.blastn_log.info("Encountered a predicted(X*_00000) " diff --git a/OrthoEvol/Orthologs/Blast/comparative_genetics.py b/OrthoEvol/Orthologs/Blast/comparative_genetics.py index ef92c152..731911d8 100644 --- a/OrthoEvol/Orthologs/Blast/comparative_genetics.py +++ b/OrthoEvol/Orthologs/Blast/comparative_genetics.py @@ -55,11 +55,13 @@ def __init__(self, project=None, project_path=os.getcwd(), acc_file=None, downstream processing or for basic observation of the data. :param project: The name of the project. - :param project_path: The location of the project, which is generally defined by the ProjectManagement configuration. + :param project_path: The location of the project, which is generally + defined by the ProjectManagement configuration. :param acc_file: The name of the accession file. :param taxon_file: A file that contains an ordered list of taxonomy ids. - :param pre_blast: A flag that gives the user access to an API that contains extra information about their genes - using the mygene package. + :param pre_blast: A flag that gives the user access to an API that + contains extra information about their genes using the + mygene package. :param post_blast: A flag that is used to handle a BLAST result file, which returns information about misssing data, duplicates, etc. :param hgnc: A flag used as a placeholder for future work with HGNC files. @@ -121,7 +123,8 @@ def __init__(self, project=None, project_path=os.getcwd(), acc_file=None, self.taxon_path = self.project_index / Path(self.__taxon_filename) # Handle the master accession file (could be before or after blast) if self.copy_from_package: - shutil.copy(pkg_resources.resource_filename(data.__name__, kwargs['MAF']), str(self.project_index)) + shutil.copy(pkg_resources.resource_filename(data.__name__, kwargs['MAF']), + str(self.project_index)) self.acc_file = self.MAF = kwargs['MAF'] self.acc_filename = self.acc_file if self.acc_file is not None: @@ -169,12 +172,14 @@ def __init__(self, project=None, project_path=os.getcwd(), acc_file=None, self.building = pd.read_csv(str(self.acc_path), dtype=str) del self.building['Tier'] del self.building[self.species] - self.building = self.building.set_index('Gene') # Object for good user output + # Object for good user output + self.building = self.building.set_index('Gene') self.building_file_path = self.data / Path(self.building_filename) # Blast time points + # Master time file for the blast self.building_time_filename = self.building_filename.replace( - 'building.csv', 'building_time.csv') # Master time file for the blast + 'building.csv', 'building_time.csv') self.building_time = pd.read_csv(str(self.acc_path), dtype=str) del self.building_time['Tier'] del self.building_time[self.species] @@ -218,8 +223,8 @@ def __init__(self, project=None, project_path=os.getcwd(), acc_file=None, self.gene_dict = self.df.T.to_dict() self.get_master_lists(self.__data) # populates our lists else: - self.building_filename = str(self.project + 'building.csv') - self.building_time_filename = str(self.project + 'building_time.csv') + self.building_filename = str(self.project + '_building.csv') + self.building_time_filename = str(self.project + '_building_time.csv') # //TODO-ROB Add HGNC python module @@ -229,9 +234,8 @@ def get_file_list(file): :param file: Name of csv file. """ - - data = pd.read_csv(file, header=None) - file_list = list(data[0]) + file_data = pd.read_csv(file, header=None) + file_list = list(file_data[0]) return file_list def get_master_lists(self, df, csv_file=None): diff --git a/OrthoEvol/Orthologs/Blast/orthologs_blastn.py b/OrthoEvol/Orthologs/Blast/orthologs_blastn.py index 758a0236..9f4bc461 100644 --- a/OrthoEvol/Orthologs/Blast/orthologs_blastn.py +++ b/OrthoEvol/Orthologs/Blast/orthologs_blastn.py @@ -8,7 +8,7 @@ class OrthoBlastN(BaseBlastN): def __init__(self, project="orthology-gpcr", method=3, template=None, save_data=True, acc_file="gpcr.csv", copy_from_package=True, **kwargs): - """This class inherits from the CompGenFiles class. + """This class inherits from the BaseBlastN class. This class utilizes it's parent classes to search a standalone Blast database for specific orthologs of a gene using a query organism @@ -21,6 +21,7 @@ def __init__(self, project="orthology-gpcr", method=3, template=None, :param template: The accession file template. :param save_data: A flag for saving the post_blast data to an excel file. :param acc_file: The accession file to use. (Default: 'karg.csv') + :param copy_from_package: Copy the acc_file from the package. (Default: True) :param kwargs: """ # Set values for methods to prevent using a config. @@ -43,4 +44,5 @@ def __init__(self, project="orthology-gpcr", method=3, template=None, proj_mana=self.proj_mana, **kwargs) def run(self): + """Run the blast using a default configuration.""" self.configure(self.blast_human, self.species, auto_start=True)