run black

npechl · Sep 1, 2022 · 2feaebc · 2feaebc
1 parent 8d8b931
commit 2feaebc
Show file tree

Hide file tree

Showing 7 changed files with 135 additions and 120 deletions.
diff --git a/bin/add_full_sequence_to_taxfile.py b/bin/add_full_sequence_to_taxfile.py
@@ -1,5 +1,5 @@
 #!/usr/bin/env python3
-#@author Jeanette Tangrot
+# @author Jeanette Tangrot
 # Takes one TSV taxonomy file from DADA2 and a sequence fasta file,
 # adds sequence to taxonomy based on ASV_ID
 
@@ -12,27 +12,26 @@
 
 # Read tsv and remove sequence column
 taxfile = sys.argv[1]
-tax = pd.read_csv(taxfile, sep='\t', header=0)
-tax.drop(columns='sequence', inplace=True)
+tax = pd.read_csv(taxfile, sep="\t", header=0)
+tax.drop(columns="sequence", inplace=True)
 
 # Read fasta file and store as data frame
-seqs = pd.DataFrame(columns=["id","sequence"])
+seqs = pd.DataFrame(columns=["id", "sequence"])
 seq = ""
 name = ""
-with open(sys.argv[2], 'r') as reader:
+with open(sys.argv[2], "r") as reader:
     for line in reader:
-        if line.startswith('>'):
-            if (seq != "" and name != ""):
-                seqs = seqs.append({'id':name, 'sequence': seq}, ignore_index=True)
+        if line.startswith(">"):
+            if seq != "" and name != "":
+                seqs = seqs.append({"id": name, "sequence": seq}, ignore_index=True)
                 seq = ""
-            name = line.lstrip('>').rstrip('\s+*\n')
+            name = line.lstrip(">").rstrip("\s+*\n")
         else:
-            seq = seq + line.rstrip('\n')
-if (seq != "" and name != ""):
-    seqs = seqs.append({'id':name, 'sequence': seq}, ignore_index=True)
+            seq = seq + line.rstrip("\n")
+if seq != "" and name != "":
+    seqs = seqs.append({"id": name, "sequence": seq}, ignore_index=True)
 
 # Join taxonomy and full sequence, write to file
-tax = tax.set_index('ASV_ID').join(seqs.set_index('id'), how='outer')
+tax = tax.set_index("ASV_ID").join(seqs.set_index("id"), how="outer")
 outfile = sys.argv[3]
-tax.to_csv(outfile, sep='\t',na_rep="", index_label="ASV_ID")
-
+tax.to_csv(outfile, sep="\t", na_rep="", index_label="ASV_ID")
diff --git a/bin/add_sh_to_taxonomy.py b/bin/add_sh_to_taxonomy.py
@@ -1,5 +1,5 @@
 #!/usr/bin/env python3
-#@author Jeanette Tångrot
+# @author Jeanette Tångrot
 
 # Adds UNITE species hypothesis (SH) information to ASV table based on vsearch usearch_global results in blast6 format.
 #
@@ -22,29 +22,29 @@
 outfile = sys.argv[5]
 
 # Read sequence to SH matchings
-seq2sh = pd.read_csv(sys.argv[1], sep='\t', header=None, index_col=0, skiprows=None, compression='bz2')
+seq2sh = pd.read_csv(sys.argv[1], sep="\t", header=None, index_col=0, skiprows=None, compression="bz2")
 
 # Read SH taxonomies
 # Columns:
 # SH  taxonid  kingdom  phylum  class  order  family  genus  species
-shtax = pd.read_csv(sys.argv[2], sep='\t', header=None, index_col=0, skiprows=None, compression='bz2')
+shtax = pd.read_csv(sys.argv[2], sep="\t", header=None, index_col=0, skiprows=None, compression="bz2")
 # Replace taxonid with Domain = "Eukaryota"
-shtax.loc[:,1] = 'Eukaryota'
+shtax.loc[:, 1] = "Eukaryota"
 # Remove genus from species name
-shtax.loc[:,8] = shtax.loc[:,8].str.split(" ",1).str[1]
+shtax.loc[:, 8] = shtax.loc[:, 8].str.split(" ", 1).str[1]
 
 # Read taxonomy table
 # Determine number of taxonomy levels from header
 # ASV_ID  Domain  Kingdom Phylum  Class   Order   Family  Genus   confidence      sequence
-taxtable = pd.read_csv(sys.argv[3], sep='\t', header=0)
+taxtable = pd.read_csv(sys.argv[3], sep="\t", header=0)
 num_ranks = len(taxtable.columns) - 3
 # Add SH slot to table:
 # ASV_ID  Domain  Kingdom Phylum  Class   Order   Family  Genus  SH confidence      sequence
-taxtable.insert(num_ranks+1,"SH","", allow_duplicates=False)
-tax_entries = list(taxtable.columns)[1:num_ranks+3]
+taxtable.insert(num_ranks + 1, "SH", "", allow_duplicates=False)
+tax_entries = list(taxtable.columns)[1 : num_ranks + 3]
 
 # Go through vsearch matches and update taxonomy for those entries
-fh = open( sys.argv[4], mode = 'r' )
+fh = open(sys.argv[4], mode="r")
 prev_ASV = fh.readline().split()[0]
 fh.seek(0)
 matches = []
@@ -60,30 +60,30 @@
         tax = ""
         conf = 0.0
         for m in matches:
-            matchparts = m[0].split('|')
+            matchparts = m[0].split("|")
             try:
-                new_SH = seq2sh.loc[ matchparts[1] ][1]
+                new_SH = seq2sh.loc[matchparts[1]][1]
             except KeyError:
-                print( "WARNING: " + matchparts[1] + " not in seq2SH list", file=sys.stderr )
+                print("WARNING: " + matchparts[1] + " not in seq2SH list", file=sys.stderr)
                 new_SH = ""
-            if ( pd.isna( new_SH ) ) :
-                print( "WARNING: no SH reported for " + matchparts[1], file=sys.stderr )
+            if pd.isna(new_SH):
+                print("WARNING: no SH reported for " + matchparts[1], file=sys.stderr)
                 new_SH = ""
-            if SH != "" and new_SH != SH :
+            if SH != "" and new_SH != SH:
                 SH = ""
                 tax = ""
                 break
             elif new_SH != "":
                 SH = new_SH
                 try:
-                    tax = list(shtax.loc[ SH ])
+                    tax = list(shtax.loc[SH])
                 except KeyError:
-                    print( "WARNING: no taxonomy found for " + SH, file=sys.stderr )
-                    tax = [""]*num_ranks
-                conf = m[1]/100.0
+                    print("WARNING: no taxonomy found for " + SH, file=sys.stderr)
+                    tax = [""] * num_ranks
+                conf = m[1] / 100.0
         if SH != "":
             tax_list = tax[0:num_ranks] + [SH] + [conf]
-            taxtable.loc[ taxtable['ASV_ID'] == prev_ASV, tax_entries] = tax_list
+            taxtable.loc[taxtable["ASV_ID"] == prev_ASV, tax_entries] = tax_list
         prev_ASV = ASV
         maxid = -1
         maxlen = -1
@@ -101,35 +101,35 @@
         elif pid == maxid and alen == maxlen:
             matches.append([match, pid, alen])
 
-if match != "*":        # Take care of last row/ASV in match file
+if match != "*":  # Take care of last row/ASV in match file
     SH = ""
     tax = ""
     conf = 0.0
     for m in matches:
-        matchparts = m[0].split('|')
+        matchparts = m[0].split("|")
         try:
-            new_SH = seq2sh.loc[ matchparts[1] ][1]
+            new_SH = seq2sh.loc[matchparts[1]][1]
         except KeyError:
-            print( "WARNING: " + matchparts[1] + " not in seq2SH list", file=sys.stderr )
+            print("WARNING: " + matchparts[1] + " not in seq2SH list", file=sys.stderr)
             new_SH = ""
-        if ( pd.isna( new_SH ) ) :
-            print( "WARNING: no SH reported for " + matchparts[1], file=sys.stderr )
+        if pd.isna(new_SH):
+            print("WARNING: no SH reported for " + matchparts[1], file=sys.stderr)
             new_SH = ""
-        if SH != "" and new_SH != SH :
+        if SH != "" and new_SH != SH:
             SH = ""
             tax = ""
             break
         elif new_SH != "":
             SH = new_SH
             try:
-                tax = list(shtax.loc[ SH ])
+                tax = list(shtax.loc[SH])
             except KeyError:
-                print( "WARNING: no taxonomy found for " + SH, file=sys.stderr )
-                tax = [""]*num_ranks
-            conf = m[1]/100.0
+                print("WARNING: no taxonomy found for " + SH, file=sys.stderr)
+                tax = [""] * num_ranks
+            conf = m[1] / 100.0
     if SH != "":
         tax_list = tax[0:num_ranks] + [SH] + [conf]
-        taxtable.loc[ taxtable['ASV_ID'] == prev_ASV, tax_entries] = tax_list
+        taxtable.loc[taxtable["ASV_ID"] == prev_ASV, tax_entries] = tax_list
 
 
 # Write new taxtable, with SH and new taxonomy added if found

diff --git a/bin/count_table_minmax_reads.py b/bin/count_table_minmax_reads.py
@@ -1,29 +1,29 @@
 #!/usr/bin/env python3
-#@author Daniel Straub
+# @author Daniel Straub
 # Takes one TSV count table from QIIME2
 # and reports the maximum or minimum counts of all samples.
 
 import pandas as pd
 import sys
 
-#argument check
-if len(sys.argv) != 3 or sys.argv[2] not in ['maximum','minimum']:
+# argument check
+if len(sys.argv) != 3 or sys.argv[2] not in ["maximum", "minimum"]:
     exit("Usage: count_table_max_reads.py <feature-table.tsv> <maximum/minimum>")
 
-#read tsv and skip first two rows
-data = pd.read_csv(sys.argv[1], sep='\t', skiprows=[0,1], header=None) #count table
+# read tsv and skip first two rows
+data = pd.read_csv(sys.argv[1], sep="\t", skiprows=[0, 1], header=None)  # count table
 
-#drop feature ids
+# drop feature ids
 df = data.drop(data.columns[0], axis=1)
 
-#make sums
+# make sums
 sums = df.sum()
 
-#determine maximum or minimum
-if sys.argv[2] == 'maximum':
+# determine maximum or minimum
+if sys.argv[2] == "maximum":
     out = int(sums.max())
-elif sys.argv[2] == 'minimum':
+elif sys.argv[2] == "minimum":
     out = int(sums.min())
 
-#print value
-print(out, end='')
+# print value
+print(out, end="")
diff --git a/bin/create_unite_taxfile.py b/bin/create_unite_taxfile.py
@@ -6,7 +6,7 @@
 #
 # By Jeanette Tångrot 2020-09-02
 
-#--- Import libraries, do initializations  ---#
+# --- Import libraries, do initializations  ---#
 import sys
 from Bio import SeqIO
 
@@ -16,30 +16,36 @@
     <unite_tax.txt> : Output. Name of text file with taxonomies.
 """
 
-#--- Check and read arguments ---#
+# --- Check and read arguments ---#
 if len(sys.argv) != 4:
-    exit("Usage: " + usage )
+    exit("Usage: " + usage)
 
 fasta_in = sys.argv[1]
 fasta_out = sys.argv[2]
 tax_out = sys.argv[3]
 
-#--- Read sequence file and create new records ---#
-replace_dict = {';p__': ';D_1__', ';c__': ';D_2__', ';o__': ';D_3__', ';f__': ';D_4__', ';g__': ';D_5__', ';s__': ';D_6__'}
-
-fh_fasta = open( fasta_out, mode = 'w' )
-fh_tax = open( tax_out, mode = 'w' )
-
-for entry in SeqIO.parse( fasta_in, "fasta" ):
-    (name, tax) = entry.id.split('|k__')
-    tax = 'D_0__' + tax
-    tax = tax.replace('unidentified','')
+# --- Read sequence file and create new records ---#
+replace_dict = {
+    ";p__": ";D_1__",
+    ";c__": ";D_2__",
+    ";o__": ";D_3__",
+    ";f__": ";D_4__",
+    ";g__": ";D_5__",
+    ";s__": ";D_6__",
+}
+
+fh_fasta = open(fasta_out, mode="w")
+fh_tax = open(tax_out, mode="w")
+
+for entry in SeqIO.parse(fasta_in, "fasta"):
+    (name, tax) = entry.id.split("|k__")
+    tax = "D_0__" + tax
+    tax = tax.replace("unidentified", "")
     for n1, n2 in replace_dict.items():
-        tax = tax.replace( n1, n2 )
-    tax = tax.replace('|SH','_SH')
-    fh_fasta.write('>' + name + '\n' + str(entry.seq).upper() + '\n')
-    fh_tax.write( name + '\t' + tax + '\n' )
+        tax = tax.replace(n1, n2)
+    tax = tax.replace("|SH", "_SH")
+    fh_fasta.write(">" + name + "\n" + str(entry.seq).upper() + "\n")
+    fh_tax.write(name + "\t" + tax + "\n")
 
 fh_fasta.close()
 fh_tax.close()
-
diff --git a/bin/cutadapt_summary.py b/bin/cutadapt_summary.py
@@ -1,25 +1,34 @@
 #!/usr/bin/env python3
 
-#--- Import libraries, do initializations  ---#
+# --- Import libraries, do initializations  ---#
 import re, sys
 from sys import argv
+
 usage = "Usage: cutadapt_summary.py <single_end/paired_end> cutadapt_log_*.txt"
 
-#--- Check and read arguments ---#
+# --- Check and read arguments ---#
 if len(argv) < 3:
     exit(usage)
 if argv[1] != "single_end" and argv[1] != "paired_end":
     exit(usage)
 
-regexes = [r" -o (\S+) ",
+regexes = [
+    r" -o (\S+) ",
     r"Total (?:read pairs|reads) processed:\s+([0-9,,]+)",
     r"Reverse-complemented:\s+([0-9,,]+)",
     r"(?:Pairs|Reads) written .+?:\s+([0-9,,]+)",
-    r"(?:Pairs|Reads) written .+?:.*?\(([^)]+)"]
+    r"(?:Pairs|Reads) written .+?:.*?\(([^)]+)",
+]
 
-columns = ["sample", "cutadapt_total_processed", "cutadapt_reverse_complemented", "cutadapt_passing_filters", "cutadapt_passing_filters_percent"]
+columns = [
+    "sample",
+    "cutadapt_total_processed",
+    "cutadapt_reverse_complemented",
+    "cutadapt_passing_filters",
+    "cutadapt_passing_filters_percent",
+]
 
-#--- Search each file using regex ---#
+# --- Search each file using regex ---#
 print("\t".join(columns))
 for FILE in argv[2:]:
     with open(FILE) as x:
@@ -32,8 +41,8 @@
             else:
                 results.append("")
 
-        #modify sample names (all before ".")
+        # modify sample names (all before ".")
         results[0] = results[0].split(".", 1)[0]
 
-        #output per file
+        # output per file
         print("\t".join(results))
diff --git a/bin/filter_stats.py b/bin/filter_stats.py
@@ -1,38 +1,38 @@
 #!/usr/bin/env python3
-#@author Daniel Straub
+# @author Daniel Straub
 # Takes two TSV count table from QIIME2
 # and reports how much counts were filtered.
 
 import pandas as pd
 import sys
 
-#argument check
+# argument check
 if len(sys.argv) != 3:
     exit("Usage: count_table_max_reads.py <unfiltered_feature-table.tsv> <filtered_feature-table.tsv>")
 
-#read tsv and skip first two rows
-data_unfiltered = pd.read_csv(sys.argv[1], sep='\t', skiprows=None) #count table
-data_filtered = pd.read_csv(sys.argv[2], sep='\t', skiprows=[0]) #count table
+# read tsv and skip first two rows
+data_unfiltered = pd.read_csv(sys.argv[1], sep="\t", skiprows=None)  # count table
+data_filtered = pd.read_csv(sys.argv[2], sep="\t", skiprows=[0])  # count table
 
-#drop feature ids
+# drop feature ids
 df_unfiltered = data_unfiltered.drop(data_unfiltered.columns[0], axis=1)
 df_filtered = data_filtered.drop(data_filtered.columns[0], axis=1)
 
-#make sample count sums
+# make sample count sums
 sums_unfiltered = df_unfiltered.sum()
 sums_filtered = df_filtered.sum()
 
-#merge dataframes
-out =  sums_unfiltered.to_frame(name = 'unfiltered').join(sums_filtered.to_frame(name = 'filtered'))
-out['lost'] = out['unfiltered'] - out['filtered']
-out['retained_percent'] = out['filtered'] / out['unfiltered'] *100
-out['lost_percent'] = (100 - out['retained_percent'])
+# merge dataframes
+out = sums_unfiltered.to_frame(name="unfiltered").join(sums_filtered.to_frame(name="filtered"))
+out["lost"] = out["unfiltered"] - out["filtered"]
+out["retained_percent"] = out["filtered"] / out["unfiltered"] * 100
+out["lost_percent"] = 100 - out["retained_percent"]
 
-#add column with sample names at beginning
-out = out.rename_axis('sample').reset_index()
+# add column with sample names at beginning
+out = out.rename_axis("sample").reset_index()
 
-#rename columns
-out = out.rename(columns={'unfiltered': 'input_tax_filter', 'filtered': 'filtered_tax_filter'})
+# rename columns
+out = out.rename(columns={"unfiltered": "input_tax_filter", "filtered": "filtered_tax_filter"})
 
-#write file
-out.to_csv('count_table_filter_stats.tsv', sep='\t', index=False)
+# write file
+out.to_csv("count_table_filter_stats.tsv", sep="\t", index=False)