Skip to content

Commit

Permalink
run black
Browse files Browse the repository at this point in the history
  • Loading branch information
d4straub committed Sep 1, 2022
1 parent 8d8b931 commit 2feaebc
Show file tree
Hide file tree
Showing 7 changed files with 135 additions and 120 deletions.
29 changes: 14 additions & 15 deletions bin/add_full_sequence_to_taxfile.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
#!/usr/bin/env python3
#@author Jeanette Tangrot
# @author Jeanette Tangrot
# Takes one TSV taxonomy file from DADA2 and a sequence fasta file,
# adds sequence to taxonomy based on ASV_ID

Expand All @@ -12,27 +12,26 @@

# Read tsv and remove sequence column
taxfile = sys.argv[1]
tax = pd.read_csv(taxfile, sep='\t', header=0)
tax.drop(columns='sequence', inplace=True)
tax = pd.read_csv(taxfile, sep="\t", header=0)
tax.drop(columns="sequence", inplace=True)

# Read fasta file and store as data frame
seqs = pd.DataFrame(columns=["id","sequence"])
seqs = pd.DataFrame(columns=["id", "sequence"])
seq = ""
name = ""
with open(sys.argv[2], 'r') as reader:
with open(sys.argv[2], "r") as reader:
for line in reader:
if line.startswith('>'):
if (seq != "" and name != ""):
seqs = seqs.append({'id':name, 'sequence': seq}, ignore_index=True)
if line.startswith(">"):
if seq != "" and name != "":
seqs = seqs.append({"id": name, "sequence": seq}, ignore_index=True)
seq = ""
name = line.lstrip('>').rstrip('\s+*\n')
name = line.lstrip(">").rstrip("\s+*\n")
else:
seq = seq + line.rstrip('\n')
if (seq != "" and name != ""):
seqs = seqs.append({'id':name, 'sequence': seq}, ignore_index=True)
seq = seq + line.rstrip("\n")
if seq != "" and name != "":
seqs = seqs.append({"id": name, "sequence": seq}, ignore_index=True)

# Join taxonomy and full sequence, write to file
tax = tax.set_index('ASV_ID').join(seqs.set_index('id'), how='outer')
tax = tax.set_index("ASV_ID").join(seqs.set_index("id"), how="outer")
outfile = sys.argv[3]
tax.to_csv(outfile, sep='\t',na_rep="", index_label="ASV_ID")

tax.to_csv(outfile, sep="\t", na_rep="", index_label="ASV_ID")
64 changes: 32 additions & 32 deletions bin/add_sh_to_taxonomy.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
#!/usr/bin/env python3
#@author Jeanette Tångrot
# @author Jeanette Tångrot

# Adds UNITE species hypothesis (SH) information to ASV table based on vsearch usearch_global results in blast6 format.
#
Expand All @@ -22,29 +22,29 @@
outfile = sys.argv[5]

# Read sequence to SH matchings
seq2sh = pd.read_csv(sys.argv[1], sep='\t', header=None, index_col=0, skiprows=None, compression='bz2')
seq2sh = pd.read_csv(sys.argv[1], sep="\t", header=None, index_col=0, skiprows=None, compression="bz2")

# Read SH taxonomies
# Columns:
# SH taxonid kingdom phylum class order family genus species
shtax = pd.read_csv(sys.argv[2], sep='\t', header=None, index_col=0, skiprows=None, compression='bz2')
shtax = pd.read_csv(sys.argv[2], sep="\t", header=None, index_col=0, skiprows=None, compression="bz2")
# Replace taxonid with Domain = "Eukaryota"
shtax.loc[:,1] = 'Eukaryota'
shtax.loc[:, 1] = "Eukaryota"
# Remove genus from species name
shtax.loc[:,8] = shtax.loc[:,8].str.split(" ",1).str[1]
shtax.loc[:, 8] = shtax.loc[:, 8].str.split(" ", 1).str[1]

# Read taxonomy table
# Determine number of taxonomy levels from header
# ASV_ID Domain Kingdom Phylum Class Order Family Genus confidence sequence
taxtable = pd.read_csv(sys.argv[3], sep='\t', header=0)
taxtable = pd.read_csv(sys.argv[3], sep="\t", header=0)
num_ranks = len(taxtable.columns) - 3
# Add SH slot to table:
# ASV_ID Domain Kingdom Phylum Class Order Family Genus SH confidence sequence
taxtable.insert(num_ranks+1,"SH","", allow_duplicates=False)
tax_entries = list(taxtable.columns)[1:num_ranks+3]
taxtable.insert(num_ranks + 1, "SH", "", allow_duplicates=False)
tax_entries = list(taxtable.columns)[1 : num_ranks + 3]

# Go through vsearch matches and update taxonomy for those entries
fh = open( sys.argv[4], mode = 'r' )
fh = open(sys.argv[4], mode="r")
prev_ASV = fh.readline().split()[0]
fh.seek(0)
matches = []
Expand All @@ -60,30 +60,30 @@
tax = ""
conf = 0.0
for m in matches:
matchparts = m[0].split('|')
matchparts = m[0].split("|")
try:
new_SH = seq2sh.loc[ matchparts[1] ][1]
new_SH = seq2sh.loc[matchparts[1]][1]
except KeyError:
print( "WARNING: " + matchparts[1] + " not in seq2SH list", file=sys.stderr )
print("WARNING: " + matchparts[1] + " not in seq2SH list", file=sys.stderr)
new_SH = ""
if ( pd.isna( new_SH ) ) :
print( "WARNING: no SH reported for " + matchparts[1], file=sys.stderr )
if pd.isna(new_SH):
print("WARNING: no SH reported for " + matchparts[1], file=sys.stderr)
new_SH = ""
if SH != "" and new_SH != SH :
if SH != "" and new_SH != SH:
SH = ""
tax = ""
break
elif new_SH != "":
SH = new_SH
try:
tax = list(shtax.loc[ SH ])
tax = list(shtax.loc[SH])
except KeyError:
print( "WARNING: no taxonomy found for " + SH, file=sys.stderr )
tax = [""]*num_ranks
conf = m[1]/100.0
print("WARNING: no taxonomy found for " + SH, file=sys.stderr)
tax = [""] * num_ranks
conf = m[1] / 100.0
if SH != "":
tax_list = tax[0:num_ranks] + [SH] + [conf]
taxtable.loc[ taxtable['ASV_ID'] == prev_ASV, tax_entries] = tax_list
taxtable.loc[taxtable["ASV_ID"] == prev_ASV, tax_entries] = tax_list
prev_ASV = ASV
maxid = -1
maxlen = -1
Expand All @@ -101,35 +101,35 @@
elif pid == maxid and alen == maxlen:
matches.append([match, pid, alen])

if match != "*": # Take care of last row/ASV in match file
if match != "*": # Take care of last row/ASV in match file
SH = ""
tax = ""
conf = 0.0
for m in matches:
matchparts = m[0].split('|')
matchparts = m[0].split("|")
try:
new_SH = seq2sh.loc[ matchparts[1] ][1]
new_SH = seq2sh.loc[matchparts[1]][1]
except KeyError:
print( "WARNING: " + matchparts[1] + " not in seq2SH list", file=sys.stderr )
print("WARNING: " + matchparts[1] + " not in seq2SH list", file=sys.stderr)
new_SH = ""
if ( pd.isna( new_SH ) ) :
print( "WARNING: no SH reported for " + matchparts[1], file=sys.stderr )
if pd.isna(new_SH):
print("WARNING: no SH reported for " + matchparts[1], file=sys.stderr)
new_SH = ""
if SH != "" and new_SH != SH :
if SH != "" and new_SH != SH:
SH = ""
tax = ""
break
elif new_SH != "":
SH = new_SH
try:
tax = list(shtax.loc[ SH ])
tax = list(shtax.loc[SH])
except KeyError:
print( "WARNING: no taxonomy found for " + SH, file=sys.stderr )
tax = [""]*num_ranks
conf = m[1]/100.0
print("WARNING: no taxonomy found for " + SH, file=sys.stderr)
tax = [""] * num_ranks
conf = m[1] / 100.0
if SH != "":
tax_list = tax[0:num_ranks] + [SH] + [conf]
taxtable.loc[ taxtable['ASV_ID'] == prev_ASV, tax_entries] = tax_list
taxtable.loc[taxtable["ASV_ID"] == prev_ASV, tax_entries] = tax_list


# Write new taxtable, with SH and new taxonomy added if found
Expand Down
24 changes: 12 additions & 12 deletions bin/count_table_minmax_reads.py
Original file line number Diff line number Diff line change
@@ -1,29 +1,29 @@
#!/usr/bin/env python3
#@author Daniel Straub
# @author Daniel Straub
# Takes one TSV count table from QIIME2
# and reports the maximum or minimum counts of all samples.

import pandas as pd
import sys

#argument check
if len(sys.argv) != 3 or sys.argv[2] not in ['maximum','minimum']:
# argument check
if len(sys.argv) != 3 or sys.argv[2] not in ["maximum", "minimum"]:
exit("Usage: count_table_max_reads.py <feature-table.tsv> <maximum/minimum>")

#read tsv and skip first two rows
data = pd.read_csv(sys.argv[1], sep='\t', skiprows=[0,1], header=None) #count table
# read tsv and skip first two rows
data = pd.read_csv(sys.argv[1], sep="\t", skiprows=[0, 1], header=None) # count table

#drop feature ids
# drop feature ids
df = data.drop(data.columns[0], axis=1)

#make sums
# make sums
sums = df.sum()

#determine maximum or minimum
if sys.argv[2] == 'maximum':
# determine maximum or minimum
if sys.argv[2] == "maximum":
out = int(sums.max())
elif sys.argv[2] == 'minimum':
elif sys.argv[2] == "minimum":
out = int(sums.min())

#print value
print(out, end='')
# print value
print(out, end="")
42 changes: 24 additions & 18 deletions bin/create_unite_taxfile.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
#
# By Jeanette Tångrot 2020-09-02

#--- Import libraries, do initializations ---#
# --- Import libraries, do initializations ---#
import sys
from Bio import SeqIO

Expand All @@ -16,30 +16,36 @@
<unite_tax.txt> : Output. Name of text file with taxonomies.
"""

#--- Check and read arguments ---#
# --- Check and read arguments ---#
if len(sys.argv) != 4:
exit("Usage: " + usage )
exit("Usage: " + usage)

fasta_in = sys.argv[1]
fasta_out = sys.argv[2]
tax_out = sys.argv[3]

#--- Read sequence file and create new records ---#
replace_dict = {';p__': ';D_1__', ';c__': ';D_2__', ';o__': ';D_3__', ';f__': ';D_4__', ';g__': ';D_5__', ';s__': ';D_6__'}

fh_fasta = open( fasta_out, mode = 'w' )
fh_tax = open( tax_out, mode = 'w' )

for entry in SeqIO.parse( fasta_in, "fasta" ):
(name, tax) = entry.id.split('|k__')
tax = 'D_0__' + tax
tax = tax.replace('unidentified','')
# --- Read sequence file and create new records ---#
replace_dict = {
";p__": ";D_1__",
";c__": ";D_2__",
";o__": ";D_3__",
";f__": ";D_4__",
";g__": ";D_5__",
";s__": ";D_6__",
}

fh_fasta = open(fasta_out, mode="w")
fh_tax = open(tax_out, mode="w")

for entry in SeqIO.parse(fasta_in, "fasta"):
(name, tax) = entry.id.split("|k__")
tax = "D_0__" + tax
tax = tax.replace("unidentified", "")
for n1, n2 in replace_dict.items():
tax = tax.replace( n1, n2 )
tax = tax.replace('|SH','_SH')
fh_fasta.write('>' + name + '\n' + str(entry.seq).upper() + '\n')
fh_tax.write( name + '\t' + tax + '\n' )
tax = tax.replace(n1, n2)
tax = tax.replace("|SH", "_SH")
fh_fasta.write(">" + name + "\n" + str(entry.seq).upper() + "\n")
fh_tax.write(name + "\t" + tax + "\n")

fh_fasta.close()
fh_tax.close()

25 changes: 17 additions & 8 deletions bin/cutadapt_summary.py
Original file line number Diff line number Diff line change
@@ -1,25 +1,34 @@
#!/usr/bin/env python3

#--- Import libraries, do initializations ---#
# --- Import libraries, do initializations ---#
import re, sys
from sys import argv

usage = "Usage: cutadapt_summary.py <single_end/paired_end> cutadapt_log_*.txt"

#--- Check and read arguments ---#
# --- Check and read arguments ---#
if len(argv) < 3:
exit(usage)
if argv[1] != "single_end" and argv[1] != "paired_end":
exit(usage)

regexes = [r" -o (\S+) ",
regexes = [
r" -o (\S+) ",
r"Total (?:read pairs|reads) processed:\s+([0-9,,]+)",
r"Reverse-complemented:\s+([0-9,,]+)",
r"(?:Pairs|Reads) written .+?:\s+([0-9,,]+)",
r"(?:Pairs|Reads) written .+?:.*?\(([^)]+)"]
r"(?:Pairs|Reads) written .+?:.*?\(([^)]+)",
]

columns = ["sample", "cutadapt_total_processed", "cutadapt_reverse_complemented", "cutadapt_passing_filters", "cutadapt_passing_filters_percent"]
columns = [
"sample",
"cutadapt_total_processed",
"cutadapt_reverse_complemented",
"cutadapt_passing_filters",
"cutadapt_passing_filters_percent",
]

#--- Search each file using regex ---#
# --- Search each file using regex ---#
print("\t".join(columns))
for FILE in argv[2:]:
with open(FILE) as x:
Expand All @@ -32,8 +41,8 @@
else:
results.append("")

#modify sample names (all before ".")
# modify sample names (all before ".")
results[0] = results[0].split(".", 1)[0]

#output per file
# output per file
print("\t".join(results))
36 changes: 18 additions & 18 deletions bin/filter_stats.py
Original file line number Diff line number Diff line change
@@ -1,38 +1,38 @@
#!/usr/bin/env python3
#@author Daniel Straub
# @author Daniel Straub
# Takes two TSV count table from QIIME2
# and reports how much counts were filtered.

import pandas as pd
import sys

#argument check
# argument check
if len(sys.argv) != 3:
exit("Usage: count_table_max_reads.py <unfiltered_feature-table.tsv> <filtered_feature-table.tsv>")

#read tsv and skip first two rows
data_unfiltered = pd.read_csv(sys.argv[1], sep='\t', skiprows=None) #count table
data_filtered = pd.read_csv(sys.argv[2], sep='\t', skiprows=[0]) #count table
# read tsv and skip first two rows
data_unfiltered = pd.read_csv(sys.argv[1], sep="\t", skiprows=None) # count table
data_filtered = pd.read_csv(sys.argv[2], sep="\t", skiprows=[0]) # count table

#drop feature ids
# drop feature ids
df_unfiltered = data_unfiltered.drop(data_unfiltered.columns[0], axis=1)
df_filtered = data_filtered.drop(data_filtered.columns[0], axis=1)

#make sample count sums
# make sample count sums
sums_unfiltered = df_unfiltered.sum()
sums_filtered = df_filtered.sum()

#merge dataframes
out = sums_unfiltered.to_frame(name = 'unfiltered').join(sums_filtered.to_frame(name = 'filtered'))
out['lost'] = out['unfiltered'] - out['filtered']
out['retained_percent'] = out['filtered'] / out['unfiltered'] *100
out['lost_percent'] = (100 - out['retained_percent'])
# merge dataframes
out = sums_unfiltered.to_frame(name="unfiltered").join(sums_filtered.to_frame(name="filtered"))
out["lost"] = out["unfiltered"] - out["filtered"]
out["retained_percent"] = out["filtered"] / out["unfiltered"] * 100
out["lost_percent"] = 100 - out["retained_percent"]

#add column with sample names at beginning
out = out.rename_axis('sample').reset_index()
# add column with sample names at beginning
out = out.rename_axis("sample").reset_index()

#rename columns
out = out.rename(columns={'unfiltered': 'input_tax_filter', 'filtered': 'filtered_tax_filter'})
# rename columns
out = out.rename(columns={"unfiltered": "input_tax_filter", "filtered": "filtered_tax_filter"})

#write file
out.to_csv('count_table_filter_stats.tsv', sep='\t', index=False)
# write file
out.to_csv("count_table_filter_stats.tsv", sep="\t", index=False)
Loading

0 comments on commit 2feaebc

Please sign in to comment.