Skip to content

Commit

Permalink
new stuff
Browse files Browse the repository at this point in the history
  • Loading branch information
kyclark committed Dec 3, 2020
1 parent fa880f4 commit fbd518f
Show file tree
Hide file tree
Showing 79 changed files with 3,471 additions and 285 deletions.
Empty file added 16_fastx_grep/.out
Empty file.
8 changes: 8 additions & 0 deletions 16_fastx_grep/foo
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
@ITSLSUmock2p.ITS_M01380:138:000000000-C9GKM:1:1101:14440:2042 2:N:0
CAAGTTACTTCCTCTAAATGACCAAGCCTAGTGTAGAACCATGTCGTCAGTGTCAGTCTGAGTGTAGATCTCGGTGGTCGCCGTATCATTAAAAAAAAAAATGTAATACTACTAGTAATTATTAATATTATAATTTTGTCTATTAGCATCTTATTATAGATAGAAGATATTATTCATATTTCACTATCTTATACTGATATCAGCTTTATCAGATCACACTCTAGTGAAGATTGTTCTTAACTGAAATTTCCTTCTTCATACAGACACATTAATCTTACCTA
+
EFGGGGGGGGGCGGGGGFCFFFGGGGGFGGGGGGGGGGGFGGGGGGGFGFFFCFGGFFGGGGGGGGGFGGGGFGGGDG<FD@4@CFFGGGGCFFAFEFEG+,9,,,,99,,,5,,49,4,8,4,444,4,4,,,,,,,,,,,,,,8,,,,63,,,,,,,,376,3,,,,,,,8,,,,,,,,,+++++++++++++3++25+++0+*+0+*0+***))*0))1/+++**************.****.*******0*********/(,(/).)))1)).).).
@ITSLSUmock2p.ITS_M01384:138:000000000-C9GKM:1:1101:14440:2043 2:N:0
ACCCGTCAATTTCTTTAAGTTTTAGCCTTGCGACCGTACTCCCCAGGCGGTGCACTTAGTGGTTTTCCGGCGACCCGGGCGGCGTCAGAGCCCCCCAAGTCTCGTGCACATCGTTTACGGCGTGGACTACCAGGGTATCTAATCCTGTTTGATCCCCACGCTTTCGTGCCTCAGCGTCAGTACCGGCCCAGCCACCCGTCTTCACCTTCGGCGTTCCTGTAGATATCTACGCATTTCACCGCTACACCTACAGTTCCGGTGGCGCCTACCGGCCTCAAGAAACGCAGTATGCCCAGCTATT
+
CCCCCGGGGGGGGGGGGGGGGGGGGGGGGEGGGGGGGGGGGGGGFGGGGGGGGGGGG9FGGGGGGGGGGGGGGGGGGGGGGGGGGGGEGGGGGGGGGCGFGFGGGGGGGGGGGGGGGGGDGGGDEEFGGGGGGGGGGGGFFGGG9EGFGGFCGGCGGGCCGEGGGEGFGGGG6B6*=EGGGGGGG8ECCEGGGGCFGDEEGGG?FGFGGC39:>EGGGGGGF7=9:A@FBF>DGCGFF=75C=DBCF74DFFFF*/91B>9>?9?>>:B?>F>FBB:??200:699>?AA2)34F?2))54
23 changes: 23 additions & 0 deletions 16_fastx_grep/mk-outs.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
#!/usr/bin/env bash

set -u

PRG="./fastx_grep.py"
EMPTY="./tests/inputs/empty.fa"
LSU="./tests/inputs/lsu.fq"
LSU_FA="./tests/inputs/lsu.fa"

rm -f ./tests/inputs/*.out

"$PRG" -o "$EMPTY.out" XXX "$EMPTY"

"$PRG" -o "$LSU.upper.out" LSU "$LSU"
"$PRG" -o "$LSU.lower.out" lsu "$LSU"

"$PRG" -o "$LSU.i.upper.out" -i LSU "$LSU"
"$PRG" -o "$LSU.i.lower.out" -i lsu "$LSU"

"$PRG" -O fasta -o "$LSU.fa.out" LSU "$LSU"
"$PRG" -O fasta-2line -o "$LSU.2fa.out" LSU "$LSU"

echo "Done."
96 changes: 62 additions & 34 deletions 16_fastx_grep/solution.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,11 +6,22 @@
import re
import sys
from Bio import SeqIO
from typing import List, NamedTuple, TextIO


class Args(NamedTuple):
""" Command-line arguments """
pattern: str
files: List[TextIO]
input_format: str
output_format: str
outfile: TextIO
verbose: bool


# --------------------------------------------------
def get_args():
"""Get command-line arguments"""
def get_args() -> Args:
""" Get command-line arguments """

parser = argparse.ArgumentParser(
description='Grep through FASTX files',
Expand All @@ -24,7 +35,7 @@ def get_args():
parser.add_argument('file',
metavar='FILE',
nargs='+',
type=argparse.FileType('r'),
type=argparse.FileType('rt'),
help='Input file(s)')

parser.add_argument('-f',
Expand All @@ -35,7 +46,7 @@ def get_args():
default='')

parser.add_argument('-O',
'--out_format',
'--outfmt',
help='Output file format',
metavar='str',
choices=['fasta', 'fastq', 'fasta-2line'],
Expand All @@ -44,33 +55,69 @@ def get_args():
parser.add_argument('-o',
'--outfile',
help='Output file',
type=argparse.FileType('wt'),
metavar='FILE',
default=None)
default=sys.stdout)

parser.add_argument('-v',
'--verbose',
help='Be chatty',
action='store_true')

args = parser.parse_args()

if not args.format:
args.format = guess_format(args.file[0].name)
return Args(pattern=args.pattern,
files=args.file,
input_format=args.format,
output_format=args.outfmt,
outfile=args.outfile,
verbose=args.verbose)


# --------------------------------------------------
def main() -> None:
""" Make a jazz noise here """

args = get_args()

def progress(msg):
if args.verbose:
print(msg, file=sys.stderr)

if not args.format:
parser.error('Cannot guess --format, please specify')
regex = re.compile(args.pattern)
num_checked, num_took = 0, 0
for i, fh in enumerate(args.files, start=1):
progress(f'{i:3}: {fh.name}')
input_format = args.input_format or guess_format(fh.name)

if not input_format:
sys.exit(f'Please specify file format for "{fh.name}"')

return args
output_format = args.output_format or input_format

for rec in SeqIO.parse(fh, input_format):
num_checked += 1
if any(map(regex.search, [rec.id, rec.description])):
num_took += 1
SeqIO.write(rec, args.outfile, output_format)

outfile = 'STDOUT' if args.outfile == sys.stdout else args.outfile.name
progress(f'Done, checked {num_checked}, wrote {num_took} to "{outfile}".')


# --------------------------------------------------
def guess_format(file):
"""Guess format from extension"""
def guess_format(filename: str) -> str:
""" Guess format from extension """

ext = re.sub('^.', '', os.path.splitext(file)[1])
ext = re.sub('^.', '', os.path.splitext(filename)[1])

return 'fasta' if re.match(
'f(ast|n)?a$', ext) else 'fastq' if re.match('f(ast)?q$', ext) else ''


# --------------------------------------------------
def test_guess_format():
"""Test guess_format"""
def test_guess_format() -> None:
""" Test guess_format """

assert guess_format('/foo/bar.fa') == 'fasta'
assert guess_format('/foo/bar.fna') == 'fasta'
Expand All @@ -80,25 +127,6 @@ def test_guess_format():
assert guess_format('/foo/bar.fx') == ''


# --------------------------------------------------
def main():
"""Make a jazz noise here"""

args = get_args()
regex = re.compile(args.pattern)
out_fh = args.outfile or sys.stdout
checked, took = 0, 0

for fh in args.file:
for rec in SeqIO.parse(fh, args.format):
checked += 1
if any(map(regex.search, [rec.id, rec.description])):
took += 1
SeqIO.write(rec, out_fh, args.out_format or args.format)

print(f'Done, checked {checked}, took {took}.', file=sys.stderr)


# --------------------------------------------------
if __name__ == '__main__':
main()
138 changes: 121 additions & 17 deletions 16_fastx_grep/tests/fastx_grep_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,22 +6,25 @@
import string
import re
from subprocess import getstatusoutput
from typing import List

PRG = './fastx_grep.py'
RUN = f'python {PRG}' if platform.system() == 'Windows' else PRG
EMPTY = './tests/inputs/empty.fa'
LSU = './tests/inputs/lsu.fq'
LSU_FA = './tests/inputs/lsu.fa'
BAD_EXT = './tests/inputs/lsu.fx'


# --------------------------------------------------
def test_exists():
def test_exists() -> None:
"""exists"""

assert os.path.isfile(PRG)


# --------------------------------------------------
def test_usage():
def test_usage() -> None:
"""usage"""

for flag in ['-h', '--help']:
Expand All @@ -42,27 +45,128 @@ def test_bad_file() -> None:
assert re.search(f"No such file or directory: '{bad}'", out)


# # --------------------------------------------------
# def run(input_file: str, expected_file: str) -> None:
# """ Runs on command-line input """
# --------------------------------------------------
def test_cannot_guess() -> None:
""" Dies on unguessable extension """

pattern = random_string()
bad = random_string()
rv, out = getstatusoutput(f'{RUN} {pattern} {BAD_EXT}')
assert rv != 0
assert out == f'Please specify file format for "{BAD_EXT}"'


# --------------------------------------------------
def run(pattern: str,
input_file: str,
expected_file: str,
opts: List[str] = []) -> None:
""" Runs on command-line input """

expected = open(expected_file).read().rstrip()
cmd = f"{RUN} {' '.join(opts)} {pattern} {input_file}"
rv, out = getstatusoutput(cmd)
assert rv == 0
assert out == expected


# --------------------------------------------------
def test_empty_file() -> None:
""" Handles empty file """

pattern = random_string()
run(pattern, EMPTY, EMPTY + '.out')


# --------------------------------------------------
def test_lsu_uppercase() -> None:
""" LSU """

run('LSU', LSU, LSU + '.upper.out')


# --------------------------------------------------
def test_lsu_lowercase() -> None:
""" lsu """

run('lsu', LSU, LSU + '.lower.out')


# expected = open(expected_file).read().rstrip()
# rv, out = getstatusoutput(f'{RUN} {input_file}')
# assert rv == 0
# assert out == expected
# --------------------------------------------------
def test_lsu_uppercase_insensitive() -> None:
""" -i LSU """

run('LSU', LSU, LSU + '.i.upper.out', ['-i'])


# --------------------------------------------------
def test_lsu_lowercase_insensitive() -> None:
""" -i lsu """

run('lsu', LSU, LSU + '.i.lower.out', ['--insensitive'])


# # --------------------------------------------------
# def test_empty_file() -> None:
# """ Handles empty file """
# --------------------------------------------------
def test_outfile() -> None:
""" outfile """

outfile = random_string()
if os.path.isfile(outfile):
os.remove(outfile)

try:
flag = '-o' if random.choice([0, 1]) else '--outfile'
rv, out = getstatusoutput(f'{RUN} {flag} {outfile} LSU {LSU}')
assert rv == 0
assert out == ''
assert os.path.isfile(outfile)
expected = open(LSU + '.upper.out').read().rstrip()
assert open(outfile).read().rstrip() == expected

finally:
if os.path.isfile(outfile):
os.remove(outfile)

# run(*EMPTY)

# # --------------------------------------------------
# def test_input1() -> None:
# """ Runs on command-line input """
# --------------------------------------------------
def test_outfile_verbose() -> None:
""" outfile + verbose """

outfile = random_string()
if os.path.isfile(outfile):
os.remove(outfile)

try:
flag = '-v' if random.choice([0, 1]) else '--verbose'
rv, out = getstatusoutput(f'{RUN} {flag} -o {outfile} LSU {LSU}')
assert rv == 0
assert out.splitlines() == [
' 1: ./tests/inputs/lsu.fq',
f'Done, checked 4, wrote 2 to "{outfile}".'
]
assert os.path.isfile(outfile)
expected = open(LSU + '.upper.out').read().rstrip()
assert open(outfile).read().rstrip() == expected

finally:
if os.path.isfile(outfile):
os.remove(outfile)


# --------------------------------------------------
def test_outfmt_fastq_to_fasta() -> None:
""" outfmt """

flag = '-O' if random.choice([0, 1]) else '--outfmt'
run('LSU', LSU, LSU + '.fa.out', [f'{flag} fasta'])


# --------------------------------------------------
def test_outfmt_fastq_to_fasta2line() -> None:
""" outfmt """

# run(*TEST1)
flag = '-O' if random.choice([0, 1]) else '--outfmt'
run('LSU', LSU, LSU + '.2fa.out', [f'{flag} fasta-2line'])


# --------------------------------------------------
Expand Down
Empty file.
13 changes: 13 additions & 0 deletions 16_fastx_grep/tests/inputs/lsu.fa
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
>ITSLSUmock2p.ITS_M01380:138:000000000-C9GKM:1:1101:14440:2042 2:N:0
CAAGTTACTTCCTCTAAATGACCAAGCCTAGTGTAGAACCATGTCGTCAGTGTCAGTCTG
AGTGTAGATCTCGGTGGTCGCCGTATCATTAAAAAAAAAAATGTAATACTACTAGTAATT
ATTAATATTATAATTTTGTCTATTAGCATCTTATTATAGATAGAAGATATTATTCATATT
TCACTATCTTATACTGATATCAGCTTTATCAGATCACACTCTAGTGAAGATTGTTCTTAA
CTGAAATTTCCTTCTTCATACAGACACATTAATCTTACCTA
>ITSLSUmock2p.ITS_M01384:138:000000000-C9GKM:1:1101:14440:2043 2:N:0
ACCCGTCAATTTCTTTAAGTTTTAGCCTTGCGACCGTACTCCCCAGGCGGTGCACTTAGT
GGTTTTCCGGCGACCCGGGCGGCGTCAGAGCCCCCCAAGTCTCGTGCACATCGTTTACGG
CGTGGACTACCAGGGTATCTAATCCTGTTTGATCCCCACGCTTTCGTGCCTCAGCGTCAG
TACCGGCCCAGCCACCCGTCTTCACCTTCGGCGTTCCTGTAGATATCTACGCATTTCACC
GCTACACCTACAGTTCCGGTGGCGCCTACCGGCCTCAAGAAACGCAGTATGCCCAGCTAT
T
13 changes: 12 additions & 1 deletion 16_fastx_grep/tests/inputs/lsu.fq
Original file line number Diff line number Diff line change
@@ -1,5 +1,16 @@
@M00773:480:000000000-BLYPT:1:2106:12063:1841 1:N:0:AGGCGACCTTA
TTTCTGTGCCAGCAGCCGCGGTAAGACAGAGGTGGCGAGCGTTGTTCGGATTTACTGGGCGTAAAGCGCGGGTAGGCGGTTCGGCCAGTCAGATGTGAAATCCCACCGCTTAACGGTGGAACGGCGTCTGATACTACCGGACTTGAGTGCAGGAGAGGAGGGTGGAATTTCCGGTGTAGCGGTGAAATGCGTAGAGATCGGAAGGAACACCAGTGGCGAAGGCGGCCCTCTGGACTGCAACTGACGCTGAGACGCGAAAGCGTGGGGAGCACACAGGATTAGATACCCTGGTAGTCAACGC
+
CCCCCGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGEFGGFEGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGFGGGGGGGGGGGGGGGGGGEGGGGGGGEGGGGGGDGDGGGGGGGGGGGFDGGGGGGGGFFFFDFG7FFGGGGGGGGG7EGGGGGDGGEGGGGGG=EFGDGGFGGDEGGGGFFC5;EEDFEFGEGFCFGEECC8?5CEE*:5*;?FGGFGCCFGAFFGGGDGGFFGCDECGGGGE;EE8EC=390;575>8<+9FGGFC<8CGFF:9+9,<D5)
@ITSLSUmock2p.ITS_M01380:138:000000000-C9GKM:1:1101:14440:2042 2:N:0
CAAGTTACTTCCTCTAAATGACCAAGCCTAGTGTAGAACCATGTCGTCAGTGTCAGTCTGAGTGTAGATCTCGGTGGTCGCCGTATCATTAAAAAAAAAAATGTAATACTACTAGTAATTATTAATATTATAATTTTGTCTATTAGCATCTTATTATAGATAGAAGATATTATTCATATTTCACTATCTTATACTGATATCAGCTTTATCAGATCACACTCTAGTGAAGATTGTTCTTAACTGAAATTTCCTTCTTCATACAGACACATTAATCTTACCTA
+
EFGGGGGGGGGCGGGGGFCFFFGGGGGFGGGGGGGGGGGFGGGGGGGFGFFFCFGGFFGGGGGGGGGFGGGGFGGGDG<FD@4@CFFGGGGCFFAFEFEG+,9,,,,99,,,5,,49,4,8,4,444,4,4,,,,,,,,,,,,,,8,,,,63,,,,,,,,376,3,,,,,,,8,,,,,,,,,+++++++++++++3++25+++0+*+0+*0+***))*0))1/+++**************.****.*******0*********/(,(/).)))1)).).).

@M00773:480:000000000-BLYPT:1:2106:12063:1841 2:N:0:AGGCGACCTTA
ACCCGCCAATTTCTTTGAGTTTCAACCTTGCGGCCGTACTCCCCAGGCGGGGCACTTACTCCGTTAGGTGCGGCACGGGAGGGGTCGATACCTCCCACACTTAGTGCCCATCGTTTAGGGCGTGGACTACCAGGGTATCTAATCCTGTTTGCTCCCCACGCTTTCGCGTCTCAGCGTCAGTTGCAGTCCAGAGGGCCGCCTTCGCCACTGGTGTTCCTTCCGAGCCCTACGCATTTCACCGCTACACCGGAAATTCCACCCTCCTCTCCTGCACTCAAGTCTGGCAGTAGCAGACGCCGTT
+
CCCCCDGGGGGGGGGGGGG<CECGGGFGGGGGGGGGGGGGF@FGGGGGGFDGGGGCDFGGGGGGCGGGGFDGCFFGGCGEDGEF:FGGGGCGCFGGFFGG@EGGGGGGDGFGGCF7D>FGGG><FCEGCFEFGEE@<:CFGGGGGCGGCGFGEC<<EGG:C:E5FE*:?E8C?ECEGG88C;E9AFFFGF@:EF*;88*/A5EFG8*/ACC**:*7<00C:ED))(-7>)0)9DEDFFF>8=*0.6C75C>DD?:@*)9(41:??FFF(9:2<FF70?):A).3)(,46))5)..,3(,41
@ITSLSUmock2p.ITS_M01384:138:000000000-C9GKM:1:1101:14440:2043 2:N:0
ACCCGTCAATTTCTTTAAGTTTTAGCCTTGCGACCGTACTCCCCAGGCGGTGCACTTAGTGGTTTTCCGGCGACCCGGGCGGCGTCAGAGCCCCCCAAGTCTCGTGCACATCGTTTACGGCGTGGACTACCAGGGTATCTAATCCTGTTTGATCCCCACGCTTTCGTGCCTCAGCGTCAGTACCGGCCCAGCCACCCGTCTTCACCTTCGGCGTTCCTGTAGATATCTACGCATTTCACCGCTACACCTACAGTTCCGGTGGCGCCTACCGGCCTCAAGAAACGCAGTATGCCCAGCTATT
+
CCCCCGGGGGGGGGGGGGGGGGGGGGGGGEGGGGGGGGGGGGGGFGGGGGGGGGGGG9FGGGGGGGGGGGGGGGGGGGGGGGGGGGGEGGGGGGGGGCGFGFGGGGGGGGGGGGGGGGGDGGGDEEFGGGGGGGGGGGGFFGGG9EGFGGFCGGCGGGCCGEGGGEGFGGGG6B6*=EGGGGGGG8ECCEGGGGCFGDEEGGG?FGFGGC39:>EGGGGGGF7=9:A@FBF>DGCGFF=75C=DBCF74DFFFF*/91B>9>?9?>>:B?>F>FBB:??200:699>?AA2)34F?2))54
4 changes: 4 additions & 0 deletions 16_fastx_grep/tests/inputs/lsu.fq.2fa.out
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
>ITSLSUmock2p.ITS_M01380:138:000000000-C9GKM:1:1101:14440:2042 2:N:0
CAAGTTACTTCCTCTAAATGACCAAGCCTAGTGTAGAACCATGTCGTCAGTGTCAGTCTGAGTGTAGATCTCGGTGGTCGCCGTATCATTAAAAAAAAAAATGTAATACTACTAGTAATTATTAATATTATAATTTTGTCTATTAGCATCTTATTATAGATAGAAGATATTATTCATATTTCACTATCTTATACTGATATCAGCTTTATCAGATCACACTCTAGTGAAGATTGTTCTTAACTGAAATTTCCTTCTTCATACAGACACATTAATCTTACCTA
>ITSLSUmock2p.ITS_M01384:138:000000000-C9GKM:1:1101:14440:2043 2:N:0
ACCCGTCAATTTCTTTAAGTTTTAGCCTTGCGACCGTACTCCCCAGGCGGTGCACTTAGTGGTTTTCCGGCGACCCGGGCGGCGTCAGAGCCCCCCAAGTCTCGTGCACATCGTTTACGGCGTGGACTACCAGGGTATCTAATCCTGTTTGATCCCCACGCTTTCGTGCCTCAGCGTCAGTACCGGCCCAGCCACCCGTCTTCACCTTCGGCGTTCCTGTAGATATCTACGCATTTCACCGCTACACCTACAGTTCCGGTGGCGCCTACCGGCCTCAAGAAACGCAGTATGCCCAGCTATT
Loading

0 comments on commit fbd518f

Please sign in to comment.