forked from kyclark/biofx_python
-
Notifications
You must be signed in to change notification settings - Fork 0
/
solution.py
executable file
·122 lines (95 loc) · 3.48 KB
/
solution.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
#!/usr/bin/env python3
""" Probabilistically subset FASTA files """
import argparse
import os
import random
from Bio import SeqIO
from typing import List, NamedTuple, Optional, TextIO
class Args(NamedTuple):
""" Command-line arguments """
files: List[TextIO]
file_format: str
percent: float
max_reads: int
seed: Optional[int]
outdir: str
# --------------------------------------------------
def get_args() -> Args:
""" Get command-line arguments """
parser = argparse.ArgumentParser(
description='Probabilistically subset FASTA files',
formatter_class=argparse.ArgumentDefaultsHelpFormatter)
parser.add_argument('file',
metavar='FILE',
type=argparse.FileType('r'),
nargs='+',
help='Input FASTA/Q file(s)')
parser.add_argument('-f',
'--format',
help='Input file format',
metavar='format',
type=str,
choices=['fasta', 'fastq'],
default='fasta')
parser.add_argument('-p',
'--percent',
help='Percent of reads',
metavar='reads',
type=float,
default=.1)
parser.add_argument('-m',
'--max',
help='Maximum number of reads',
metavar='max',
type=int,
default=0)
parser.add_argument('-s',
'--seed',
help='Random seed value',
metavar='seed',
type=int,
default=None)
parser.add_argument('-o',
'--outdir',
help='Output directory',
metavar='DIR',
type=str,
default='out')
args = parser.parse_args()
if not 0 < args.percent < 1:
parser.error(f'--percent "{args.percent}" must be between 0 and 1')
if not os.path.isdir(args.outdir):
os.makedirs(args.outdir)
return Args(files=args.file,
file_format=args.format,
percent=args.percent,
max_reads=args.max,
seed=args.seed,
outdir=args.outdir)
# --------------------------------------------------
def main() -> None:
""" Make a jazz noise here """
args = get_args()
random.seed(args.seed)
total_num = 0
for i, fh in enumerate(args.files, start=1):
basename = os.path.basename(fh.name)
out_file = os.path.join(args.outdir, basename)
print(f'{i:3}: {basename}')
out_fh = open(out_file, 'wt')
num_taken = 0
for rec in SeqIO.parse(fh, args.file_format):
if random.random() <= args.percent:
num_taken += 1
SeqIO.write(rec, out_fh, 'fasta')
if args.max_reads and num_taken == args.max_reads:
break
out_fh.close()
total_num += num_taken
num_files = len(args.files)
print(f'Wrote {total_num:,} sequence{"" if total_num == 1 else "s"} '
f'from {num_files:,} file{"" if num_files == 1 else "s"} '
f'to directory "{args.outdir}".')
# --------------------------------------------------
if __name__ == '__main__':
main()