forked from dib-lab/khmer
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathfilter-min2-multi.py
55 lines (40 loc) · 1.19 KB
/
filter-min2-multi.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
import sys, screed.fasta, os
import khmer
from khmer.thread_utils import ThreadedSequenceProcessor, verbose_fasta_iter
K = 32
HT_SIZE=7e9
N_HT=4
WORKER_THREADS=8
GROUPSIZE=100
###
def main():
print '-- settings:'
print 'K', K
print 'N THREADS', WORKER_THREADS
print '--'
print 'making hashtable'
ht = khmer.new_counting_hash(K, HT_SIZE, N_HT)
for filename in sys.argv[1:]:
print 'consuming input', filename
ht.consume_fasta(filename)
def process_fn(record, ht=ht):
name = record['name']
seq = record['sequence']
if 'N' in seq:
return None, None
if len(seq) < K:
return None, None
if ht.get_min_count(seq) < 2:
return None, None
return name, seq
for filename in sys.argv[1:]:
print '***', filename
outfile = os.path.basename(filename) + '.f2'
if os.path.exists(outfile):
print 'SKIPPING', outfile, ' -- already exists'
continue
outfp = open(outfile, 'w')
tsp = ThreadedSequenceProcessor(process_fn, WORKER_THREADS, GROUPSIZE)
tsp.start(verbose_fasta_iter(filename), outfp)
if __name__ == '__main__':
main()