forked from dib-lab/khmer
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcount-within-radius.py
44 lines (31 loc) · 942 Bytes
/
count-within-radius.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
import sys, screed.fasta, os
import khmer
K = 32
HASHTABLE_SIZE=int(8e9)
N_HT = 4
RADIUS=100
###
MAX_DENSITY=2000
infile = sys.argv[1]
outfile = sys.argv[2]
if len(sys.argv) > 3:
RADIUS=int(sys.argv[3])
print 'saving to:', outfile
print 'making hashtable'
ht = khmer.new_hashbits(K, HASHTABLE_SIZE, N_HT)
print 'eating', infile
ht.consume_fasta(infile)
print 'loading'
ht.save(outfile + '.ht')
outfp = open(outfile, 'w')
for n, record in enumerate(screed.open(infile)):
if n % 10000 == 0:
print '... saving', n
seq = record['sequence']
for pos in range(0, len(seq), 200):
subseq = seq[pos:pos+200]
middle = (len(subseq) - K + 1) / 2
density = ht.count_kmers_within_radius(subseq[middle:middle+K], RADIUS,
MAX_DENSITY)
density /= float(RADIUS)
print >>outfp, '>%s d=%.3f\n%s' % (record['name'], density, subseq)