forked from dib-lab/khmer
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcount-density-by-position.py
45 lines (33 loc) · 994 Bytes
/
count-density-by-position.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
import sys, screed.fasta, os
import khmer
K = 32
HASHTABLE_SIZE=int(8e9)
N_HT = 4
###
MAX_DENSITY=2000
infile = sys.argv[1]
outfile = sys.argv[2]
print 'saving to:', outfile
print 'making hashtable'
ht = khmer.new_hashbits(K, HASHTABLE_SIZE, N_HT)
print 'eating', infile
#ht.consume_fasta(infile)
ht.load(infile + '.ht')
RADIUS=10
hist = [0.0] * 200
histcount = [0] * 200
for n, record in enumerate(screed.fasta.fasta_iter(open(infile))):
if n % 1000 == 0:
print '... saving', n
seq = record['sequence']
for pos in range(0, len(seq) - K + 1):
density = ht.count_kmers_within_radius(seq[pos:pos+K], RADIUS,
MAX_DENSITY)
density /= float(RADIUS)
hist[pos] += density
histcount[pos] += 1
if n % 1000 == 0:
outfp = open(outfile, 'w')
for i in range(len(hist)):
if histcount[i]:
print >>outfp, i, hist[i], histcount[i], hist[i]/histcount[i]