forked from dib-lab/khmer
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathload-into-counting.py
executable file
·84 lines (61 loc) · 2.51 KB
/
load-into-counting.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
#! /usr/bin/env python
"""
Build a counting Bloom filter from the given sequences, save in <htname>.
% python scripts/load-into-counting.py <htname> <data1> [ <data2> <...> ]
Use '-h' for parameter help.
@CTB enable/disable bigcount via command-line parameters.
"""
import sys, screed
import khmer
from khmer.counting_args import build_construct_args, DEFAULT_MIN_HASHSIZE
###
def main():
parser = build_construct_args()
parser.add_argument('output_filename')
parser.add_argument('input_filenames', nargs='+')
args = parser.parse_args()
if not args.quiet:
if args.min_hashsize == DEFAULT_MIN_HASHSIZE:
print>>sys.stderr, "** WARNING: hashsize is default! You absodefly want to increase this!\n** Please read the docs!"
print>>sys.stderr, '\nPARAMETERS:'
print>>sys.stderr, ' - kmer size = %d \t\t(-k)' % args.ksize
print>>sys.stderr, ' - n hashes = %d \t\t(-N)' % args.n_hashes
print>>sys.stderr, ' - min hashsize = %-5.2g \t(-x)' % args.min_hashsize
print>>sys.stderr, ''
print>>sys.stderr, 'Estimated memory usage is %.2g bytes (n_hashes x min_hashsize)' % (args.n_hashes * args.min_hashsize)
print>>sys.stderr, '-'*8
K=args.ksize
HT_SIZE=args.min_hashsize
N_HT=args.n_hashes
base = args.output_filename
filenames = args.input_filenames
print 'Saving hashtable to %s' % base
print 'Loading kmers from sequences in %s' % repr(filenames)
###
print 'making hashtable'
ht = khmer.new_counting_hash(K, HT_SIZE, N_HT)
ht.set_use_bigcount(True)
for n, filename in enumerate(filenames):
print 'consuming input', filename
ht.consume_fasta(filename)
if n > 0 and n % 10 == 0:
print 'mid-save', base
ht.save(base)
open(base + '.info', 'w').write('through %s' % filename)
print 'saving', base
ht.save(base)
info_fp = open(base + '.info', 'w')
info_fp.write('through end: %s\n' % filename)
# Change 0.2 only if you really grok it. HINT: You don't.
fp_rate = khmer.calc_expected_collisions(ht)
print 'fp rate estimated to be %1.3f' % fp_rate
print >>info_fp, 'fp rate estimated to be %1.3f' % fp_rate
if fp_rate > 0.20:
print >>sys.stderr, "**"
print >>sys.stderr, "** ERROR: the counting hash is too small for"
print >>sys.stderr, "** this data set. Increase hashsize/num ht."
print >>sys.stderr, "**"
sys.exit(-1)
print 'DONE.'
if __name__ == '__main__':
main()