forked from dib-lab/khmer
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathnormalize-by-kadian.py
102 lines (80 loc) · 3.53 KB
/
normalize-by-kadian.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
#! /usr/bin/env python
"""
Eliminate reads with kadian k-mer abundance higher than
DESIRED_COVERAGE. Output sequences will be placed in 'infile.keepkad'.
% python scripts/normalize-by-median.py [ -C <cutoff> ] <data1> <data2> ...
Use '-h' for parameter help.
"""
import sys, screed, os
import khmer
from khmer.counting_args import build_construct_args, DEFAULT_MIN_HASHSIZE
DEFAULT_DESIRED_COVERAGE=5
def main():
parser = build_construct_args()
parser.add_argument('-C', '--cutoff', type=int, dest='cutoff',
default=DEFAULT_DESIRED_COVERAGE)
parser.add_argument('-s', '--savehash', dest='savehash', default='')
parser.add_argument('-l', '--loadhash', dest='loadhash',
default='')
parser.add_argument('input_filenames', nargs='+')
args = parser.parse_args()
if not args.quiet:
if args.min_hashsize == DEFAULT_MIN_HASHSIZE:
print>>sys.stderr, "** WARNING: hashsize is default! You absodefly want to increase this!\n** Please read the docs!"
print>>sys.stderr, '\nPARAMETERS:'
print>>sys.stderr, ' - kmer size = %d \t\t(-k)' % args.ksize
print>>sys.stderr, ' - n hashes = %d \t\t(-N)' % args.n_hashes
print>>sys.stderr, ' - min hashsize = %-5.2g \t(-x)' % args.min_hashsize
print>>sys.stderr, ''
print>>sys.stderr, 'Estimated memory usage is %.2g bytes (n_hashes x min_hashsize)' % (args.n_hashes * args.min_hashsize)
print>>sys.stderr, '-'*8
K=args.ksize
HT_SIZE=args.min_hashsize
N_HT=args.n_hashes
DESIRED_COVERAGE=args.cutoff
filenames = args.input_filenames
if args.loadhash:
print 'loading hashtable from', args.loadhash
ht = khmer.load_counting_hash(args.loadhash)
else:
print 'making hashtable'
ht = khmer.new_counting_hash(K, HT_SIZE, N_HT)
total = 0
discarded = 0
for input_filename in filenames:
output_name = os.path.basename(input_filename) + '.keepkad'
outfp = open(output_name, 'w')
for n, record in enumerate(screed.open(input_filename)):
if n > 0 and n % 10000 == 0:
print '... kept', total - discarded, 'of', total, ', or', \
int(100. - discarded / float(total) * 100.), '%'
print '... in file', input_filename
total += 1
if len(record.sequence) < K:
continue
seq = record.sequence.replace('N', 'A')
kad = ht.get_kadian_count(seq)
if kad < DESIRED_COVERAGE:
ht.consume(seq)
outfp.write('>%s\n%s\n' % (record.name, record.sequence))
else:
discarded += 1
print 'DONE with', input_filename, '; kept', total - discarded, 'of',\
total, 'or', int(100. - discarded / float(total) * 100.), '%'
print 'output in', output_name
if args.savehash:
print 'Saving hashfile through', input_filename
print '...saving to', args.savehash
ht.save(args.savehash)
# Change 0.2 only if you really grok it. HINT: You don't.
fp_rate = khmer.calc_expected_collisions(ht)
print 'fp rate estimated to be %1.3f' % fp_rate
if fp_rate > 0.20:
print >>sys.stderr, "**"
print >>sys.stderr, "** ERROR: the counting hash is too small for"
print >>sys.stderr, "** this data set. Increase hashsize/num ht."
print >>sys.stderr, "**"
print >>sys.stderr, "** Do not use these results!!"
sys.exit(-1)
if __name__ == '__main__':
main()