forked from dib-lab/khmer
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathfind-knots.py
executable file
·109 lines (84 loc) · 3.73 KB
/
find-knots.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
#! /usr/bin/env python
"""
Find highly-connected k-mers and output them in a .stoptags file, for use
in partitioning.
% python scripts/find-knots.py <base>
"""
import sys
import argparse
import glob
import os
import khmer
# counting hash parameters.
DEFAULT_COUNTING_HT_SIZE=3e6 # number of bytes
DEFAULT_COUNTING_HT_N=4 # number of counting hash tables
# Lump removal parameters. Probably shouldn't be changed, but who knows?
#
# explanation:
#
# We will walk EXCURSION_DISTANCE out from each tag; if we find more than
# EXCURSION_KMER_THRESHOLD kmers within that range, this will be a "big"
# excursion and we will track all k-mers visited. If we find that any
# k-mer has been visited more than EXCURSION_KMER_COUNT_THRESHOLD times,
# we will mark it as BAD and make it a stop tag for traversal.
## don't change these!
EXCURSION_DISTANCE=40
EXCURSION_KMER_THRESHOLD=200
EXCURSION_KMER_COUNT_THRESHOLD=2
#EXCURSION_KMER_COUNT_THRESHOLD=5 # -- works ok for non-diginormed data
###
def main():
parser = argparse.ArgumentParser(description="Find all highly connected k-mers.")
parser.add_argument('--n_hashes', '-N', type=int, dest='n_hashes',
default=DEFAULT_COUNTING_HT_N,
help='number of counting hash tables to use')
parser.add_argument('--hashsize', '-x', type=float, dest='min_hashsize',
default=DEFAULT_COUNTING_HT_SIZE,
help='lower bound on counting hashsize to use')
parser.add_argument('graphbase')
args = parser.parse_args()
graphbase = args.graphbase
print 'loading ht %s.ht' % graphbase
ht = khmer.load_hashbits(graphbase + '.ht')
print 'loading tagset %s.tagset...' % graphbase
ht.load_tagset(graphbase + '.tagset')
initial_stoptags = False # @CTB regularize with make-initial
if os.path.exists(graphbase + '.stoptags'):
print 'loading stoptags %s.stoptags' % graphbase
ht.load_stop_tags(graphbase + '.stoptags')
initial_stoptags = True
pmap_files = glob.glob(args.graphbase + '.subset.*.pmap')
print 'loading %d pmap files (first one: %s)' % (len(pmap_files),
pmap_files[0])
print '---'
print 'output stoptags will be in', graphbase + '.stoptags'
if initial_stoptags:
print '(these output stoptags will include the already-loaded set)'
print '---'
# create counting hash
K = ht.ksize()
counting = khmer.new_counting_hash(K, args.min_hashsize, args.n_hashes)
# load & merge
for n, subset_file in enumerate(pmap_files):
print '<-', subset_file
subset = ht.load_subset_partitionmap(subset_file)
print '** repartitioning subset... %s' % subset_file
ht.repartition_largest_partition(subset, counting,
EXCURSION_DISTANCE,
EXCURSION_KMER_THRESHOLD,
EXCURSION_KMER_COUNT_THRESHOLD)
print '** merging subset... %s' % subset_file
ht.merge_subset(subset)
print '** repartitioning, round 2... %s' % subset_file
size = ht.repartition_largest_partition(None, counting,
EXCURSION_DISTANCE,
EXCURSION_KMER_THRESHOLD,
EXCURSION_KMER_COUNT_THRESHOLD)
print '** repartitioned size:', size
print 'saving stoptags binary'
ht.save_stop_tags(graphbase + '.stoptags')
os.rename(subset_file, subset_file + '.processed')
print '(%d of %d)\n' % (n, len(pmap_files))
print 'done!'
if __name__ == '__main__':
main()