forked from dib-lab/khmer
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcount-shared-kmers-btw-assemblies.py
76 lines (61 loc) · 2 KB
/
count-shared-kmers-btw-assemblies.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
import sys
import khmer
import screed
import os
from screed.fasta import fasta_iter
K=32
HASHTABLE_SIZE = 5e7
N_HT = 4
LENGTH_THRESHOLD = int(sys.argv[3])
def iterseq(x):
for record in x:
yield record.sequence
def slidingWindow(sequence, K_size):
total_windows = (len(sequence) - K_size) + 1
for i in range(0, total_windows, 1):
kmer = sequence[i:i + K_size]
if 'N' in kmer:
continue
yield sequence[i:i+K_size]
def count(contigs1, contigs2):
ht = khmer.new_counting_hash(K, HASHTABLE_SIZE, N_HT)
count = 0
count2 = 0
count3 = 0
for n in contigs1:
if len(n) >= LENGTH_THRESHOLD:
kmer1 = slidingWindow(n, K)
for x in kmer1:
count += 1
if ht.get(x):
continue
ht.consume(x)
for n in contigs2:
if len(n) >= LENGTH_THRESHOLD:
kmer2 = slidingWindow(n, K)
for x in kmer2:
count2 += 1
if ht.get(x) > 0:
count3 += 1
# 'count' is the total number of kmers in the first file
# 'count2' is the total number of kmers in the second file
# 'count3' is the total number of kmers shared between the two files.
print count, count2, count3, "%.1f%%" % (count3 / float(count) * 100.)
def main(contig1, contig2):
ht = count(iterseq(screed.open(contig1)),
iterseq(screed.open(contig2)))
def output(ht, contigs, fp):
for n, sequence in enumerate(contigs):
count_list = []
contig_kmers = slidingWindow(sequence, K)
for contig_kmer in contig_kmers:
count_kmer = int(ht.get(contig_kmer)) - 1
count_list.append(count_kmer)
for item in count_list:
print >>fp, '%s' % item
print 'Hashtable occupancy =', ht.n_occupied()/float(HASHTABLE_SIZE)
return count_list
if __name__=='__main__':
contig_file1 = sys.argv[1]
contig_file2 = sys.argv[2]
main(contig_file1, contig_file2)