-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdetect-N-repeats.py
67 lines (57 loc) · 2.05 KB
/
detect-N-repeats.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
# copyright (c) 2022-2023, Karim Hussein
# python script to find the most frequent repeat of any length in a fasta file
import os
file_path = input('Enter the path for a fasta file: ')
if os.path.exists(file_path):
print("opening the file")
with open(file_path, 'r', encoding='utf-8-sig') as f:
lines = f.readlines()
else:
print('The file does not exist')
exit()
seqs = {}
for line in lines:
line=line.strip()
if line.startswith('>'):
words = line.split()
name = words[0]
seqs[name]=''
else:
seqs[name]= seqs[name]+line
def n_repeats(seq, num):
"""
n_repeats detects the number of repeats for a specified length of bases(num) with a certain pattern and
returns a dictionary with the repeat and it's occurrences as key:value pairs
"""
repeatsdict = {}
for i in range(len(seq) - num):
rep = seq[i:i + num]
if rep in repeatsdict.keys():
repeatsdict[rep] += 1
else:
repeatsdict[rep] = 1
return repeatsdict
# calculates repeats of length n
# merge all the sequences together
all_seqs = ""
for seq in seqs.values():
all_seqs = all_seqs+seq
# taking user input for the repeat's length
Repeat_len = input("Please enter the length of the repeat: ")
# calling n_repeats function
repeats = n_repeats(all_seqs, int(Repeat_len))
# printing the repeats and how many times they exist
#print("here is all the repeats of length ", Repeat_len, "\n", repeats)
output = ("all the repeats of length ", str(Repeat_len), " and their occurrences", "\n", str(repeats))
f = open("repeats.txt", "w")
f.writelines(output)
# reversing the dictionary to use the max function
rev_dict = {}
for key, value in repeats.items():
rev_dict[value] = key
#print(rev_dict)
# print the most frequent sequence for the specified length
#print("the most occurring repeat of length", Repeat_len, "is", "\n", max(rev_dict.items()))
output2 = ("\n", "the most occurring repeat of length ", str(Repeat_len), " is", "\n", str(max(rev_dict.items())))
f.writelines(output2)
f.close()