-
Notifications
You must be signed in to change notification settings - Fork 1
/
singleCopyAlign2paml_workflow.py
199 lines (182 loc) · 9.13 KB
/
singleCopyAlign2paml_workflow.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
# -*- coding:utf-8 -*-
# @FileName :singleCopyAlign2paml_workflow.py
# @Time :2022/2/9 17:46
# @Author :Xian
## 这是一个生成paml mcmctree 输入align sequence文件的workflow
##
## 本脚本等效于依次使用如下脚本:
## extractSingleCopySequence.py --> batchId2Spname.py --> batchSeqAlign.py --> fasAlign2phy.py -->delStopCodon.py
import re
import os
import shutil
from Bio import SeqIO
from Bio import AlignIO
import argparse
import time
def filterId (Orthsco,Orth):
'''
Orthsco : Orthfiner file Orthogroups_SingleCopyOrthologues.txt
Orth : Orthfiner file Orthogroups.txt
'''
orthSingleList = []
with open(Orthsco) as afile:
for line in afile:
line = line.strip()
orthSingleList.append(line)
# print(orthSingleList)
orth2single = {}
with open(Orth) as bfile:
for line in bfile:
line = line.strip()
lineList = re.split('[:]',line)
orth2single[lineList[0]] = lineList[1]
# print(orth2single)
return orthSingleList,orth2single
def id2Fa (allFasta):
id2seq = {}
for seq_record in SeqIO.parse(allFasta, "fasta"):
id2seq[seq_record.id] = seq_record
return id2seq
def getFile(pathname,suffix):
'''
获取指定目录下的某类文件,存入列表中
pathname: 搜索目录
suffix: 文件后缀
'''
name = []
for root, dirs, files in os.walk(pathname):
# root 所指的是当前正在遍历的这个文件夹的本身的地址
# dirs 是一个list,内容是该文件夹中所有的目录的名字(不包括子目录)
# files同样是list, 内容是该文件夹中所有的文件(不包括子目录)
for i in files:
# print (i)
if os.path.splitext(i)[1] == suffix:
name.append(i)
# print (name)
return name
def fas2phy(infas):
with open(infas, 'r') as fin:
sequences = [(m.group(1), ''.join(m.group(2).split()))
for m in re.finditer(r'(?m)^>([^ \n]+)[^\n]*([^>]*)', fin.read())]
with open(infas+".phy", 'w') as fout:
fout.write('%d %d\n' % (len(sequences), len(sequences[0][1])))
for item in sequences:
fout.write('%-30s %s\n' % item)
def delStopCodon(inphy,outphy):
with open(inphy,'r') as fin:
content = fin.read()
content = content.replace("TAG","---")
content = content.replace("TAA", "---")
content = content.replace("TGA", "---")
with open(outphy,'w') as fout:
fout.write(content)
if __name__ == '__main__':
parser = argparse.ArgumentParser(description = "用途:利用orthfinder结果,提取并比对单拷贝基因序列。")
parser.add_argument("oscgroups", type=str,
help="Orthfinder resulte file: \"Orthogroups_SingleCopyOrthologues.txt\" (type = str)")
parser.add_argument("ogroups", type=str, help="Orthfinder resulte file: \"Orthogroups.txt\" (type = str)")
parser.add_argument("allcds", type=str, help="a merged coding squence file of all used species (type = str)")
parser.add_argument("cdsdir", type = str, help = "dir of xxx.cds.fa file, xxx is specie's name (type = str)")
parser.add_argument("-c","--cdssuffix", type = str, default = "fa", help = "suffix of xxx.cds.fa(type = str)")
parser.add_argument("-d","--scpdir", type = str, default = "./singleCopySeq", help = "dir of singleCopySeq (type = str)")
parser.add_argument("-s","--scpsuffix", type = str, default = "fa", help = "suffix of singleCopySeq, It is recommended to use a non-fasta suffix such as 'fa' to avoid errors in subsequent programs (type = str)")
parser.add_argument("-r","--rscpsuffix", type = str, default = "fasta", help = "suffix of replaced singleCopySeq(type = str)")
parser.add_argument("-m","--malignsuffix", type = str, default = "fas", help = "multiple sequence alignment file suffixes in fasta format generated by muscle(type = str)")
parser.add_argument("-p","--pmalignsuffix", type = str, default = "phy", help = "multiple sequence alignment file suffixes in phylip format generated by muscle(type = str)")
args = parser.parse_args()
## Extraction of single copy Gene sequence from Orthogroups.txt
## 提取Orthogroups.txt中的单拷贝基因序列
interval = ">" * 50
print(f'\n{interval}')
print('\nExtraction of single copy Gene sequence\n')
osinglelist, o2single = filterId(args.oscgroups, args.ogroups)
id2seq = id2Fa(args.allcds)
if os.path.exists(args.scpdir):
shutil.rmtree(args.scpdir)
os.mkdir(args.scpdir)
for og in osinglelist:
# print(o2single[og])
idList = re.split('\s+', o2single[og])[1:] ### delete blank in head
seqList = []
for id in idList:
# print(og + ' ' + id)
seqList.append(id2seq[id])
# SeqIO.write(seqList, "./singleCopySeq/" + og + ".singlecopy.cds.fa", "fasta")
SeqIO.write(seqList, f'{args.scpdir}/{og}.singlecopy.cds.fa', "fasta")
print(f'\n{interval}')
## Read cds files to establish the index relationship between the sequence and the file name (species name)
## 读取CDS文件,建立序列与文件名(物种名)的索引关系
print(f'\n{interval}')
print('\nRead cds files to establish the index relationship\n')
cdsFa = getFile(args.cdsdir,f'.{args.cdssuffix}')
print(cdsFa)
id2sp = {}
for sp in cdsFa:
print ("input "+ sp +"\n")
spName = re.split(r"\.",sp)
for seq_record in SeqIO.parse(args.cdsdir+'/'+sp,"fasta"):
# print(seq_record)
id2sp[seq_record.id] = spName[0]
print(f'\n{interval}')
## Read the single copy gene sequence and replace the gene name as the species name.
## 读取单拷贝基因序列,并将基因名称替换为物种名称。
print(f'\n{interval}')
print('\nRead and replace the name of single copy gene sequence\n')
name = getFile(args.scpdir,f'.{args.scpsuffix}')
for id in name:
print (id)
seqList = []
for seq_record in SeqIO.parse(args.scpdir+'/'+id,"fasta"):
seq_record.id = id2sp[seq_record.id]
seq_record.description = seq_record.id
seqList.append(seq_record)
SeqIO.write(seqList, args.scpdir+'/'+id+"sta", "fasta")
print(f'\n{interval}')
## Read the single-copy gene sequence after name substitution and make multiple sequence alignment
## 读取名称替换后的单拷贝基因序列,进行多序列比对
print(f'\n{interval}')
print('\nMake multiple sequence alignment of single copy gene sequence\n')
name = getFile(args.scpdir,f'.{args.rscpsuffix}')
for id in name:
print(f'\nReading {args.scpdir}/{id}')
os.system(f'muscle -align {args.scpdir}/{id} -output {args.scpdir}/{id}.fas')
# os.popen(f'muscle -in {args.scpdir}/{id} -out {args.scpdir}/{id}.fas')
# time.sleep(5)
# os.popen在循环中是多线程异步执行,如果每个线程执行时间很长会发生相互干扰,但是效率更高。需要合理利用
# os.popen("muscle -in ./singleCopySeq/"+id+" -fastaout ./singleCopySeq/"+id+".fas")
# os.popen("cat ./singleCopySeq/*.phy >> ./singleCopySeq/all.phy")
# shutil.copy("./singleCopySeq/*.phy", "./singleCopySeq/all.phy")
print(f'\n{interval}')
## Convert fasta format of muscle multiple sequence alignment to phylip Sequential format
## 将muscle多序列比对的fasta格式转换为 phylip Sequential格式(与-physout输出格式不一致)。
print(f'\n{interval}')
print('\nConvert fasta format of muscle multiple sequence alignment to phylip Sequential format\n')
name = getFile(args.scpdir,f'.{args.malignsuffix}')
for id in name:
print ("\nConverting "+id )
# AlignIO.convert(f'{args.scpdir}/{id}', "fasta", f'{args.scpdir}/{id}.phy',"phylip-sequential") ### biopython转化的phy格式 paml不识别
fas2phy(f'{args.scpdir}/{id}')
print(f'\n{interval}')
## Merge all single-copy gene multiple sequence alignment files in phylip format
## 合并所有phylip格式的单拷贝基因多序列比对文件
print(f'\n{interval}')
print('\nMerge all single-copy gene multiple sequence alignment files in phylip format\n')
time.sleep(3)
name = getFile(args.scpdir,f'.{args.pmalignsuffix}')
if os.path.exists(f'{args.scpdir}/all.phys'):
os.unlink(f'{args.scpdir}/all.phys')
for id in name:
with open(f'{args.scpdir}/{id}','r') as fin:
print (f'\nReading {args.scpdir}/{id}')
with open(f'{args.scpdir}/all.phys','a') as fout:
fout.write(fin.read())
print (f'\nPhy file :{id} was merged!' )
print ("\nMerged Phy file :"+id )
print(f'\n{interval}')
## Delete the stop codon in the alignment sequence
## 删除比对序列中的终止密码子
print(f'\n{interval}')
print('\nDelete the stop codon in the alignment sequence\n')
delStopCodon(f'{args.scpdir}/all.phys',f'{args.scpdir}/all.delStopCodon.phys')
abspath = os.path.abspath(f'{args.scpdir}/all.delStopCodon.phys')
print (f'\nGenerate final file: {abspath}\n\nThe work is done')