singleCopyAlign2paml_workflow.py

# -*- coding:utf-8 -*-
# @FileName :singleCopyAlign2paml_workflow.py
# @Time     :2022/2/9 17:46
# @Author   :Xian

## 这是一个生成paml mcmctree 输入align sequence文件的workflow
## 
## 本脚本等效于依次使用如下脚本：
## extractSingleCopySequence.py --> batchId2Spname.py --> batchSeqAlign.py --> fasAlign2phy.py -->delStopCodon.py

import re
import os
import shutil
from Bio import SeqIO
from Bio import AlignIO
import argparse
import time

def filterId (Orthsco,Orth):
    '''
    Orthsco : Orthfiner file Orthogroups_SingleCopyOrthologues.txt
    Orth : Orthfiner file Orthogroups.txt
    '''
    orthSingleList = []
    with open(Orthsco) as afile:
        for line in afile:
            line = line.strip()
            orthSingleList.append(line)
        # print(orthSingleList)
    orth2single = {}
    with open(Orth) as bfile:
        for line in bfile:
            line = line.strip()
            lineList = re.split('[:]',line)
            orth2single[lineList[0]] = lineList[1]
        # print(orth2single)
    return orthSingleList,orth2single

def id2Fa (allFasta):
    id2seq = {}
    for seq_record in SeqIO.parse(allFasta, "fasta"):
        id2seq[seq_record.id] = seq_record
    return id2seq

def getFile(pathname,suffix):
    '''
    获取指定目录下的某类文件,存入列表中
    pathname: 搜索目录
    suffix: 文件后缀
    '''
    name = []
    for root, dirs, files in os.walk(pathname):
        # root 所指的是当前正在遍历的这个文件夹的本身的地址
        # dirs 是一个list，内容是该文件夹中所有的目录的名字(不包括子目录)
        # files同样是list, 内容是该文件夹中所有的文件(不包括子目录)
        for i in files:
            # print (i)
            if os.path.splitext(i)[1] == suffix:
                name.append(i)
    # print (name)
    return name

def fas2phy(infas):
    with open(infas, 'r') as fin:
        sequences = [(m.group(1), ''.join(m.group(2).split()))
                     for m in re.finditer(r'(?m)^>([^ \n]+)[^\n]*([^>]*)', fin.read())]
        with open(infas+".phy", 'w') as fout:
            fout.write('%d %d\n' % (len(sequences), len(sequences[0][1])))
            for item in sequences:
                fout.write('%-30s %s\n' % item)

def delStopCodon(inphy,outphy):
    with open(inphy,'r') as fin:
        content = fin.read()
        content = content.replace("TAG","---")
        content = content.replace("TAA", "---")
        content = content.replace("TGA", "---")
    with open(outphy,'w') as fout:
        fout.write(content)

if __name__ == '__main__':
    parser = argparse.ArgumentParser(description = "用途：利用orthfinder结果，提取并比对单拷贝基因序列。")
    parser.add_argument("oscgroups", type=str,
                        help="Orthfinder resulte file: \"Orthogroups_SingleCopyOrthologues.txt\" (type = str)")
    parser.add_argument("ogroups", type=str, help="Orthfinder resulte file: \"Orthogroups.txt\" (type = str)")
    parser.add_argument("allcds", type=str, help="a merged coding squence file of all used species (type = str)")
    parser.add_argument("cdsdir", type = str, help = "dir of xxx.cds.fa file, xxx is specie's name (type = str)") 
    parser.add_argument("-c","--cdssuffix", type = str, default = "fa", help = "suffix of xxx.cds.fa(type = str)")
    parser.add_argument("-d","--scpdir", type = str, default = "./singleCopySeq", help = "dir of singleCopySeq (type = str)")
    parser.add_argument("-s","--scpsuffix", type = str, default = "fa", help = "suffix of singleCopySeq, It is recommended to use a non-fasta suffix such as 'fa' to avoid errors in subsequent programs (type = str)")
    parser.add_argument("-r","--rscpsuffix", type = str, default = "fasta", help = "suffix of replaced singleCopySeq(type = str)")
    parser.add_argument("-m","--malignsuffix", type = str, default = "fas", help = "multiple sequence alignment file suffixes in fasta format generated by muscle(type = str)")
    parser.add_argument("-p","--pmalignsuffix", type = str, default = "phy", help = "multiple sequence alignment file suffixes in phylip format generated by muscle(type = str)")

    args = parser.parse_args()

    ## Extraction of single copy Gene sequence from Orthogroups.txt
    ## 提取Orthogroups.txt中的单拷贝基因序列
    interval = ">" * 50
    print(f'\n{interval}')
    print('\nExtraction of single copy Gene sequence\n')
    osinglelist, o2single = filterId(args.oscgroups, args.ogroups)
    id2seq = id2Fa(args.allcds)
    if os.path.exists(args.scpdir):
        shutil.rmtree(args.scpdir)
    os.mkdir(args.scpdir)
    for og in osinglelist:
        # print(o2single[og])
        idList = re.split('\s+', o2single[og])[1:]  ### delete blank in head
        seqList = []
        for id in idList:
            # print(og + ' ' + id)
            seqList.append(id2seq[id])
        # SeqIO.write(seqList, "./singleCopySeq/" + og + ".singlecopy.cds.fa", "fasta")
        SeqIO.write(seqList, f'{args.scpdir}/{og}.singlecopy.cds.fa', "fasta")
    print(f'\n{interval}')

    ## Read cds files to establish the index relationship between the sequence and the file name (species name)
    ## 读取CDS文件，建立序列与文件名(物种名)的索引关系
    print(f'\n{interval}')
    print('\nRead cds files to establish the index relationship\n')
    cdsFa = getFile(args.cdsdir,f'.{args.cdssuffix}')
    print(cdsFa)
    id2sp = {}
    for sp in cdsFa:
        print ("input "+ sp +"\n")
        spName = re.split(r"\.",sp)
        for seq_record in SeqIO.parse(args.cdsdir+'/'+sp,"fasta"):
            # print(seq_record)
            id2sp[seq_record.id] = spName[0]
    print(f'\n{interval}')

    ## Read the single copy gene sequence and replace the gene name as the species name.
    ## 读取单拷贝基因序列，并将基因名称替换为物种名称。
    print(f'\n{interval}')
    print('\nRead and replace the name of single copy gene sequence\n')
    name = getFile(args.scpdir,f'.{args.scpsuffix}')
    for id in name:
        print (id)
        seqList = []
        for seq_record in SeqIO.parse(args.scpdir+'/'+id,"fasta"):
            seq_record.id = id2sp[seq_record.id]
            seq_record.description = seq_record.id
            seqList.append(seq_record)
        SeqIO.write(seqList, args.scpdir+'/'+id+"sta", "fasta")
    print(f'\n{interval}')

    ## Read the single-copy gene sequence after name substitution and make multiple sequence alignment
    ## 读取名称替换后的单拷贝基因序列，进行多序列比对
    print(f'\n{interval}')
    print('\nMake multiple sequence alignment of single copy gene sequence\n')
    name = getFile(args.scpdir,f'.{args.rscpsuffix}')
    for id in name:
        print(f'\nReading {args.scpdir}/{id}')        
        os.system(f'muscle -align {args.scpdir}/{id} -output {args.scpdir}/{id}.fas')
        # os.popen(f'muscle -in {args.scpdir}/{id} -out {args.scpdir}/{id}.fas')
        # time.sleep(5)
        # os.popen在循环中是多线程异步执行，如果每个线程执行时间很长会发生相互干扰，但是效率更高。需要合理利用

        # os.popen("muscle -in ./singleCopySeq/"+id+" -fastaout ./singleCopySeq/"+id+".fas")
        # os.popen("cat ./singleCopySeq/*.phy >> ./singleCopySeq/all.phy")
        # shutil.copy("./singleCopySeq/*.phy", "./singleCopySeq/all.phy")
    print(f'\n{interval}')

    ## Convert fasta format of muscle multiple sequence alignment to phylip Sequential format
    ## 将muscle多序列比对的fasta格式转换为 phylip Sequential格式(与-physout输出格式不一致)。
    print(f'\n{interval}')
    print('\nConvert fasta format of muscle multiple sequence alignment to phylip Sequential format\n')
    name = getFile(args.scpdir,f'.{args.malignsuffix}')
    for id in name:
        print ("\nConverting "+id )
        # AlignIO.convert(f'{args.scpdir}/{id}', "fasta", f'{args.scpdir}/{id}.phy',"phylip-sequential") ### biopython转化的phy格式 paml不识别
        fas2phy(f'{args.scpdir}/{id}')
    print(f'\n{interval}')

    ## Merge all single-copy gene multiple sequence alignment files in phylip format
    ## 合并所有phylip格式的单拷贝基因多序列比对文件
    print(f'\n{interval}')
    print('\nMerge all single-copy gene multiple sequence alignment files in phylip format\n')
    time.sleep(3)
    name = getFile(args.scpdir,f'.{args.pmalignsuffix}')
    if os.path.exists(f'{args.scpdir}/all.phys'):
        os.unlink(f'{args.scpdir}/all.phys')
    for id in name:
        with open(f'{args.scpdir}/{id}','r') as fin:
            print (f'\nReading {args.scpdir}/{id}')
            with open(f'{args.scpdir}/all.phys','a') as fout:
                fout.write(fin.read())
                print (f'\nPhy file :{id} was merged!' )
                print ("\nMerged Phy file :"+id )
    print(f'\n{interval}')
    
    ## Delete the stop codon in the alignment sequence
    ## 删除比对序列中的终止密码子
    print(f'\n{interval}')
    print('\nDelete the stop codon in the alignment sequence\n')
    delStopCodon(f'{args.scpdir}/all.phys',f'{args.scpdir}/all.delStopCodon.phys')
    abspath = os.path.abspath(f'{args.scpdir}/all.delStopCodon.phys')
    print (f'\nGenerate final file: {abspath}\n\nThe work is done')