-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathExome_Script.py
executable file
·225 lines (207 loc) · 11.9 KB
/
Exome_Script.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
#program description:
#we substring out the exome from the genome, and call varsimlab on the exome.
#We then go through the SINC error file and correct the positions listed--
# The error positions will be relative to the exome. We want them relative to the genome.
# genome_position_of_error = exon_position_in_genome + error_position_in_exon
# first we go through the error list and figure out which exon the error belongs to. We then calculate the genome_position_of_error as above.
import sys
if sys.version_info[0] < 3:
raise Exception("Must be using Python 3")
import argparse
import os
import subprocess
from itertools import accumulate
import bisect
import glob
#all of these imports are part of the python standard library of python3
#required arguments
parser=argparse.ArgumentParser()
parser.add_argument("filename", help="name of output file")
parser.add_argument("genome", help="genome to be processessed")
group=parser.add_mutually_exclusive_group(required=True)
group.add_argument("-use_genome", help="generate tumor and normal for entire provided sequence", action="store_true", default=False)
group.add_argument("-bed", help="generate tumor and normal based on bed file containing exonic regions")
#read generation modifications
read_gen=parser.add_argument_group("read generation parameters", "arguments to adjust read generation")
read_gen.add_argument("-c", help="read depth of coverage", default=25, type=float)
read_gen.add_argument("-s", help="use single end reads (default paired)", action="store_true", default=False)
read_gen.add_argument("-l", help="read length. default 100 bp", type=int, default=100)
read_gen.add_argument("-sam", help="align reads to reference genome", action="store_true", default=False)
read_gen.add_argument("-m", help="maximum distance for two bed ranges to be merged into one range. If zero, merges only those ranges that directly overlap with each other", type=int, default=30)
#error generation modifications
#TODO fix these helps. too wordy and unclear:
error_gen=parser.add_argument_group("error parameters", "arguments to adjust tumor error generation")
error_gen.add_argument("-cnv", help="percent of total input to be incorporated into a CNV. Values from 0 to 100. 4 would signify 4 percent of input should be included in CNVs", type=float,default=8)
error_gen.add_argument("-cnv_min_size", help="minimum size of CNVs", default=10000, type=int)
error_gen.add_argument("-cnv_max_size", help="CNV_max_size", default=40000, type=int)
error_gen.add_argument("-snp", help="percent of total input to be turned into SNPs. Values from 0 to 100. A value of 5 indicates 5 percent of genome should be turned into SNPs", type=float, default=.002)
error_gen.add_argument("-indel", help="percent of total input to be included in INDELS. values from 0 to 100, a value of 1 indicates 1 percent of the genome should be included in indels",default=.001, type=float)
error_gen.add_argument("-ploidy", help="tumor ploidy. default diploid", type=int, default=2, choices=[1,2,3])
error_gen.add_argument("-subclones", help="generate multiple tumor subclones, to simulate tumor heterogeneity", type=int, default=1)
args=parser.parse_args()
#tumor_coverage=args.c/args.ploidy
#this line would make the read coverage between tumor and normal equal. As is if all the alleles of the tumor are combined, the tumor coverage is ploidy times more than normal.
assert args.snp>=0 and args.snp<=100, "SNP rate should be between 0 and 100 percent"
assert args.cnv>=0 and args.cnv<=100, "CNV rate should be between 0 and 100 percent"
assert args.indel>=0 and args.indel<=100, "INDEL rate should be between 0 and 100 percent"
#ensure the user gives acceptable values for cnv, indel and snp rate. we could accomplish this with the choices arg in add_argument, but I think it makes the help page look ugly
art_args=["./art_run.sh", args.filename, "exome_with_linebreaks.fa", args.c, args.s, args.snp, args.indel, args.cnv, args.cnv_min_size, args.cnv_max_size, args.l, args.ploidy, args.subclones, args.genome, args.sam]
art_args=list(map(str, art_args))
#list of command line arguments used by art_run.sh.
def prep_bed():
'''take bed file, merge together ranges that overlap or are within args.m of eachother. This prevents the same region being included twice, and prevents short pseudodeletions being created between 2 nearby exons'''
# os.system("module load bedtools")
# bedtools_location=subprocess.check_output(["which bedtools"], shell=True, universal_newlines=True).strip()
os.system("sort -k2,2n {} -o {}".format(args.bed, args.bed))
#sort the bed file based on the start column (required for bedtools merge)
new_bedfile=args.bed+"_disjoint"
os.system("{} merge -d {} -i {} > {}".format("./bedtools2/bin/bedtools", args.m, args.bed, new_bedfile))
#merge together adjacent ranges in bed file, rendering the bed file nonoverlapping
def genome_IO(genome_file):
'''We take all the non-header lines out of the genome and contatenate
them together into a string which we return'''
genome=[]
with open(genome_file, "r") as f:
for line in f:
if line[0]!= ">":
#if the line is not a header we add it to our genome string
genome.append(line.strip())
#We take out linebreaks (they'll mess with our math). We'll have to add them back in later
return ''.join(genome)
def genome_to_exome(genome):
'''Given genome string, we substring out the exonic regions of the genome using the bed file. Returns a list with two elements: a list of exon sequences, and a list of genome offsets (position of exon in genome)'''
exon_sequences=[]
offsets=[]
exon_ranges=[]
fh= open(args.bed, "r")
uniqueexons=list(set(fh.readlines()))
#boucing it through a set keeps only unique lines
for line in uniqueexons:
line=line.strip().split()
if line:
start=int(line[1])
end=int(line[2])
exon_ranges.append((start,end))
exon_ranges.sort(key=lambda x: x[0])
#list of (start,end) tuples, sorted by start position
for start,end in exon_ranges:
offsets.append(start)
exon=genome[start:end]
exon_sequences.append(exon)
#substring out exonic regions
exome=[exon_sequences, offsets]
#we could do this as a dict, but our list of lists will speed things up in a few steps
fh.close()
return exome
def exome_chunk_index(exome):
'''we need the position of each exon in the exome. That way we can decide which exon and given error falls into, and what genome offset to use for that error. given the list generated by genome_to_exome this function adds the list of exome positions and returns the three lists'''
exomelist=exome[0]
genome_offsets=exome[1]
lengths=list(map(len, exomelist))
#a list of the lengths of all the exon chunks
exon_ends=list(accumulate(lengths))
#the end position of each exon. Found by taking the cumulative sum of the lengths of the exons
exon_starts = [(end-length)+1 for (end,length) in zip(exon_ends,lengths)]
#the start position of each exon (starting at one)
exome_with_offsets=[exomelist, genome_offsets, exon_starts]
#we've added a list with the start position of each exon in the exome
return exome_with_offsets
def correct_error_file_faster(error_column, file, exome):
'''Given an error file, the index of that error files uncorrected positions column, and the exome_with_offsets generated above, this function goes through the file and corrected the error_column to be relative to the genome, rather than the exome.'''
uncorrected_error_positions=[]
#error positions relative to exome
exon_number_of_errors=[]
#will contain index of exon that each error belongs to. For example [2,4] means the first error belongs to the 2nd exon, and the 2nd error belongs to the fourth exon
corrected_error_positions=[]
fh=open(file+"corrected", "w")
if "CNV_results" in file:
error_file=open(file, "r").readlines()[1:]
#ignore the header line of the CNV file.
else:
error_file=open(file,"r").readlines()
for line in error_file:
line=line.split()
error_position=int(line[error_column])
uncorrected_error_positions.append(error_position)
genome_offsets=exome[1]
start_positions =exome[2]
for error in uncorrected_error_positions:
exon_number_of_errors.append(bisect.bisect(start_positions, error)-1)
#determine which exon the particular error falls into, using our list of exon start positions and a binary search algorithm
for index, exon_index in enumerate(exon_number_of_errors):
#iterating through the exon number of each error.
start_position=start_positions[exon_index]
#start position of exon in genome
genome_offset=genome_offsets[exon_index]
#determine genome offset for each error, based on which exon it falls in
error_position=uncorrected_error_positions[index]
exome_position=error_position-start_position
#the position of the error within the exon.
corrected=genome_offset+exome_position+1
corrected_error_positions.append(str(corrected))
for position,line in enumerate(error_file):
fh.write(corrected_error_positions[position]+"\t"+line)
fh.close()
def add_header(filename):
'''add a header column to the corrected file names, to make them more understandable.'''
fh=open(filename, "r+")
content=fh.read()
fh.seek(0,0)
if "INDEL" in filename:
#TODO fix
fh.write("genome_position\ttarget\texome_start\texome_end\toriginal\tvariant\n"+content)
if "SNP" in filename:
fh.write("genome_position\ttarget\texome_position\toriginal\tvariant\n"+content)
if "CNV_results" in filename:
fh.write("genome_start_position\ttarget\texome_start_position\ttype\tseq_size\tCNV_size\tallele_1_copies\tallele_2_copies\tNpercent_allele_1\tNpercent_allele_2\n"+content)
if "stdresults" in filename:
fh.write("genome_start\tgenome_end\ttarget\texome_start\texome_end\ttype\n"+content)
fh.close()
def call_varsimlab(genome_file, bed_file):
'''we call varsimlab on our exonic regions.'''
genome=genome_IO(genome_file)
exome_list=genome_to_exome(genome)
exome=exome_list[0] #make a list of the exonic regions
exome=''.join(exome)
exome_with_linebreaks=[]
for i in range(0, len(exome), 50):
exome_with_linebreaks.append(exome[i:i+50])
exome_with_linebreaks='\n'.join(exome_with_linebreaks)
#add in line breaks and a header. art needs them.
exome_with_linebreaks=(">exome\n"+exome_with_linebreaks)
exome_file=open("exome_with_linebreaks.fa", "w")
exome_file.write(exome_with_linebreaks)
exome_file.close()
#exome with linebreaks used by run.sh
subprocess.run(art_args)
if __name__=="__main__" and not args.use_genome:
#if we are doing exome sequencing
genome=genome_IO(args.genome)
prep_bed()
args.bed=args.bed+"_disjoint"
exomedict=genome_to_exome(genome)
b=exome_chunk_index(exomedict)
call_varsimlab(args.genome, args.bed)
for subclone in range(1, (args.subclones+1)):
filelist=glob.glob("{}/tumor/subclone_{}/*.txt".format(args.filename, subclone))
for file in filelist:
if not "stdresults" in file:
correct_error_file_faster(1, file, b)
else:
correct_error_file_faster(2, file, b)
correct_error_file_faster(2, file+"corrected", b)
# correct the start and end of the CNVS in stdresults file
os.system("rm {}/tumor/subclone_{}/*.txt".format(args.filename, subclone))
os.system("rm {}/tumor/subclone_{}/CNV_stdresults.txtcorrected".format(args.filename, subclone))
os.system("mv {}/tumor/subclone_{}/CNV_stdresults.txtcorrectedcorrected {}/tumor/subclone_{}/CNV_stdresultscorrected".format(args.filename, subclone, args.filename, subclone))
if args.ploidy==3:
os.system("mv {}/tumor/subclone_{}/*.txtcorrectedcorrected {}/tumor/subclone_{}/CNV_stdresults_2corrected".format(args.filename, subclone, args.filename, subclone))
#clean up
filelist=glob.glob("{}/tumor/subclone_{}/*".format(args.filename, subclone))
for file in filelist:
add_header(file)
elif __name__=="__main__":
art_args[2]=args.genome
#switch exome_with_linebreaks out for the file path of the genome
subprocess.run(art_args)
#if we're just doing genome sequencing we can simply call run.sh. We don't need to do any error file correcting, or subsequencing.