forked from mrwaas/funny_CRISPR_pun
-
Notifications
You must be signed in to change notification settings - Fork 0
/
catPiss_v.1_0.py
executable file
·305 lines (282 loc) · 13.2 KB
/
catPiss_v.1_0.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
#!/usr/bin/env python3
import argparse
import os, sys, re, glob, shutil
import subprocess
import gzip
from gff3_to_TSSbed import gff3_to_TSSbed
from genelist_to_TSS_bed import TSSs_of_interest_bed
from overlap import range_overlap
from grna_gen import denovoGuideRnaAnno
##########-Start-###################
# Function: Download Gff3 file from Ensembl .gff3 ftp site
def downloadGff3(speciesName,outputDir):
fileList = 'wget ' + '\'' + 'ftp://ftp.ensembl.org/pub/release-98/gff3/' + speciesName + '/CHECKSUMS' + '\''
fileList_run = subprocess.run(fileList, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
if fileList_run.returncode != 0:
print("FAILED! Unable to locate list file. Check for existence on the Ensembl ftp site.")
exit(2)
else:
with open('CHECKSUMS') as gZList:
for lines in gZList:
lines = lines.rstrip()
if lines.endswith('.98.gff3.gz'):
temp = lines.split()
fileGrab = temp[2]
siteCmd = 'wget ' + '\'' + 'ftp://ftp.ensembl.org/pub/release-98/gff3/' + speciesName + '/' + fileGrab + '\''
siteCmd_run = subprocess.run(siteCmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
if siteCmd_run.returncode != 0:
print("FAILED! Either your species is not available or the species did not match. Please check the Ensembl FTP site and try again")
exit(2)
else:
contents=os.listdir(outputDir)
for files in contents:
if files.endswith('98.gff3.gz'):
preGz = files # the file name that matches the desired gff3 file
with gzip.open(preGz,'rb') as f_in:
with open(preGz[:-3], 'wb') as f_out:
shutil.copyfileobj(f_in, f_out) # unzipping the gff3 file
newName = preGz[:-3] # removing the '.gz' from the name since it is now unzipped
os.remove(preGz) # deleting the .gz file
os.remove('CHECKSUMS') # deleting the CHECKSUMS file
gff3_to_TSSbed(newName) # this is generating a new .bed file of TSSs for the species -- saved so it can be used again
print('\n\n{} was successfully downloaded.\n{} was successfully generated.\n\n'.format(newName, newName[:-3]+'.TSS.bed'))
##########-End-###################
#########-Start-#################
# Function: Download Fasta file from Ensembl .fa ftp site
def downloadFasta(speciesName):
fileList = 'wget ' + '\'' + 'ftp://ftp.ensembl.org/pub/release-98/fasta/' + speciesName + '/dna/CHECKSUMS' + '\''
fileList_run = subprocess.run(fileList, shell = True, stdout = subprocess.PIPE, stderr=subprocess.PIPE)
if fileList_run.returncode != 0:
print("FAILED! Unable to locate list file. Check for existence on the Ensembl ftp site.")
exit(2)
else:
with open('CHECKSUMS') as fastaList:
for lines in fastaList:
lines = lines.rstrip()
if lines.endswith('dna_rm.toplevel.fa.gz'):
temp = lines.split()
fileGrab = temp[2]
siteCmd = 'wget ' + '\'' + 'ftp://ftp.ensembl.org/pub/release-98/fasta/' + speciesName + '/dna/' + fileGrab + '\''
siteCmd_run = subprocess.run(siteCmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
if siteCmd_run.returncode != 0:
print("FAILED! Either your species is not available or the species did not match. Please check the Ensembl FTP site and try again")
exit(2)
else:
contents=os.listdir(currPath)
for files in content:
if files.endswith == 'dna_rm.toplevel.fa.gz':
fastaGz = files
with gzip.open(fastaGz,'rb') as f_in:
with open(fastaGz[:-3],'wb') as f_out:
shutil.copyfileobj(f_in, f_out)
newName = fastaGz[:-3]
os.remove(fastaGz)
os.remove('CHECKSUMS')
print('{} was successfully downloaded.'.format(newName,newPath))
##########-End-###################
########-Start-####################
# Function: Generate user inputs for gRNA generation
def userInputs():
print('/n/nWhat is your GC content lower bound (decimal form)?')
gc_lower = input()
if gc_lower > 1:
print('Your GC lower bound needs to be a decimal!')
exit(2)
print('What is your GC content upper bound (decimal form)?')
gc_upper = input()
if gc_upper > 1 or gc_upper < gc_lower:
print('Your GC upper bound needs to be a decimal and greater than or equal to your lower bound!')
exit(2)
print('What is the minimum chromosome range for your search (integer position)?')
minChr = input()
print('What is the maximum chromosome range for your search (integer position)?')
maxChr = input()
print('What score cut-off would you like to use (range: 0-100)?')
score = input()
inputList = [gc_lower, gc_upper, minChr, maxChr, score]
return inputList
###########-end-####################
def main():
parser = argparse.ArgumentParser(description="""
This program identifies a list of guide RNAs for CRISPR-based gene attenuation or promotion\n
for a given list of genes and associated species. The two required inputs are a gene or a \n
list of genes and the species of interest (see pattern requirements below). In addition, the \n
user must also indicate either deNovo or use of a reference table. The output is a\n
.bed file that contains a list of gRNAs for each gene and the associated primers for those\n
gRNAs.
""")
parser.add_argument('geneNames', help='The specific gene(s) of interest entered as a string or the name of the file with the list of genes.')
parser.add_argument('speciesName', help='The name of the target species. Uses underscore for spaces (e.g., homo_sapiens)')
parser.add_argument('-o', '--output', help='Optional: supply output file directory name, otherwise write to program default', dest = 'out')
parser.add_argument('-u', '--upstream', help='Optional: supply distance upstream of the TSS site to check for gRNAs', dest = 'upStream')
parser.add_argument('-d', '--downstream', help='Optional: supply distance downstream of the TSS site to check for gRNAs', dest = 'downStream')
parser.add_argument('-s', '--sortby', help="Optional: if 'de novo' is 'True' sort by is an option for sorting by score using either 'Doench2016_perc', 'Doench2016_score', 'Moreno_Matos_perc', 'Moreno_Matos_score', 'MIT_specificity'", dest = 'sortBy')
parser.add_argument('-n', '--deNovo', help="Choice of using a de novo table from this program ('True')", dest = 'deNovoVar')
parser.add_argument('-f', '--refTable', help="User provided reference table of scored gRNAs.", dest = 'refTableFile')
parser.add_argument('-t', '--casType', choices = ['Cas9','Cas12A'], help="Specify type of cas: Cas9 or Cas12A. This must be entered if deNovo = 'True'", dest = 'casTypeData')
parser.add_argument('-c', '--casFile', help="Identify file name with cas protein information detailed. This should be in the working directory. This must be entered if deNovo = 'True'", dest = 'casFileData')
parser.add_argument('-p', '--cloning', help="Identify the cloning strategy for generation of primers. Default is pX330. Other options for cloning strategies are in 'cloningstrategy_primers.txt, which MUST be in your working directory.", dest = 'cloneDest')
args = parser.parse_args()
geneNames = args.geneNames
speciesName = args.speciesName
speciesName = speciesName.lower()
if args.deNovoVar:
if args.casTypeData:
casType = args.casTypeData
else:
print("If deNovo is desired, you must define the '-t' option!")
exit(1)
if args.casFileData:
casFile = args.casFileData
else:
print("If deNovo is desired, you must define the '-c' option!")
exit(1)
else:
if args.refTableFile:
refTable = args.refTableFile
else:
print('You must provide a reference table!')
exit(1)
if args.deNovoVar == False and args.refTableFile == False:
print('You must input either a request for a deNovo analysis or use a reference table!')
exit(1)
if args.cloneDest:
cloningStrat = args.cloneDest
currPath = os.getcwd() # current path
sourceDir = currPath + '/' + speciesName # directory that contains the sources files for the species
if args.deNovoVar:
casFile = currPath + '/' + casFile
if os.path.exists(sourceDir) == 0:
os.mkdir(sourceDir) # if source directory for species not present, this creates it
contents = os.listdir(sourceDir)
TSS = False
FASTA = False
GRNA = False
for files in contents:
files = files.rstrip()
if files.endswith('.TSS.bed'):
TSS = True
nameTss = files
if files.endswith( ('.fa','.fasta') ):
FASTA = True
nameFasta = files
if files == 'selectGrnas.bed':
GRNA = True
nameGrna = files
print(TSS, FASTA, GRNA)
if (TSS == False) and (FASTA == False) and (GRNA == False):
print("""\n\nYou are missing all of the necessary files.\n
It will take quite a bit of time (i.e., > 1 day
to download them and generate the necessary files.\n\n
Would you like to proceed (yes/no)?""")
userIn1 = input()
if userIn1 == 'yes':
os.chdir(sourceDir)
print('OK. Sit back. Relax. This is going to take a while!')
print('A TSS reference file is not present. Retrieving necessary files from the Ensembl ftp download site.')
downloadGff3(speciesName)
print('A fasta reference file is not present. Retrieving necessary files from the Ensemble ftp download site.')
downloadFasta(speciesName)
print('Generating reference guide RNA .bed file.')
userIn = userInputs()
denovoGuideRnaAnno(nameFasta, casType, casFile, userIn[0], userIn[1], userIn[2], userIn[3], userIn[4] )
else:
print('Ok. Come back perhaps when you have more time.')
exit(2)
elif (TSS == True) and (FASTA == False) and (GRNA == False):
print("""\n\nYou are missing some really large files.\n
It will take quite a bit of time (i.e., > 1 day
to download them and generate the necessary files.\n\n
Would you like to proceed (yes/no)?""")
userIn1 = input()
if userIn1 == 'yes':
os.chdir(sourceDir)
print('OK. Sit back. Relax. This is going to take a while!')
print('A fasta reference file is not present. Retrieving necessary files from the Ensemble ftp download site.')
downloadFasta(speciesName)
print('Generating reference guide RNA .bed file.')
userIn = userInputs()
denovoGuideRnaAnno(nameFasta, casType, casFile, userIn[0], userIn[1], userIn[2], userIn[3], userIn[4] )
else:
print('Ok. Come back perhaps when you have more time.')
exit(2)
elif (TSS == True) and (FASTA == True) and (GRNA == False):
print("""\n\nYou are missing a really big file.\n
It will take quite a bit of time (i.e., > 1 day
to download them and generate the necessary files.\n\n
Would you like to proceed (yes/no)?""")
userIn1 = input()
if userIn1 == 'yes':
os.chdir(sourceDir)
print('OK. Sit back. Relax. This is going to take a while!')
print('Generating reference guide RNA .bed file.')
userIn = userInputs()
denovoGuideRnaAnno(nameFasta, casType, casFile, userIn[0], userIn[1], userIn[2], userIn[3], userIn[4] )
else:
print('Ok. Come back perhaps when you have more time.')
exit(2)
elif (TSS == False) and (FASTA == True) and (GRNA == False):
print("""\n\nYou are missing a large set of the necessary files.\n
It will take quite a bit of time (i.e., > 1 day
to download them and generate the necessary files.\n\n
Would you like to proceed (yes/no)?""")
userIn1 = input()
if userIn1 == 'yes':
os.chdir(sourceDir)
print('OK. Sit back. Relax. This is going to take a while!')
print('A TSS reference file is not present. Retrieving necessary files from the Ensembl ftp download site.')
downloadGff3(speciesName)
print('Generating reference guide RNA .bed file.')
userIn = userInputs()
denovoGuideRnaAnno(nameFasta, casType, casFile, userIn[0], userIn[1], userIn[2], userIn[3], userIn[4] )
else:
print('Ok. Come back perhaps when you have more time.')
exit(2)
elif (TSS == False) and (FASTA == True) and (GRNA == True):
print("""You are missing a relatively small file.
It will take just a little bit of time (i.e.,
(i.e., a couple of minutes) to download and
generate the necessary file.\n
Would you like to proceed (yes/no)?""")
userIn1 = input()
if userIn1 == 'yes':
os.chdir(sourceDir)
print('A TSS reference file is not present. Retrieving necessary files from the Ensembl ftp download site.')
downloadGff3(speciesName,sourceDir)
else:
print('Ok. Come back perhaps when you have more time.')
exit(2)
elif (TSS == True) and (GRNA == True):
print('The necessary files are here.')
if args.out:
outputDir = currPath + '/' + outputFileDirName
else:
outputDir = currPath + '/' + speciesName + '_OutputFiles'
os.chdir(currPath)
os.mkdir(outputDir)
contents=os.listdir(sourceDir)
for files in contents:
if files.endswith('.TSS.bed'):
inLocFile = sourceDir+'/' + files
geneNamesIn = currPath + '/' + geneNames
TSSs_of_interest_bed(inLocFile, geneNamesIn) # this is generating a new .bed file of TSSs specific to the desired genes
overlapInput = files[:-3]+'genesofinterest.bed'
gRNAs = currPath + '/' + speciesName + '/selectGrnas.bed'
if args.upStream:
upStrIn = upStr
else:
upStrIn = 100
if args.downStream:
downStrIn = downStr
else:
downStrIn = 50
if args.sortBy:
sortByIn = args.sortBy
else:
sortByIn = 'Doench2016'
temp = os.getcwd()
print(temp)
range_overlap(overlapInput, gRNAs, outputDir +'/overLapGRnas_OUTPUT', upStrIn, downStrIn, sortByIn, args.deNovoVar, cloningStrat)
sys.exit(0)
if __name__ == '__main__':
main()