forked from zanglab/SICER2
-
Notifications
You must be signed in to change notification settings - Fork 0
/
sicer
executable file
·235 lines (192 loc) · 8.35 KB
/
sicer
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
#!/usr/bin/env python3
# Developed by Zang Lab at University of Virginia - 2018
#Author: Jin Yong Yoo
import os
import sys, errno, warnings
curr_path = os.getcwd()
cpu_available = os.cpu_count() - 1 #leave one core for I/O
import subprocess
import argparse
# Imports from SICER package
from sicer.main import run_SICER
from sicer.lib import Utility, GenomeData
def warning_on_one_line(message, category, filename, lineno, file=None, line=None):
return '%s:%s: %s:%s\n' % (filename, lineno, category.__name__, message)
warnings.formatwarning = warning_on_one_line
def main():
'''The main function/pipeline for SICER'''
parser = argparse.ArgumentParser(description='Processing arguments for SICER', usage = "Use --help or -h for more information")
parser.add_argument(
'--treatment_file',
'-t',
required=True,
type=str,
help='''Name of the sample file you wish to run SICER on. This can either be the relative or the absolute path of the file. Must be in BED or BAM format.'''
)
parser.add_argument(
'--control_file',
'-c',
required=False,
type=str,
help='''Name of the control library in BED or BAM format. This can either be the relative or the absolute path of the file. If you wish to run SICER without a control library, simply do not enter the file. '''
)
parser.add_argument(
'--species',
'-s',
required=True,
type=str,
help='The species/genome used (ex: hg38)'
)
parser.add_argument(
'--redundancy_threshold',
'-rt',
required=False,
type=int,
default=1,
help='The number of copies of indentical reads allowed in a library. Default value is 1'
)
parser.add_argument(
'--window_size',
'-w',
required=False,
type=int,
default=200,
help='Resolution of SICER. Default value is 200 (bp)'
)
parser.add_argument(
'--fragment_size',
'-f',
required=False,
type=int,
default=150,
help='The amount of shift from the beginning of a read to the center of the DNA fragment represented by the read. Default value is 150 (bp).'
)
parser.add_argument(
'--effective_genome_fraction',
'-egf',
required=False,
type=float,
default=0.74,
help='Effective genome as fraction of the genome size. Default value is 0.74'
)
parser.add_argument(
'--false_discovery_rate',
'-fdr',
required=False,
default=0.01,
type=float,
help='''Remove all islands with an false_discovery_rate below cutoff. Default value is 0.01.'''
)
parser.add_argument(
'--output_directory',
'-o',
required=False,
default=curr_path,
type=str,
help='Path of the directory in which results will be stored. Default path is the current path'
)
parser.add_argument(
'--gap_size',
'-g',
required=False,
type=int,
default=600,
help='The minimum length of a \"gap\" such that neighboring window is an \"island.\" This value must be a multiple of the window size. Default value is 600 (bp)'''
)
parser.add_argument(
'--e_value',
'-e',
required=False,
type=int,
default=1000,
help='E-value. Requires user input when no control library is provided. Default value is 1000'
)
parser.add_argument(
'--cpu',
'-cpu',
required=False,
type=int,
default=cpu_available,
help='CPU Core Count: The number of CPU cores SICER program will use when executing multi-processing tasks. Optimal core count is the species\' number of chromosomes. Default value is the maximum number of cores avaiable in the system.'
)
parser.add_argument(
'--significant_reads',
required=False,
action='store_true',
help='Output Significant Reads: Enter \"--significant_reads\" to have SICER produce a BED file of treatment reads filtered by significant islands and WIG file of filtered reads binned into windows'
)
parser.add_argument(
"--verbose",
"-v",
required=False,
help="increase console output verbosity",
action="store_true"
)
args = parser.parse_args()
setattr(args,'subcommand','SICER')
setattr(args,'df',False)
# Check if argument inputs are valid
if not(os.path.isabs(args.treatment_file)):
args.treatment_file = os.path.join(curr_path, args.treatment_file)
if (not (Utility.fileExists(args.treatment_file))):
raise FileNotFoundError(errno.ENOENT, os.strerror(errno.ENOENT), args.treatment_file)
if (not (args.treatment_file.lower().endswith('.bed')) and not (args.treatment_file.lower().endswith('.bam'))):
warnings.warn("Treatment file must be in BED or BAM format.")
# Checks if the inputs files are in BAM format. If they are, convert them into BED format using bamToBed
# functionality from bedtools2 (https://github.com/arq5x/bedtools2).
if (args.treatment_file.lower().endswith('.bam')):
bed_file_name = args.treatment_file.replace('.bam', '.bed')
if not Utility.fileExists(bed_file_name):
process = subprocess.Popen("bedtools bamtobed -i %s > %s" % (args.treatment_file, bed_file_name),
stdin=subprocess.PIPE,
shell=True,
)
process.communicate()
if process.returncode != 0:
sys.stderr.write("Error: Cannot convert BAM file to BED file.\nCheck if bedtools2 (https://github.com/arq5x/bedtools2) has been installed correctly.\n")
sys.exit(1)
args.treatment_file = bed_file_name
if (args.control_file is not None):
if not(os.path.isabs(args.control_file)):
args.control_file = os.path.join(curr_path, args.control_file)
if (not (Utility.fileExists(args.control_file))):
raise FileNotFoundError(errno.ENOENT, os.strerror(errno.ENOENT), args.control_file)
if (not (args.control_file.lower().endswith('.bed')) and not (args.control_file.lower().endswith('.bam'))):
warnings.warn("Treatment file must be in BED or BAM format.")
if (args.control_file.lower().endswith('.bam')):
bed_file_name = args.control_file.replace('.bam', '.bed')
if not Utility.fileExists(bed_file_name):
process = subprocess.Popen("bedtools bamtobed -i %s > %s" % (args.control_file, bed_file_name),
stdin=subprocess.PIPE,
shell=True,
)
process.communicate()
if process.returncode != 0:
sys.stderr.write("Error: Cannot convert BAM file to BED file.\nCheck if bedtools2 (https://github.com/arq5x/bedtools2) has been installed correctly.\n")
sys.exit(1)
args.control_file = bed_file_name
if (not (args.species in GenomeData.species_chroms.keys())):
sys.stderr.write("Error: Species " + args.species + " not recognized.\n")
sys.exit(1)
if (not (args.effective_genome_fraction <= 1 and args.effective_genome_fraction >= 0)):
sys.stderr.write("Error: Effective genome fraction must be a value between 0 and 1.\n")
sys.exit(1)
if (args.gap_size % args.window_size != 0):
sys.stderr.write("Error: Gap size is not a multiple of window size.\n")
sys.exit(1)
if not os.path.exists(args.output_directory):
try:
os.makedirs(args.output_directory)
except:
sys.stderr.write("Error: Gap size is not a multiple of window size.\n" % args.output_directory)
sys.exit(1)
if not(os.path.isabs(args.output_directory)):
args.output_directory = os.path.join(curr_path, args.output_directory)
if args.cpu > cpu_available:
args.cpu = cpu_available
warnings.warn("The number of CPU cores entered is greater than the number of cores available for this process. Executing SICER with the maximum number of cores available.\n")
print("Running SICER with given arguments \n")
run_SICER.main(args)
print("\nProgram Finished Running")
if __name__ == '__main__':
main()