-
Notifications
You must be signed in to change notification settings - Fork 5
/
Copy pathSEMedley.pl
207 lines (197 loc) · 7.62 KB
/
SEMedley.pl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
use warnings;
use strict;
use File::Basename;
my $skipped_exon_file;
my $annotation_file;
my $genome_fasta_file;
my $expression_file;
my $TPM1;
my $TPM2;
my $cond1_ct;
my $cond2_ct;
my $FDR;
my $dirname = dirname(__FILE__);
sub program_info {
print "\n\tSEMedley.pl will automatically run the following on an rMATS SE JCEC file:\n\t\t- SEFractionExpressed.pl\n\t\t- SEIntronExonSizes.pl\n\t\t- SENumberSkipped.pl\n\t\t- SESpliceSiteScoring.pl\n\t\t- SETranslateNMD.pl\n\t\t- SEUnannotated.pl\n\n\tUsage: perl SEMedley.pl [OPTIONS] -s <skipped exon file (rMATS JCEC)> -a <bed12 annotation file> -g <genome fasta file> -e <expression file> -TPM <min TPMs condition 1,min TPMs condition 2> -SN <sample number condition 1,sample number condition2> -f <FDR>\n\n\tRequired:\n\t\t-s <skipped exon file>\n\t\t-a <bed12 annotation file>\n\t\t-g <genome fasta file>\n\t\t-e <expression file>\n\t\t-TPM <min TPMs condition 1,min TPMs condition 2>\n\t\t-SN <sample number condition 1,sample number condition 2>\n\t\t-f <FDR>\n\n\tAdditional:\n\t\t-h help\n\n\tExample: perl SEMedley.pl -s PATH/SEfile.txt -a PATH/bed12_annotation.bed -g PATH/genome.fa -e PATH/expression.tsv -TPM 2,2 -SN 3,3 -f 0.05\n\n";
exit;
}
sub options {
if (scalar @ARGV == 0) {
program_info;
exit;
}
for (my $i=0; $i < scalar @ARGV; $i++) {
if ($ARGV[$i] eq "\-s") {
$skipped_exon_file = $ARGV[$i+1];
}
elsif ($ARGV[$i] eq "\-g") {
$genome_fasta_file = $ARGV[$i+1];
}
elsif ($ARGV[$i] eq "\-a") {
$annotation_file = $ARGV[$i+1];
}
elsif ($ARGV[$i] eq "\-f") {
$FDR = $ARGV[$i+1];
}
elsif ($ARGV[$i] eq "\-e") {
$expression_file = $ARGV[$i+1];
}
elsif ($ARGV[$i] eq "\-TPM") {
my $TPMs = $ARGV[$i+1];
($TPM1, $TPM2) = split("\,", $TPMs);
}
elsif ($ARGV[$i] eq "\-SN") {
my $sample_counts = $ARGV[$i+1];
($cond1_ct, $cond2_ct) = split("\,", $sample_counts);
}
elsif ($ARGV[$i] eq "\-h") {
program_info;
exit;
}
}
}
sub qc {
if ($FDR !~ m/^\d/) {
print "\nFDR is not numeric!\n";
program_info;
exit;
}
elsif ($skipped_exon_file eq "") {
print "\nSkipped exon file not defined!\n";
program_info;
exit;
}
elsif ($genome_fasta_file eq "") {
print "\nGenome fasta file not defined!\n";
program_info;
exit;
}
elsif ($expression_file eq "") {
print "\nExpression file not defined!\n";
program_info;
exit;
}
elsif ($FDR eq "") {
print "\nFDR not defined!\n";
program_info;
exit;
}
elsif ($TPM1 eq "") {
print "\nMinimum TPMs for condition 1 not defined!\n Is -TPM value in correct format?";
program_info;
exit;
}
elsif ($TPM2 eq "") {
print "\nMinimum TPMs for condition 2 not defined!\n Is -TPM value in correct format?";
program_info;
exit;
}
elsif ($cond1_ct eq "") {
print "\nCondition 1 sample count not defined!\n Is -SN value in correct format?";
program_info;
exit;
}
elsif ($cond2_ct eq "") {
print "\nCondition 2 sample count not defined!\n Is -SN value in correct format?";
program_info;
exit;
}
open(INF, "<$skipped_exon_file") or die "couldn't open skipped exon file";
while(my $line = <INF>) {
chomp($line);
$line =~ s/\"//g;
my @split_line = split("\t", $line);
if ($. == 2) {
if (scalar @split_line != 23 || $split_line[4] !~ m/[+-]/ || $split_line[5] !~ m/^\d/ || $split_line[6] !~ m/^\d/ || $split_line[7] !~ m/^\d/ || $split_line[8] !~ m/^\d/ || $split_line[9] !~ m/^\d/ || $split_line[10] !~ m/^\d/) {
print "\nSkipped exon file may not be an rMATS JCEC file format!\n\n";
program_info;
exit;
}
elsif ($split_line[3] !~ m/^chr/) {
print "\nSkipped exon file may not be an rMATS JCEC file format!\n Is fourth column chromosome? Fourth column doesn't begin with \"chr\"\n\n";
program_info;
exit;
}
}
elsif ($. == 3) {
last;
}
}
close(INF);
open(INF, "<$genome_fasta_file") or die "couldn't open genome fasta file";
while(my $line = <INF>) {
chomp($line);
my @split_line = split("\t", $line);
if ($. == 1) {
if (scalar @split_line != 1) {
print "\nGenome fasta file appears to be incorrect format!\n\n";
program_info;
exit;
}
elsif ($line =~ m/^\>chr/ || $line =~ m/^\> chr/) {
last;
}
else {
print "\nGenome fasta file appears to be incorrect format!\n First line doesn't match \"\>chr\" or \"\> chr\"\n\n";
program_info;
exit;
}
}
}
close(INF);
if ($annotation_file ne "none") {
open(INF, "<$annotation_file") or die "couldn't open annotation file";
while(my $line = <INF>) {
chomp($line);
my @split_line = split("\t", $line);
if ($. == 2) {
if (scalar @split_line != 12 || $split_line[0] !~ m/^chr/ || $split_line[5] !~ m/[+-]/ || $split_line[6] !~ m/^\d/ || $split_line[7] !~ m/^\d/ || $split_line[8] !~ m/^\d/ || $split_line[9] !~ m/^\d/ || $split_line[10] !~ m/^\d/ || $split_line[11] !~ m/^\d/) {
print "\nAnnotation file may not be in bed12 format!\n\n";
program_info;
exit;
}
}
elsif ($. == 3) {
last;
}
}
close(INF);
}
}
sub print_input_parameters {
print "\n\nINPUT:\nSkipped exon file: ", basename($skipped_exon_file), "\nAnnotation file: ", basename($annotation_file), "\nGenome fasta file: ", basename($genome_fasta_file), "\nExpression file: ", basename($expression_file), "\nMin TPMs, condition 1 = ", $TPM1, "\nMin TPMs, condition 2 = ", $TPM2, "\nSample number condition 1 = ", $cond1_ct, "\nSample number condition 2 = ", $cond2_ct, "\nSE FDR = ", $FDR, "\n\n";
}
sub SESpliceSiteScoring {
my $SESpliceSiteScoring = $dirname."\/SESpliceSiteScoring.pl";
system ("perl $SESpliceSiteScoring -s $skipped_exon_file -g $genome_fasta_file -a $annotation_file -f $FDR");
}
sub SEIntronExonSizes {
my $SEIntronExonSizes = $dirname."\/SEIntronExonSizes.pl";
system ("perl $SEIntronExonSizes -s $skipped_exon_file -a $annotation_file -f $FDR");
}
sub SENumberSkipped {
my $SENumberSkipped = $dirname."\/SENumberSkipped.pl";
system ("perl $SENumberSkipped -s $skipped_exon_file -a $annotation_file -f $FDR");
}
sub SEFractionExpressed {
my $SEFractionExpressed = $dirname."\/SEFractionExpressed.pl";
system ("perl $SEFractionExpressed -s $skipped_exon_file -e $expression_file -SN $cond1_ct,$cond2_ct -TPM $TPM1,$TPM2 -f $FDR");
}
sub SEUnannotated {
my $SEUnannotated = $dirname."\/SEUnannotated.pl";
system ("perl $SEUnannotated -s $skipped_exon_file -a $annotation_file -f $FDR");
}
sub SETranslateNMD {
my $SETranslateNMD = $dirname."\/SETranslateNMD.pl";
system ("perl $SETranslateNMD -s $skipped_exon_file -a $annotation_file -g $genome_fasta_file -f $FDR");
}
options;
qc;
print "\n\nRunning SEKitchenSink...";
print_input_parameters;
SESpliceSiteScoring;
SEIntronExonSizes;
SENumberSkipped;
SEFractionExpressed;
SEUnannotated;
SETranslateNMD;