-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathfastq_clean_pairs.pl
107 lines (87 loc) · 2.58 KB
/
fastq_clean_pairs.pl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
#!/usr/bin/perl
# ejr - 20120216
# open two fastq files, write matching pairs to right and lefts files and then
# unpaired to a third file
# inefficient IO, but has a lower memory footprint for big files
# I should find a better way to do this. BerkeleyDb and SQlite are slower
use strict;
use warnings;
use File::Spec;
use Getopt::Long;
##$file_1 and _2 should be mate fq files
my $file_1;
my $file_2;
GetOptions(
'1|file_1:s' => \$file_1,
'2|file_2:s' => \$file_2,
'h|help' => \&getHelp,
);
sub getHelp {
print "
This script takes 2 paired fastq files and matches mates and prints unPaired reads to STDOUT.
Please name the fastq files with .fq extention.
usage:
./clean_pairs.pl [-1 fastq file 1] [-2 fastq file 2][-h]
options:
-1 STR fq file 1 [required, no default]
-2 STR fq file 2 [required, no default]
-h this message
";
exit 1;
}
if ( !defined $file_1 and !defined $file_2 ) {
print "\n\nMust provide a two files containing paired reads\n\n";
&getHelp;
exit 1;
}
my $file_1_path = File::Spec->rel2abs($file_1);
my $file_2_path = File::Spec->rel2abs($file_2);
my $sample;
##sample_1.fq
##sampe_pair1.fq
if ( $file_1 =~ /(\S+?)_?(\S+)\.(fq|fastq)/ ) {
$sample = $1;
}
my $fq_1 = $file_1;
my $fq_2 = $file_2;
$fq_1 =~ s/(\.fq|\.fastq)$//;
$fq_2 =~ s/(\.fq|\.fastq)$//;
my %lefts;
my %rights;
open (LEFT, $file_1) or die "cannot open $file_1:$!\n";
open (RIGHT, $file_2) or die "cannot open $file_2:$!\n";
open (LEFTOUT, ">$fq_1.matched.fq") or die "cannot open file:$!\n";
open (RIGHTOUT, ">$fq_2.matched.fq") or die "cannot open file:$!\n";
while (my $left_header = <LEFT>) {
$lefts{$left_header} = 1;
<LEFT>;
<LEFT>;
<LEFT>;
}
close LEFT;
while (my $right_header = <RIGHT>) {
my $right_seq = <RIGHT>;
my $right_qual_header = <RIGHT>;
my $right_qual = <RIGHT>;
my $right_full = $right_header.$right_seq.$right_qual_header.$right_qual;
if (exists($lefts{$right_header})){
print RIGHTOUT $right_full;
$rights{$right_header} = 1;
} else {
print $right_full;
}
}
close RIGHT;
open (LEFT, $file_1) or die "cannot open $file_1:$!\n";
#open (LEFT, $ARGV[0]) or die "cannot open file:$!\n";
while (my $left_header = <LEFT>) {
my $left_seq = <LEFT>;
my $left_qual_header = <LEFT>;
my $left_qual = <LEFT>;
my $left_full =$left_header.$left_seq.$left_qual_header.$left_qual;
if ($rights{$left_header}) {
print LEFTOUT $left_full;
} else {
print $left_full;
}
}