Skip to content

Commit

Permalink
Support .gbk / .gbff [.gz] files
Browse files Browse the repository at this point in the history
  • Loading branch information
dfornika committed Sep 2, 2017
1 parent 3c14261 commit 39aaaa0
Show file tree
Hide file tree
Showing 2 changed files with 82 additions and 6 deletions.
26 changes: 20 additions & 6 deletions scripts/add_to_library.sh
Original file line number Diff line number Diff line change
Expand Up @@ -24,24 +24,38 @@ set -e # Stop on error

LIBRARY_DIR="$KRAKEN_DB_NAME/library"

if [ ! -e "$1" ]
input_file=$1

if [ ! -e "$input_file" ]
then
echo "Can't add \"$1\": file does not exist"
echo "Can't add \"$input_file\": file does not exist"
exit 1
fi
if [ ! -f "$1" ]
if [ ! -f "$input_file" ]
then
echo "Can't add \"$1\": not a regular file"
echo "Can't add \"$input_file\": not a regular file"
exit 1
fi

add_dir="$LIBRARY_DIR/added"
mkdir -p "$add_dir"
scan_fasta_file.pl "$1" > "$add_dir/temp_map.txt"

filename=$(cp_into_tempfile.pl -t "XXXXXXXXXX" -d "$add_dir" -s fna "$1")
if [[ $input_file == *.gbff || $input_file == *.gbff.gz || $input_file == *.gbk || $input_file == *.gbk.gz ]]
then
convert_gb_to_fa.pl $input_file > "$add_dir/temp.fna"
input_file="$add_dir/temp.fna"
fi

scan_fasta_file.pl "$input_file" > "$add_dir/temp_map.txt"

filename=$(cp_into_tempfile.pl -t "XXXXXXXXXX" -d "$add_dir" -s fna "$input_file")

cat "$add_dir/temp_map.txt" >> "$add_dir/prelim_map.txt"
rm "$add_dir/temp_map.txt"

if [ -e "$add_dir/temp.fna" ]
then
rm "$add_dir/temp.fna"
fi

echo "Added \"$1\" to library ($KRAKEN_DB_NAME)"
62 changes: 62 additions & 0 deletions scripts/convert_gb_to_fa.pl
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
#!/usr/bin/env perl

# Copyright 2013-2017, Derrick Wood <[email protected]>
#
# This file is part of the Kraken taxonomic sequence classification system.
#
# Kraken is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# Kraken is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with Kraken. If not, see <http://www.gnu.org/licenses/>.

# Pull sequence data with accession and taxid from Genbank file and output in fasta format
# Adapted from @tseemann https://github.com/MDU-PHL/mdu-tools/blob/master/bin/genbank-to-kraken_fasta.pl

use strict;
use warnings;

@ARGV or die "Usage: $0 <file.gbk[.gz]> ...";

my $wrote=0;
my($seqid, $in_seq, $taxid);

my $input_file = $ARGV[0];

if ($input_file =~ /.gz$/) {
open(IN, "gunzip -c $input_file |") or die "can’t open pipe to $input_file";
} else {
open(IN, $input_file) or die "can’t open $input_file";
}

while (<IN>) {
if (m/^VERSION\s+(\S+)/) {
$seqid = $1;
}
elsif (m/taxon:(\d+)/) {
$taxid = $1;
}
elsif (m/^ORIGIN/) {
$in_seq = 1;
print ">$seqid|kraken:taxid|$taxid\n";
}
elsif (m{^//}) {
$in_seq = $taxid = $seqid = undef;
$wrote++;
}
elsif ($in_seq) {
substr $_, 0, 10, '';
s/\s//g;
print uc($_), "\n";
}
}

close IN;

0 comments on commit 39aaaa0

Please sign in to comment.