Adding per-sequence name translation

haoziyeung · Jan 28, 2015 · 450c599 · 450c599
1 parent cad40ba
commit 450c599
Show file tree

Hide file tree

Showing 3 changed files with 185 additions and 0 deletions.
diff --git a/CHANGELOG b/CHANGELOG
@@ -8,6 +8,7 @@ v0.10.5-beta:
 * Reduced memory usage of db_shrink (Build step 2 / kraken-build --shrink)
 * Reduced memory usage of db_sort (Build step 3)
 * Support added for KRAKEN_DB_PATH and KRAKEN_DEFAULT_DB env. variables
+* Added kraken-translate for getting taxonomic names for each sequence
 
 v0.10.4-beta:
 * use GRCh38 for human genome library

diff --git a/README b/README
@@ -298,6 +298,43 @@ left to right, they are:
      - the next k-mer was not in the database
      - the last 3 k-mers mapped to taxonomy ID #562
 
+For users who want the full taxonomic name associated with each input
+sequence, we provide a script named "kraken-translate" that produces two
+different output formats for classified sequences.  The script operates
+on the output of "kraken", like so:
+
+  kraken --db $DBNAME sequences.fa > sequences.kraken
+  kraken-translate --db $DBNAME sequences.kraken > sequences.labels
+
+(The same database used to run kraken should be used to translate the
+output; see DATABASE ENVIRONMENT VARIABLES below for ways to reduce
+redundancy on the command line.)
+
+The file sequences.labels generated by the above example is a text file
+with two tab-delimited columns, and one line for each classified sequence
+in sequences.fa; unclassified sequences are not reported by
+kraken-translate.  The first column of kraken-translate's output are the
+sequence IDs of the classified sequences, and the second column contains
+the taxonomy of the sequence.  For example, an output line from "kraken" of:
+
+  C	SEQ1	562	36	562:6
+
+Would result in a corresponding output line from kraken-translate of:
+
+  SEQ1	root;cellular organisms;Bacteria;Proteobacteria;Gammaproteobacteria;Enterobacteriales;Enterobacteriaceae;Escherichia;Escherichia coli
+
+Alternatively, kraken-translate accepts the option "--mpa-format" which
+will report only levels of the taxonomy with standard rank assignments
+(superkingdom, kingdom, phylum, class, order, family, genus, species),
+and uses pipes to delimit the various levels of the taxonomy.  For example,
+"kraken-translate --mpa-format --db $DBNAME" with the above example output
+from "kraken" would result in the following line of output:
+
+  SEQ1	d__Bacteria|p__Proteobacteria|c__Gammaproteobacteria|o__Enterobacteriales|f__Enterobacteriaceae|g__Escherichia|s__Escherichia_coli
+
+Taxonomy assignments above the superkingdom (d__) rank are represented as
+just "root" when using the --mpa-report for kraken-translate.
+
 
 CUSTOM DATABASES
 ================

diff --git a/scripts/kraken-translate b/scripts/kraken-translate
@@ -0,0 +1,147 @@
+#!/usr/bin/perl
+
+# Copyright 2013-2015, Derrick Wood <[email protected]>
+#
+# This file is part of the Kraken taxonomic sequence classification system.
+#
+# Kraken is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# Kraken is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with Kraken.  If not, see <http://www.gnu.org/licenses/>.
+
+# For each classified read, prints sequence ID and full taxonomy
+
+use strict;
+use warnings;
+use File::Basename;
+use Getopt::Long;
+
+my $PROG = basename $0;
+my $KRAKEN_DIR = "#####=KRAKEN_DIR=#####";
+
+# Test to see if the executables got moved, try to recover if we can
+if (! -e "$KRAKEN_DIR/classify") {
+  use Cwd 'abs_path';
+  $KRAKEN_DIR = dirname abs_path($0);
+}
+
+require "$KRAKEN_DIR/krakenlib.pm";
+
+my $db_prefix;
+my $mpa_format = 0;
+
+GetOptions(
+  "help" => \&display_help,
+  "version" => \&display_version,
+  "db=s" => \$db_prefix,
+  "mpa-format" => \$mpa_format
+);
+
+eval { $db_prefix = krakenlib::find_db($db_prefix); };
+if ($@) {
+  die "$PROG: $@";
+}
+
+sub display_help {
+  print STDERR "Usage: $PROG [--db KRAKEN_DB_NAME] <kraken output file(s)>\n";
+  exit 0;
+}
+
+sub display_version {
+  print "Kraken version #####=VERSION=#####\n";
+  print "Copyright 2013-2015, Derrick Wood (dwood\@cs.jhu.edu)\n";
+  exit 0;
+}
+
+my (%parent_map, %name_map, %rank_map);
+load_taxonomy($db_prefix);
+my %known_taxonomy_strings;
+
+while (<>) {
+  next unless /^C/;
+  chomp;
+  my @fields = split;
+  my ($seqid, $taxid) = @fields[1,2];
+  my $taxonomy_str = get_taxonomy_str($taxid);
+  print "$seqid\t$taxonomy_str\n";
+}
+
+sub get_taxonomy_str {
+  my $taxid = shift;
+  if (! exists $known_taxonomy_strings{$taxid}) {
+    my @nodes;
+    while ($taxid > 0) {
+      if ($mpa_format) {
+        my $rank_code = rank_code($rank_map{$taxid});
+        my $name = $name_map{$taxid};
+        $name =~ tr/ /_/;
+        unshift @nodes, lc($rank_code) . "__" . $name if $rank_code ne "-";
+      }
+      else {
+        unshift @nodes, $name_map{$taxid};
+      }
+      $taxid = $parent_map{$taxid};
+    }
+    if ($mpa_format) {
+      $known_taxonomy_strings{$taxid} = @nodes ? join("|", @nodes) : "root";
+    }
+    else {
+      $known_taxonomy_strings{$taxid} = join(";", @nodes);
+    }
+  }
+  return $known_taxonomy_strings{$taxid};
+}
+
+sub rank_code {
+  my $rank = shift;
+  for ($rank) {
+    $_ eq "species" and return "S";
+    $_ eq "genus" and return "G";
+    $_ eq "family" and return "F";
+    $_ eq "order" and return "O";
+    $_ eq "class" and return "C";
+    $_ eq "phylum" and return "P";
+    $_ eq "kingdom" and return "K";
+    $_ eq "superkingdom" and return "D";
+  }
+  return "-";
+}
+
+sub load_taxonomy {
+  my $prefix = shift;
+
+  open NAMES, "<", "$prefix/taxonomy/names.dmp"
+    or die "$PROG: can't open names file: $!\n";
+  while (<NAMES>) {
+    chomp;
+    s/\t\|$//;
+    my @fields = split /\t\|\t/;
+    my ($node_id, $name, $type) = @fields[0,1,3];
+    if ($type eq "scientific name") {
+      $name_map{$node_id} = $name;
+    }
+  }
+  close NAMES;
+
+  open NODES, "<", "$prefix/taxonomy/nodes.dmp"
+    or die "$PROG: can't open nodes file: $!\n";
+  while (<NODES>) {
+    chomp;
+    my @fields = split /\t\|\t/;
+    my ($node_id, $parent_id, $rank) = @fields[0,1,2];
+    if ($node_id == 1) {
+      $parent_id = 0;
+    }
+    $parent_map{$node_id} = $parent_id;
+    $rank_map{$node_id} = $rank;
+  }
+  close NODES;
+}