Add CoNLL outputter; debug JSON outputter

kalyanp · Sep 9, 2014 · 9b54ad9 · 9b54ad9
1 parent 7c5ce60
commit 9b54ad9
Show file tree

Hide file tree

Showing 7 changed files with 403 additions and 34 deletions.
diff --git a/itest/src/edu/stanford/nlp/pipeline/AnnotationOutputterITest.java b/itest/src/edu/stanford/nlp/pipeline/AnnotationOutputterITest.java
@@ -0,0 +1,120 @@
+package edu.stanford.nlp.pipeline;
+
+import junit.framework.TestCase;
+
+import java.io.IOException;
+import java.util.Properties;
+
+/**
+ * Tests for the various annotation outputters which require the models to be loaded.
+ *
+ * @author Gabor Angeli
+ */
+public class AnnotationOutputterITest extends TestCase {
+
+  static StanfordCoreNLP pipeline =
+      new StanfordCoreNLP(new Properties() {{ setProperty("annotators", "tokenize, ssplit, pos, lemma, ner, parse"); }});
+
+  public void testSimpleSentenceCoNLL() throws IOException {
+    Annotation ann = new Annotation("The cat is fat. The dog is lazy.");
+    pipeline.annotate(ann);
+    String actual = new CoNLLOutputter().print(ann);
+    String expected =
+        "1\tThe\tthe\tDT\tO\t2\tdet\n" +
+            "2\tcat\tcat\tNN\tO\t4\tnsubj\n" +
+            "3\tis\tbe\tVBZ\tO\t4\tcop\n" +
+            "4\tfat\tfat\tJJ\tO\t0\tROOT\n" +
+            "5\t.\t.\t.\tO\t_\t_\n" +
+            "\n" +
+            "1\tThe\tthe\tDT\tO\t2\tdet\n" +
+            "2\tdog\tdog\tNN\tO\t4\tnsubj\n" +
+            "3\tis\tbe\tVBZ\tO\t4\tcop\n" +
+            "4\tlazy\tlazy\tJJ\tO\t0\tROOT\n" +
+            "5\t.\t.\t.\tO\t_\t_";
+    assertEquals(expected, actual);
+  }
+
+  public void testSimpleSentenceJSON() throws IOException {
+    Annotation ann = new Annotation("Bad wolf");
+    pipeline.annotate(ann);
+    String actual = new JSONOutputter().print(ann);
+    String expected =
+        "{\n" +
+            "  \"sentences\": [\n" +
+            "    {\n" +
+            "      \"index\": \"0\",\n" +
+            "      \"parse\": \"(ROOT\\n  (NP (JJ Bad) (NN wolf)))\\n\\n\",\n" +
+            "      \"basic-dependencies\": [\n" +
+            "        {\n" +
+            "          \"dep\": \"ROOT\",\n" +
+            "          \"governor\": \"0\",\n" +
+            "          \"governorGloss\": \"ROOT\",\n" +
+            "          \"dependent\": \"2\",\n" +
+            "          \"dependentGloss\": \"wolf\"\n" +
+            "        },\n" +
+            "        {\n" +
+            "          \"dep\": \"amod\",\n" +
+            "          \"governor\": \"2\",\n" +
+            "          \"governorGloss\": \"wolf\",\n" +
+            "          \"dependent\": \"1\",\n" +
+            "          \"dependentGloss\": \"Bad\"\n" +
+            "        }\n" +
+            "      ],\n" +
+            "      \"collapsed-dependencies\": [\n" +
+            "        {\n" +
+            "          \"dep\": \"ROOT\",\n" +
+            "          \"governor\": \"0\",\n" +
+            "          \"governorGloss\": \"ROOT\",\n" +
+            "          \"dependent\": \"2\",\n" +
+            "          \"dependentGloss\": \"wolf\"\n" +
+            "        },\n" +
+            "        {\n" +
+            "          \"dep\": \"amod\",\n" +
+            "          \"governor\": \"2\",\n" +
+            "          \"governorGloss\": \"wolf\",\n" +
+            "          \"dependent\": \"1\",\n" +
+            "          \"dependentGloss\": \"Bad\"\n" +
+            "        }\n" +
+            "      ],\n" +
+            "      \"collapsed-ccprocessed-dependencies\": [\n" +
+            "        {\n" +
+            "          \"dep\": \"ROOT\",\n" +
+            "          \"governor\": \"0\",\n" +
+            "          \"governorGloss\": \"ROOT\",\n" +
+            "          \"dependent\": \"2\",\n" +
+            "          \"dependentGloss\": \"wolf\"\n" +
+            "        },\n" +
+            "        {\n" +
+            "          \"dep\": \"amod\",\n" +
+            "          \"governor\": \"2\",\n" +
+            "          \"governorGloss\": \"wolf\",\n" +
+            "          \"dependent\": \"1\",\n" +
+            "          \"dependentGloss\": \"Bad\"\n" +
+            "        }\n" +
+            "      ],\n" +
+            "      \"tokens\": [\n" +
+            "        {\n" +
+            "          \"index\": \"1\",\n" +
+            "          \"word\": \"Bad\",\n" +
+            "          \"lemma\": \"bad\",\n" +
+            "          \"characterOffsetBegin\": \"0\",\n" +
+            "          \"characterOffsetEnd\": \"3\",\n" +
+            "          \"pos\": \"JJ\",\n" +
+            "          \"ner\": \"O\"\n" +
+            "        },\n" +
+            "        {\n" +
+            "          \"index\": \"2\",\n" +
+            "          \"word\": \"wolf\",\n" +
+            "          \"lemma\": \"wolf\",\n" +
+            "          \"characterOffsetBegin\": \"4\",\n" +
+            "          \"characterOffsetEnd\": \"8\",\n" +
+            "          \"pos\": \"NN\",\n" +
+            "          \"ner\": \"O\"\n" +
+            "        }\n" +
+            "      ]\n" +
+            "    }\n" +
+            "  ]\n" +
+            "}";
+    assertEquals(expected, actual);
+  }
+}
diff --git a/src/edu/stanford/nlp/pipeline/CoNLLOutputter.java b/src/edu/stanford/nlp/pipeline/CoNLLOutputter.java
@@ -0,0 +1,173 @@
+package edu.stanford.nlp.pipeline;
+
+import edu.stanford.nlp.ling.CoreAnnotations;
+import edu.stanford.nlp.ling.CoreLabel;
+import edu.stanford.nlp.ling.IndexedWord;
+import edu.stanford.nlp.semgraph.SemanticGraph;
+import edu.stanford.nlp.semgraph.SemanticGraphCoreAnnotations;
+import edu.stanford.nlp.semgraph.SemanticGraphEdge;
+import edu.stanford.nlp.util.CoreMap;
+import edu.stanford.nlp.util.StringUtils;
+
+import java.io.IOException;
+import java.io.OutputStream;
+import java.io.PrintWriter;
+import java.util.ArrayList;
+import java.util.List;
+import java.util.Set;
+import java.util.stream.Collectors;
+
+/**
+ * <p>Write a subset of our CoreNLP output in CoNLL format.</p>
+ *
+ * <p>The fields currently output are:</p>
+ *
+ * <table>
+ *   <tr>
+ *     <td>Field Number</td>
+ *     <td>Field Name</td>
+ *     <td>Description</td>
+ *   </tr>
+ *   <tr>
+ *     <td>1</td>
+ *     <td>ID</td>
+ *     <td>Token Counter, starting at 1 for each new sentence.</td>
+ *   </tr>
+ *   <tr>
+ *     <td>2</td>
+ *     <td>FORM</td>
+ *     <td>Word form or punctuation symbol.</td>
+ *   </tr>
+ *   <tr>
+ *     <td>3</td>
+ *     <td>LEMMA</td>
+ *     <td>Lemma of word form, or an underscore if not available.</td>
+ *   </tr>
+ *   <tr>
+ *     <td>4</td>
+ *     <td>POSTAG</td>
+ *     <td>Fine-grained part-of-speech tag, or underscore if not available.</td>
+ *   </tr>
+ *   <tr>
+ *     <td>5</td>
+ *     <td>NER</td>
+ *     <td>Named Entity tag, or underscore if not available.</td>
+ *   </tr>
+ *   <tr>
+ *     <td>6</td>
+ *     <td>HEAD</td>
+ *     <td>Head of the current token, which is either a value of ID or zero ('0').
+ *         This is underscore if not available.</td>
+ *   </tr>
+ *   <tr>
+ *     <td>7</td>
+ *     <td>DEPREL</td>
+ *     <td>Dependency relation to the HEAD, or underscore if not available.</td>
+ *   </tr>
+ * </table>
+ *
+ * @author Gabor Angeli
+ */
+public class CoNLLOutputter extends AnnotationOutputter {
+
+  private static final String NULL_PLACEHOLDER = "_";
+
+  public CoNLLOutputter() { }
+
+  private String orNull(String in) {
+    if (in == null) {
+      return NULL_PLACEHOLDER;
+    } else {
+      return in;
+    }
+  }
+
+  /**
+   * Write a line of the CoNLL output.
+   */
+  private String line(int index,
+                      CoreLabel token,
+                      int head, String deprel) {
+    ArrayList<String> fields = new ArrayList<>(16);
+
+    fields.add(Integer.toString(index)); // 1
+    fields.add(orNull(token.word()));    // 2
+    fields.add(orNull(token.lemma()));   // 3
+    fields.add(orNull(token.tag()));     // 4
+    fields.add(orNull(token.ner()));     // 5
+    if (head >= 0) {
+      fields.add(Integer.toString(head));  // 6
+      fields.add(deprel);                  // 7
+    } else {
+      fields.add(NULL_PLACEHOLDER);
+      fields.add(NULL_PLACEHOLDER);
+    }
+
+    return StringUtils.join(fields, "\t");
+  }
+
+  @Override
+  public void print(Annotation doc, OutputStream target, Options options) throws IOException {
+    PrintWriter writer = new PrintWriter(target);
+
+    // vv A bunch of nonsense to get tokens vv
+    boolean firstSentence = true;
+    if (doc.get(CoreAnnotations.SentencesAnnotation.class) != null) {
+      for (CoreMap sentence : doc.get(CoreAnnotations.SentencesAnnotation.class)) {
+        if (!firstSentence) {
+          writer.println();
+          writer.println();
+        }
+        firstSentence = false;
+        if (sentence.get(CoreAnnotations.TokensAnnotation.class) != null) {
+          List<CoreLabel> tokens = sentence.get(CoreAnnotations.TokensAnnotation.class);
+          SemanticGraph depTree = sentence.get(SemanticGraphCoreAnnotations.BasicDependenciesAnnotation.class);
+          for (int i = 0; i < tokens.size(); ++i) {
+            // ^^ end nonsense to get tokens ^^
+
+            // Newline if applicable
+            if (i > 0) {
+              writer.println();
+            }
+
+            // Try to get the incoming dependency edge
+            int head = -1;
+            String deprel = null;
+            if (depTree != null) {
+              Set<Integer> rootSet = depTree.getRoots().stream().map(IndexedWord::index).collect(Collectors.toSet());
+              IndexedWord node = depTree.getNodeByIndexSafe(i + 1);
+              if (node != null) {
+                List<SemanticGraphEdge> edgeList = depTree.getIncomingEdgesSorted(node);
+                if (!edgeList.isEmpty()) {
+                  assert edgeList.size() == 1;
+                  head = edgeList.get(0).getGovernor().index();
+                  deprel = edgeList.get(0).getRelation().toString();
+                } else if (rootSet.contains(i + 1)) {
+                  head = 0;
+                  deprel = "ROOT";
+                }
+              }
+            }
+
+            // Write the token
+            writer.print(line(i + 1, tokens.get(i), head, deprel));
+          }
+        }
+      }
+    }
+    writer.flush();
+  }
+
+  public static void conllPrint(Annotation annotation, OutputStream os) throws IOException {
+    new CoNLLOutputter().print(annotation, os);
+  }
+
+  public static void conllPrint(Annotation annotation, OutputStream os, StanfordCoreNLP pipeline) throws IOException {
+    new CoNLLOutputter().print(annotation, os, pipeline);
+  }
+
+  public static void conllPrint(Annotation annotation, OutputStream os, Options options) throws IOException {
+    new CoNLLOutputter().print(annotation, os, options);
+  }
+
+}
diff --git a/src/edu/stanford/nlp/pipeline/JSONOutputter.java b/src/edu/stanford/nlp/pipeline/JSONOutputter.java
@@ -2,6 +2,7 @@
 
 
 import edu.stanford.nlp.ling.CoreAnnotations;
+import edu.stanford.nlp.ling.IndexedWord;
 import edu.stanford.nlp.semgraph.SemanticGraph;
 import edu.stanford.nlp.semgraph.SemanticGraphCoreAnnotations;
 import edu.stanford.nlp.semgraph.SemanticGraphEdge;
@@ -106,13 +107,24 @@ public void print(Annotation doc, OutputStream target, Options options) throws I
   @SuppressWarnings("RedundantCast")  // It's lying; we need the "redundant" casts (as of 2014-09-08)
   private static Object buildDependencyTree(SemanticGraph graph) {
     if(graph != null) {
-      return graph.edgeListSorted().stream().map( (SemanticGraphEdge edge) -> (Consumer<Writer>) (Writer dep) -> {
-        dep.set("dep", edge.getRelation());
-        dep.set("governor", Integer.toString(edge.getGovernor().index()));
-        dep.set("governorGloss", edge.getGovernor().word());
-        dep.set("dependent", Integer.toString(edge.getDependent().index()));
-        dep.set("dependentGloss", edge.getDependent().word());
-      });
+      return Stream.concat(
+          // Roots
+          graph.getRoots().stream().map( (IndexedWord root) -> (Consumer<Writer>) dep -> {
+            dep.set("dep", "ROOT");
+            dep.set("governor", "0");
+            dep.set("governorGloss", "ROOT");
+            dep.set("dependent", Integer.toString(root.index()));
+            dep.set("dependentGloss", root.word());
+          }),
+          // Regular edges
+          graph.edgeListSorted().stream().map( (SemanticGraphEdge edge) -> (Consumer<Writer>) (Writer dep) -> {
+            dep.set("dep", edge.getRelation().toString());
+            dep.set("governor", Integer.toString(edge.getGovernor().index()));
+            dep.set("governorGloss", edge.getGovernor().word());
+            dep.set("dependent", Integer.toString(edge.getDependent().index()));
+            dep.set("dependentGloss", edge.getDependent().word());
+          })
+      );
     } else {
       return null;
     }

diff --git a/src/edu/stanford/nlp/pipeline/StanfordCoreNLP.java b/src/edu/stanford/nlp/pipeline/StanfordCoreNLP.java
@@ -80,7 +80,7 @@
 
 public class StanfordCoreNLP extends AnnotationPipeline {
 
-  enum OutputFormat { TEXT, XML, JSON, SERIALIZED }
+  enum OutputFormat { TEXT, XML, JSON, CONLL, SERIALIZED }
 
   // other constants
   public static final String CUSTOM_ANNOTATOR_PREFIX = "customAnnotatorClass.";
@@ -486,6 +486,19 @@ public void jsonPrint(Annotation annotation, Writer w) throws IOException {
     w.flush();
   }
 
+  /**
+   * Displays the output of many annotators in CoNLL format.
+   * @param annotation Contains the output of all annotators
+   * @param w The Writer to send the output to
+   * @throws IOException
+   */
+  public void conllPrint(Annotation annotation, Writer w) throws IOException {
+    ByteArrayOutputStream os = new ByteArrayOutputStream();
+    CoNLLOutputter.conllPrint(annotation, os, this);
+    w.write(new String(os.toByteArray(), getEncoding()));
+    w.flush();
+  }
+
   /**
    * Displays the output of all annotators in XML format.
    * @param annotation Contains the output of all annotators
@@ -869,6 +882,12 @@ public void processFiles(String base, final Collection<File> files, int numThrea
               fos.close();
               break;
             }
+            case CONLL: {
+              OutputStream fos = new BufferedOutputStream(new FileOutputStream(finalOutputFilename));
+              new CoNLLOutputter().print(annotation, fos);
+              fos.close();
+              break;
+            }
             case TEXT: {
               OutputStream fos = new BufferedOutputStream(new FileOutputStream(finalOutputFilename));
               prettyPrint(annotation, fos);