Skip to content

Commit

Permalink
Add CoNLL outputter; debug JSON outputter
Browse files Browse the repository at this point in the history
  • Loading branch information
gangeli authored and Stanford NLP committed Sep 9, 2014
1 parent 7c5ce60 commit 9b54ad9
Show file tree
Hide file tree
Showing 7 changed files with 403 additions and 34 deletions.
120 changes: 120 additions & 0 deletions itest/src/edu/stanford/nlp/pipeline/AnnotationOutputterITest.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,120 @@
package edu.stanford.nlp.pipeline;

import junit.framework.TestCase;

import java.io.IOException;
import java.util.Properties;

/**
* Tests for the various annotation outputters which require the models to be loaded.
*
* @author Gabor Angeli
*/
public class AnnotationOutputterITest extends TestCase {

static StanfordCoreNLP pipeline =
new StanfordCoreNLP(new Properties() {{ setProperty("annotators", "tokenize, ssplit, pos, lemma, ner, parse"); }});

public void testSimpleSentenceCoNLL() throws IOException {
Annotation ann = new Annotation("The cat is fat. The dog is lazy.");
pipeline.annotate(ann);
String actual = new CoNLLOutputter().print(ann);
String expected =
"1\tThe\tthe\tDT\tO\t2\tdet\n" +
"2\tcat\tcat\tNN\tO\t4\tnsubj\n" +
"3\tis\tbe\tVBZ\tO\t4\tcop\n" +
"4\tfat\tfat\tJJ\tO\t0\tROOT\n" +
"5\t.\t.\t.\tO\t_\t_\n" +
"\n" +
"1\tThe\tthe\tDT\tO\t2\tdet\n" +
"2\tdog\tdog\tNN\tO\t4\tnsubj\n" +
"3\tis\tbe\tVBZ\tO\t4\tcop\n" +
"4\tlazy\tlazy\tJJ\tO\t0\tROOT\n" +
"5\t.\t.\t.\tO\t_\t_";
assertEquals(expected, actual);
}

public void testSimpleSentenceJSON() throws IOException {
Annotation ann = new Annotation("Bad wolf");
pipeline.annotate(ann);
String actual = new JSONOutputter().print(ann);
String expected =
"{\n" +
" \"sentences\": [\n" +
" {\n" +
" \"index\": \"0\",\n" +
" \"parse\": \"(ROOT\\n (NP (JJ Bad) (NN wolf)))\\n\\n\",\n" +
" \"basic-dependencies\": [\n" +
" {\n" +
" \"dep\": \"ROOT\",\n" +
" \"governor\": \"0\",\n" +
" \"governorGloss\": \"ROOT\",\n" +
" \"dependent\": \"2\",\n" +
" \"dependentGloss\": \"wolf\"\n" +
" },\n" +
" {\n" +
" \"dep\": \"amod\",\n" +
" \"governor\": \"2\",\n" +
" \"governorGloss\": \"wolf\",\n" +
" \"dependent\": \"1\",\n" +
" \"dependentGloss\": \"Bad\"\n" +
" }\n" +
" ],\n" +
" \"collapsed-dependencies\": [\n" +
" {\n" +
" \"dep\": \"ROOT\",\n" +
" \"governor\": \"0\",\n" +
" \"governorGloss\": \"ROOT\",\n" +
" \"dependent\": \"2\",\n" +
" \"dependentGloss\": \"wolf\"\n" +
" },\n" +
" {\n" +
" \"dep\": \"amod\",\n" +
" \"governor\": \"2\",\n" +
" \"governorGloss\": \"wolf\",\n" +
" \"dependent\": \"1\",\n" +
" \"dependentGloss\": \"Bad\"\n" +
" }\n" +
" ],\n" +
" \"collapsed-ccprocessed-dependencies\": [\n" +
" {\n" +
" \"dep\": \"ROOT\",\n" +
" \"governor\": \"0\",\n" +
" \"governorGloss\": \"ROOT\",\n" +
" \"dependent\": \"2\",\n" +
" \"dependentGloss\": \"wolf\"\n" +
" },\n" +
" {\n" +
" \"dep\": \"amod\",\n" +
" \"governor\": \"2\",\n" +
" \"governorGloss\": \"wolf\",\n" +
" \"dependent\": \"1\",\n" +
" \"dependentGloss\": \"Bad\"\n" +
" }\n" +
" ],\n" +
" \"tokens\": [\n" +
" {\n" +
" \"index\": \"1\",\n" +
" \"word\": \"Bad\",\n" +
" \"lemma\": \"bad\",\n" +
" \"characterOffsetBegin\": \"0\",\n" +
" \"characterOffsetEnd\": \"3\",\n" +
" \"pos\": \"JJ\",\n" +
" \"ner\": \"O\"\n" +
" },\n" +
" {\n" +
" \"index\": \"2\",\n" +
" \"word\": \"wolf\",\n" +
" \"lemma\": \"wolf\",\n" +
" \"characterOffsetBegin\": \"4\",\n" +
" \"characterOffsetEnd\": \"8\",\n" +
" \"pos\": \"NN\",\n" +
" \"ner\": \"O\"\n" +
" }\n" +
" ]\n" +
" }\n" +
" ]\n" +
"}";
assertEquals(expected, actual);
}
}
173 changes: 173 additions & 0 deletions src/edu/stanford/nlp/pipeline/CoNLLOutputter.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,173 @@
package edu.stanford.nlp.pipeline;

import edu.stanford.nlp.ling.CoreAnnotations;
import edu.stanford.nlp.ling.CoreLabel;
import edu.stanford.nlp.ling.IndexedWord;
import edu.stanford.nlp.semgraph.SemanticGraph;
import edu.stanford.nlp.semgraph.SemanticGraphCoreAnnotations;
import edu.stanford.nlp.semgraph.SemanticGraphEdge;
import edu.stanford.nlp.util.CoreMap;
import edu.stanford.nlp.util.StringUtils;

import java.io.IOException;
import java.io.OutputStream;
import java.io.PrintWriter;
import java.util.ArrayList;
import java.util.List;
import java.util.Set;
import java.util.stream.Collectors;

/**
* <p>Write a subset of our CoreNLP output in CoNLL format.</p>
*
* <p>The fields currently output are:</p>
*
* <table>
* <tr>
* <td>Field Number</td>
* <td>Field Name</td>
* <td>Description</td>
* </tr>
* <tr>
* <td>1</td>
* <td>ID</td>
* <td>Token Counter, starting at 1 for each new sentence.</td>
* </tr>
* <tr>
* <td>2</td>
* <td>FORM</td>
* <td>Word form or punctuation symbol.</td>
* </tr>
* <tr>
* <td>3</td>
* <td>LEMMA</td>
* <td>Lemma of word form, or an underscore if not available.</td>
* </tr>
* <tr>
* <td>4</td>
* <td>POSTAG</td>
* <td>Fine-grained part-of-speech tag, or underscore if not available.</td>
* </tr>
* <tr>
* <td>5</td>
* <td>NER</td>
* <td>Named Entity tag, or underscore if not available.</td>
* </tr>
* <tr>
* <td>6</td>
* <td>HEAD</td>
* <td>Head of the current token, which is either a value of ID or zero ('0').
* This is underscore if not available.</td>
* </tr>
* <tr>
* <td>7</td>
* <td>DEPREL</td>
* <td>Dependency relation to the HEAD, or underscore if not available.</td>
* </tr>
* </table>
*
* @author Gabor Angeli
*/
public class CoNLLOutputter extends AnnotationOutputter {

private static final String NULL_PLACEHOLDER = "_";

public CoNLLOutputter() { }

private String orNull(String in) {
if (in == null) {
return NULL_PLACEHOLDER;
} else {
return in;
}
}

/**
* Write a line of the CoNLL output.
*/
private String line(int index,
CoreLabel token,
int head, String deprel) {
ArrayList<String> fields = new ArrayList<>(16);

fields.add(Integer.toString(index)); // 1
fields.add(orNull(token.word())); // 2
fields.add(orNull(token.lemma())); // 3
fields.add(orNull(token.tag())); // 4
fields.add(orNull(token.ner())); // 5
if (head >= 0) {
fields.add(Integer.toString(head)); // 6
fields.add(deprel); // 7
} else {
fields.add(NULL_PLACEHOLDER);
fields.add(NULL_PLACEHOLDER);
}

return StringUtils.join(fields, "\t");
}

@Override
public void print(Annotation doc, OutputStream target, Options options) throws IOException {
PrintWriter writer = new PrintWriter(target);

// vv A bunch of nonsense to get tokens vv
boolean firstSentence = true;
if (doc.get(CoreAnnotations.SentencesAnnotation.class) != null) {
for (CoreMap sentence : doc.get(CoreAnnotations.SentencesAnnotation.class)) {
if (!firstSentence) {
writer.println();
writer.println();
}
firstSentence = false;
if (sentence.get(CoreAnnotations.TokensAnnotation.class) != null) {
List<CoreLabel> tokens = sentence.get(CoreAnnotations.TokensAnnotation.class);
SemanticGraph depTree = sentence.get(SemanticGraphCoreAnnotations.BasicDependenciesAnnotation.class);
for (int i = 0; i < tokens.size(); ++i) {
// ^^ end nonsense to get tokens ^^

// Newline if applicable
if (i > 0) {
writer.println();
}

// Try to get the incoming dependency edge
int head = -1;
String deprel = null;
if (depTree != null) {
Set<Integer> rootSet = depTree.getRoots().stream().map(IndexedWord::index).collect(Collectors.toSet());
IndexedWord node = depTree.getNodeByIndexSafe(i + 1);
if (node != null) {
List<SemanticGraphEdge> edgeList = depTree.getIncomingEdgesSorted(node);
if (!edgeList.isEmpty()) {
assert edgeList.size() == 1;
head = edgeList.get(0).getGovernor().index();
deprel = edgeList.get(0).getRelation().toString();
} else if (rootSet.contains(i + 1)) {
head = 0;
deprel = "ROOT";
}
}
}

// Write the token
writer.print(line(i + 1, tokens.get(i), head, deprel));
}
}
}
}
writer.flush();
}

public static void conllPrint(Annotation annotation, OutputStream os) throws IOException {
new CoNLLOutputter().print(annotation, os);
}

public static void conllPrint(Annotation annotation, OutputStream os, StanfordCoreNLP pipeline) throws IOException {
new CoNLLOutputter().print(annotation, os, pipeline);
}

public static void conllPrint(Annotation annotation, OutputStream os, Options options) throws IOException {
new CoNLLOutputter().print(annotation, os, options);
}

}
26 changes: 19 additions & 7 deletions src/edu/stanford/nlp/pipeline/JSONOutputter.java
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@


import edu.stanford.nlp.ling.CoreAnnotations;
import edu.stanford.nlp.ling.IndexedWord;
import edu.stanford.nlp.semgraph.SemanticGraph;
import edu.stanford.nlp.semgraph.SemanticGraphCoreAnnotations;
import edu.stanford.nlp.semgraph.SemanticGraphEdge;
Expand Down Expand Up @@ -106,13 +107,24 @@ public void print(Annotation doc, OutputStream target, Options options) throws I
@SuppressWarnings("RedundantCast") // It's lying; we need the "redundant" casts (as of 2014-09-08)
private static Object buildDependencyTree(SemanticGraph graph) {
if(graph != null) {
return graph.edgeListSorted().stream().map( (SemanticGraphEdge edge) -> (Consumer<Writer>) (Writer dep) -> {
dep.set("dep", edge.getRelation());
dep.set("governor", Integer.toString(edge.getGovernor().index()));
dep.set("governorGloss", edge.getGovernor().word());
dep.set("dependent", Integer.toString(edge.getDependent().index()));
dep.set("dependentGloss", edge.getDependent().word());
});
return Stream.concat(
// Roots
graph.getRoots().stream().map( (IndexedWord root) -> (Consumer<Writer>) dep -> {
dep.set("dep", "ROOT");
dep.set("governor", "0");
dep.set("governorGloss", "ROOT");
dep.set("dependent", Integer.toString(root.index()));
dep.set("dependentGloss", root.word());
}),
// Regular edges
graph.edgeListSorted().stream().map( (SemanticGraphEdge edge) -> (Consumer<Writer>) (Writer dep) -> {
dep.set("dep", edge.getRelation().toString());
dep.set("governor", Integer.toString(edge.getGovernor().index()));
dep.set("governorGloss", edge.getGovernor().word());
dep.set("dependent", Integer.toString(edge.getDependent().index()));
dep.set("dependentGloss", edge.getDependent().word());
})
);
} else {
return null;
}
Expand Down
21 changes: 20 additions & 1 deletion src/edu/stanford/nlp/pipeline/StanfordCoreNLP.java
Original file line number Diff line number Diff line change
Expand Up @@ -80,7 +80,7 @@

public class StanfordCoreNLP extends AnnotationPipeline {

enum OutputFormat { TEXT, XML, JSON, SERIALIZED }
enum OutputFormat { TEXT, XML, JSON, CONLL, SERIALIZED }

// other constants
public static final String CUSTOM_ANNOTATOR_PREFIX = "customAnnotatorClass.";
Expand Down Expand Up @@ -486,6 +486,19 @@ public void jsonPrint(Annotation annotation, Writer w) throws IOException {
w.flush();
}

/**
* Displays the output of many annotators in CoNLL format.
* @param annotation Contains the output of all annotators
* @param w The Writer to send the output to
* @throws IOException
*/
public void conllPrint(Annotation annotation, Writer w) throws IOException {
ByteArrayOutputStream os = new ByteArrayOutputStream();
CoNLLOutputter.conllPrint(annotation, os, this);
w.write(new String(os.toByteArray(), getEncoding()));
w.flush();
}

/**
* Displays the output of all annotators in XML format.
* @param annotation Contains the output of all annotators
Expand Down Expand Up @@ -869,6 +882,12 @@ public void processFiles(String base, final Collection<File> files, int numThrea
fos.close();
break;
}
case CONLL: {
OutputStream fos = new BufferedOutputStream(new FileOutputStream(finalOutputFilename));
new CoNLLOutputter().print(annotation, fos);
fos.close();
break;
}
case TEXT: {
OutputStream fos = new BufferedOutputStream(new FileOutputStream(finalOutputFilename));
prettyPrint(annotation, fos);
Expand Down
Loading

0 comments on commit 9b54ad9

Please sign in to comment.