forked from stanfordnlp/CoreNLP
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Add CoNLL outputter; debug JSON outputter
- Loading branch information
Showing
7 changed files
with
403 additions
and
34 deletions.
There are no files selected for viewing
120 changes: 120 additions & 0 deletions
120
itest/src/edu/stanford/nlp/pipeline/AnnotationOutputterITest.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,120 @@ | ||
package edu.stanford.nlp.pipeline; | ||
|
||
import junit.framework.TestCase; | ||
|
||
import java.io.IOException; | ||
import java.util.Properties; | ||
|
||
/** | ||
* Tests for the various annotation outputters which require the models to be loaded. | ||
* | ||
* @author Gabor Angeli | ||
*/ | ||
public class AnnotationOutputterITest extends TestCase { | ||
|
||
static StanfordCoreNLP pipeline = | ||
new StanfordCoreNLP(new Properties() {{ setProperty("annotators", "tokenize, ssplit, pos, lemma, ner, parse"); }}); | ||
|
||
public void testSimpleSentenceCoNLL() throws IOException { | ||
Annotation ann = new Annotation("The cat is fat. The dog is lazy."); | ||
pipeline.annotate(ann); | ||
String actual = new CoNLLOutputter().print(ann); | ||
String expected = | ||
"1\tThe\tthe\tDT\tO\t2\tdet\n" + | ||
"2\tcat\tcat\tNN\tO\t4\tnsubj\n" + | ||
"3\tis\tbe\tVBZ\tO\t4\tcop\n" + | ||
"4\tfat\tfat\tJJ\tO\t0\tROOT\n" + | ||
"5\t.\t.\t.\tO\t_\t_\n" + | ||
"\n" + | ||
"1\tThe\tthe\tDT\tO\t2\tdet\n" + | ||
"2\tdog\tdog\tNN\tO\t4\tnsubj\n" + | ||
"3\tis\tbe\tVBZ\tO\t4\tcop\n" + | ||
"4\tlazy\tlazy\tJJ\tO\t0\tROOT\n" + | ||
"5\t.\t.\t.\tO\t_\t_"; | ||
assertEquals(expected, actual); | ||
} | ||
|
||
public void testSimpleSentenceJSON() throws IOException { | ||
Annotation ann = new Annotation("Bad wolf"); | ||
pipeline.annotate(ann); | ||
String actual = new JSONOutputter().print(ann); | ||
String expected = | ||
"{\n" + | ||
" \"sentences\": [\n" + | ||
" {\n" + | ||
" \"index\": \"0\",\n" + | ||
" \"parse\": \"(ROOT\\n (NP (JJ Bad) (NN wolf)))\\n\\n\",\n" + | ||
" \"basic-dependencies\": [\n" + | ||
" {\n" + | ||
" \"dep\": \"ROOT\",\n" + | ||
" \"governor\": \"0\",\n" + | ||
" \"governorGloss\": \"ROOT\",\n" + | ||
" \"dependent\": \"2\",\n" + | ||
" \"dependentGloss\": \"wolf\"\n" + | ||
" },\n" + | ||
" {\n" + | ||
" \"dep\": \"amod\",\n" + | ||
" \"governor\": \"2\",\n" + | ||
" \"governorGloss\": \"wolf\",\n" + | ||
" \"dependent\": \"1\",\n" + | ||
" \"dependentGloss\": \"Bad\"\n" + | ||
" }\n" + | ||
" ],\n" + | ||
" \"collapsed-dependencies\": [\n" + | ||
" {\n" + | ||
" \"dep\": \"ROOT\",\n" + | ||
" \"governor\": \"0\",\n" + | ||
" \"governorGloss\": \"ROOT\",\n" + | ||
" \"dependent\": \"2\",\n" + | ||
" \"dependentGloss\": \"wolf\"\n" + | ||
" },\n" + | ||
" {\n" + | ||
" \"dep\": \"amod\",\n" + | ||
" \"governor\": \"2\",\n" + | ||
" \"governorGloss\": \"wolf\",\n" + | ||
" \"dependent\": \"1\",\n" + | ||
" \"dependentGloss\": \"Bad\"\n" + | ||
" }\n" + | ||
" ],\n" + | ||
" \"collapsed-ccprocessed-dependencies\": [\n" + | ||
" {\n" + | ||
" \"dep\": \"ROOT\",\n" + | ||
" \"governor\": \"0\",\n" + | ||
" \"governorGloss\": \"ROOT\",\n" + | ||
" \"dependent\": \"2\",\n" + | ||
" \"dependentGloss\": \"wolf\"\n" + | ||
" },\n" + | ||
" {\n" + | ||
" \"dep\": \"amod\",\n" + | ||
" \"governor\": \"2\",\n" + | ||
" \"governorGloss\": \"wolf\",\n" + | ||
" \"dependent\": \"1\",\n" + | ||
" \"dependentGloss\": \"Bad\"\n" + | ||
" }\n" + | ||
" ],\n" + | ||
" \"tokens\": [\n" + | ||
" {\n" + | ||
" \"index\": \"1\",\n" + | ||
" \"word\": \"Bad\",\n" + | ||
" \"lemma\": \"bad\",\n" + | ||
" \"characterOffsetBegin\": \"0\",\n" + | ||
" \"characterOffsetEnd\": \"3\",\n" + | ||
" \"pos\": \"JJ\",\n" + | ||
" \"ner\": \"O\"\n" + | ||
" },\n" + | ||
" {\n" + | ||
" \"index\": \"2\",\n" + | ||
" \"word\": \"wolf\",\n" + | ||
" \"lemma\": \"wolf\",\n" + | ||
" \"characterOffsetBegin\": \"4\",\n" + | ||
" \"characterOffsetEnd\": \"8\",\n" + | ||
" \"pos\": \"NN\",\n" + | ||
" \"ner\": \"O\"\n" + | ||
" }\n" + | ||
" ]\n" + | ||
" }\n" + | ||
" ]\n" + | ||
"}"; | ||
assertEquals(expected, actual); | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,173 @@ | ||
package edu.stanford.nlp.pipeline; | ||
|
||
import edu.stanford.nlp.ling.CoreAnnotations; | ||
import edu.stanford.nlp.ling.CoreLabel; | ||
import edu.stanford.nlp.ling.IndexedWord; | ||
import edu.stanford.nlp.semgraph.SemanticGraph; | ||
import edu.stanford.nlp.semgraph.SemanticGraphCoreAnnotations; | ||
import edu.stanford.nlp.semgraph.SemanticGraphEdge; | ||
import edu.stanford.nlp.util.CoreMap; | ||
import edu.stanford.nlp.util.StringUtils; | ||
|
||
import java.io.IOException; | ||
import java.io.OutputStream; | ||
import java.io.PrintWriter; | ||
import java.util.ArrayList; | ||
import java.util.List; | ||
import java.util.Set; | ||
import java.util.stream.Collectors; | ||
|
||
/** | ||
* <p>Write a subset of our CoreNLP output in CoNLL format.</p> | ||
* | ||
* <p>The fields currently output are:</p> | ||
* | ||
* <table> | ||
* <tr> | ||
* <td>Field Number</td> | ||
* <td>Field Name</td> | ||
* <td>Description</td> | ||
* </tr> | ||
* <tr> | ||
* <td>1</td> | ||
* <td>ID</td> | ||
* <td>Token Counter, starting at 1 for each new sentence.</td> | ||
* </tr> | ||
* <tr> | ||
* <td>2</td> | ||
* <td>FORM</td> | ||
* <td>Word form or punctuation symbol.</td> | ||
* </tr> | ||
* <tr> | ||
* <td>3</td> | ||
* <td>LEMMA</td> | ||
* <td>Lemma of word form, or an underscore if not available.</td> | ||
* </tr> | ||
* <tr> | ||
* <td>4</td> | ||
* <td>POSTAG</td> | ||
* <td>Fine-grained part-of-speech tag, or underscore if not available.</td> | ||
* </tr> | ||
* <tr> | ||
* <td>5</td> | ||
* <td>NER</td> | ||
* <td>Named Entity tag, or underscore if not available.</td> | ||
* </tr> | ||
* <tr> | ||
* <td>6</td> | ||
* <td>HEAD</td> | ||
* <td>Head of the current token, which is either a value of ID or zero ('0'). | ||
* This is underscore if not available.</td> | ||
* </tr> | ||
* <tr> | ||
* <td>7</td> | ||
* <td>DEPREL</td> | ||
* <td>Dependency relation to the HEAD, or underscore if not available.</td> | ||
* </tr> | ||
* </table> | ||
* | ||
* @author Gabor Angeli | ||
*/ | ||
public class CoNLLOutputter extends AnnotationOutputter { | ||
|
||
private static final String NULL_PLACEHOLDER = "_"; | ||
|
||
public CoNLLOutputter() { } | ||
|
||
private String orNull(String in) { | ||
if (in == null) { | ||
return NULL_PLACEHOLDER; | ||
} else { | ||
return in; | ||
} | ||
} | ||
|
||
/** | ||
* Write a line of the CoNLL output. | ||
*/ | ||
private String line(int index, | ||
CoreLabel token, | ||
int head, String deprel) { | ||
ArrayList<String> fields = new ArrayList<>(16); | ||
|
||
fields.add(Integer.toString(index)); // 1 | ||
fields.add(orNull(token.word())); // 2 | ||
fields.add(orNull(token.lemma())); // 3 | ||
fields.add(orNull(token.tag())); // 4 | ||
fields.add(orNull(token.ner())); // 5 | ||
if (head >= 0) { | ||
fields.add(Integer.toString(head)); // 6 | ||
fields.add(deprel); // 7 | ||
} else { | ||
fields.add(NULL_PLACEHOLDER); | ||
fields.add(NULL_PLACEHOLDER); | ||
} | ||
|
||
return StringUtils.join(fields, "\t"); | ||
} | ||
|
||
@Override | ||
public void print(Annotation doc, OutputStream target, Options options) throws IOException { | ||
PrintWriter writer = new PrintWriter(target); | ||
|
||
// vv A bunch of nonsense to get tokens vv | ||
boolean firstSentence = true; | ||
if (doc.get(CoreAnnotations.SentencesAnnotation.class) != null) { | ||
for (CoreMap sentence : doc.get(CoreAnnotations.SentencesAnnotation.class)) { | ||
if (!firstSentence) { | ||
writer.println(); | ||
writer.println(); | ||
} | ||
firstSentence = false; | ||
if (sentence.get(CoreAnnotations.TokensAnnotation.class) != null) { | ||
List<CoreLabel> tokens = sentence.get(CoreAnnotations.TokensAnnotation.class); | ||
SemanticGraph depTree = sentence.get(SemanticGraphCoreAnnotations.BasicDependenciesAnnotation.class); | ||
for (int i = 0; i < tokens.size(); ++i) { | ||
// ^^ end nonsense to get tokens ^^ | ||
|
||
// Newline if applicable | ||
if (i > 0) { | ||
writer.println(); | ||
} | ||
|
||
// Try to get the incoming dependency edge | ||
int head = -1; | ||
String deprel = null; | ||
if (depTree != null) { | ||
Set<Integer> rootSet = depTree.getRoots().stream().map(IndexedWord::index).collect(Collectors.toSet()); | ||
IndexedWord node = depTree.getNodeByIndexSafe(i + 1); | ||
if (node != null) { | ||
List<SemanticGraphEdge> edgeList = depTree.getIncomingEdgesSorted(node); | ||
if (!edgeList.isEmpty()) { | ||
assert edgeList.size() == 1; | ||
head = edgeList.get(0).getGovernor().index(); | ||
deprel = edgeList.get(0).getRelation().toString(); | ||
} else if (rootSet.contains(i + 1)) { | ||
head = 0; | ||
deprel = "ROOT"; | ||
} | ||
} | ||
} | ||
|
||
// Write the token | ||
writer.print(line(i + 1, tokens.get(i), head, deprel)); | ||
} | ||
} | ||
} | ||
} | ||
writer.flush(); | ||
} | ||
|
||
public static void conllPrint(Annotation annotation, OutputStream os) throws IOException { | ||
new CoNLLOutputter().print(annotation, os); | ||
} | ||
|
||
public static void conllPrint(Annotation annotation, OutputStream os, StanfordCoreNLP pipeline) throws IOException { | ||
new CoNLLOutputter().print(annotation, os, pipeline); | ||
} | ||
|
||
public static void conllPrint(Annotation annotation, OutputStream os, Options options) throws IOException { | ||
new CoNLLOutputter().print(annotation, os, options); | ||
} | ||
|
||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.