Skip to content

Commit a581061

Browse files
mjfang27Stanford NLP
authored and
Stanford NLP
committed
setup for parsing wiki
1 parent 460091d commit a581061

File tree

2 files changed

+107
-35
lines changed

2 files changed

+107
-35
lines changed

src/edu/stanford/nlp/pipeline/StanfordCoreNLP.java

+42-35
Original file line numberDiff line numberDiff line change
@@ -80,7 +80,7 @@
8080

8181
public class StanfordCoreNLP extends AnnotationPipeline {
8282

83-
enum OutputFormat { TEXT, XML, JSON, CONLL, SERIALIZED }
83+
enum OutputFormat { TEXT, XML, JSON, CONLL, TAGGED, SERIALIZED }
8484

8585
// other constants
8686
public static final String CUSTOM_ANNOTATOR_PREFIX = "customAnnotatorClass.";
@@ -742,6 +742,7 @@ public void processFiles(String base, final Collection<File> files, int numThrea
742742
case CONLL: defaultExtension = ".conll"; break;
743743
case TEXT: defaultExtension = ".out"; break;
744744
case SERIALIZED: defaultExtension = ".ser.gz"; break;
745+
case TAGGED: defaultExtension = ".tag"; break;
745746
default: throw new IllegalArgumentException("Unknown output format " + outputFormat);
746747
}
747748
final String serializerClass = properties.getProperty("serializer", GenericAnnotationSerializer.class.getName());
@@ -885,42 +886,48 @@ public void processFiles(String base, final Collection<File> files, int numThrea
885886
if (annotationOkay) {
886887
//--Output File
887888
switch (outputFormat) {
888-
case XML: {
889-
OutputStream fos = new BufferedOutputStream(new FileOutputStream(finalOutputFilename));
890-
xmlPrint(annotation, fos);
891-
fos.close();
892-
break;
893-
}
894-
case JSON: {
895-
OutputStream fos = new BufferedOutputStream(new FileOutputStream(finalOutputFilename));
896-
new JSONOutputter().print(annotation, fos);
897-
fos.close();
898-
break;
899-
}
900-
case CONLL: {
901-
OutputStream fos = new BufferedOutputStream(new FileOutputStream(finalOutputFilename));
902-
new CoNLLOutputter().print(annotation, fos);
903-
fos.close();
904-
break;
905-
}
906-
case TEXT: {
907-
OutputStream fos = new BufferedOutputStream(new FileOutputStream(finalOutputFilename));
908-
prettyPrint(annotation, fos);
909-
fos.close();
910-
break;
911-
}
912-
case SERIALIZED: {
913-
if (outputSerializerClass != null) {
914-
AnnotationSerializer outputSerializer = loadSerializer(outputSerializerClass, outputSerializerName, properties);
889+
case XML: {
915890
OutputStream fos = new BufferedOutputStream(new FileOutputStream(finalOutputFilename));
916-
outputSerializer.write(annotation, fos).close();
917-
} else {
918-
IOUtils.writeObjectToFile(annotation, finalOutputFilename);
891+
xmlPrint(annotation, fos);
892+
fos.close();
893+
break;
919894
}
920-
break;
921-
}
922-
default:
923-
throw new IllegalArgumentException("Unknown output format " + outputFormat);
895+
case JSON: {
896+
OutputStream fos = new BufferedOutputStream(new FileOutputStream(finalOutputFilename));
897+
new JSONOutputter().print(annotation, fos);
898+
fos.close();
899+
break;
900+
}
901+
case CONLL: {
902+
OutputStream fos = new BufferedOutputStream(new FileOutputStream(finalOutputFilename));
903+
new CoNLLOutputter().print(annotation, fos);
904+
fos.close();
905+
break;
906+
}
907+
case TEXT: {
908+
OutputStream fos = new BufferedOutputStream(new FileOutputStream(finalOutputFilename));
909+
prettyPrint(annotation, fos);
910+
fos.close();
911+
break;
912+
}
913+
case SERIALIZED: {
914+
if (outputSerializerClass != null) {
915+
AnnotationSerializer outputSerializer = loadSerializer(outputSerializerClass, outputSerializerName, properties);
916+
OutputStream fos = new BufferedOutputStream(new FileOutputStream(finalOutputFilename));
917+
outputSerializer.write(annotation, fos).close();
918+
} else {
919+
IOUtils.writeObjectToFile(annotation, finalOutputFilename);
920+
}
921+
break;
922+
}
923+
case TAGGED: {
924+
OutputStream fos = new BufferedOutputStream(new FileOutputStream(finalOutputFilename));
925+
TaggedTextOutputter.prettyPrint(annotation, fos, this);
926+
fos.close();
927+
break;
928+
}
929+
default:
930+
throw new IllegalArgumentException("Unknown output format " + outputFormat);
924931
}
925932
synchronized (totalProcessed) {
926933
totalProcessed.incValue(1);
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,65 @@
1+
package edu.stanford.nlp.pipeline;
2+
3+
import edu.stanford.nlp.io.IOUtils;
4+
import edu.stanford.nlp.io.RuntimeIOException;
5+
import edu.stanford.nlp.ling.CoreAnnotations;
6+
import edu.stanford.nlp.ling.CoreLabel;
7+
import edu.stanford.nlp.util.CoreMap;
8+
9+
import java.io.IOException;
10+
import java.io.OutputStream;
11+
import java.io.PrintWriter;
12+
import java.util.List;
13+
14+
/**
15+
* Created by michaelf on 7/15/15. Outputs document back into text format, with verbs and nouns tagged as such (_V or _N) and also lemmatized.
16+
*/
17+
public class TaggedTextOutputter extends AnnotationOutputter{
18+
public TaggedTextOutputter() {}
19+
20+
@Override
21+
public void print(Annotation doc, OutputStream target, Options options) throws IOException {
22+
PrintWriter os = new PrintWriter(IOUtils.encodedOutputStreamWriter(target, options.encoding));
23+
print(doc, os, options);
24+
}
25+
26+
27+
private static void print(Annotation annotation, PrintWriter pw, Options options) throws IOException {
28+
List<CoreMap> sentences = annotation.get(CoreAnnotations.SentencesAnnotation.class);
29+
if(sentences != null)
30+
{
31+
for(int i = 0; i < sentences.size(); i++) {
32+
CoreMap sentence = sentences.get(i);
33+
StringBuilder sentenceToWrite = new StringBuilder();
34+
for(CoreLabel token : sentence.get(CoreAnnotations.TokensAnnotation.class))
35+
{
36+
sentenceToWrite.append(" ");
37+
sentenceToWrite.append(token.lemma().toLowerCase());
38+
if(token.get(CoreAnnotations.PartOfSpeechAnnotation.class).startsWith("V")) //verb
39+
sentenceToWrite.append("_V");
40+
else if(token.get(CoreAnnotations.PartOfSpeechAnnotation.class).startsWith("N")) //noun
41+
sentenceToWrite.append("_N");
42+
}
43+
pw.print(sentenceToWrite.toString()); //omit first space
44+
}
45+
}
46+
}
47+
48+
//from TextOutputter
49+
50+
/** Static helper */
51+
public static void prettyPrint(Annotation annotation, OutputStream stream, StanfordCoreNLP pipeline) {
52+
prettyPrint(annotation, new PrintWriter(stream), pipeline);
53+
}
54+
55+
/** Static helper */
56+
public static void prettyPrint(Annotation annotation, PrintWriter pw, StanfordCoreNLP pipeline) {
57+
try {
58+
TaggedTextOutputter.print(annotation, pw, getOptions(pipeline));
59+
// already flushed
60+
// don't close, might not want to close underlying stream
61+
} catch (IOException e) {
62+
throw new RuntimeIOException(e);
63+
}
64+
}
65+
}

0 commit comments

Comments
 (0)