setup for parsing wiki

mjfang27 · Stanford NLP · commit a5810611387b · 2017-04-02T14:14:34.000-07:00
diff --git a/src/edu/stanford/nlp/pipeline/StanfordCoreNLP.java b/src/edu/stanford/nlp/pipeline/StanfordCoreNLP.java
@@ -80,7 +80,7 @@
 
 public class StanfordCoreNLP extends AnnotationPipeline {
 
-  enum OutputFormat { TEXT, XML, JSON, CONLL, SERIALIZED }
+  enum OutputFormat { TEXT, XML, JSON, CONLL, TAGGED, SERIALIZED }
 
   // other constants
   public static final String CUSTOM_ANNOTATOR_PREFIX = "customAnnotatorClass.";
@@ -742,6 +742,7 @@ public void processFiles(String base, final Collection<File> files, int numThrea
       case CONLL: defaultExtension = ".conll"; break;
       case TEXT: defaultExtension = ".out"; break;
       case SERIALIZED: defaultExtension = ".ser.gz"; break;
+      case TAGGED: defaultExtension = ".tag"; break;
       default: throw new IllegalArgumentException("Unknown output format " + outputFormat);
     }
     final String serializerClass = properties.getProperty("serializer", GenericAnnotationSerializer.class.getName());
@@ -885,42 +886,48 @@ public void processFiles(String base, final Collection<File> files, int numThrea
           if (annotationOkay) {
             //--Output File
             switch (outputFormat) {
-            case XML: {
-              OutputStream fos = new BufferedOutputStream(new FileOutputStream(finalOutputFilename));
-              xmlPrint(annotation, fos);
-              fos.close();
-              break;
-            }
-            case JSON: {
-              OutputStream fos = new BufferedOutputStream(new FileOutputStream(finalOutputFilename));
-              new JSONOutputter().print(annotation, fos);
-              fos.close();
-              break;
-            }
-            case CONLL: {
-              OutputStream fos = new BufferedOutputStream(new FileOutputStream(finalOutputFilename));
-              new CoNLLOutputter().print(annotation, fos);
-              fos.close();
-              break;
-            }
-            case TEXT: {
-              OutputStream fos = new BufferedOutputStream(new FileOutputStream(finalOutputFilename));
-              prettyPrint(annotation, fos);
-              fos.close();
-              break;
-            }
-            case SERIALIZED: {
-              if (outputSerializerClass != null) {
-                AnnotationSerializer outputSerializer = loadSerializer(outputSerializerClass, outputSerializerName, properties);
+              case XML: {
                 OutputStream fos = new BufferedOutputStream(new FileOutputStream(finalOutputFilename));
-                outputSerializer.write(annotation, fos).close();
-              } else {
-                IOUtils.writeObjectToFile(annotation, finalOutputFilename);
+                xmlPrint(annotation, fos);
+                fos.close();
+                break;
               }
-              break;
-            }
-            default:
-              throw new IllegalArgumentException("Unknown output format " + outputFormat);
+              case JSON: {
+                OutputStream fos = new BufferedOutputStream(new FileOutputStream(finalOutputFilename));
+                new JSONOutputter().print(annotation, fos);
+                fos.close();
+                break;
+              }
+              case CONLL: {
+                OutputStream fos = new BufferedOutputStream(new FileOutputStream(finalOutputFilename));
+                new CoNLLOutputter().print(annotation, fos);
+                fos.close();
+                break;
+              }
+              case TEXT: {
+                OutputStream fos = new BufferedOutputStream(new FileOutputStream(finalOutputFilename));
+                prettyPrint(annotation, fos);
+                fos.close();
+                break;
+              }
+              case SERIALIZED: {
+                if (outputSerializerClass != null) {
+                  AnnotationSerializer outputSerializer = loadSerializer(outputSerializerClass, outputSerializerName, properties);
+                  OutputStream fos = new BufferedOutputStream(new FileOutputStream(finalOutputFilename));
+                  outputSerializer.write(annotation, fos).close();
+                } else {
+                  IOUtils.writeObjectToFile(annotation, finalOutputFilename);
+                }
+                break;
+              }
+              case TAGGED: {
+                OutputStream fos = new BufferedOutputStream(new FileOutputStream(finalOutputFilename));
+                TaggedTextOutputter.prettyPrint(annotation, fos, this);
+                fos.close();
+                break;
+              }
+              default:
+                throw new IllegalArgumentException("Unknown output format " + outputFormat);
             }
             synchronized (totalProcessed) {
               totalProcessed.incValue(1);
diff --git a/src/edu/stanford/nlp/pipeline/TaggedTextOutputter.java b/src/edu/stanford/nlp/pipeline/TaggedTextOutputter.java
@@ -0,0 +1,65 @@
+package edu.stanford.nlp.pipeline;
+
+import edu.stanford.nlp.io.IOUtils;
+import edu.stanford.nlp.io.RuntimeIOException;
+import edu.stanford.nlp.ling.CoreAnnotations;
+import edu.stanford.nlp.ling.CoreLabel;
+import edu.stanford.nlp.util.CoreMap;
+
+import java.io.IOException;
+import java.io.OutputStream;
+import java.io.PrintWriter;
+import java.util.List;
+
+/**
+ * Created by michaelf on 7/15/15. Outputs document back into text format, with verbs and nouns tagged as such (_V or _N) and also lemmatized.
+ */
+public class TaggedTextOutputter extends AnnotationOutputter{
+  public TaggedTextOutputter() {}
+
+  @Override
+  public void print(Annotation doc, OutputStream target, Options options) throws IOException {
+    PrintWriter os = new PrintWriter(IOUtils.encodedOutputStreamWriter(target, options.encoding));
+    print(doc, os, options);
+  }
+
+
+  private static void print(Annotation annotation, PrintWriter pw, Options options) throws IOException {
+    List<CoreMap> sentences = annotation.get(CoreAnnotations.SentencesAnnotation.class);
+    if(sentences != null)
+    {
+      for(int i = 0; i < sentences.size(); i++) {
+        CoreMap sentence = sentences.get(i);
+        StringBuilder sentenceToWrite = new StringBuilder();
+        for(CoreLabel token : sentence.get(CoreAnnotations.TokensAnnotation.class))
+        {
+          sentenceToWrite.append(" ");
+          sentenceToWrite.append(token.lemma().toLowerCase());
+          if(token.get(CoreAnnotations.PartOfSpeechAnnotation.class).startsWith("V")) //verb
+            sentenceToWrite.append("_V");
+          else if(token.get(CoreAnnotations.PartOfSpeechAnnotation.class).startsWith("N")) //noun
+            sentenceToWrite.append("_N");
+        }
+        pw.print(sentenceToWrite.toString()); //omit first space
+      }
+    }
+  }
+
+  //from TextOutputter
+
+  /** Static helper */
+  public static void prettyPrint(Annotation annotation, OutputStream stream, StanfordCoreNLP pipeline) {
+    prettyPrint(annotation, new PrintWriter(stream), pipeline);
+  }
+
+  /** Static helper */
+  public static void prettyPrint(Annotation annotation, PrintWriter pw, StanfordCoreNLP pipeline) {
+    try {
+      TaggedTextOutputter.print(annotation, pw, getOptions(pipeline));
+      // already flushed
+      // don't close, might not want to close underlying stream
+    } catch (IOException e) {
+      throw new RuntimeIOException(e);
+    }
+  }
+}