Add option to load SemanticGraphs from CoNLL-U files in SemgrexPattern.

sebschu · Stanford NLP · commit 70a7cf1c77f8 · 2015-05-22T16:26:11.000-07:00
diff --git a/src/edu/stanford/nlp/semgraph/semgrex/SemgrexPattern.java b/src/edu/stanford/nlp/semgraph/semgrex/SemgrexPattern.java
@@ -5,7 +5,9 @@
 
 import edu.stanford.nlp.semgraph.SemanticGraph;
 import edu.stanford.nlp.semgraph.SemanticGraphFactory;
+import edu.stanford.nlp.io.IOUtils;
 import edu.stanford.nlp.ling.*;
+import edu.stanford.nlp.trees.CoNLLUDocumentReader;
 import edu.stanford.nlp.trees.GrammaticalStructure;
 import edu.stanford.nlp.trees.MemoryTreebank;
 import edu.stanford.nlp.trees.Tree;
@@ -31,7 +33,7 @@
  * matches "NN", "NNS", "NNP", etc. --wcmac) <p/>
  *
  * For example, <code>{lemma:slice;tag:/VB.* /}</code> represents any verb nodes
- * with "slice" as their lemma.  Attributes are extracted using 
+ * with "slice" as their lemma.  Attributes are extracted using
  * <code>edu.stanford.nlp.ling.AnnotationLookup</code>. <p/>
  *
  * The root of the graph can be marked by the $ sign, that is <code>{$}</code>
@@ -145,14 +147,14 @@
  * <p><h3>Naming relations</h3>
  *
  * It is also possible to name relations.  For example, you can write the pattern
- * <code>{idx:1} &gt;=reln {idx:2}</code>  The name of the relation will then 
- * be stored in the matcher and can be extracted with <code>getRelnName("reln")</code>  
- * At present, though, there is no backreferencing capability such as with the 
- * named nodes; this is only useful when using the API to extract the name of the 
+ * <code>{idx:1} &gt;=reln {idx:2}</code>  The name of the relation will then
+ * be stored in the matcher and can be extracted with <code>getRelnName("reln")</code>
+ * At present, though, there is no backreferencing capability such as with the
+ * named nodes; this is only useful when using the API to extract the name of the
  * relation used when making the match.
  * <p/>
  * In the case of ancestor and descendant relations, the <b>last</b>
- * relation in the sequence of relations is the name used.  
+ * relation in the sequence of relations is the name used.
  * <p/>
  *
  * @author Chloe Kiddon
@@ -353,11 +355,13 @@ public int hashCode() {
   static final String MODE = "-mode";
   static final String DEFAULT_MODE = "BASIC";
   static final String EXTRAS = "-extras";
-    
+  static final String CONLLU_FILE = "-conlluFile";
+
   public static void help() {
     System.err.println("Possible arguments for SemgrexPattern:");
     System.err.println(PATTERN + ": what pattern to use for matching");
     System.err.println(TREE_FILE + ": a file of trees to process");
+    System.err.println(CONLLU_FILE + ": a CoNLL-U file of dependency trees to process");
     System.err.println(MODE + ": what mode for dependencies.  basic, collapsed, or ccprocessed.  To get 'noncollapsed', use basic with extras");
     System.err.println(EXTRAS + ": whether or not to use extras");
     System.err.println();
@@ -372,7 +376,7 @@ public static void help() {
    * <br>
    * See the help() function for a list of possible arguments to provide.
    */
-  public static void main(String[] args) {
+  public static void main(String[] args) throws IOException {
     Map<String,Integer> flagMap = Generics.newHashMap();
 
     flagMap.put(PATTERN, 1);
@@ -400,7 +404,7 @@ public static void main(String[] args) {
     if (argsMap.containsKey(EXTRAS) && argsMap.get(EXTRAS).length > 0) {
       useExtras = Boolean.valueOf(argsMap.get(EXTRAS)[0]);
     }
-    
+
     List<SemanticGraph> graphs = Generics.newArrayList();
     // TODO: allow other sources of graphs, such as dependency files
     if (argsMap.containsKey(TREE_FILE) && argsMap.get(TREE_FILE).length > 0) {
@@ -416,6 +420,19 @@ public static void main(String[] args) {
       }
     }
 
+    if (argsMap.containsKey(CONLLU_FILE) && argsMap.get(CONLLU_FILE).length > 0) {
+      CoNLLUDocumentReader reader = new CoNLLUDocumentReader();
+      for (String conlluFile : argsMap.get(CONLLU_FILE)) {
+        System.err.println("Loading file " + conlluFile);
+        Iterator<SemanticGraph> it = reader.getIterator(IOUtils.readerFromString(conlluFile));
+
+        while (it.hasNext()) {
+          SemanticGraph graph = it.next();
+          graphs.add(graph);
+        }
+      }
+    }
+
     for (SemanticGraph graph : graphs) {
       SemgrexMatcher matcher = semgrex.matcher(graph);
       if (!(matcher.find())) {
diff --git a/src/edu/stanford/nlp/trees/CoNLLUDocumentReader.java b/src/edu/stanford/nlp/trees/CoNLLUDocumentReader.java
@@ -0,0 +1,167 @@
+package edu.stanford.nlp.trees;
+
+import java.io.Reader;
+import java.io.StringReader;
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.Comparator;
+import java.util.HashMap;
+import java.util.Iterator;
+import java.util.List;
+import java.util.function.Function;
+
+import edu.stanford.nlp.international.Language;
+import edu.stanford.nlp.ling.CoreAnnotations;
+import edu.stanford.nlp.ling.IndexedWord;
+import edu.stanford.nlp.objectbank.DelimitRegExIterator;
+import edu.stanford.nlp.objectbank.IteratorFromReaderFactory;
+import edu.stanford.nlp.objectbank.ObjectBank;
+import edu.stanford.nlp.semgraph.SemanticGraph;
+import edu.stanford.nlp.trees.GrammaticalRelation;
+import edu.stanford.nlp.trees.TypedDependency;
+
+/**
+ * Reader for ConLL-U formatted dependency treebanks.
+ *
+ * @author Sebastian Schuster
+ */
+
+
+public class CoNLLUDocumentReader implements
+    IteratorFromReaderFactory<SemanticGraph> {
+
+
+  private IteratorFromReaderFactory<SemanticGraph> ifrf;
+
+  public CoNLLUDocumentReader() {
+    this.ifrf = DelimitRegExIterator.getFactory("\n(\\s*\n)+", new SentenceProcessor());
+  }
+
+
+  @Override
+  public Iterator<SemanticGraph> getIterator(Reader r) {
+    return ifrf.getIterator(r);
+  }
+
+  private static class SentenceProcessor implements Function<String,SemanticGraph> {
+    public SemanticGraph apply(String line) {
+      if (line == null) return null;
+      Function<String,IndexedWord> func = new WordProcessor();
+      ObjectBank<IndexedWord> words = ObjectBank.getLineIterator(new StringReader(line), func);
+      List<IndexedWord> sorted = new ArrayList<IndexedWord>(words);
+      Collections.sort(sorted);
+
+
+      /* Construct a semantic graph. */
+      List<TypedDependency> deps = new ArrayList<TypedDependency>(sorted.size());
+      for (IndexedWord word : sorted) {
+        GrammaticalRelation reln = GrammaticalRelation.valueOf(Language.UniversalEnglish, word.get(CoreAnnotations.CoNLLDepTypeAnnotation.class));
+        int govIdx = word.get(CoreAnnotations.CoNLLDepParentIndexAnnotation.class);
+        IndexedWord gov;
+        if (govIdx == 0) {
+          gov = new IndexedWord(word.docID(), word.sentIndex(), 0);
+          gov.setValue("ROOT");
+          if (word.get(CoreAnnotations.CoNLLDepTypeAnnotation.class).equals("root")) {
+            reln = GrammaticalRelation.ROOT;
+          }
+        } else {
+          gov = sorted.get(govIdx - 1);
+        }
+        TypedDependency dep = new TypedDependency(reln, gov, word);
+        deps.add(dep);
+      }
+
+      return new SemanticGraph(deps);
+    }
+  }
+
+  private static class WordProcessor implements Function<String,IndexedWord> {
+    public IndexedWord apply(String line) {
+      String[] bits = line.split("\\s+");
+      IndexedWord word = new IndexedWord();
+      word.set(CoreAnnotations.IndexAnnotation.class, Integer.parseInt(bits[0]));
+      word.set(CoreAnnotations.TextAnnotation.class, bits[1]);
+      word.set(CoreAnnotations.LemmaAnnotation.class, bits[2]);
+      word.set(CoreAnnotations.CoarseTagAnnotation.class, bits[3]);
+      word.set(CoreAnnotations.PartOfSpeechAnnotation.class, bits[4]);
+
+      word.set(CoreAnnotations.CoNLLDepParentIndexAnnotation.class, Integer.parseInt(bits[6]));
+      word.set(CoreAnnotations.CoNLLDepTypeAnnotation.class, bits[7]);
+      word.set(CoreAnnotations.CoNLLUSecondaryDepsAnnotation.class, bits[8]);
+      word.set(CoreAnnotations.CoNLLUMisc.class, bits[9]);
+
+      word.setIndex(Integer.parseInt(bits[0]));
+      word.setValue(bits[1]);
+
+      /* Parse features. */
+      HashMap<String, String> features = parseFeatures(bits[5]);
+
+      word.set(CoreAnnotations.CoNLLUFeats.class, features);
+
+
+      return word;
+    }
+  }
+
+
+  /**
+   * Parses the value of the feature column in a CoNLL-U file
+   * and returns them in a HashMap with the feature names as keys
+   * and the feature values as values.
+   *
+   * @param featureString
+   * @return A HashMap<String,String> with the feature values.
+   */
+  public static HashMap<String,String> parseFeatures(String featureString) {
+    HashMap<String, String> features = new HashMap<String, String>();
+    if (! featureString.equals("_")) {
+      String[] featValPairs = featureString.split("\\|");
+      for (String p : featValPairs) {
+        String[] featValPair = p.split("=");
+        features.put(featValPair[0], featValPair[1]);
+      }
+    }
+    return features;
+  }
+
+  /**
+   * Converts a feature HashMap to a feature string to be used
+   * in a CoNLL-U file.
+   *
+   * @return The feature string.
+   */
+
+  public static String toFeatureString(HashMap<String,String> features) {
+    StringBuffer sb = new StringBuffer();
+    boolean first = true;
+    List<String> sortedKeys = new ArrayList<String>(features.keySet());
+    Collections.sort(sortedKeys, new FeatureNameComparator());
+    for (String key : sortedKeys) {
+      if ( ! first) {
+        sb.append("|");
+      } else {
+        first = false;
+      }
+
+      sb.append(key)
+        .append("=")
+        .append(features.get(key));
+
+    }
+
+    /* Empty feature list. */
+    if (first) {
+      sb.append("_");
+    }
+
+    return sb.toString();
+  }
+
+  public static class FeatureNameComparator implements Comparator<String> {
+
+    @Override
+    public int compare(String featureName1, String featureName2) {
+      return featureName1.toLowerCase().compareTo(featureName2.toLowerCase());
+    }
+  }
+}