Skip to content

Commit 70a7cf1

Browse files
sebschuStanford NLP
authored and
Stanford NLP
committed
Add option to load SemanticGraphs from CoNLL-U files in SemgrexPattern.
1 parent 0ce3922 commit 70a7cf1

File tree

2 files changed

+193
-9
lines changed

2 files changed

+193
-9
lines changed

src/edu/stanford/nlp/semgraph/semgrex/SemgrexPattern.java

+26-9
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,9 @@
55

66
import edu.stanford.nlp.semgraph.SemanticGraph;
77
import edu.stanford.nlp.semgraph.SemanticGraphFactory;
8+
import edu.stanford.nlp.io.IOUtils;
89
import edu.stanford.nlp.ling.*;
10+
import edu.stanford.nlp.trees.CoNLLUDocumentReader;
911
import edu.stanford.nlp.trees.GrammaticalStructure;
1012
import edu.stanford.nlp.trees.MemoryTreebank;
1113
import edu.stanford.nlp.trees.Tree;
@@ -31,7 +33,7 @@
3133
* matches "NN", "NNS", "NNP", etc. --wcmac) <p/>
3234
*
3335
* For example, <code>{lemma:slice;tag:/VB.* /}</code> represents any verb nodes
34-
* with "slice" as their lemma. Attributes are extracted using
36+
* with "slice" as their lemma. Attributes are extracted using
3537
* <code>edu.stanford.nlp.ling.AnnotationLookup</code>. <p/>
3638
*
3739
* The root of the graph can be marked by the $ sign, that is <code>{$}</code>
@@ -145,14 +147,14 @@
145147
* <p><h3>Naming relations</h3>
146148
*
147149
* It is also possible to name relations. For example, you can write the pattern
148-
* <code>{idx:1} &gt;=reln {idx:2}</code> The name of the relation will then
149-
* be stored in the matcher and can be extracted with <code>getRelnName("reln")</code>
150-
* At present, though, there is no backreferencing capability such as with the
151-
* named nodes; this is only useful when using the API to extract the name of the
150+
* <code>{idx:1} &gt;=reln {idx:2}</code> The name of the relation will then
151+
* be stored in the matcher and can be extracted with <code>getRelnName("reln")</code>
152+
* At present, though, there is no backreferencing capability such as with the
153+
* named nodes; this is only useful when using the API to extract the name of the
152154
* relation used when making the match.
153155
* <p/>
154156
* In the case of ancestor and descendant relations, the <b>last</b>
155-
* relation in the sequence of relations is the name used.
157+
* relation in the sequence of relations is the name used.
156158
* <p/>
157159
*
158160
* @author Chloe Kiddon
@@ -353,11 +355,13 @@ public int hashCode() {
353355
static final String MODE = "-mode";
354356
static final String DEFAULT_MODE = "BASIC";
355357
static final String EXTRAS = "-extras";
356-
358+
static final String CONLLU_FILE = "-conlluFile";
359+
357360
public static void help() {
358361
System.err.println("Possible arguments for SemgrexPattern:");
359362
System.err.println(PATTERN + ": what pattern to use for matching");
360363
System.err.println(TREE_FILE + ": a file of trees to process");
364+
System.err.println(CONLLU_FILE + ": a CoNLL-U file of dependency trees to process");
361365
System.err.println(MODE + ": what mode for dependencies. basic, collapsed, or ccprocessed. To get 'noncollapsed', use basic with extras");
362366
System.err.println(EXTRAS + ": whether or not to use extras");
363367
System.err.println();
@@ -372,7 +376,7 @@ public static void help() {
372376
* <br>
373377
* See the help() function for a list of possible arguments to provide.
374378
*/
375-
public static void main(String[] args) {
379+
public static void main(String[] args) throws IOException {
376380
Map<String,Integer> flagMap = Generics.newHashMap();
377381

378382
flagMap.put(PATTERN, 1);
@@ -400,7 +404,7 @@ public static void main(String[] args) {
400404
if (argsMap.containsKey(EXTRAS) && argsMap.get(EXTRAS).length > 0) {
401405
useExtras = Boolean.valueOf(argsMap.get(EXTRAS)[0]);
402406
}
403-
407+
404408
List<SemanticGraph> graphs = Generics.newArrayList();
405409
// TODO: allow other sources of graphs, such as dependency files
406410
if (argsMap.containsKey(TREE_FILE) && argsMap.get(TREE_FILE).length > 0) {
@@ -416,6 +420,19 @@ public static void main(String[] args) {
416420
}
417421
}
418422

423+
if (argsMap.containsKey(CONLLU_FILE) && argsMap.get(CONLLU_FILE).length > 0) {
424+
CoNLLUDocumentReader reader = new CoNLLUDocumentReader();
425+
for (String conlluFile : argsMap.get(CONLLU_FILE)) {
426+
System.err.println("Loading file " + conlluFile);
427+
Iterator<SemanticGraph> it = reader.getIterator(IOUtils.readerFromString(conlluFile));
428+
429+
while (it.hasNext()) {
430+
SemanticGraph graph = it.next();
431+
graphs.add(graph);
432+
}
433+
}
434+
}
435+
419436
for (SemanticGraph graph : graphs) {
420437
SemgrexMatcher matcher = semgrex.matcher(graph);
421438
if (!(matcher.find())) {
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,167 @@
1+
package edu.stanford.nlp.trees;
2+
3+
import java.io.Reader;
4+
import java.io.StringReader;
5+
import java.util.ArrayList;
6+
import java.util.Collections;
7+
import java.util.Comparator;
8+
import java.util.HashMap;
9+
import java.util.Iterator;
10+
import java.util.List;
11+
import java.util.function.Function;
12+
13+
import edu.stanford.nlp.international.Language;
14+
import edu.stanford.nlp.ling.CoreAnnotations;
15+
import edu.stanford.nlp.ling.IndexedWord;
16+
import edu.stanford.nlp.objectbank.DelimitRegExIterator;
17+
import edu.stanford.nlp.objectbank.IteratorFromReaderFactory;
18+
import edu.stanford.nlp.objectbank.ObjectBank;
19+
import edu.stanford.nlp.semgraph.SemanticGraph;
20+
import edu.stanford.nlp.trees.GrammaticalRelation;
21+
import edu.stanford.nlp.trees.TypedDependency;
22+
23+
/**
24+
* Reader for ConLL-U formatted dependency treebanks.
25+
*
26+
* @author Sebastian Schuster
27+
*/
28+
29+
30+
public class CoNLLUDocumentReader implements
31+
IteratorFromReaderFactory<SemanticGraph> {
32+
33+
34+
private IteratorFromReaderFactory<SemanticGraph> ifrf;
35+
36+
public CoNLLUDocumentReader() {
37+
this.ifrf = DelimitRegExIterator.getFactory("\n(\\s*\n)+", new SentenceProcessor());
38+
}
39+
40+
41+
@Override
42+
public Iterator<SemanticGraph> getIterator(Reader r) {
43+
return ifrf.getIterator(r);
44+
}
45+
46+
private static class SentenceProcessor implements Function<String,SemanticGraph> {
47+
public SemanticGraph apply(String line) {
48+
if (line == null) return null;
49+
Function<String,IndexedWord> func = new WordProcessor();
50+
ObjectBank<IndexedWord> words = ObjectBank.getLineIterator(new StringReader(line), func);
51+
List<IndexedWord> sorted = new ArrayList<IndexedWord>(words);
52+
Collections.sort(sorted);
53+
54+
55+
/* Construct a semantic graph. */
56+
List<TypedDependency> deps = new ArrayList<TypedDependency>(sorted.size());
57+
for (IndexedWord word : sorted) {
58+
GrammaticalRelation reln = GrammaticalRelation.valueOf(Language.UniversalEnglish, word.get(CoreAnnotations.CoNLLDepTypeAnnotation.class));
59+
int govIdx = word.get(CoreAnnotations.CoNLLDepParentIndexAnnotation.class);
60+
IndexedWord gov;
61+
if (govIdx == 0) {
62+
gov = new IndexedWord(word.docID(), word.sentIndex(), 0);
63+
gov.setValue("ROOT");
64+
if (word.get(CoreAnnotations.CoNLLDepTypeAnnotation.class).equals("root")) {
65+
reln = GrammaticalRelation.ROOT;
66+
}
67+
} else {
68+
gov = sorted.get(govIdx - 1);
69+
}
70+
TypedDependency dep = new TypedDependency(reln, gov, word);
71+
deps.add(dep);
72+
}
73+
74+
return new SemanticGraph(deps);
75+
}
76+
}
77+
78+
private static class WordProcessor implements Function<String,IndexedWord> {
79+
public IndexedWord apply(String line) {
80+
String[] bits = line.split("\\s+");
81+
IndexedWord word = new IndexedWord();
82+
word.set(CoreAnnotations.IndexAnnotation.class, Integer.parseInt(bits[0]));
83+
word.set(CoreAnnotations.TextAnnotation.class, bits[1]);
84+
word.set(CoreAnnotations.LemmaAnnotation.class, bits[2]);
85+
word.set(CoreAnnotations.CoarseTagAnnotation.class, bits[3]);
86+
word.set(CoreAnnotations.PartOfSpeechAnnotation.class, bits[4]);
87+
88+
word.set(CoreAnnotations.CoNLLDepParentIndexAnnotation.class, Integer.parseInt(bits[6]));
89+
word.set(CoreAnnotations.CoNLLDepTypeAnnotation.class, bits[7]);
90+
word.set(CoreAnnotations.CoNLLUSecondaryDepsAnnotation.class, bits[8]);
91+
word.set(CoreAnnotations.CoNLLUMisc.class, bits[9]);
92+
93+
word.setIndex(Integer.parseInt(bits[0]));
94+
word.setValue(bits[1]);
95+
96+
/* Parse features. */
97+
HashMap<String, String> features = parseFeatures(bits[5]);
98+
99+
word.set(CoreAnnotations.CoNLLUFeats.class, features);
100+
101+
102+
return word;
103+
}
104+
}
105+
106+
107+
/**
108+
* Parses the value of the feature column in a CoNLL-U file
109+
* and returns them in a HashMap with the feature names as keys
110+
* and the feature values as values.
111+
*
112+
* @param featureString
113+
* @return A HashMap<String,String> with the feature values.
114+
*/
115+
public static HashMap<String,String> parseFeatures(String featureString) {
116+
HashMap<String, String> features = new HashMap<String, String>();
117+
if (! featureString.equals("_")) {
118+
String[] featValPairs = featureString.split("\\|");
119+
for (String p : featValPairs) {
120+
String[] featValPair = p.split("=");
121+
features.put(featValPair[0], featValPair[1]);
122+
}
123+
}
124+
return features;
125+
}
126+
127+
/**
128+
* Converts a feature HashMap to a feature string to be used
129+
* in a CoNLL-U file.
130+
*
131+
* @return The feature string.
132+
*/
133+
134+
public static String toFeatureString(HashMap<String,String> features) {
135+
StringBuffer sb = new StringBuffer();
136+
boolean first = true;
137+
List<String> sortedKeys = new ArrayList<String>(features.keySet());
138+
Collections.sort(sortedKeys, new FeatureNameComparator());
139+
for (String key : sortedKeys) {
140+
if ( ! first) {
141+
sb.append("|");
142+
} else {
143+
first = false;
144+
}
145+
146+
sb.append(key)
147+
.append("=")
148+
.append(features.get(key));
149+
150+
}
151+
152+
/* Empty feature list. */
153+
if (first) {
154+
sb.append("_");
155+
}
156+
157+
return sb.toString();
158+
}
159+
160+
public static class FeatureNameComparator implements Comparator<String> {
161+
162+
@Override
163+
public int compare(String featureName1, String featureName2) {
164+
return featureName1.toLowerCase().compareTo(featureName2.toLowerCase());
165+
}
166+
}
167+
}

0 commit comments

Comments
 (0)