Merge remote-tracking branch 'refs/remotes/stanfordnlp/master'

cpig · May 17, 2017 · beed193 · beed193
2 parents c92068e + 748e9f3
commit beed193
Show file tree

Hide file tree

Showing 42 changed files with 102,146 additions and 93,645 deletions.
diff --git a/doc/corenlp/pom-full.xml b/doc/corenlp/pom-full.xml
@@ -117,7 +117,7 @@
     <dependency>
       <groupId>com.google.protobuf</groupId>
       <artifactId>protobuf-java</artifactId>
-      <version>2.6.1</version>
+      <version>3.2.0</version>
     </dependency>
 
     <dependency>

diff --git a/lib/protobuf.jar b/lib/protobuf.jar
diff --git a/libsrc/protobuf-java-3.2.0-sources.jar b/libsrc/protobuf-java-3.2.0-sources.jar
diff --git a/libsrc/protobuf-src.jar b/libsrc/protobuf-src.jar
diff --git a/pom.xml b/pom.xml
@@ -117,7 +117,7 @@
     <dependency>
       <groupId>com.google.protobuf</groupId>
       <artifactId>protobuf-java</artifactId>
-      <version>2.6.1</version>
+      <version>3.2.0</version>
     </dependency>
 
     <dependency>

diff --git a/src/edu/stanford/nlp/ie/QuantifiableEntityNormalizer.java b/src/edu/stanford/nlp/ie/QuantifiableEntityNormalizer.java
@@ -1,4 +1,4 @@
-package edu.stanford.nlp.ie; 
+package edu.stanford.nlp.ie;
 import edu.stanford.nlp.util.logging.Redwood;
 
 import edu.stanford.nlp.ie.pascal.ISODateInstance;
@@ -907,7 +907,7 @@ public static String normalizedOrdinalString(String s, Number numberFromSUTime)
     return normalizedOrdinalStringQuiet(s, numberFromSUTime);
   }
 
-  public static final Pattern numberPattern = Pattern.compile("([0-9.]+)");
+  private static final Pattern numberPattern = Pattern.compile("([0-9.]+)");
 
   public static String normalizedOrdinalStringQuiet(String s, Number numberFromSUTime) {
     // clean up string
@@ -1132,7 +1132,7 @@ public static List<List<CoreLabel>> normalizeClassifierOutput(List<List<CoreLabe
    * Takes the strings of the three previous and next words to a quantity and
    * detects a
    * quantity modifier like "less than", "more than", etc.
-   * Any of these words may be <code>null</code> or an empty String.
+   * Any of these words may be {@code null} or an empty String.
    */
   private static <E extends CoreMap> String detectQuantityModifier(List<E> list, int beforeIndex, int afterIndex) {
     String prev = (beforeIndex >= 0) ? list.get(beforeIndex).get(CoreAnnotations.TextAnnotation.class).toLowerCase(): "";
@@ -1175,26 +1175,26 @@ private static <E extends CoreMap> String detectQuantityModifier(List<E> list, i
   }
 
 
-  private static String earlyOneWord = "early";
-  private static String earlyTwoWords = "(?:dawn|eve|beginning) of";
-  private static String earlyThreeWords = "early in the";
-  private static String lateOneWord = "late";
-  private static String lateTwoWords = "late at|end of";
-  private static String lateThreeWords = "end of the";
-  private static String middleTwoWords = "(?:middle|midst) of";
-  private static String middleThreeWords = "(?:middle|midst) of the";
+  private static final String earlyOneWord = "early";
+  private static final String earlyTwoWords = "(?:dawn|eve|beginning) of";
+  private static final String earlyThreeWords = "early in the";
+  private static final String lateOneWord = "late";
+  private static final String lateTwoWords = "late at|end of";
+  private static final String lateThreeWords = "end of the";
+  private static final String middleTwoWords = "(?:middle|midst) of";
+  private static final String middleThreeWords = "(?:middle|midst) of the";
 
-  private static String amOneWord = "[Aa]\\.?[Mm]\\.?";
-  private static String pmOneWord = "[Pp]\\.?[Mm]\\.?";
-  private static String amThreeWords = "in the morning";
-  private static String pmTwoWords = "at night";
-  private static String pmThreeWords = "in the (?:afternoon|evening)";
+  private static final String amOneWord = "[Aa]\\.?[Mm]\\.?";
+  private static final String pmOneWord = "[Pp]\\.?[Mm]\\.?";
+  private static final String amThreeWords = "in the morning";
+  private static final String pmTwoWords = "at night";
+  private static final String pmThreeWords = "in the (?:afternoon|evening)";
 
 
   /**
    * Takes the strings of the three previous words to a quantity and detects a
    * quantity modifier like "less than", "more than", etc.
-   * Any of these words may be <code>null</code> or an empty String.
+   * Any of these words may be {@code null} or an empty String.
    */
   private static <E extends CoreMap> String detectTimeOfDayModifier(List<E> list, int beforeIndex, int afterIndex) {
     String prev = (beforeIndex >= 0) ? list.get(beforeIndex).get(CoreAnnotations.TextAnnotation.class).toLowerCase() : "";

diff --git a/src/edu/stanford/nlp/international/spanish/process/SpanishLexer.flex b/src/edu/stanford/nlp/international/spanish/process/SpanishLexer.flex
@@ -156,18 +156,18 @@ import edu.stanford.nlp.util.logging.Redwood;
 	} else if (value.equals("allKeep")) {
 	  untokenizable = UntokenizableOptions.ALL_KEEP;
 	} else {
-        throw new IllegalArgumentException("FrenchLexer: Invalid option value in constructor: " + key + ": " + value);
+        throw new IllegalArgumentException("SpanishLexer: Invalid option value in constructor: " + key + ": " + value);
 	}
       } else if ("strictTreebank3".equals(key)) {
         strictTreebank3 = val;
       } else {
-        System.err.printf("%s: Invalid options key in constructor: %s%n", this.getClass().getName(), key);
+        throw new IllegalArgumentException(String.format("%s: Invalid options key in constructor: %s%n", this.getClass().getName(), key));
       }
     }
     // this.seenUntokenizableCharacter = false; // unnecessary, it's default initialized
     if (invertible) {
       if ( ! (tf instanceof CoreLabelTokenFactory)) {
-        throw new IllegalArgumentException("FrenchLexer: the invertible option requires a CoreLabelTokenFactory");
+        throw new IllegalArgumentException("SpanishLexer: the invertible option requires a CoreLabelTokenFactory");
       }
       prevWord = (CoreLabel) tf.makeToken("", 0, 0);
       prevWordAfter = new StringBuilder();
@@ -274,8 +274,8 @@ import edu.stanford.nlp.util.logging.Redwood;
     return result.length() == 0 ? "-" : result;
   }
 
-  private static final Pattern asciiSingleQuote = Pattern.compile("&apos;|[\u0091\u2018\u0092\u2019\u201A\u201B\u2039\u203A']");
-  private static final Pattern asciiDoubleQuote = Pattern.compile("&quot;|[\u0093\u201C\u0094\u201D\u201E\u00AB\u00BB\"]");
+  private static final Pattern asciiSingleQuote = Pattern.compile("&apos;|[\u0082\u0091\u2018\u0092\u2019\u201A\u201B\u2039\u203A']");
+  private static final Pattern asciiDoubleQuote = Pattern.compile("&quot;|[\u0084\u0093\u201C\u0094\u201D\u201E\u00AB\u00BB\"]");
 
   private static String  Shlomi2AsciiQuotes(String in) {
     return asciiQuotes(in);
@@ -335,7 +335,7 @@ import edu.stanford.nlp.util.logging.Redwood;
   private static final Pattern AMP_PATTERN = Pattern.compile("(?i:&amp;)");
 
   private static String normalizeAmp(final String in) {
-      return AMP_PATTERN.matcher(in).replaceAll("&");
+    return AMP_PATTERN.matcher(in).replaceAll("&");
   }
 
   private static String convertToEl(String l) {
@@ -553,14 +553,14 @@ ASTS = \*+|(\\\*){1,3}
 HASHES = #+
 FNMARKS = {ATS}|{HASHES}|{UNDS}
 INSENTP = [,;:\u3001]
-QUOTES = {APOSETCETERA}|''|[`\u2018\u2019\u201A\u201B\u201C\u201D\u0091\u0092\u0093\u0094\u201E\u201F\u2039\u203A\u00AB\u00BB]{1,2}
+QUOTES = {APOSETCETERA}|''|[`\u2018\u2019\u201A\u201B\u201C\u201D\u0082\u0084\u0091-\u0094\u201E\u201F\u2039\u203A\u00AB\u00BB]{1,2}
+
 DBLQUOT = \"|&quot;
 
 /* U+2200-U+2BFF has a lot of the various mathematical, etc. symbol ranges */
 MISCSYMBOL = [+%&~\^|\\¦\u00A7¨\u00A9\u00AC\u00AE¯\u00B0-\u00B3\u00B4-\u00BA\u00D7\u00F7\u0387\u05BE\u05C0\u05C3\u05C6\u05F3\u05F4\u0600-\u0603\u0606-\u060A\u060C\u0614\u061B\u061E\u066A\u066D\u0703-\u070D\u07F6\u07F7\u07F8\u0964\u0965\u0E4F\u1FBD\u2016\u2017\u2020-\u2023\u2030-\u2038\u203B\u203E-\u2042\u2044\u207A-\u207F\u208A-\u208E\u2100-\u214F\u2190-\u21FF\u2200-\u2BFF\u3012\u30FB\uFF01-\uFF0F\uFF1A-\uFF20\uFF3B-\uFF40\uFF5B-\uFF65\uFF65]
 /* \uFF65 is Halfwidth katakana middle dot; \u30FB is Katakana middle dot */
 /* Math and other symbols that stand alone: °²× ∀ */
-// Consider this list of bullet chars: 2219, 00b7, 2022, 2024
 
 
 %%
@@ -730,18 +730,20 @@ cannot			{ yypushback(3) ; return getNext(); }
 
 {FAKEDUCKFEET} |
 {MISCSYMBOL}	{ return getNext(); }
+\u0095          { return getNext("\u2022", yytext()); } /* cp1252 bullet mapped to unicode */
+\u0099          { return getNext("\u2122", yytext()); } /* cp1252 TM sign mapped to unicode */
 
 \0|{SPACES}|[\u200B\u200E-\u200F\uFEFF]	{ if (invertible) {
                      prevWordAfter.append(yytext());
                   }
                 }
-{NEWLINE}	{ if (tokenizeNLs) {
+{NEWLINE}	      { if (tokenizeNLs) {
                       return getNext(NEWLINE_TOKEN, yytext()); // js: for tokenizing carriage returns
                   } else if (invertible) {
                       prevWordAfter.append(yytext());
                   }
                 }
-&nbsp;		{ if (invertible) {
+&nbsp;		      { if (invertible) {
                      prevWordAfter.append(yytext());
                   }
                 }