Skip to content

Commit

Permalink
Merge remote-tracking branch 'refs/remotes/stanfordnlp/master'
Browse files Browse the repository at this point in the history
  • Loading branch information
cpig committed May 17, 2017
2 parents c92068e + 748e9f3 commit beed193
Show file tree
Hide file tree
Showing 42 changed files with 102,146 additions and 93,645 deletions.
2 changes: 1 addition & 1 deletion doc/corenlp/pom-full.xml
Original file line number Diff line number Diff line change
Expand Up @@ -117,7 +117,7 @@
<dependency>
<groupId>com.google.protobuf</groupId>
<artifactId>protobuf-java</artifactId>
<version>2.6.1</version>
<version>3.2.0</version>
</dependency>

<dependency>
Expand Down
Binary file modified lib/protobuf.jar
Binary file not shown.
Binary file added libsrc/protobuf-java-3.2.0-sources.jar
Binary file not shown.
Binary file removed libsrc/protobuf-src.jar
Binary file not shown.
2 changes: 1 addition & 1 deletion pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -117,7 +117,7 @@
<dependency>
<groupId>com.google.protobuf</groupId>
<artifactId>protobuf-java</artifactId>
<version>2.6.1</version>
<version>3.2.0</version>
</dependency>

<dependency>
Expand Down
34 changes: 17 additions & 17 deletions src/edu/stanford/nlp/ie/QuantifiableEntityNormalizer.java
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
package edu.stanford.nlp.ie;
package edu.stanford.nlp.ie;
import edu.stanford.nlp.util.logging.Redwood;

import edu.stanford.nlp.ie.pascal.ISODateInstance;
Expand Down Expand Up @@ -907,7 +907,7 @@ public static String normalizedOrdinalString(String s, Number numberFromSUTime)
return normalizedOrdinalStringQuiet(s, numberFromSUTime);
}

public static final Pattern numberPattern = Pattern.compile("([0-9.]+)");
private static final Pattern numberPattern = Pattern.compile("([0-9.]+)");

public static String normalizedOrdinalStringQuiet(String s, Number numberFromSUTime) {
// clean up string
Expand Down Expand Up @@ -1132,7 +1132,7 @@ public static List<List<CoreLabel>> normalizeClassifierOutput(List<List<CoreLabe
* Takes the strings of the three previous and next words to a quantity and
* detects a
* quantity modifier like "less than", "more than", etc.
* Any of these words may be <code>null</code> or an empty String.
* Any of these words may be {@code null} or an empty String.
*/
private static <E extends CoreMap> String detectQuantityModifier(List<E> list, int beforeIndex, int afterIndex) {
String prev = (beforeIndex >= 0) ? list.get(beforeIndex).get(CoreAnnotations.TextAnnotation.class).toLowerCase(): "";
Expand Down Expand Up @@ -1175,26 +1175,26 @@ private static <E extends CoreMap> String detectQuantityModifier(List<E> list, i
}


private static String earlyOneWord = "early";
private static String earlyTwoWords = "(?:dawn|eve|beginning) of";
private static String earlyThreeWords = "early in the";
private static String lateOneWord = "late";
private static String lateTwoWords = "late at|end of";
private static String lateThreeWords = "end of the";
private static String middleTwoWords = "(?:middle|midst) of";
private static String middleThreeWords = "(?:middle|midst) of the";
private static final String earlyOneWord = "early";
private static final String earlyTwoWords = "(?:dawn|eve|beginning) of";
private static final String earlyThreeWords = "early in the";
private static final String lateOneWord = "late";
private static final String lateTwoWords = "late at|end of";
private static final String lateThreeWords = "end of the";
private static final String middleTwoWords = "(?:middle|midst) of";
private static final String middleThreeWords = "(?:middle|midst) of the";

private static String amOneWord = "[Aa]\\.?[Mm]\\.?";
private static String pmOneWord = "[Pp]\\.?[Mm]\\.?";
private static String amThreeWords = "in the morning";
private static String pmTwoWords = "at night";
private static String pmThreeWords = "in the (?:afternoon|evening)";
private static final String amOneWord = "[Aa]\\.?[Mm]\\.?";
private static final String pmOneWord = "[Pp]\\.?[Mm]\\.?";
private static final String amThreeWords = "in the morning";
private static final String pmTwoWords = "at night";
private static final String pmThreeWords = "in the (?:afternoon|evening)";


/**
* Takes the strings of the three previous words to a quantity and detects a
* quantity modifier like "less than", "more than", etc.
* Any of these words may be <code>null</code> or an empty String.
* Any of these words may be {@code null} or an empty String.
*/
private static <E extends CoreMap> String detectTimeOfDayModifier(List<E> list, int beforeIndex, int afterIndex) {
String prev = (beforeIndex >= 0) ? list.get(beforeIndex).get(CoreAnnotations.TextAnnotation.class).toLowerCase() : "";
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -156,18 +156,18 @@ import edu.stanford.nlp.util.logging.Redwood;
} else if (value.equals("allKeep")) {
untokenizable = UntokenizableOptions.ALL_KEEP;
} else {
throw new IllegalArgumentException("FrenchLexer: Invalid option value in constructor: " + key + ": " + value);
throw new IllegalArgumentException("SpanishLexer: Invalid option value in constructor: " + key + ": " + value);
}
} else if ("strictTreebank3".equals(key)) {
strictTreebank3 = val;
} else {
System.err.printf("%s: Invalid options key in constructor: %s%n", this.getClass().getName(), key);
throw new IllegalArgumentException(String.format("%s: Invalid options key in constructor: %s%n", this.getClass().getName(), key));
}
}
// this.seenUntokenizableCharacter = false; // unnecessary, it's default initialized
if (invertible) {
if ( ! (tf instanceof CoreLabelTokenFactory)) {
throw new IllegalArgumentException("FrenchLexer: the invertible option requires a CoreLabelTokenFactory");
throw new IllegalArgumentException("SpanishLexer: the invertible option requires a CoreLabelTokenFactory");
}
prevWord = (CoreLabel) tf.makeToken("", 0, 0);
prevWordAfter = new StringBuilder();
Expand Down Expand Up @@ -274,8 +274,8 @@ import edu.stanford.nlp.util.logging.Redwood;
return result.length() == 0 ? "-" : result;
}

private static final Pattern asciiSingleQuote = Pattern.compile("&apos;|[\u0091\u2018\u0092\u2019\u201A\u201B\u2039\u203A']");
private static final Pattern asciiDoubleQuote = Pattern.compile("&quot;|[\u0093\u201C\u0094\u201D\u201E\u00AB\u00BB\"]");
private static final Pattern asciiSingleQuote = Pattern.compile("&apos;|[\u0082\u0091\u2018\u0092\u2019\u201A\u201B\u2039\u203A']");
private static final Pattern asciiDoubleQuote = Pattern.compile("&quot;|[\u0084\u0093\u201C\u0094\u201D\u201E\u00AB\u00BB\"]");

private static String Shlomi2AsciiQuotes(String in) {
return asciiQuotes(in);
Expand Down Expand Up @@ -335,7 +335,7 @@ import edu.stanford.nlp.util.logging.Redwood;
private static final Pattern AMP_PATTERN = Pattern.compile("(?i:&amp;)");

private static String normalizeAmp(final String in) {
return AMP_PATTERN.matcher(in).replaceAll("&");
return AMP_PATTERN.matcher(in).replaceAll("&");
}

private static String convertToEl(String l) {
Expand Down Expand Up @@ -553,14 +553,14 @@ ASTS = \*+|(\\\*){1,3}
HASHES = #+
FNMARKS = {ATS}|{HASHES}|{UNDS}
INSENTP = [,;:\u3001]
QUOTES = {APOSETCETERA}|''|[`\u2018\u2019\u201A\u201B\u201C\u201D\u0091\u0092\u0093\u0094\u201E\u201F\u2039\u203A\u00AB\u00BB]{1,2}
QUOTES = {APOSETCETERA}|''|[`\u2018\u2019\u201A\u201B\u201C\u201D\u0082\u0084\u0091-\u0094\u201E\u201F\u2039\u203A\u00AB\u00BB]{1,2}

DBLQUOT = \"|&quot;

/* U+2200-U+2BFF has a lot of the various mathematical, etc. symbol ranges */
MISCSYMBOL = [+%&~\^|\\¦\u00A7¨\u00A9\u00AC\u00AE¯\u00B0-\u00B3\u00B4-\u00BA\u00D7\u00F7\u0387\u05BE\u05C0\u05C3\u05C6\u05F3\u05F4\u0600-\u0603\u0606-\u060A\u060C\u0614\u061B\u061E\u066A\u066D\u0703-\u070D\u07F6\u07F7\u07F8\u0964\u0965\u0E4F\u1FBD\u2016\u2017\u2020-\u2023\u2030-\u2038\u203B\u203E-\u2042\u2044\u207A-\u207F\u208A-\u208E\u2100-\u214F\u2190-\u21FF\u2200-\u2BFF\u3012\u30FB\uFF01-\uFF0F\uFF1A-\uFF20\uFF3B-\uFF40\uFF5B-\uFF65\uFF65]
/* \uFF65 is Halfwidth katakana middle dot; \u30FB is Katakana middle dot */
/* Math and other symbols that stand alone: °²× ∀ */
// Consider this list of bullet chars: 2219, 00b7, 2022, 2024


%%
Expand Down Expand Up @@ -730,18 +730,20 @@ cannot { yypushback(3) ; return getNext(); }

{FAKEDUCKFEET} |
{MISCSYMBOL} { return getNext(); }
\u0095 { return getNext("\u2022", yytext()); } /* cp1252 bullet mapped to unicode */
\u0099 { return getNext("\u2122", yytext()); } /* cp1252 TM sign mapped to unicode */

\0|{SPACES}|[\u200B\u200E-\u200F\uFEFF] { if (invertible) {
prevWordAfter.append(yytext());
}
}
{NEWLINE} { if (tokenizeNLs) {
{NEWLINE} { if (tokenizeNLs) {
return getNext(NEWLINE_TOKEN, yytext()); // js: for tokenizing carriage returns
} else if (invertible) {
prevWordAfter.append(yytext());
}
}
&nbsp; { if (invertible) {
&nbsp; { if (invertible) {
prevWordAfter.append(yytext());
}
}
Expand Down
Loading

0 comments on commit beed193

Please sign in to comment.