[GR-40008] Treat RegExp identifiers as if they are in Unicode mode.

PullRequest: graal/12437
jessealama · Aug 29, 2022 · 96387f0 · 96387f0
2 parents 0cf94ee + b3bdb52
commit 96387f0
Showing 1 changed file with 10 additions and 7 deletions.
diff --git a/...x/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/tregex/parser/RegexLexer.java b/...x/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/tregex/parser/RegexLexer.java
@@ -413,7 +413,7 @@ private Token parseGroupBegin() throws RegexSyntaxException {
 
     private int parseCodePointInGroupName() throws RegexSyntaxException {
         if (consumingLookahead("\\u")) {
-            final int unicodeEscape = parseUnicodeEscapeChar();
+            final int unicodeEscape = parseUnicodeEscapeChar(true);
             if (unicodeEscape < 0) {
                 throw syntaxError(ErrorMessages.INVALID_UNICODE_ESCAPE);
             } else {
@@ -427,7 +427,7 @@ private int parseCodePointInGroupName() throws RegexSyntaxException {
             return -1;
         }
         final char c = consumeChar();
-        return flags.isUnicode() && Character.isHighSurrogate(c) ? finishSurrogatePair(c) : c;
+        return Character.isHighSurrogate(c) ? finishSurrogatePair(c) : c;
     }
 
     /**
@@ -658,19 +658,22 @@ private CodePointSet parseUnicodeCharacterProperty(boolean invert) throws RegexS
     /**
      * Parse a {@code RegExpUnicodeEscapeSequence}, assuming that the prefix '&#92;u' has already
      * been read.
+     * 
+     * @param unicodeMode whether we are in Unicode mode, which allows '&#92;u{...} escapes and
+     *            treats surrogate pairs as single code points
      *
      * @return the code point of the escaped character, or -1 if the escape was malformed
      */
-    private int parseUnicodeEscapeChar() throws RegexSyntaxException {
-        if (flags.isUnicode() && consumingLookahead("{")) {
+    private int parseUnicodeEscapeChar(boolean unicodeMode) throws RegexSyntaxException {
+        if (unicodeMode && consumingLookahead("{")) {
             final int value = parseHex(1, Integer.MAX_VALUE, 0x10ffff, ErrorMessages.INVALID_UNICODE_ESCAPE);
             if (!consumingLookahead("}")) {
                 throw syntaxError(ErrorMessages.INVALID_UNICODE_ESCAPE);
             }
             return value;
         } else {
             final int value = parseHex(4, 4, 0xffff, ErrorMessages.INVALID_UNICODE_ESCAPE);
-            if (flags.isUnicode() && Character.isHighSurrogate((char) value)) {
+            if (unicodeMode && Character.isHighSurrogate((char) value)) {
                 final int resetIndex = index;
                 if (consumingLookahead("\\u") && !lookahead("{")) {
                     final char lead = (char) value;
@@ -728,7 +731,7 @@ private int parseEscapeChar(char c, boolean inCharClass) throws RegexSyntaxExcep
                 advance();
                 return Character.toUpperCase(controlLetter) - ('A' - 1);
             case 'u':
-                final int unicodeEscape = parseUnicodeEscapeChar();
+                final int unicodeEscape = parseUnicodeEscapeChar(flags.isUnicode());
                 return unicodeEscape < 0 ? c : unicodeEscape;
             case 'x':
                 final int value = parseHex(2, 2, 0xff, ErrorMessages.INVALID_ESCAPE);
@@ -750,7 +753,7 @@ private int parseEscapeChar(char c, boolean inCharClass) throws RegexSyntaxExcep
     }
 
     private int finishSurrogatePair(char c) {
-        assert flags.isUnicode() && Character.isHighSurrogate(c);
+        assert Character.isHighSurrogate(c);
         if (!atEnd() && Character.isLowSurrogate(curChar())) {
             final char lead = c;
             final char trail = consumeChar();