Implement Unicode instructions

Summary: This provides implementations of our Unicode-savvy regexp instructions, which decode surrogate pairs. Reviewed By: avp Differential Revision: D17413812 fbshipit-source-id: dc983858b8ad175fff837f12eb6a082f1686dc15
tonyle9 · Oct 3, 2019 · 1cbe62e · 1cbe62e
1 parent 892bf2c
commit 1cbe62e
Show file tree

Hide file tree

Showing 3 changed files with 135 additions and 9 deletions.
diff --git a/include/hermes/Regex/RegexTraits.h b/include/hermes/Regex/RegexTraits.h
@@ -96,6 +96,26 @@ struct UTF16RegexTraits {
   bool rangesContain(llvm::ArrayRef<BracketRange32> ranges, CodePoint c) const {
     return anyRangeContainsChar(ranges, c);
   }
+
+  /// Decode a UTF16 character from a string \p s which ends at \p end.
+  /// Place the character in \p cp and advance \p s by the number of code units
+  /// decoded. If the character is an unpaired surrogate, return that surrogate.
+  /// \return true if a character was decoded, false if not (which can only
+  /// occur if the string is empty).
+  static bool
+  decodeUTF16(const CodeUnit *&s, const CodeUnit *end, CodePoint *cp) {
+    if (s == end) {
+      return false;
+    }
+    if (s + 1 < end && isHighSurrogate(s[0]) && isLowSurrogate(s[1])) {
+      *cp = decodeSurrogatePair(s[0], s[1]);
+      s += 2;
+    } else {
+      *cp = s[0];
+      s += 1;
+    }
+    return true;
+  }
 };
 
 /// Implementation of regex::Traits for 7-bit ASCII.
@@ -138,6 +158,18 @@ struct ASCIIRegexTraits {
   bool rangesContain(llvm::ArrayRef<BracketRange32> ranges, char16_t c) const {
     return anyRangeContainsChar(ranges, c);
   }
+
+  /// Decode a UTF16 character from a string \p s which ends at \p end,
+  /// advancing \p s by the number of code units decoded.
+  /// \return the character in \p cp.
+  static bool
+  decodeUTF16(const CodeUnit *&s, const CodeUnit *end, CodePoint *cp) {
+    // ASCII does not support surrogates so the implementation is trivial.
+    if (s == end)
+      return false;
+    *cp = *s++;
+    return true;
+  }
 };
 
 } // end namespace regex

diff --git a/lib/Regex/Executor.cpp b/lib/Regex/Executor.cpp
@@ -6,7 +6,6 @@
  */
 #include "hermes/Regex/Executor.h"
 #include "hermes/Regex/RegexTraits.h"
-#include "hermes/Support/ErrorHandling.h"
 
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/Support/TrailingObjects.h"
@@ -307,6 +306,7 @@ struct Context {
   /// matches the instruction \p insn (with that opcode).
   template <Width1Opcode w1opcode>
   inline bool matchWidth1(const Insn *insn, CodeUnit c) const;
+
   /// \return true if all chars, stored in contiguous memory after \p insn,
   /// match the chars in state \p s in the same order, case insensitive. Note
   /// the count of chars is given in \p insn.
@@ -450,7 +450,7 @@ bool bracketMatchesChar(
     const Context<Traits> &ctx,
     const BracketInsn *insn,
     const BracketRange32 *ranges,
-    char16_t ch) {
+    typename Traits::CodePoint ch) {
   const auto &traits = ctx.traits_;
   // Note that if the bracket is negated /[^abc]/, we want to return true if we
   // do not match, false if we do. Implement this by xor with the negate flag.
@@ -777,10 +777,13 @@ auto Context<Traits>::match(
           s->ip_ += sizeof(MatchAnyButNewlineInsn);
           break;
 
-        case Opcode::U16MatchAnyButNewline: {
-          hermes_fatal("Unimplemented");
+        case Opcode::U16MatchAnyButNewline:
+          CodePoint cp;
+          if (!Traits::decodeUTF16(s->current_, last_, &cp)) {
+            BACKTRACK();
+          }
+          s->ip_ += sizeof(U16MatchAnyButNewlineInsn);
           break;
-        }
 
         case Opcode::MatchChar8: {
           if (s->current_ == last_ ||
@@ -801,7 +804,13 @@ auto Context<Traits>::match(
         }
 
         case Opcode::U16MatchChar32: {
-          hermes_fatal("Unimplemented");
+          const auto *insn = llvm::cast<U16MatchChar32Insn>(base);
+          CodePoint cp;
+          if (!Traits::decodeUTF16(s->current_, last_, &cp) ||
+              cp != (CodePoint)insn->c) {
+            BACKTRACK();
+          }
+          s->ip_ += sizeof(U16MatchChar32Insn);
           break;
         }
 
@@ -824,7 +833,14 @@ auto Context<Traits>::match(
         }
 
         case Opcode::U16MatchCharICase32: {
-          hermes_fatal("Unimplemented");
+          const auto *insn = llvm::cast<U16MatchCharICase32Insn>(base);
+          assert(insn->c >= 0x010000 && "Character should be astral");
+          CodePoint cp;
+          if (!Traits::decodeUTF16(s->current_, last_, &cp) ||
+              traits_.canonicalize(cp) != (CodePoint)insn->c) {
+            BACKTRACK();
+          }
+          s->ip_ += sizeof(U16MatchCharICase32Insn);
           break;
         }
 
@@ -891,8 +907,20 @@ auto Context<Traits>::match(
         }
 
         case Opcode::U16Bracket: {
-          llvm_unreachable("Unimplemented");
-          abort();
+          bool matched = false;
+          CodePoint cp;
+          const U16BracketInsn *insn = llvm::cast<U16BracketInsn>(base);
+          if (Traits::decodeUTF16(s->current_, last_, &cp)) {
+            // U16BracketInsn is followed by a list of BracketRange32s.
+            const BracketRange32 *ranges =
+                reinterpret_cast<const BracketRange32 *>(insn + 1);
+            matched = bracketMatchesChar<Traits>(*this, insn, ranges, cp);
+          }
+          if (!matched)
+            BACKTRACK();
+
+          s->ip_ += insn->totalWidth();
+          break;
         }
 
         case Opcode::WordBoundary: {

diff --git a/test/hermes/regexp_unicode.js b/test/hermes/regexp_unicode.js
@@ -0,0 +1,66 @@
+// Copyright (c) Facebook, Inc. and its affiliates.
+//
+// This source code is licensed under the MIT license found in the LICENSE
+// file in the root directory of this source tree.
+//
+// RUN: LC_ALL=en_US.UTF-8 %hermes -non-strict -O -target=HBC %s | %FileCheck --match-full-lines %s
+
+print('RegExp Unicode');
+// CHECK: RegExp Unicode
+
+try { new RegExp("\\u{FFFFFFFFF}", "u"); } catch (e) { print(e.message); }
+// CHECK-NEXT: Invalid RegExp pattern: Escaped value too large
+
+print(/abc/ui.exec("AbC"));
+// CHECK-NEXT: AbC
+
+print(/\u{48}/.exec("H"));
+// CHECK-NEXT: null
+
+print(/\u{48}/u.exec("H"));
+// CHECK-NEXT: H
+
+print(/\110/.exec("H"));
+// CHECK-NEXT: H
+
+try { new RegExp("\\110", "u"); } catch (e) { print(e.message); }
+// CHECK-NEXT: Invalid RegExp pattern: Invalid escape
+
+try { new RegExp("\\u{}", "u"); } catch (e) { print(e.message); }
+// CHECK-NEXT: Invalid RegExp pattern: Invalid escape
+
+try { new RegExp("\\u{ABC", "u"); } catch (e) { print(e.message); }
+// CHECK-NEXT: Invalid RegExp pattern: Invalid escape
+
+try { new RegExp("\\u{}", "u"); } catch (e) { print(e.message); }
+// CHECK-NEXT: Invalid RegExp pattern: Invalid escape
+
+try { new RegExp("\\u}", "u"); } catch (e) { print(e.message); }
+// CHECK-NEXT: Invalid RegExp pattern: Invalid escape
+
+try { new RegExp("\\u", "u"); } catch (e) { print(e.message); }
+// CHECK-NEXT: Invalid RegExp pattern: Invalid escape
+
+try { new RegExp("\\uZZZZ", "u"); } catch (e) { print(e.message); }
+// CHECK-NEXT: Invalid RegExp pattern: Invalid escape
+
+try { new RegExp("\\u{FFFFFFFFF}", "u"); } catch (e) { print(e.message); }
+// CHECK-NEXT: Invalid RegExp pattern: Escaped value too large
+
+print(/\u{1F600}/.exec("\u{1F600}"));
+// CHECK-NEXT: null
+
+// Here the match has length 2, because this emoji must be encoded via
+// a surrogate pair.
+print(/\u{1F600}/u.exec("\u{1F600}")[0].length);
+// CHECK-NEXT: 2
+
+// Here we expect to match because the surrogate pair looks like two characters to a non-unicode regexp.
+print(!! /^..$/.exec("\u{1F600}"));
+// CHECK-NEXT: true
+
+// Here we expect to not match because the surrogate pair should look like one character to a non-unicode regexp.
+print(!! /^..$/u.exec("\u{1F600}"));
+// CHECK-NEXT: false
+print(!! /^.$/u.exec("\u{1F600}"));
+// CHECK-NEXT: true