Skip to content

Commit

Permalink
Implement Unicode instructions
Browse files Browse the repository at this point in the history
Summary:
This provides implementations of our Unicode-savvy regexp instructions,
which decode surrogate pairs.

Reviewed By: avp

Differential Revision: D17413812

fbshipit-source-id: dc983858b8ad175fff837f12eb6a082f1686dc15
  • Loading branch information
Peter Ammon authored and facebook-github-bot committed Oct 3, 2019
1 parent 892bf2c commit 1cbe62e
Show file tree
Hide file tree
Showing 3 changed files with 135 additions and 9 deletions.
32 changes: 32 additions & 0 deletions include/hermes/Regex/RegexTraits.h
Original file line number Diff line number Diff line change
Expand Up @@ -96,6 +96,26 @@ struct UTF16RegexTraits {
bool rangesContain(llvm::ArrayRef<BracketRange32> ranges, CodePoint c) const {
return anyRangeContainsChar(ranges, c);
}

/// Decode a UTF16 character from a string \p s which ends at \p end.
/// Place the character in \p cp and advance \p s by the number of code units
/// decoded. If the character is an unpaired surrogate, return that surrogate.
/// \return true if a character was decoded, false if not (which can only
/// occur if the string is empty).
static bool
decodeUTF16(const CodeUnit *&s, const CodeUnit *end, CodePoint *cp) {
if (s == end) {
return false;
}
if (s + 1 < end && isHighSurrogate(s[0]) && isLowSurrogate(s[1])) {
*cp = decodeSurrogatePair(s[0], s[1]);
s += 2;
} else {
*cp = s[0];
s += 1;
}
return true;
}
};

/// Implementation of regex::Traits for 7-bit ASCII.
Expand Down Expand Up @@ -138,6 +158,18 @@ struct ASCIIRegexTraits {
bool rangesContain(llvm::ArrayRef<BracketRange32> ranges, char16_t c) const {
return anyRangeContainsChar(ranges, c);
}

/// Decode a UTF16 character from a string \p s which ends at \p end,
/// advancing \p s by the number of code units decoded.
/// \return the character in \p cp.
static bool
decodeUTF16(const CodeUnit *&s, const CodeUnit *end, CodePoint *cp) {
// ASCII does not support surrogates so the implementation is trivial.
if (s == end)
return false;
*cp = *s++;
return true;
}
};

} // end namespace regex
Expand Down
46 changes: 37 additions & 9 deletions lib/Regex/Executor.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,6 @@
*/
#include "hermes/Regex/Executor.h"
#include "hermes/Regex/RegexTraits.h"
#include "hermes/Support/ErrorHandling.h"

#include "llvm/ADT/SmallVector.h"
#include "llvm/Support/TrailingObjects.h"
Expand Down Expand Up @@ -307,6 +306,7 @@ struct Context {
/// matches the instruction \p insn (with that opcode).
template <Width1Opcode w1opcode>
inline bool matchWidth1(const Insn *insn, CodeUnit c) const;

/// \return true if all chars, stored in contiguous memory after \p insn,
/// match the chars in state \p s in the same order, case insensitive. Note
/// the count of chars is given in \p insn.
Expand Down Expand Up @@ -450,7 +450,7 @@ bool bracketMatchesChar(
const Context<Traits> &ctx,
const BracketInsn *insn,
const BracketRange32 *ranges,
char16_t ch) {
typename Traits::CodePoint ch) {
const auto &traits = ctx.traits_;
// Note that if the bracket is negated /[^abc]/, we want to return true if we
// do not match, false if we do. Implement this by xor with the negate flag.
Expand Down Expand Up @@ -777,10 +777,13 @@ auto Context<Traits>::match(
s->ip_ += sizeof(MatchAnyButNewlineInsn);
break;

case Opcode::U16MatchAnyButNewline: {
hermes_fatal("Unimplemented");
case Opcode::U16MatchAnyButNewline:
CodePoint cp;
if (!Traits::decodeUTF16(s->current_, last_, &cp)) {
BACKTRACK();
}
s->ip_ += sizeof(U16MatchAnyButNewlineInsn);
break;
}

case Opcode::MatchChar8: {
if (s->current_ == last_ ||
Expand All @@ -801,7 +804,13 @@ auto Context<Traits>::match(
}

case Opcode::U16MatchChar32: {
hermes_fatal("Unimplemented");
const auto *insn = llvm::cast<U16MatchChar32Insn>(base);
CodePoint cp;
if (!Traits::decodeUTF16(s->current_, last_, &cp) ||
cp != (CodePoint)insn->c) {
BACKTRACK();
}
s->ip_ += sizeof(U16MatchChar32Insn);
break;
}

Expand All @@ -824,7 +833,14 @@ auto Context<Traits>::match(
}

case Opcode::U16MatchCharICase32: {
hermes_fatal("Unimplemented");
const auto *insn = llvm::cast<U16MatchCharICase32Insn>(base);
assert(insn->c >= 0x010000 && "Character should be astral");
CodePoint cp;
if (!Traits::decodeUTF16(s->current_, last_, &cp) ||
traits_.canonicalize(cp) != (CodePoint)insn->c) {
BACKTRACK();
}
s->ip_ += sizeof(U16MatchCharICase32Insn);
break;
}

Expand Down Expand Up @@ -891,8 +907,20 @@ auto Context<Traits>::match(
}

case Opcode::U16Bracket: {
llvm_unreachable("Unimplemented");
abort();
bool matched = false;
CodePoint cp;
const U16BracketInsn *insn = llvm::cast<U16BracketInsn>(base);
if (Traits::decodeUTF16(s->current_, last_, &cp)) {
// U16BracketInsn is followed by a list of BracketRange32s.
const BracketRange32 *ranges =
reinterpret_cast<const BracketRange32 *>(insn + 1);
matched = bracketMatchesChar<Traits>(*this, insn, ranges, cp);
}
if (!matched)
BACKTRACK();

s->ip_ += insn->totalWidth();
break;
}

case Opcode::WordBoundary: {
Expand Down
66 changes: 66 additions & 0 deletions test/hermes/regexp_unicode.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@
// Copyright (c) Facebook, Inc. and its affiliates.
//
// This source code is licensed under the MIT license found in the LICENSE
// file in the root directory of this source tree.
//
// RUN: LC_ALL=en_US.UTF-8 %hermes -non-strict -O -target=HBC %s | %FileCheck --match-full-lines %s

print('RegExp Unicode');
// CHECK: RegExp Unicode

try { new RegExp("\\u{FFFFFFFFF}", "u"); } catch (e) { print(e.message); }
// CHECK-NEXT: Invalid RegExp pattern: Escaped value too large

print(/abc/ui.exec("AbC"));
// CHECK-NEXT: AbC

print(/\u{48}/.exec("H"));
// CHECK-NEXT: null

print(/\u{48}/u.exec("H"));
// CHECK-NEXT: H

print(/\110/.exec("H"));
// CHECK-NEXT: H

try { new RegExp("\\110", "u"); } catch (e) { print(e.message); }
// CHECK-NEXT: Invalid RegExp pattern: Invalid escape

try { new RegExp("\\u{}", "u"); } catch (e) { print(e.message); }
// CHECK-NEXT: Invalid RegExp pattern: Invalid escape

try { new RegExp("\\u{ABC", "u"); } catch (e) { print(e.message); }
// CHECK-NEXT: Invalid RegExp pattern: Invalid escape

try { new RegExp("\\u{}", "u"); } catch (e) { print(e.message); }
// CHECK-NEXT: Invalid RegExp pattern: Invalid escape

try { new RegExp("\\u}", "u"); } catch (e) { print(e.message); }
// CHECK-NEXT: Invalid RegExp pattern: Invalid escape

try { new RegExp("\\u", "u"); } catch (e) { print(e.message); }
// CHECK-NEXT: Invalid RegExp pattern: Invalid escape

try { new RegExp("\\uZZZZ", "u"); } catch (e) { print(e.message); }
// CHECK-NEXT: Invalid RegExp pattern: Invalid escape

try { new RegExp("\\u{FFFFFFFFF}", "u"); } catch (e) { print(e.message); }
// CHECK-NEXT: Invalid RegExp pattern: Escaped value too large

print(/\u{1F600}/.exec("\u{1F600}"));
// CHECK-NEXT: null

// Here the match has length 2, because this emoji must be encoded via
// a surrogate pair.
print(/\u{1F600}/u.exec("\u{1F600}")[0].length);
// CHECK-NEXT: 2

// Here we expect to match because the surrogate pair looks like two characters to a non-unicode regexp.
print(!! /^..$/.exec("\u{1F600}"));
// CHECK-NEXT: true

// Here we expect to not match because the surrogate pair should look like one character to a non-unicode regexp.
print(!! /^..$/u.exec("\u{1F600}"));
// CHECK-NEXT: false
print(!! /^.$/u.exec("\u{1F600}"));
// CHECK-NEXT: true

0 comments on commit 1cbe62e

Please sign in to comment.