Skip to content

Commit

Permalink
Optimize matching multiple literal characters
Browse files Browse the repository at this point in the history
Summary:
Hermes is a bytecode compiler for JavaScript. This diff will optimize the implementation of literal opcodes in its regex engine.

Literal characters in a regex like /abcdefg/ are compiled as a sequence of MatchChar8 opcodes. This requires a separate instruction dispatch for each character. This diff will add a multi-character variant: a single regex opcode which can match multiple characters.

Example:
> echo '/abcd/' | hermes -dump-bytecode

  Header: marked: 0 loops: 0 flags: 0 constraints: 4
  0000  MatchChar8: 'a'
  0002  MatchChar8: 'b'
  0004  MatchChar8: 'c'
  0006  MatchChar8: 'd'
  0008  Goal

Should Be:

Example:
> echo '/abcd/' | hermes -dump-bytecode

  Header: marked: 0 loops: 0 flags: 0 constraints: 4
  0000  MatchNChar8: 'abcd'
  0006  Goal

While this may not be a large 'win' in terms of instruction compression, it does make bytecode easier to read. The optimization only saves one byte per MatchChar, starting with the 3rd char in a group. Single chars should *not* become MatchNChar8 as doing so is less memory efficient, and less readible.

Reviewed By: ridiculousfish

Differential Revision: D16119452

fbshipit-source-id: 8f386a582d6d6a921f6c3bbf66280c05e682a4e8
  • Loading branch information
artem-shu authored and facebook-github-bot committed Jul 18, 2019
1 parent 7e803f4 commit 8733459
Show file tree
Hide file tree
Showing 6 changed files with 301 additions and 41 deletions.
200 changes: 161 additions & 39 deletions include/hermes/Regex/Compiler.h
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@

#include "hermes/Regex/RegexBytecode.h"

#include "llvm/ADT/SmallVector.h"
#include "llvm/Support/ErrorOr.h"
#include "llvm/Support/raw_ostream.h"

Expand Down Expand Up @@ -298,6 +299,16 @@ class Node {
return false;
}

/// Perform optimization on Node's contents.
virtual void optimizeNodeContents() {}

/// If this Node can be coalesced into a single MatchCharNode,
/// then add the node's characters to \p output and \return true.
/// Otherwise \return false.
virtual bool tryCoalesceCharacters(std::vector<char16_t> *output) const {
return false;
}

protected:
/// \return the match constraints for this node.
/// This should be overridden by subclasses to report the constraints for that
Expand All @@ -312,6 +323,9 @@ class Node {
virtual void emit(RegexBytecodeStream &bcs) const {}
};

/// Coalesce adjacent MatchCharNode in NodeList.
static void coalesceCaseSensitiveASCIINodeList(NodeList &nodes);

/// GoalNode is the terminal Node that represents successful execution.
class GoalNode final : public Node {
public:
Expand Down Expand Up @@ -384,6 +398,10 @@ class LoopNode final : public Node {
return result | Super::matchConstraints();
}

virtual void optimizeNodeContents() override {
coalesceCaseSensitiveASCIINodeList(loopee_);
}

private:
/// Override of emit() to compile our looped expression and add a jump
/// back to the loop.
Expand Down Expand Up @@ -481,6 +499,11 @@ class AlternationNode final : public Node {
return result | Super::matchConstraints();
}

virtual void optimizeNodeContents() override {
coalesceCaseSensitiveASCIINodeList(first_);
coalesceCaseSensitiveASCIINodeList(second_);
}

void emit(RegexBytecodeStream &bcs) const override {
// Instruction stream looks like:
// [Alternation][PrimaryBranch][Jump][SecondaryBranch][...]
Expand Down Expand Up @@ -616,74 +639,138 @@ class MatchAnyButNewlineNode final : public Node {
}
};

/// MatchChar matches a single character, specified as a parameter to the
/// MatchChar matches one or more characters, specified as a parameter to the
/// constructor.
class MatchCharNode final : public Node {
class MatchCharNodeBase : public Node {
using Super = Node;

// The character we wish to match against.
char16_t c_;

public:
MatchCharNode(char16_t c) : c_(c) {}
// The character literals we wish to match against.
llvm::SmallVector<char16_t, 10> literals_;

MatchCharNodeBase(char16_t c) : literals_{c} {}

MatchCharNodeBase(std::vector<char16_t> *chars) {
literals_.insert(literals_.begin(), chars->begin(), chars->end());
}

virtual MatchConstraintSet matchConstraints() const override {
MatchConstraintSet result = MatchConstraintNonEmpty;
// If our character is not ASCII, then we cannot match pure-ASCII strings.
if (!isASCII(c_))
if (!isPureASCII()) {
result |= MatchConstraintNonASCII;
}
return result | Super::matchConstraints();
}

void emit(RegexBytecodeStream &bcs) const override {
if (isASCII(c_)) {
bcs.emit<MatchChar8Insn>()->c = c_;
// if size is 1, then NChar instruction is less efficient than singular.
if (isPureASCII() && literals_.size() > 1) {
emitPureASCIIImpl(bcs);
} else {
bcs.emit<MatchChar16Insn>()->c = c_;
emitMixedASCIIImpl(bcs);
}
}

protected:
/// \return true if every character in literals_ is ASCII.
bool isPureASCII() const {
return std::all_of(literals_.begin(), literals_.end(), isASCII);
}

virtual bool matchesExactlyOneCharacter() const override {
return true;
return literals_.size() == 1;
}
};

/// MatchCharICase matches a single character, ignoring case.
template <class Traits>
class MatchCharICaseNode : public Node {
using Super = Node;
/// Implementation of emitPureASCII to avoid class templates.
virtual void emitPureASCIIImpl(RegexBytecodeStream &bcs) const = 0;
/// Compile node and all characters as ASCII to a bytecode stream \p bcs.
template <typename MatchNChar8InsnType>
void emitPureASCII(RegexBytecodeStream &bcs) const {
assert(
literals_.size() <= UINT8_MAX &&
"Literal count cannot exceed uint8 max");
auto insn = bcs.emit<MatchNChar8InsnType>();
insn->charCount = literals_.size();
for (const auto c : literals_) {
bcs.emitChar8(c);
}
}

// Traits used for case conversion.
const Traits &traits_;
/// Implementation of emitMixedASCII to avoid class templates.
virtual void emitMixedASCIIImpl(RegexBytecodeStream &bcs) const = 0;
/// Compile node and all characters to a bytecode stream \p bcs.
template <typename MatchChar8InsnType, typename MatchChar16InsnType>
void emitMixedASCII(RegexBytecodeStream &bcs) const {
for (const auto c : literals_) {
if (isASCII(c)) {
bcs.emit<MatchChar8InsnType>()->c = c;
} else {
bcs.emit<MatchChar16InsnType>()->c = c;
}
}
}
};

// The character we wish to match against.
char16_t c_;
class MatchCharNode final : public MatchCharNodeBase {
using Super = MatchCharNodeBase;

public:
MatchCharICaseNode(const Traits &traits, char16_t c)
: traits_(traits), c_(traits.caseFold(c)) {}
MatchCharNode(char16_t c) : Super(c) {}

virtual void emit(RegexBytecodeStream &bcs) const override {
if (isASCII(c_)) {
bcs.emit<MatchCharICase8Insn>()->c = c_;
} else {
bcs.emit<MatchCharICase16Insn>()->c = c_;
MatchCharNode(std::vector<char16_t> *chars) : Super(chars) {}

virtual bool tryCoalesceCharacters(
std::vector<char16_t> *output) const override {
// Only coalesce pure ASCII characters.
if (!isPureASCII()) {
return false;
}
// Only coalesce if the size of the new vector can be represented as
// uint8_t in the corresponding instruction's charCount.
if (UINT8_MAX <= output->size() + literals_.size()) {
return false;
}
output->insert(output->end(), literals_.begin(), literals_.end());
return true;
}

virtual MatchConstraintSet matchConstraints() const override {
MatchConstraintSet result = MatchConstraintNonEmpty;
// If our character is not ASCII, then we cannot match pure-ASCII strings.
// Note this is true despite the fact that we are case-insensitive. For
// example, Turkish dotless i does not match ASCII i or I because the
// definition of canonicalize() forbids it (ES5.1 15.10.2.8).
if (!isASCII(c_))
result |= MatchConstraintNonASCII;
return result | Super::matchConstraints();
private:
virtual void emitPureASCIIImpl(RegexBytecodeStream &bcs) const override {
return emitPureASCII<MatchNChar8Insn>(bcs);
};
virtual void emitMixedASCIIImpl(RegexBytecodeStream &bcs) const override {
return emitMixedASCII<MatchChar8Insn, MatchChar16Insn>(bcs);
}
};

virtual bool matchesExactlyOneCharacter() const override {
return true;
template <class Traits>
class MatchCharICaseNode final : public MatchCharNodeBase {
using Super = MatchCharNodeBase;
const Traits &traits_;

public:
MatchCharICaseNode(const Traits &traits, char16_t c)
: Super(traits.caseFold(c)), traits_(traits) {}

MatchCharICaseNode(const Traits &traits, std::vector<char16_t> *chars)
: Super(applyTraitsToAllChars(traits, chars)), traits_(traits) {}

/// \return pointer to chars after caseFolding every element.
std::vector<char16_t> *applyTraitsToAllChars(
const Traits &traits,
std::vector<char16_t> *chars) {
for (auto &c : *chars)
c = traits.caseFold(c);
return chars;
}

private:
virtual void emitPureASCIIImpl(RegexBytecodeStream &bcs) const override {
return emitPureASCII<MatchNCharICase8Insn>(bcs);
};
virtual void emitMixedASCIIImpl(RegexBytecodeStream &bcs) const override {
return emitMixedASCII<MatchCharICase8Insn, MatchCharICase16Insn>(bcs);
}
};

Expand Down Expand Up @@ -957,7 +1044,6 @@ class Regex {
ForwardIterator last,
uint32_t backRefLimit,
uint32_t *outMaxBackRef);

void pushLeftAnchor();
void pushRightAnchor();
void pushMatchAnyButNewline();
Expand Down Expand Up @@ -1020,6 +1106,10 @@ class LookaheadNode : public Node {
return result | Super::matchConstraints();
}

virtual void optimizeNodeContents() override {
coalesceCaseSensitiveASCIINodeList(exp_);
}

// Override emit() to compile our lookahead expression.
virtual void emit(RegexBytecodeStream &bcs) const override {
auto lookahead = bcs.emit<LookaheadInsn>();
Expand Down Expand Up @@ -1096,6 +1186,7 @@ ParseResult<ForwardIterator> Regex<Traits>::parseWithBackRefLimit(
// If we succeeded, add a goal node as the last node.
if (result) {
nodes_.push_back(make_unique<GoalNode>());
coalesceCaseSensitiveASCIINodeList(nodes_);
}

// Compute any match constraints.
Expand Down Expand Up @@ -1193,6 +1284,37 @@ void Regex<Traits>::pushLookahead(
appendNode<LookaheadNode>(move(exp), mexpBegin, mexpEnd, invert);
}

inline void coalesceCaseSensitiveASCIINodeList(NodeList &nodes) {
for (auto &node : nodes) {
node->optimizeNodeContents();
}

size_t clean = 0; // index tracking sub-vector of nodes not to delete
size_t start = 0; // index expoloring vector for nodes to coalesce
for (; start < nodes.size(); clean++, start++) {
std::vector<char16_t> chars;

if (nodes[start]->tryCoalesceCharacters(&chars)) {
size_t end = start + 1;
for (; end < nodes.size(); end++) { // See how many we can coalesce.
if (!nodes[end]->tryCoalesceCharacters(&chars)) {
break;
}
}

if (end - start >= 3) { // Coalesce if we get 3 or more chars.
start = end - 1;
nodes[start] = unique_ptr<MatchCharNode>(new MatchCharNode(&chars));
}
}
if (clean != start) { // updates to the clean node
iter_swap(nodes.begin() + clean, nodes.begin() + start);
}
}
if (clean < nodes.size()) { // erase all nodes after after clean index
nodes.erase(nodes.begin() + clean, nodes.end());
}
}
} // namespace regex
} // namespace hermes

Expand Down
27 changes: 26 additions & 1 deletion include/hermes/Regex/RegexBytecode.h
Original file line number Diff line number Diff line change
Expand Up @@ -99,7 +99,27 @@ struct BracketInsn : public Insn {
}
};

// See BytecodeFileFormatTest for details about bit field layouts
struct MatchNChar8Insn : public Insn {
// number of 8-byte char following this instruction.
uint8_t charCount;

/// \return the width of this instruction plus its characters.
uint32_t totalWidth() const {
return sizeof(*this) + charCount * sizeof(char);
}
};

struct MatchNCharICase8Insn : public Insn {
// number of 8-byte char following this instruction.
uint8_t charCount;

/// \return the width of this instruction plus its characters.
uint32_t totalWidth() const {
return sizeof(*this) + charCount * sizeof(char);
}
};

// See BytecodeFileFormatTest for details about bit field layouts.
static_assert(
sizeof(BracketInsn) == 6,
"BracketInsn should take up 6 byte total");
Expand Down Expand Up @@ -282,6 +302,11 @@ class RegexBytecodeStream {
bytes_.insert(bytes_.end(), rangeBytes, rangeBytes + sizeof(range));
}

/// Emit a Char8 for use inside a MatchNChar8Insn or MatchNCharICase8Insn.
void emitChar8(char c) {
bytes_.push_back((uint8_t)c);
}

/// \return the current offset in the stream, which is where the next
/// instruction will be emitted. Note the header is omitted.
uint32_t currentOffset() const {
Expand Down
2 changes: 2 additions & 0 deletions include/hermes/Regex/RegexOpcodes.def
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,8 @@ REOP(RightAnchor)
REOP(MatchAnyButNewline)
REOP(MatchChar8)
REOP(MatchChar16)
REOP(MatchNChar8)
REOP(MatchNCharICase8)
REOP(MatchCharICase8)
REOP(MatchCharICase16)
REOP(Alternation)
Expand Down
Loading

0 comments on commit 8733459

Please sign in to comment.