Optimize matching multiple literal characters

Summary: Hermes is a bytecode compiler for JavaScript. This diff will optimize the implementation of literal opcodes in its regex engine. Literal characters in a regex like /abcdefg/ are compiled as a sequence of MatchChar8 opcodes. This requires a separate instruction dispatch for each character. This diff will add a multi-character variant: a single regex opcode which can match multiple characters. Example: > echo '/abcd/' | hermes -dump-bytecode Header: marked: 0 loops: 0 flags: 0 constraints: 4 0000 MatchChar8: 'a' 0002 MatchChar8: 'b' 0004 MatchChar8: 'c' 0006 MatchChar8: 'd' 0008 Goal Should Be: Example: > echo '/abcd/' | hermes -dump-bytecode Header: marked: 0 loops: 0 flags: 0 constraints: 4 0000 MatchNChar8: 'abcd' 0006 Goal While this may not be a large 'win' in terms of instruction compression, it does make bytecode easier to read. The optimization only saves one byte per MatchChar, starting with the 3rd char in a group. Single chars should *not* become MatchNChar8 as doing so is less memory efficient, and less readible. Reviewed By: ridiculousfish Differential Revision: D16119452 fbshipit-source-id: 8f386a582d6d6a921f6c3bbf66280c05e682a4e8
EmperorEarth · Jul 18, 2019 · 8733459 · 8733459
1 parent 7e803f4
commit 8733459
Show file tree

Hide file tree

Showing 6 changed files with 301 additions and 41 deletions.
diff --git a/include/hermes/Regex/Compiler.h b/include/hermes/Regex/Compiler.h
@@ -21,6 +21,7 @@
 
 #include "hermes/Regex/RegexBytecode.h"
 
+#include "llvm/ADT/SmallVector.h"
 #include "llvm/Support/ErrorOr.h"
 #include "llvm/Support/raw_ostream.h"
 
@@ -298,6 +299,16 @@ class Node {
     return false;
   }
 
+  /// Perform optimization on Node's contents.
+  virtual void optimizeNodeContents() {}
+
+  /// If this Node can be coalesced into a single MatchCharNode,
+  /// then add the node's characters to \p output and \return true.
+  /// Otherwise \return false.
+  virtual bool tryCoalesceCharacters(std::vector<char16_t> *output) const {
+    return false;
+  }
+
  protected:
   /// \return the match constraints for this node.
   /// This should be overridden by subclasses to report the constraints for that
@@ -312,6 +323,9 @@ class Node {
   virtual void emit(RegexBytecodeStream &bcs) const {}
 };
 
+/// Coalesce adjacent MatchCharNode in NodeList.
+static void coalesceCaseSensitiveASCIINodeList(NodeList &nodes);
+
 /// GoalNode is the terminal Node that represents successful execution.
 class GoalNode final : public Node {
  public:
@@ -384,6 +398,10 @@ class LoopNode final : public Node {
     return result | Super::matchConstraints();
   }
 
+  virtual void optimizeNodeContents() override {
+    coalesceCaseSensitiveASCIINodeList(loopee_);
+  }
+
  private:
   /// Override of emit() to compile our looped expression and add a jump
   /// back to the loop.
@@ -481,6 +499,11 @@ class AlternationNode final : public Node {
     return result | Super::matchConstraints();
   }
 
+  virtual void optimizeNodeContents() override {
+    coalesceCaseSensitiveASCIINodeList(first_);
+    coalesceCaseSensitiveASCIINodeList(second_);
+  }
+
   void emit(RegexBytecodeStream &bcs) const override {
     // Instruction stream looks like:
     //   [Alternation][PrimaryBranch][Jump][SecondaryBranch][...]
@@ -616,74 +639,138 @@ class MatchAnyButNewlineNode final : public Node {
   }
 };
 
-/// MatchChar matches a single character, specified as a parameter to the
+/// MatchChar matches one or more characters, specified as a parameter to the
 /// constructor.
-class MatchCharNode final : public Node {
+class MatchCharNodeBase : public Node {
   using Super = Node;
 
-  // The character we wish to match against.
-  char16_t c_;
-
  public:
-  MatchCharNode(char16_t c) : c_(c) {}
+  // The character literals we wish to match against.
+  llvm::SmallVector<char16_t, 10> literals_;
+
+  MatchCharNodeBase(char16_t c) : literals_{c} {}
+
+  MatchCharNodeBase(std::vector<char16_t> *chars) {
+    literals_.insert(literals_.begin(), chars->begin(), chars->end());
+  }
 
   virtual MatchConstraintSet matchConstraints() const override {
     MatchConstraintSet result = MatchConstraintNonEmpty;
     // If our character is not ASCII, then we cannot match pure-ASCII strings.
-    if (!isASCII(c_))
+    if (!isPureASCII()) {
       result |= MatchConstraintNonASCII;
+    }
     return result | Super::matchConstraints();
   }
 
   void emit(RegexBytecodeStream &bcs) const override {
-    if (isASCII(c_)) {
-      bcs.emit<MatchChar8Insn>()->c = c_;
+    // if size is 1, then NChar instruction is less efficient than singular.
+    if (isPureASCII() && literals_.size() > 1) {
+      emitPureASCIIImpl(bcs);
     } else {
-      bcs.emit<MatchChar16Insn>()->c = c_;
+      emitMixedASCIIImpl(bcs);
     }
   }
 
+ protected:
+  /// \return true if every character in literals_ is ASCII.
+  bool isPureASCII() const {
+    return std::all_of(literals_.begin(), literals_.end(), isASCII);
+  }
+
   virtual bool matchesExactlyOneCharacter() const override {
-    return true;
+    return literals_.size() == 1;
   }
-};
 
-/// MatchCharICase matches a single character, ignoring case.
-template <class Traits>
-class MatchCharICaseNode : public Node {
-  using Super = Node;
+  /// Implementation of emitPureASCII to avoid class templates.
+  virtual void emitPureASCIIImpl(RegexBytecodeStream &bcs) const = 0;
+  /// Compile node and all characters as ASCII to a bytecode stream \p bcs.
+  template <typename MatchNChar8InsnType>
+  void emitPureASCII(RegexBytecodeStream &bcs) const {
+    assert(
+        literals_.size() <= UINT8_MAX &&
+        "Literal count cannot exceed uint8 max");
+    auto insn = bcs.emit<MatchNChar8InsnType>();
+    insn->charCount = literals_.size();
+    for (const auto c : literals_) {
+      bcs.emitChar8(c);
+    }
+  }
 
-  // Traits used for case conversion.
-  const Traits &traits_;
+  /// Implementation of emitMixedASCII to avoid class templates.
+  virtual void emitMixedASCIIImpl(RegexBytecodeStream &bcs) const = 0;
+  /// Compile node and all characters to a bytecode stream \p bcs.
+  template <typename MatchChar8InsnType, typename MatchChar16InsnType>
+  void emitMixedASCII(RegexBytecodeStream &bcs) const {
+    for (const auto c : literals_) {
+      if (isASCII(c)) {
+        bcs.emit<MatchChar8InsnType>()->c = c;
+      } else {
+        bcs.emit<MatchChar16InsnType>()->c = c;
+      }
+    }
+  }
+};
 
-  // The character we wish to match against.
-  char16_t c_;
+class MatchCharNode final : public MatchCharNodeBase {
+  using Super = MatchCharNodeBase;
 
  public:
-  MatchCharICaseNode(const Traits &traits, char16_t c)
-      : traits_(traits), c_(traits.caseFold(c)) {}
+  MatchCharNode(char16_t c) : Super(c) {}
 
-  virtual void emit(RegexBytecodeStream &bcs) const override {
-    if (isASCII(c_)) {
-      bcs.emit<MatchCharICase8Insn>()->c = c_;
-    } else {
-      bcs.emit<MatchCharICase16Insn>()->c = c_;
+  MatchCharNode(std::vector<char16_t> *chars) : Super(chars) {}
+
+  virtual bool tryCoalesceCharacters(
+      std::vector<char16_t> *output) const override {
+    // Only coalesce pure ASCII characters.
+    if (!isPureASCII()) {
+      return false;
     }
+    // Only coalesce if the size of the new vector can be represented as
+    // uint8_t in the corresponding instruction's charCount.
+    if (UINT8_MAX <= output->size() + literals_.size()) {
+      return false;
+    }
+    output->insert(output->end(), literals_.begin(), literals_.end());
+    return true;
   }
 
-  virtual MatchConstraintSet matchConstraints() const override {
-    MatchConstraintSet result = MatchConstraintNonEmpty;
-    // If our character is not ASCII, then we cannot match pure-ASCII strings.
-    // Note this is true despite the fact that we are case-insensitive. For
-    // example, Turkish dotless i does not match ASCII i or I because the
-    // definition of canonicalize() forbids it (ES5.1 15.10.2.8).
-    if (!isASCII(c_))
-      result |= MatchConstraintNonASCII;
-    return result | Super::matchConstraints();
+ private:
+  virtual void emitPureASCIIImpl(RegexBytecodeStream &bcs) const override {
+    return emitPureASCII<MatchNChar8Insn>(bcs);
+  };
+  virtual void emitMixedASCIIImpl(RegexBytecodeStream &bcs) const override {
+    return emitMixedASCII<MatchChar8Insn, MatchChar16Insn>(bcs);
   }
+};
 
-  virtual bool matchesExactlyOneCharacter() const override {
-    return true;
+template <class Traits>
+class MatchCharICaseNode final : public MatchCharNodeBase {
+  using Super = MatchCharNodeBase;
+  const Traits &traits_;
+
+ public:
+  MatchCharICaseNode(const Traits &traits, char16_t c)
+      : Super(traits.caseFold(c)), traits_(traits) {}
+
+  MatchCharICaseNode(const Traits &traits, std::vector<char16_t> *chars)
+      : Super(applyTraitsToAllChars(traits, chars)), traits_(traits) {}
+
+  /// \return pointer to chars after caseFolding every element.
+  std::vector<char16_t> *applyTraitsToAllChars(
+      const Traits &traits,
+      std::vector<char16_t> *chars) {
+    for (auto &c : *chars)
+      c = traits.caseFold(c);
+    return chars;
+  }
+
+ private:
+  virtual void emitPureASCIIImpl(RegexBytecodeStream &bcs) const override {
+    return emitPureASCII<MatchNCharICase8Insn>(bcs);
+  };
+  virtual void emitMixedASCIIImpl(RegexBytecodeStream &bcs) const override {
+    return emitMixedASCII<MatchCharICase8Insn, MatchCharICase16Insn>(bcs);
   }
 };
 
@@ -957,7 +1044,6 @@ class Regex {
       ForwardIterator last,
       uint32_t backRefLimit,
       uint32_t *outMaxBackRef);
-
   void pushLeftAnchor();
   void pushRightAnchor();
   void pushMatchAnyButNewline();
@@ -1020,6 +1106,10 @@ class LookaheadNode : public Node {
     return result | Super::matchConstraints();
   }
 
+  virtual void optimizeNodeContents() override {
+    coalesceCaseSensitiveASCIINodeList(exp_);
+  }
+
   // Override emit() to compile our lookahead expression.
   virtual void emit(RegexBytecodeStream &bcs) const override {
     auto lookahead = bcs.emit<LookaheadInsn>();
@@ -1096,6 +1186,7 @@ ParseResult<ForwardIterator> Regex<Traits>::parseWithBackRefLimit(
   // If we succeeded, add a goal node as the last node.
   if (result) {
     nodes_.push_back(make_unique<GoalNode>());
+    coalesceCaseSensitiveASCIINodeList(nodes_);
   }
 
   // Compute any match constraints.
@@ -1193,6 +1284,37 @@ void Regex<Traits>::pushLookahead(
   appendNode<LookaheadNode>(move(exp), mexpBegin, mexpEnd, invert);
 }
 
+inline void coalesceCaseSensitiveASCIINodeList(NodeList &nodes) {
+  for (auto &node : nodes) {
+    node->optimizeNodeContents();
+  }
+
+  size_t clean = 0; // index tracking sub-vector of nodes not to delete
+  size_t start = 0; // index expoloring vector for nodes to coalesce
+  for (; start < nodes.size(); clean++, start++) {
+    std::vector<char16_t> chars;
+
+    if (nodes[start]->tryCoalesceCharacters(&chars)) {
+      size_t end = start + 1;
+      for (; end < nodes.size(); end++) { // See how many we can coalesce.
+        if (!nodes[end]->tryCoalesceCharacters(&chars)) {
+          break;
+        }
+      }
+
+      if (end - start >= 3) { // Coalesce if we get 3 or more chars.
+        start = end - 1;
+        nodes[start] = unique_ptr<MatchCharNode>(new MatchCharNode(&chars));
+      }
+    }
+    if (clean != start) { // updates to the clean node
+      iter_swap(nodes.begin() + clean, nodes.begin() + start);
+    }
+  }
+  if (clean < nodes.size()) { // erase all nodes after after clean index
+    nodes.erase(nodes.begin() + clean, nodes.end());
+  }
+}
 } // namespace regex
 } // namespace hermes
 

diff --git a/include/hermes/Regex/RegexBytecode.h b/include/hermes/Regex/RegexBytecode.h
@@ -99,7 +99,27 @@ struct BracketInsn : public Insn {
   }
 };
 
-// See BytecodeFileFormatTest for details about bit field layouts
+struct MatchNChar8Insn : public Insn {
+  // number of 8-byte char following this instruction.
+  uint8_t charCount;
+
+  /// \return the width of this instruction plus its characters.
+  uint32_t totalWidth() const {
+    return sizeof(*this) + charCount * sizeof(char);
+  }
+};
+
+struct MatchNCharICase8Insn : public Insn {
+  // number of 8-byte char following this instruction.
+  uint8_t charCount;
+
+  /// \return the width of this instruction plus its characters.
+  uint32_t totalWidth() const {
+    return sizeof(*this) + charCount * sizeof(char);
+  }
+};
+
+// See BytecodeFileFormatTest for details about bit field layouts.
 static_assert(
     sizeof(BracketInsn) == 6,
     "BracketInsn should take up 6 byte total");
@@ -282,6 +302,11 @@ class RegexBytecodeStream {
     bytes_.insert(bytes_.end(), rangeBytes, rangeBytes + sizeof(range));
   }
 
+  /// Emit a Char8 for use inside a MatchNChar8Insn or MatchNCharICase8Insn.
+  void emitChar8(char c) {
+    bytes_.push_back((uint8_t)c);
+  }
+
   /// \return the current offset in the stream, which is where the next
   /// instruction will be emitted. Note the header is omitted.
   uint32_t currentOffset() const {

diff --git a/include/hermes/Regex/RegexOpcodes.def b/include/hermes/Regex/RegexOpcodes.def
@@ -14,6 +14,8 @@ REOP(RightAnchor)
 REOP(MatchAnyButNewline)
 REOP(MatchChar8)
 REOP(MatchChar16)
+REOP(MatchNChar8)
+REOP(MatchNCharICase8)
 REOP(MatchCharICase8)
 REOP(MatchCharICase16)
 REOP(Alternation)