Implement isHigh/Low Surrogate and other Unicode helpers

Summary: This adds some Unicode helper functions for determining if a code point is a surrogate, and for decoding surrogate pairs. These will be used by future Unicode regexps. Reviewed By: avp Differential Revision: D17413808 fbshipit-source-id: 7b23f5796900090f192525519527bfecba5e54a0
tonyle9 · Oct 3, 2019 · 3f26192 · 3f26192
1 parent edeef8a
commit 3f26192
Show file tree

Hide file tree

Showing 2 changed files with 27 additions and 10 deletions.
diff --git a/include/hermes/Platform/Unicode/CharacterProperties.h b/include/hermes/Platform/Unicode/CharacterProperties.h
@@ -7,6 +7,7 @@
 #ifndef HERMES_PLATFORMUNICODE_CHARACTERPROPERTIES_H
 #define HERMES_PLATFORMUNICODE_CHARACTERPROPERTIES_H
 
+#include <cassert>
 #include <cstdint>
 
 namespace hermes {
@@ -19,6 +20,8 @@ const uint32_t UNICODE_SURROGATE_LAST = 0xDFFF;
 const uint32_t UTF16_HIGH_SURROGATE = 0xD800;
 const uint32_t UTF16_LOW_SURROGATE = 0xDC00;
 const uint32_t UNICODE_REPLACEMENT_CHARACTER = 0xFFFD;
+/// The last member of the BMP.
+const uint32_t UNICODE_LAST_BMP = 0xFFFF;
 
 const uint32_t UNICODE_LINE_SEPARATOR = 0x2028;
 const uint32_t UNICODE_PARAGRAPH_SEPARATOR = 0x2029;
@@ -39,6 +42,29 @@ inline bool isValidCodePoint(uint32_t cp) {
       cp > UNICODE_MAX_VALUE);
 }
 
+/// \return whether \p cp is part of the Basic Multilingual Plane.
+/// Surrogate characters are considered part of the BMP.
+inline bool isMemberOfBMP(uint32_t cp) {
+  return cp <= UNICODE_LAST_BMP;
+}
+
+/// \return whether cp is a high surrogate.
+inline bool isHighSurrogate(uint32_t cp) {
+  return UNICODE_SURROGATE_FIRST <= cp && cp < UTF16_LOW_SURROGATE;
+}
+
+/// \return whether cp is a low surrogate.
+inline bool isLowSurrogate(uint32_t cp) {
+  return UTF16_LOW_SURROGATE <= cp && cp <= UNICODE_SURROGATE_LAST;
+}
+
+/// Decode a surrogate pair [\p hi, \p lo] into a code point.
+inline uint32_t decodeSurrogatePair(uint32_t hi, uint32_t lo) {
+  assert(isHighSurrogate(hi) && isLowSurrogate(lo) && "Not a surrogate pair");
+  return ((hi - UTF16_HIGH_SURROGATE) << 10) + (lo - UTF16_LOW_SURROGATE) +
+      0x10000;
+}
+
 /// \return true if the codepoint is not ASCII and is a Unicode letter.
 bool isUnicodeOnlyLetter(uint32_t cp);
 /// \return true if the codepoint is not ASCII and is a Unicode space.

diff --git a/lib/Support/UTF8.cpp b/lib/Support/UTF8.cpp
@@ -66,14 +66,6 @@ bool convertUTF16ToUTF8WithReplacements(
     std::string &out,
     llvm::ArrayRef<char16_t> input,
     size_t maxCharacters) {
-  auto isHighSurrogate = [](char16_t c) {
-    return UNICODE_SURROGATE_FIRST <= c && c < UTF16_LOW_SURROGATE;
-  };
-
-  auto isLowSurrogate = [](char16_t c) {
-    return UTF16_LOW_SURROGATE <= c && c <= UNICODE_SURROGATE_LAST;
-  };
-
   out.clear();
   out.reserve(input.size());
   // Stop early if we've reached currNumCharacters worth of UTF-8 characters.
@@ -103,8 +95,7 @@ bool convertUTF16ToUTF8WithReplacements(
         c32 = UNICODE_REPLACEMENT_CHARACTER;
       } else {
         // Decode surrogate pair and increment, because we consumed two chars.
-        c32 = cur[0] - UTF16_HIGH_SURROGATE;
-        c32 = (c32 << 10) + (cur[1] - UTF16_LOW_SURROGATE) + 0x010000;
+        c32 = decodeSurrogatePair(cur[0], cur[1]);
         ++cur;
       }
     } else {