Skip to content

Commit

Permalink
Implement isHigh/Low Surrogate and other Unicode helpers
Browse files Browse the repository at this point in the history
Summary:
This adds some Unicode helper functions for determining if a code point is a
surrogate, and for decoding surrogate pairs. These will be used by future
Unicode regexps.

Reviewed By: avp

Differential Revision: D17413808

fbshipit-source-id: 7b23f5796900090f192525519527bfecba5e54a0
  • Loading branch information
Peter Ammon authored and facebook-github-bot committed Oct 3, 2019
1 parent edeef8a commit 3f26192
Show file tree
Hide file tree
Showing 2 changed files with 27 additions and 10 deletions.
26 changes: 26 additions & 0 deletions include/hermes/Platform/Unicode/CharacterProperties.h
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
#ifndef HERMES_PLATFORMUNICODE_CHARACTERPROPERTIES_H
#define HERMES_PLATFORMUNICODE_CHARACTERPROPERTIES_H

#include <cassert>
#include <cstdint>

namespace hermes {
Expand All @@ -19,6 +20,8 @@ const uint32_t UNICODE_SURROGATE_LAST = 0xDFFF;
const uint32_t UTF16_HIGH_SURROGATE = 0xD800;
const uint32_t UTF16_LOW_SURROGATE = 0xDC00;
const uint32_t UNICODE_REPLACEMENT_CHARACTER = 0xFFFD;
/// The last member of the BMP.
const uint32_t UNICODE_LAST_BMP = 0xFFFF;

const uint32_t UNICODE_LINE_SEPARATOR = 0x2028;
const uint32_t UNICODE_PARAGRAPH_SEPARATOR = 0x2029;
Expand All @@ -39,6 +42,29 @@ inline bool isValidCodePoint(uint32_t cp) {
cp > UNICODE_MAX_VALUE);
}

/// \return whether \p cp is part of the Basic Multilingual Plane.
/// Surrogate characters are considered part of the BMP.
inline bool isMemberOfBMP(uint32_t cp) {
return cp <= UNICODE_LAST_BMP;
}

/// \return whether cp is a high surrogate.
inline bool isHighSurrogate(uint32_t cp) {
return UNICODE_SURROGATE_FIRST <= cp && cp < UTF16_LOW_SURROGATE;
}

/// \return whether cp is a low surrogate.
inline bool isLowSurrogate(uint32_t cp) {
return UTF16_LOW_SURROGATE <= cp && cp <= UNICODE_SURROGATE_LAST;
}

/// Decode a surrogate pair [\p hi, \p lo] into a code point.
inline uint32_t decodeSurrogatePair(uint32_t hi, uint32_t lo) {
assert(isHighSurrogate(hi) && isLowSurrogate(lo) && "Not a surrogate pair");
return ((hi - UTF16_HIGH_SURROGATE) << 10) + (lo - UTF16_LOW_SURROGATE) +
0x10000;
}

/// \return true if the codepoint is not ASCII and is a Unicode letter.
bool isUnicodeOnlyLetter(uint32_t cp);
/// \return true if the codepoint is not ASCII and is a Unicode space.
Expand Down
11 changes: 1 addition & 10 deletions lib/Support/UTF8.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -66,14 +66,6 @@ bool convertUTF16ToUTF8WithReplacements(
std::string &out,
llvm::ArrayRef<char16_t> input,
size_t maxCharacters) {
auto isHighSurrogate = [](char16_t c) {
return UNICODE_SURROGATE_FIRST <= c && c < UTF16_LOW_SURROGATE;
};

auto isLowSurrogate = [](char16_t c) {
return UTF16_LOW_SURROGATE <= c && c <= UNICODE_SURROGATE_LAST;
};

out.clear();
out.reserve(input.size());
// Stop early if we've reached currNumCharacters worth of UTF-8 characters.
Expand Down Expand Up @@ -103,8 +95,7 @@ bool convertUTF16ToUTF8WithReplacements(
c32 = UNICODE_REPLACEMENT_CHARACTER;
} else {
// Decode surrogate pair and increment, because we consumed two chars.
c32 = cur[0] - UTF16_HIGH_SURROGATE;
c32 = (c32 << 10) + (cur[1] - UTF16_LOW_SURROGATE) + 0x010000;
c32 = decodeSurrogatePair(cur[0], cur[1]);
++cur;
}
} else {
Expand Down

0 comments on commit 3f26192

Please sign in to comment.