Skip to content

Commit

Permalink
use "unsigned char" for bytecode string storage
Browse files Browse the repository at this point in the history
Summary:
The bytecode string storage was typed as "char" and typically kept as
std::vector<char>, which was misleading since it contains both ASCII
and UTF16 values. It is a sequence of bytes, not chars. Parts of that
sequence can later be interpreted as one type or another.

Change it to use "unsigned char", which brings it in line with other
storage pools in the bytecode.

Perhaps more importantly, this makes it easier to use the string
deduping algorithms for binary data.

Reviewed By: avp

Differential Revision: D18056231

fbshipit-source-id: f97b7e5bbf94e82886bf92bea382ed7a9f592a0a
  • Loading branch information
tmikov authored and facebook-github-bot committed Apr 14, 2020
1 parent ab92cf4 commit f4127df
Show file tree
Hide file tree
Showing 12 changed files with 50 additions and 44 deletions.
6 changes: 3 additions & 3 deletions include/hermes/BCGen/HBC/Bytecode.h
Original file line number Diff line number Diff line change
Expand Up @@ -195,8 +195,8 @@ class BytecodeModule {
/// each string in the string storage.
std::vector<StringTableEntry> stringTable_;

/// The global string storage. A sequence of chars.
std::vector<char> stringStorage_;
/// The global string storage. A sequence of bytes.
std::vector<unsigned char> stringStorage_;

/// The regexp bytecode buffer.
std::vector<unsigned char> regExpStorage_;
Expand Down Expand Up @@ -253,7 +253,7 @@ class BytecodeModule {
std::vector<StringKind::Entry> &&stringKinds,
std::vector<uint32_t> &&identifierTranslations,
std::vector<StringTableEntry> &&stringTable,
std::vector<char> &&stringStorage,
std::vector<unsigned char> &&stringStorage,
std::vector<RegExpTableEntry> &&regExpTable,
std::vector<unsigned char> &&regExpStorage,
uint32_t globalFunctionIndex,
Expand Down
7 changes: 4 additions & 3 deletions include/hermes/BCGen/HBC/BytecodeDataProvider.h
Original file line number Diff line number Diff line change
Expand Up @@ -117,7 +117,7 @@ class BCProviderBase {
uint32_t stringCount_{};
llvm::ArrayRef<StringKind::Entry> stringKinds_{};
llvm::ArrayRef<uint32_t> identifierTranslations_{};
llvm::ArrayRef<char> stringStorage_{};
llvm::ArrayRef<unsigned char> stringStorage_{};

llvm::ArrayRef<unsigned char> arrayBuffer_{};
llvm::ArrayRef<unsigned char> objKeyBuffer_{};
Expand Down Expand Up @@ -169,7 +169,7 @@ class BCProviderBase {
llvm::ArrayRef<uint32_t> getIdentifierTranslations() const {
return identifierTranslations_;
}
llvm::ArrayRef<char> getStringStorage() const {
llvm::ArrayRef<unsigned char> getStringStorage() const {
return stringStorage_;
}
llvm::ArrayRef<unsigned char> getArrayBuffer() const {
Expand Down Expand Up @@ -205,7 +205,8 @@ class BCProviderBase {
llvm::StringRef getStringRefFromID(StringID stringID) const {
auto entry = getStringTableEntry(stringID);
return llvm::StringRef(
getStringStorage().begin() + entry.getOffset(), entry.getLength());
(const char *)getStringStorage().begin() + entry.getOffset(),
entry.getLength());
}

/// Get the global debug info, lazily create it.
Expand Down
2 changes: 1 addition & 1 deletion include/hermes/BCGen/HBC/BytecodeFileFormat.h
Original file line number Diff line number Diff line change
Expand Up @@ -427,7 +427,7 @@ struct BytecodeFileFields {
Array<hbc::OverflowStringTableEntry> stringTableOverflowEntries{};

/// The character buffer used for string storage.
Array<char> stringStorage;
Array<uint8_t> stringStorage;

/// Buffer for array literals.
Array<uint8_t> arrayBuffer;
Expand Down
8 changes: 4 additions & 4 deletions include/hermes/BCGen/HBC/ConsecutiveStringStorage.h
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@ namespace hbc {
/// \return a StringRef of the string.
llvm::StringRef getStringFromEntry(
const StringTableEntry &entry,
llvm::ArrayRef<char> storage,
llvm::ArrayRef<unsigned char> storage,
std::string &utf8ConversionStorage);

/// A data structure for storing a serialized list of strings.
Expand All @@ -52,7 +52,7 @@ class ConsecutiveStringStorage {
std::vector<StringTableEntry> strTable_{};

/// A consecutive storage of char sequences.
std::vector<char> storage_{};
std::vector<unsigned char> storage_{};

/// Whether the string table is still valid to use.
bool isTableValid_{true};
Expand Down Expand Up @@ -86,7 +86,7 @@ class ConsecutiveStringStorage {
/// Construct from a table and storage.
ConsecutiveStringStorage(
std::vector<StringTableEntry> &&table,
std::vector<char> &&storage)
std::vector<unsigned char> &&storage)
: strTable_(std::move(table)), storage_(std::move(storage)) {}

/// \returns a view to the current table.
Expand Down Expand Up @@ -118,7 +118,7 @@ class ConsecutiveStringStorage {
/// \returns a reference to the string storage. Notice that whoever receives
/// the table may temper, swap or destroy the content. Hence after this
/// call, the string table is no longer valid to use.
std::vector<char> acquireStringStorage() {
std::vector<unsigned char> acquireStringStorage() {
ensureStorageValid();
isStorageValid_ = false;
return std::move(storage_);
Expand Down
6 changes: 3 additions & 3 deletions include/hermes/BCGen/HBC/DebugInfo.h
Original file line number Diff line number Diff line change
Expand Up @@ -128,7 +128,7 @@ class DebugInfo {
std::vector<StringTableEntry> filenameTable_{};

/// String storage for filenames.
std::vector<char> filenameStorage_{};
std::vector<unsigned char> filenameStorage_{};

DebugFileRegionList files_{};
uint32_t lexicalDataOffset_ = 0;
Expand All @@ -154,7 +154,7 @@ class DebugInfo {

explicit DebugInfo(
std::vector<StringTableEntry> &&filenameStrings,
std::vector<char> &&filenameStorage,
std::vector<unsigned char> &&filenameStorage,
DebugFileRegionList &&files,
uint32_t lexicalDataOffset,
StreamVector<uint8_t> &&data)
Expand All @@ -175,7 +175,7 @@ class DebugInfo {
llvm::ArrayRef<StringTableEntry> getFilenameTable() const {
return filenameTable_;
}
llvm::ArrayRef<char> getFilenameStorage() const {
llvm::ArrayRef<unsigned char> getFilenameStorage() const {
return filenameStorage_;
}

Expand Down
4 changes: 2 additions & 2 deletions include/hermes/BCGen/HBC/UniquingStringLiteralTable.h
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,7 @@ struct StringLiteralTable final : public StringLiteralIDMapping {

/// Exposes interface to extract parts of underlying ConsecutiveStringStorage
inline std::vector<StringTableEntry> acquireStringTable();
inline std::vector<char> acquireStringStorage();
inline std::vector<unsigned char> acquireStringStorage();

/// \returns a list of translations corresponding to the strings marked as
/// identifiers, in their order in the underlying storage.
Expand Down Expand Up @@ -139,7 +139,7 @@ inline std::vector<StringTableEntry> StringLiteralTable::acquireStringTable() {
return storage_.acquireStringTable();
}

inline std::vector<char> StringLiteralTable::acquireStringStorage() {
inline std::vector<unsigned char> StringLiteralTable::acquireStringStorage() {
return storage_.acquireStringStorage();
}

Expand Down
2 changes: 1 addition & 1 deletion include/hermes/Support/StringTableEntry.h
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ class StringTableEntry {
public:
using StringTableRefTy = llvm::ArrayRef<StringTableEntry>;
using MutStringTableRefTy = llvm::MutableArrayRef<StringTableEntry>;
using StringStorageRefTy = llvm::ArrayRef<char>;
using StringStorageRefTy = llvm::ArrayRef<unsigned char>;

StringTableEntry(uint32_t offset, uint32_t length, bool isUTF16)
: offset_(offset), length_(length) {
Expand Down
4 changes: 2 additions & 2 deletions lib/BCGen/HBC/BytecodeDataProvider.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -223,7 +223,7 @@ bool BytecodeFileFields<Mutable>::populateFromBuffer(

void visitStringStorage() {
align(buf);
f.stringStorage = castArrayRef<char>(buf, h->stringStorageSize, end);
f.stringStorage = castArrayRef<unsigned char>(buf, h->stringStorageSize, end);
}
void visitArrayBuffer() {
align(buf);
Expand Down Expand Up @@ -577,7 +577,7 @@ void BCProviderFromBuffer::createDebugInfo() {
auto filenameTable =
castArrayRef<StringTableEntry>(buf, header->filenameCount, end_);
auto filenameStorage =
castArrayRef<char>(buf, header->filenameStorageSize, end_);
castArrayRef<unsigned char>(buf, header->filenameStorageSize, end_);

hbc::DebugInfo::DebugFileRegionList files;
for (unsigned i = 0; i < header->fileRegionCount; i++) {
Expand Down
2 changes: 1 addition & 1 deletion lib/BCGen/HBC/BytecodeStream.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -87,7 +87,7 @@ void BytecodeSerializer::serializeDebugInfo(BytecodeModule &BM) {

const llvm::ArrayRef<StringTableEntry> filenameTable =
info.getFilenameTable();
const llvm::ArrayRef<char> filenameStorage = info.getFilenameStorage();
const auto filenameStorage = info.getFilenameStorage();
const DebugInfo::DebugFileRegionList &files = info.viewFiles();
const StreamVector<uint8_t> &data = info.viewData();
uint32_t lexOffset = info.lexicalDataOffset();
Expand Down
41 changes: 23 additions & 18 deletions lib/BCGen/HBC/ConsecutiveStringStorage.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -562,7 +562,7 @@ class StringTableBuilder {

public:
// Entries which are in ASCII.
std::vector<StringPacker<char>::StringEntry> asciiStrings_;
std::vector<StringPacker<unsigned char>::StringEntry> asciiStrings_;

// Entries which contain non-ASCII characters.
std::vector<StringPacker<char16_t>::StringEntry> u16Strings_;
Expand All @@ -583,15 +583,17 @@ class StringTableBuilder {
uint32_t index = 0;
for (auto it = begin; it != end; ++it) {
auto &str = *it;
const char *begin = str.data();
const char *end = begin + str.size();
if (isAllASCII(str.begin(), str.end())) {
ArrayRef<char> astr(begin, end);
static_assert(sizeof(str.data()[0]) == 1, "strings must be UTF8");
const unsigned char *begin = (const unsigned char *)str.data();
const unsigned char *end = begin + str.size();
if (isAllASCII(begin, end)) {
ArrayRef<unsigned char> astr(begin, end);
asciiStrings_.emplace_back(index, astr);
} else {
u16StringStorage_.emplace_back();
std::vector<char16_t> &ustr = u16StringStorage_.back();
convertUTF8WithSurrogatesToUTF16(std::back_inserter(ustr), begin, end);
convertUTF8WithSurrogatesToUTF16(
std::back_inserter(ustr), (const char *)begin, (const char *)end);
u16Strings_.emplace_back(index, ustr);
}
index++;
Expand All @@ -617,7 +619,7 @@ class StringTableBuilder {
/// If \p optimize is set, attempt to find a more efficient packing by reusing
/// substrings and prefix-suffix overlaps.
void packIntoStorage(
std::vector<char> *asciiStorage,
std::vector<unsigned char> *asciiStorage,
std::vector<char16_t> *u16Storage,
bool optimize) {
NamedRegionTimer timer(
Expand All @@ -628,16 +630,19 @@ class StringTableBuilder {
AreStatisticsEnabled());
// Note these assignments use efficient move-assignment, not copying.
if (optimize) {
*asciiStorage = StringPacker<char>::optimizingPackStrings(asciiStrings_);
*asciiStorage =
StringPacker<unsigned char>::optimizingPackStrings(asciiStrings_);
*u16Storage = StringPacker<char16_t>::optimizingPackStrings(u16Strings_);
} else {
*asciiStorage = StringPacker<char>::fastPackStrings(asciiStrings_);
*asciiStorage =
StringPacker<unsigned char>::fastPackStrings(asciiStrings_);
*u16Storage = StringPacker<char16_t>::fastPackStrings(u16Strings_);
}

#ifndef NDEBUG
// Ensure that our packing was correct.
StringPacker<char>::validateStringPacking(asciiStrings_, *asciiStorage);
StringPacker<unsigned char>::validateStringPacking(
asciiStrings_, *asciiStorage);
StringPacker<char16_t>::validateStringPacking(u16Strings_, *u16Storage);
#endif
}
Expand All @@ -650,7 +655,7 @@ class StringTableBuilder {
/// negated to indicate they are Unicode. If a string is in the
/// \p identifiers, we also mark the second most significant bit.
std::vector<StringTableEntry> generateStringTable(
ArrayRef<char> storage,
ArrayRef<unsigned char> storage,
size_t u16OffsetAdjust) {
// Each of our StringEntries remembers its original index in the initial
// array. Create a table large enough, and set the corresponding index in
Expand Down Expand Up @@ -678,7 +683,7 @@ class StringTableBuilder {
/// \return the offset of the u16 storage in that byte array.
static size_t appendU16Storage(
ArrayRef<char16_t> u16Storage,
std::vector<char> *output) {
std::vector<unsigned char> *output) {
using namespace llvm::support;
static_assert(sizeof(char16_t) == 2, "sizeof char16_t unexpectedly not 2");
if (u16Storage.empty()) {
Expand All @@ -694,7 +699,7 @@ class StringTableBuilder {
// Make space, and write as little endian.
size_t offset = output->size();
output->resize(output->size() + sizeof(char16_t) * u16Storage.size());
char *cursor = &output->at(offset);
unsigned char *cursor = &output->at(offset);
for (char16_t s : u16Storage) {
endian::write<char16_t, little, 0>(cursor, s);
cursor += sizeof(s);
Expand All @@ -716,7 +721,7 @@ ConsecutiveStringStorage::ConsecutiveStringStorage(
// Prepare to build our string table.
// Generate storage for our ASCII and u16 strings.
StringTableBuilder builder(begin, end);
std::vector<char> asciiStorage;
std::vector<unsigned char> asciiStorage;
std::vector<char16_t> u16Storage;
builder.packIntoStorage(&asciiStorage, &u16Storage, optimize);

Expand Down Expand Up @@ -753,13 +758,13 @@ uint32_t ConsecutiveStringStorage::getEntryHash(size_t i) const {
ensureStorageValid();

auto &entry = strTable_[i];
const char *data = &storage_[entry.getOffset()];
const unsigned char *data = &storage_[entry.getOffset()];
uint32_t length = entry.getLength();
if (entry.isUTF16()) {
const char16_t *u16data = reinterpret_cast<const char16_t *>(data);
return hermes::hashString(ArrayRef<char16_t>{u16data, length});
} else {
return hermes::hashString(ArrayRef<char>{data, length});
return hermes::hashString(ArrayRef<char>{(const char *)data, length});
}
}

Expand Down Expand Up @@ -795,15 +800,15 @@ llvm::StringRef ConsecutiveStringStorage::getStringAtIndex(

llvm::StringRef getStringFromEntry(
const StringTableEntry &entry,
llvm::ArrayRef<char> storage,
llvm::ArrayRef<unsigned char> storage,
std::string &utf8ConversionStorage) {
uint32_t offset = entry.getOffset();
uint32_t length = entry.getLength();
assert(
offset + length <= storage.size() && offset + length >= offset &&
"Invalid entry");
if (!entry.isUTF16()) {
return StringRef{storage.data() + offset, length};
return StringRef{(const char *)storage.data() + offset, length};
} else {
const char16_t *s =
reinterpret_cast<const char16_t *>(storage.data() + offset);
Expand Down
4 changes: 2 additions & 2 deletions lib/VM/RuntimeModule.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,7 @@ SymbolID RuntimeModule::createSymbolFromStringIDMayAllocate(
return mapStringMayAllocate(str, stringID, hash);
} else {
// ASCII.
const char *s = strStorage.begin() + entry.getOffset();
const char *s = (const char *)strStorage.begin() + entry.getOffset();
ASCIIRef str{s, entry.getLength()};
uint32_t hash = mhash ? *mhash : hashString(str);
return mapStringMayAllocate(str, stringID, hash);
Expand Down Expand Up @@ -327,7 +327,7 @@ std::string RuntimeModule::getStringFromStringID(StringID stringID) {
return out;
} else {
// ASCII.
const char *s = strStorage.begin() + entry.getOffset();
const char *s = (const char *)strStorage.begin() + entry.getOffset();
return std::string{s, entry.getLength()};
}
}
Expand Down
8 changes: 4 additions & 4 deletions unittests/BCGen/SupportTest.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -92,7 +92,7 @@ TEST(StringStorageTest, PackingStringStorageTest) {
std::vector<llvm::StringRef> strings{"phab", "alphabet", "soup", "ou"};
hbc::ConsecutiveStringStorage storage(strings);
const auto data = storage.acquireStringStorage();
llvm::StringRef dataAsStr(data.data(), data.size());
llvm::StringRef dataAsStr((const char *)data.data(), data.size());
EXPECT_EQ(dataAsStr.str(), "phabalphabetsoupou");
}

Expand All @@ -111,7 +111,7 @@ static void test1OptimizingStringStorage(
hbc::ConsecutiveStringStorage storage(strings, true /* optimize */);
auto index = storage.acquireStringTable();
auto data = storage.acquireStringStorage();
llvm::StringRef dataAsString(data.data(), data.size());
llvm::StringRef dataAsString((const char *)data.data(), data.size());
size_t idx = 0;
for (const auto &p : index) {
uint32_t offset = p.getOffset();
Expand Down Expand Up @@ -612,7 +612,7 @@ TEST(StringStorageTest, DeltaOptimizingModeTest) {
"ecoc", "octpeseta", "nationachime", "ationremass"};

auto baseTable = tableForStrings(baseStrings);
std::vector<char> baseBuffer = baseTable.acquireStringStorage();
std::vector<unsigned char> baseBuffer = baseTable.acquireStringStorage();
std::vector<StringTableEntry> baseEntries = baseTable.acquireStringTable();

// Create a new table starting with the base storage.
Expand All @@ -626,7 +626,7 @@ TEST(StringStorageTest, DeltaOptimizingModeTest) {

hbc::UniquingStringLiteralAccumulator baseAccumulator{
hbc::ConsecutiveStringStorage{std::vector<StringTableEntry>{baseEntries},
std::vector<char>{baseBuffer}},
std::vector<unsigned char>{baseBuffer}},
std::vector<bool>(baseEntries.size(), false)};

auto newTable = tableForStrings(newStrings, std::move(baseAccumulator));
Expand Down

0 comments on commit f4127df

Please sign in to comment.