Skip to content

Commit

Permalink
[PDB] Hash types up front when merging types instead of using StringMap
Browse files Browse the repository at this point in the history
Summary:
First, StringMap uses llvm::HashString, which is only good for short
identifiers and really bad for large blobs of binary data like type
records. Moving to `DenseMap<StringRef, TypeIndex>` with some tricks for
memory allocation fixes that.

Unfortunately, that didn't buy very much performance. Profiling showed
that we spend a long time during DenseMap growth rehashing existing
entries. Also, in general, DenseMap is faster when the keys are small.
This change takes that to the logical conclusion by introducing a small
wrapper value type around a pointer to key data. The key data contains a
precomputed hash, the original record data (pointer and size), and the
type index, which is the "value" of our original map.

This reduces the time to produce llvm-as.exe and llvm-as.pdb from ~15s
on my machine to 3.5s, which is about a 4x improvement.

Reviewers: zturner, inglorion, ruiu

Subscribers: llvm-commits

Differential Revision: https://reviews.llvm.org/D33428

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@303665 91177308-0d34-0410-b5e6-96231b3b80d8
  • Loading branch information
rnk committed May 23, 2017
1 parent 6ae2b02 commit 9d05387
Show file tree
Hide file tree
Showing 6 changed files with 139 additions and 94 deletions.
23 changes: 9 additions & 14 deletions include/llvm/DebugInfo/CodeView/TypeSerializer.h
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,6 @@

#include "llvm/ADT/Optional.h"
#include "llvm/ADT/SmallVector.h"
#include "llvm/ADT/StringMap.h"
#include "llvm/ADT/StringRef.h"
#include "llvm/Support/Allocator.h"
#include "llvm/Support/Error.h"
Expand All @@ -26,6 +25,8 @@ namespace llvm {

namespace codeview {

class TypeHasher;

class TypeSerializer : public TypeVisitorCallbacks {
struct SubRecord {
SubRecord(TypeLeafKind K, uint32_t S) : Kind(K), Size(S) {}
Expand All @@ -45,43 +46,37 @@ class TypeSerializer : public TypeVisitorCallbacks {
}
};

typedef SmallVector<MutableArrayRef<uint8_t>, 2> RecordList;
typedef SmallVector<MutableArrayRef<uint8_t>, 2> MutableRecordList;

static constexpr uint8_t ContinuationLength = 8;
BumpPtrAllocator &RecordStorage;
RecordSegment CurrentSegment;
RecordList FieldListSegments;
MutableRecordList FieldListSegments;

TypeIndex LastTypeIndex;
Optional<TypeLeafKind> TypeKind;
Optional<TypeLeafKind> MemberKind;
std::vector<uint8_t> RecordBuffer;
MutableBinaryByteStream Stream;
BinaryStreamWriter Writer;
TypeRecordMapping Mapping;

RecordList SeenRecords;
StringMap<TypeIndex> HashedRecords;
/// Private type record hashing implementation details are handled here.
std::unique_ptr<TypeHasher> Hasher;

bool isInFieldList() const;
TypeIndex calcNextTypeIndex() const;
TypeIndex incrementTypeIndex();
MutableArrayRef<uint8_t> getCurrentSubRecordData();
MutableArrayRef<uint8_t> getCurrentRecordData();
Error writeRecordPrefix(TypeLeafKind Kind);
TypeIndex insertRecordBytesPrivate(MutableArrayRef<uint8_t> Record);
TypeIndex insertRecordBytesWithCopy(CVType &Record,
MutableArrayRef<uint8_t> Data);

Expected<MutableArrayRef<uint8_t>>
addPadding(MutableArrayRef<uint8_t> Record);

public:
explicit TypeSerializer(BumpPtrAllocator &Storage);
~TypeSerializer();

ArrayRef<MutableArrayRef<uint8_t>> records() const;
TypeIndex getLastTypeIndex() const;
TypeIndex insertRecordBytes(MutableArrayRef<uint8_t> Record);
ArrayRef<ArrayRef<uint8_t>> records() const;
TypeIndex insertRecordBytes(ArrayRef<uint8_t> Record);
Expected<TypeIndex> visitTypeEndGetIndex(CVType &Record);

Error visitTypeBegin(CVType &Record) override;
Expand Down
6 changes: 2 additions & 4 deletions include/llvm/DebugInfo/CodeView/TypeTableBuilder.h
Original file line number Diff line number Diff line change
Expand Up @@ -64,7 +64,7 @@ class TypeTableBuilder {
return *ExpectedIndex;
}

TypeIndex writeSerializedRecord(MutableArrayRef<uint8_t> Record) {
TypeIndex writeSerializedRecord(ArrayRef<uint8_t> Record) {
return Serializer.insertRecordBytes(Record);
}

Expand All @@ -77,9 +77,7 @@ class TypeTableBuilder {
}
}

ArrayRef<MutableArrayRef<uint8_t>> records() const {
return Serializer.records();
}
ArrayRef<ArrayRef<uint8_t>> records() const { return Serializer.records(); }
};

class FieldListRecordBuilder {
Expand Down
4 changes: 2 additions & 2 deletions include/llvm/DebugInfo/CodeView/TypeTableCollection.h
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ namespace codeview {

class TypeTableCollection : public TypeCollection {
public:
explicit TypeTableCollection(ArrayRef<MutableArrayRef<uint8_t>> Records);
explicit TypeTableCollection(ArrayRef<ArrayRef<uint8_t>> Records);

Optional<TypeIndex> getFirst() override;
Optional<TypeIndex> getNext(TypeIndex Prev) override;
Expand All @@ -33,7 +33,7 @@ class TypeTableCollection : public TypeCollection {
bool hasCapacityFor(TypeIndex Index) const;
void ensureTypeExists(TypeIndex Index);

ArrayRef<MutableArrayRef<uint8_t>> Records;
ArrayRef<ArrayRef<uint8_t>> Records;
TypeDatabase Database;
};
}
Expand Down
183 changes: 119 additions & 64 deletions lib/DebugInfo/CodeView/TypeSerializer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -9,28 +9,124 @@

#include "llvm/DebugInfo/CodeView/TypeSerializer.h"

#include "llvm/ADT/DenseSet.h"
#include "llvm/Support/BinaryStreamWriter.h"

#include <string.h>

using namespace llvm;
using namespace llvm::codeview;

bool TypeSerializer::isInFieldList() const {
return TypeKind.hasValue() && *TypeKind == TypeLeafKind::LF_FIELDLIST;
}
namespace {
struct HashedType {
uint64_t Hash;
const uint8_t *Data;
unsigned Size; // FIXME: Go to uint16_t?
TypeIndex Index;
};

/// Wrapper around a poitner to a HashedType. Hash and equality operations are
/// based on data in the pointee.
struct HashedTypePtr {
HashedTypePtr() = default;
HashedTypePtr(HashedType *Ptr) : Ptr(Ptr) {}
HashedType *Ptr = nullptr;
};
} // namespace

template <> struct DenseMapInfo<HashedTypePtr> {
static inline HashedTypePtr getEmptyKey() { return HashedTypePtr(nullptr); }
static inline HashedTypePtr getTombstoneKey() {
return HashedTypePtr(reinterpret_cast<HashedType *>(1));
}
static unsigned getHashValue(HashedTypePtr Val) {
assert(Val.Ptr != getEmptyKey().Ptr && Val.Ptr != getTombstoneKey().Ptr);
return Val.Ptr->Hash;
}
static bool isEqual(HashedTypePtr LHSP, HashedTypePtr RHSP) {
HashedType *LHS = LHSP.Ptr;
HashedType *RHS = RHSP.Ptr;
if (RHS == getEmptyKey().Ptr || RHS == getTombstoneKey().Ptr)
return LHS == RHS;
if (LHS->Hash != RHS->Hash || LHS->Size != RHS->Size)
return false;
return ::memcmp(LHS->Data, RHS->Data, LHS->Size) == 0;
}
};

/// Private implementation so that we don't leak our DenseMap instantiations to
/// users.
class llvm::codeview::TypeHasher {
private:
/// Storage for type record provided by the caller. Records will outlive the
/// hasher object, so they should be allocated here.
BumpPtrAllocator &RecordStorage;

/// Storage for hash keys. These only need to live as long as the hashing
/// operation.
BumpPtrAllocator KeyStorage;

/// Hash table. We really want a DenseMap<ArrayRef<uint8_t>, TypeIndex> here,
/// but DenseMap is inefficient when the keys are long (like type records)
/// because it recomputes the hash value of every key when it grows. This
/// value type stores the hash out of line in KeyStorage, so that table
/// entries are small and easy to rehash.
DenseSet<HashedTypePtr> HashedRecords;

SmallVector<ArrayRef<uint8_t>, 2> SeenRecords;

TypeIndex NextTypeIndex = TypeIndex(TypeIndex::FirstNonSimpleIndex);

public:
TypeHasher(BumpPtrAllocator &RecordStorage) : RecordStorage(RecordStorage) {}

ArrayRef<ArrayRef<uint8_t>> records() const { return SeenRecords; }

/// Takes the bytes of type record, inserts them into the hash table, saves
/// them, and returns a pointer to an identical stable type record along with
/// its type index in the destination stream.
TypeIndex getOrCreateRecord(ArrayRef<uint8_t> &Record);
};

TypeIndex TypeHasher::getOrCreateRecord(ArrayRef<uint8_t> &Record) {
assert(Record.size() < UINT32_MAX && "Record too big");
assert(Record.size() % 4 == 0 && "Record is not aligned to 4 bytes!");

// Compute the hash up front so we can store it in the key.
HashedType TempHashedType = {hash_value(Record), Record.data(),
unsigned(Record.size()), NextTypeIndex};

auto Result = HashedRecords.insert(HashedTypePtr(&TempHashedType));
HashedType *&Hashed = Result.first->Ptr;

if (Result.second) {
// This was a new type record. We need stable storage for both the key and
// the record. The record should outlive the hashing operation.
Hashed = KeyStorage.Allocate<HashedType>();
*Hashed = TempHashedType;

uint8_t *Stable = RecordStorage.Allocate<uint8_t>(Record.size());
memcpy(Stable, Record.data(), Record.size());
Hashed->Data = Stable;
assert(Hashed->Size == Record.size());

// This was a new record, so increment our next type index.
++NextTypeIndex;
}

// Update the caller's copy of Record to point a stable copy.
Record = ArrayRef<uint8_t>(Hashed->Data, Hashed->Size);

if (Result.second) {
// FIXME: Can we record these in a more efficient way?
SeenRecords.push_back(Record);
}

TypeIndex TypeSerializer::calcNextTypeIndex() const {
if (LastTypeIndex.isNoneType())
return TypeIndex(TypeIndex::FirstNonSimpleIndex);
else
return TypeIndex(LastTypeIndex.getIndex() + 1);
return TypeIndex(Hashed->Index);
}

TypeIndex TypeSerializer::incrementTypeIndex() {
TypeIndex Previous = LastTypeIndex;
LastTypeIndex = calcNextTypeIndex();
return Previous;
bool TypeSerializer::isInFieldList() const {
return TypeKind.hasValue() && *TypeKind == TypeLeafKind::LF_FIELDLIST;
}

MutableArrayRef<uint8_t> TypeSerializer::getCurrentSubRecordData() {
Expand All @@ -51,46 +147,6 @@ Error TypeSerializer::writeRecordPrefix(TypeLeafKind Kind) {
return Error::success();
}

TypeIndex
TypeSerializer::insertRecordBytesPrivate(MutableArrayRef<uint8_t> Record) {
assert(Record.size() % 4 == 0 && "Record is not aligned to 4 bytes!");

StringRef S(reinterpret_cast<const char *>(Record.data()), Record.size());

TypeIndex NextTypeIndex = calcNextTypeIndex();
auto Result = HashedRecords.try_emplace(S, NextTypeIndex);
if (Result.second) {
LastTypeIndex = NextTypeIndex;
SeenRecords.push_back(Record);
}
return Result.first->getValue();
}

TypeIndex
TypeSerializer::insertRecordBytesWithCopy(CVType &Record,
MutableArrayRef<uint8_t> Data) {
assert(Data.size() % 4 == 0 && "Record is not aligned to 4 bytes!");

StringRef S(reinterpret_cast<const char *>(Data.data()), Data.size());

// Do a two state lookup / insert so that we don't have to allocate unless
// we're going
// to do an insert. This is a big memory savings.
auto Iter = HashedRecords.find(S);
if (Iter != HashedRecords.end())
return Iter->second;

LastTypeIndex = calcNextTypeIndex();
uint8_t *Copy = RecordStorage.Allocate<uint8_t>(Data.size());
::memcpy(Copy, Data.data(), Data.size());
Data = MutableArrayRef<uint8_t>(Copy, Data.size());
S = StringRef(reinterpret_cast<const char *>(Data.data()), Data.size());
HashedRecords.insert(std::make_pair(S, LastTypeIndex));
SeenRecords.push_back(Data);
Record.RecordData = Data;
return LastTypeIndex;
}

Expected<MutableArrayRef<uint8_t>>
TypeSerializer::addPadding(MutableArrayRef<uint8_t> Record) {
uint32_t Align = Record.size() % 4;
Expand All @@ -109,26 +165,25 @@ TypeSerializer::addPadding(MutableArrayRef<uint8_t> Record) {
}

TypeSerializer::TypeSerializer(BumpPtrAllocator &Storage)
: RecordStorage(Storage), LastTypeIndex(),
RecordBuffer(MaxRecordLength * 2),
: RecordStorage(Storage), RecordBuffer(MaxRecordLength * 2),
Stream(RecordBuffer, llvm::support::little), Writer(Stream),
Mapping(Writer) {
Mapping(Writer), Hasher(make_unique<TypeHasher>(Storage)) {
// RecordBuffer needs to be able to hold enough data so that if we are 1
// byte short of MaxRecordLen, and then we try to write MaxRecordLen bytes,
// we won't overflow.
}

ArrayRef<MutableArrayRef<uint8_t>> TypeSerializer::records() const {
return SeenRecords;
}
TypeSerializer::~TypeSerializer() = default;

TypeIndex TypeSerializer::getLastTypeIndex() const { return LastTypeIndex; }
ArrayRef<ArrayRef<uint8_t>> TypeSerializer::records() const {
return Hasher->records();
}

TypeIndex TypeSerializer::insertRecordBytes(MutableArrayRef<uint8_t> Record) {
TypeIndex TypeSerializer::insertRecordBytes(ArrayRef<uint8_t> Record) {
assert(!TypeKind.hasValue() && "Already in a type mapping!");
assert(Writer.getOffset() == 0 && "Stream has data already!");

return insertRecordBytesPrivate(Record);
return Hasher->getOrCreateRecord(Record);
}

Error TypeSerializer::visitTypeBegin(CVType &Record) {
Expand Down Expand Up @@ -163,8 +218,8 @@ Expected<TypeIndex> TypeSerializer::visitTypeEndGetIndex(CVType &Record) {
Prefix->RecordLen = ThisRecordData.size() - sizeof(uint16_t);

Record.Type = *TypeKind;
TypeIndex InsertedTypeIndex =
insertRecordBytesWithCopy(Record, ThisRecordData);
Record.RecordData = ThisRecordData;
TypeIndex InsertedTypeIndex = Hasher->getOrCreateRecord(Record.RecordData);

// Write out each additional segment in reverse order, and update each
// record's continuation index to point to the previous one.
Expand All @@ -174,7 +229,7 @@ Expected<TypeIndex> TypeSerializer::visitTypeEndGetIndex(CVType &Record) {
reinterpret_cast<support::ulittle32_t *>(CIBytes.data());
assert(*CI == 0xB0C0B0C0 && "Invalid TypeIndex placeholder");
*CI = InsertedTypeIndex.getIndex();
InsertedTypeIndex = insertRecordBytesPrivate(X);
InsertedTypeIndex = Hasher->getOrCreateRecord(X);
}

TypeKind.reset();
Expand Down
3 changes: 1 addition & 2 deletions lib/DebugInfo/CodeView/TypeTableCollection.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -24,8 +24,7 @@ static void error(Error &&EC) {
consumeError(std::move(EC));
}

TypeTableCollection::TypeTableCollection(
ArrayRef<MutableArrayRef<uint8_t>> Records)
TypeTableCollection::TypeTableCollection(ArrayRef<ArrayRef<uint8_t>> Records)
: Records(Records), Database(Records.size()) {}

Optional<TypeIndex> TypeTableCollection::getFirst() {
Expand Down
14 changes: 6 additions & 8 deletions tools/llvm-pdbdump/llvm-pdbdump.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -877,14 +877,12 @@ static void mergePdbs() {

auto &DestTpi = Builder.getTpiBuilder();
auto &DestIpi = Builder.getIpiBuilder();
MergedTpi.ForEachRecord(
[&DestTpi](TypeIndex TI, MutableArrayRef<uint8_t> Data) {
DestTpi.addTypeRecord(Data, None);
});
MergedIpi.ForEachRecord(
[&DestIpi](TypeIndex TI, MutableArrayRef<uint8_t> Data) {
DestIpi.addTypeRecord(Data, None);
});
MergedTpi.ForEachRecord([&DestTpi](TypeIndex TI, ArrayRef<uint8_t> Data) {
DestTpi.addTypeRecord(Data, None);
});
MergedIpi.ForEachRecord([&DestIpi](TypeIndex TI, ArrayRef<uint8_t> Data) {
DestIpi.addTypeRecord(Data, None);
});

SmallString<64> OutFile(opts::merge::PdbOutputFile);
if (OutFile.empty()) {
Expand Down

0 comments on commit 9d05387

Please sign in to comment.