Skip to content

Commit

Permalink
Reduce size of entries written in phases 1 and 2
Browse files Browse the repository at this point in the history
The earlier 'maximum entry size' was a remnant from the old sorting
and backpropagation algorithms which required extra space as the entries
were re-written in place.
  • Loading branch information
rostislav authored and hoffmang9 committed Apr 2, 2021
1 parent 50856aa commit f346f2e
Show file tree
Hide file tree
Showing 5 changed files with 27 additions and 6 deletions.
7 changes: 7 additions & 0 deletions src/entry_sizes.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -70,6 +70,13 @@ class EntrySizes {
}
}

// Get size of entries containing (sort_key, pos, offset). Such entries are
// written to table 7 in phase 1 and to tables 2-7 in phase 2.
static uint32_t GetKeyPosOffsetSize(uint8_t k)
{
return cdiv(2 * k + kOffsetSize, 8);
}

// Calculates the size of one C3 park. This will store bits for each f7 between
// two C1 checkpoints, depending on how many times that f7 is present. For low
// values of k, we need extra space to account for the additional variability.
Expand Down
13 changes: 12 additions & 1 deletion src/phase1.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -597,6 +597,7 @@ std::vector<uint64_t> RunPhase1(
uint32_t const log_num_buckets,
uint32_t const stripe_size,
uint8_t const num_threads,
bool const enable_bitfield,
bool const show_progress)
{
std::cout << "Computing table 1" << std::endl;
Expand Down Expand Up @@ -652,9 +653,19 @@ std::vector<uint64_t> RunPhase1(

// Determines how many bytes the entries in our left and right tables will take up.
uint32_t const entry_size_bytes = EntrySizes::GetMaxEntrySize(k, table_index, true);
uint32_t const compressed_entry_size_bytes = EntrySizes::GetMaxEntrySize(k, table_index, false);
uint32_t compressed_entry_size_bytes = EntrySizes::GetMaxEntrySize(k, table_index, false);
right_entry_size_bytes = EntrySizes::GetMaxEntrySize(k, table_index + 1, true);

if (enable_bitfield && table_index != 1) {
// We only write pos and offset to tables 2-6 after removing
// metadata
compressed_entry_size_bytes = cdiv(k + kOffsetSize, 8);
if (table_index == 6) {
// Table 7 will contain f7, pos and offset
right_entry_size_bytes = EntrySizes::GetKeyPosOffsetSize(k);
}
}

std::cout << "Computing table " << int{table_index + 1} << std::endl;
// Start of parallel execution

Expand Down
7 changes: 4 additions & 3 deletions src/phase2.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,7 @@ Phase2Results RunPhase2(
uint8_t const pos_offset_shift = write_counter_shift - pos_offset_size;
uint8_t const f7_shift = 128 - k;
uint8_t const t7_pos_offset_shift = f7_shift - pos_offset_size;
uint8_t const new_entry_size = EntrySizes::GetKeyPosOffsetSize(k);

std::vector<uint64_t> new_table_sizes(8, 0);
new_table_sizes[7] = table_sizes[7];
Expand Down Expand Up @@ -102,7 +103,7 @@ Phase2Results RunPhase2(
next_bitfield.clear();

int64_t const table_size = table_sizes[table_index];
int16_t const entry_size = EntrySizes::GetMaxEntrySize(k, table_index, false);
int16_t const entry_size = cdiv(k + kOffsetSize + (table_index == 7 ? k : 0), 8);

BufferedDisk disk(&tmp_1_disks[table_index], table_size * entry_size);

Expand Down Expand Up @@ -157,7 +158,7 @@ Phase2Results RunPhase2(
table_index == 2 ? memory_size : memory_size / 2,
num_buckets,
log_num_buckets,
uint16_t(entry_size),
new_entry_size,
tmp_dirname,
filename + ".p2.t" + std::to_string(table_index),
uint32_t(k),
Expand Down Expand Up @@ -265,7 +266,7 @@ Phase2Results RunPhase2(

return {
FilteredDisk(std::move(disk), std::move(current_bitfield), entry_size)
, BufferedDisk(&tmp_1_disks[7], new_table_sizes[7] * EntrySizes::GetMaxEntrySize(k, 7, false))
, BufferedDisk(&tmp_1_disks[7], new_table_sizes[7] * new_entry_size)
, std::move(output_files)
, std::move(new_table_sizes)
};
Expand Down
5 changes: 3 additions & 2 deletions src/phase3.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -181,6 +181,7 @@ Phase3Results RunPhase3(
uint32_t right_sort_key_size = k;

uint32_t left_entry_size_bytes = EntrySizes::GetMaxEntrySize(k, table_index, false);
uint32_t p2_entry_size_bytes = EntrySizes::GetKeyPosOffsetSize(k);
right_entry_size_bytes = EntrySizes::GetMaxEntrySize(k, table_index + 1, false);

uint64_t left_reader = 0;
Expand Down Expand Up @@ -242,8 +243,8 @@ Phase3Results RunPhase3(
}
// The right entries are in the format from backprop, (sort_key, pos,
// offset)
uint8_t const* right_entry_buf = right_disk.Read(right_reader, right_entry_size_bytes);
right_reader += right_entry_size_bytes;
uint8_t const* right_entry_buf = right_disk.Read(right_reader, p2_entry_size_bytes);
right_reader += p2_entry_size_bytes;
right_reader_count++;

entry_sort_key =
Expand Down
1 change: 1 addition & 0 deletions src/plotter_disk.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -227,6 +227,7 @@ class DiskPlotter {
log_num_buckets,
stripe_size,
num_threads,
!nobitfield,
show_progress);
p1.PrintElapsed("Time for phase 1 =");

Expand Down

0 comments on commit f346f2e

Please sign in to comment.