Skip to content

Commit

Permalink
[EM] Avoid resizing host cache. (dmlc#10734)
Browse files Browse the repository at this point in the history
* [EM] Avoid resizing host cache.

- Add SAM allocator and resource.
- Use page-based cache instead of stream-based cache.
  • Loading branch information
trivialfis authored Aug 22, 2024
1 parent dbfafd8 commit 55aef8f
Show file tree
Hide file tree
Showing 16 changed files with 262 additions and 144 deletions.
2 changes: 1 addition & 1 deletion jvm-packages/xgboost4j/src/native/xgboost4j-gpu.cu
Original file line number Diff line number Diff line change
Expand Up @@ -132,7 +132,7 @@ class DataIteratorProxy {
bool cache_on_host_{true}; // TODO(Bobby): Make this optional.

template <typename T>
using Alloc = xgboost::common::cuda_impl::pinned_allocator<T>;
using Alloc = xgboost::common::cuda_impl::PinnedAllocator<T>;
template <typename U>
using HostVector = std::vector<U, Alloc<U>>;

Expand Down
61 changes: 48 additions & 13 deletions src/common/cuda_pinned_allocator.h
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,6 @@ namespace xgboost::common::cuda_impl {
// that Thrust used to provide.
//
// \see https://en.cppreference.com/w/cpp/memory/allocator

template <typename T>
struct PinnedAllocPolicy {
using pointer = T*; // NOLINT: The type returned by address() / allocate()
Expand All @@ -33,7 +32,7 @@ struct PinnedAllocPolicy {
return std::numeric_limits<size_type>::max() / sizeof(value_type);
}

pointer allocate(size_type cnt, const_pointer = nullptr) { // NOLINT
[[nodiscard]] pointer allocate(size_type cnt, const_pointer = nullptr) const { // NOLINT
if (cnt > this->max_size()) {
throw std::bad_alloc{};
} // end if
Expand All @@ -57,7 +56,7 @@ struct ManagedAllocPolicy {
return std::numeric_limits<size_type>::max() / sizeof(value_type);
}

pointer allocate(size_type cnt, const_pointer = nullptr) { // NOLINT
[[nodiscard]] pointer allocate(size_type cnt, const_pointer = nullptr) const { // NOLINT
if (cnt > this->max_size()) {
throw std::bad_alloc{};
} // end if
Expand All @@ -70,16 +69,49 @@ struct ManagedAllocPolicy {
void deallocate(pointer p, size_type) { dh::safe_cuda(cudaFree(p)); } // NOLINT
};

// This is actually a pinned memory allocator in disguise. We utilize HMM or ATS for
// efficient tracked memory allocation.
template <typename T>
struct SamAllocPolicy {
using pointer = T*; // NOLINT: The type returned by address() / allocate()
using const_pointer = const T*; // NOLINT: The type returned by address()
using size_type = std::size_t; // NOLINT: The type used for the size of the allocation
using value_type = T; // NOLINT: The type of the elements in the allocator

size_type max_size() const { // NOLINT
return std::numeric_limits<size_type>::max() / sizeof(value_type);
}

[[nodiscard]] pointer allocate(size_type cnt, const_pointer = nullptr) const { // NOLINT
if (cnt > this->max_size()) {
throw std::bad_alloc{};
} // end if

size_type n_bytes = cnt * sizeof(value_type);
pointer result = reinterpret_cast<pointer>(std::malloc(n_bytes));
if (!result) {
throw std::bad_alloc{};
}
dh::safe_cuda(cudaHostRegister(result, n_bytes, cudaHostRegisterDefault));
return result;
}

void deallocate(pointer p, size_type) { // NOLINT
dh::safe_cuda(cudaHostUnregister(p));
std::free(p);
}
};

template <typename T, template <typename> typename Policy>
class CudaHostAllocatorImpl : public Policy<T> { // NOLINT
class CudaHostAllocatorImpl : public Policy<T> {
public:
using value_type = typename Policy<T>::value_type; // NOLINT
using pointer = typename Policy<T>::pointer; // NOLINT
using const_pointer = typename Policy<T>::const_pointer; // NOLINT
using size_type = typename Policy<T>::size_type; // NOLINT
using typename Policy<T>::value_type;
using typename Policy<T>::pointer;
using typename Policy<T>::const_pointer;
using typename Policy<T>::size_type;

using reference = T&; // NOLINT: The parameter type for address()
using const_reference = const T&; // NOLINT: The parameter type for address()
using reference = value_type&; // NOLINT: The parameter type for address()
using const_reference = const value_type&; // NOLINT: The parameter type for address()

using difference_type = std::ptrdiff_t; // NOLINT: The type of the distance between two pointers

Expand All @@ -101,14 +133,17 @@ class CudaHostAllocatorImpl : public Policy<T> { // NOLINT
pointer address(reference r) { return &r; } // NOLINT
const_pointer address(const_reference r) { return &r; } // NOLINT

bool operator==(CudaHostAllocatorImpl const& x) const { return true; }
bool operator==(CudaHostAllocatorImpl const&) const { return true; }

bool operator!=(CudaHostAllocatorImpl const& x) const { return !operator==(x); }
};

template <typename T>
using pinned_allocator = CudaHostAllocatorImpl<T, PinnedAllocPolicy>; // NOLINT
using PinnedAllocator = CudaHostAllocatorImpl<T, PinnedAllocPolicy>; // NOLINT

template <typename T>
using ManagedAllocator = CudaHostAllocatorImpl<T, ManagedAllocPolicy>; // NOLINT

template <typename T>
using managed_allocator = CudaHostAllocatorImpl<T, ManagedAllocPolicy>; // NOLINT
using SamAllocator = CudaHostAllocatorImpl<T, SamAllocPolicy>;
} // namespace xgboost::common::cuda_impl
3 changes: 3 additions & 0 deletions src/common/io.h
Original file line number Diff line number Diff line change
Expand Up @@ -286,6 +286,7 @@ class ResourceHandler {
kMmap = 1,
kCudaMalloc = 2,
kCudaMmap = 3,
kCudaHostCache = 4,
};

private:
Expand All @@ -310,6 +311,8 @@ class ResourceHandler {
return "CudaMalloc";
case kCudaMmap:
return "CudaMmap";
case kCudaHostCache:
return "CudaHostCache";
}
LOG(FATAL) << "Unreachable.";
return {};
Expand Down
12 changes: 9 additions & 3 deletions src/common/ref_resource_view.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -16,8 +16,7 @@ namespace xgboost::common {
* @brief Make a fixed size `RefResourceView` with cudaMalloc resource.
*/
template <typename T>
[[nodiscard]] RefResourceView<T> MakeFixedVecWithCudaMalloc(Context const*,
std::size_t n_elements) {
[[nodiscard]] RefResourceView<T> MakeFixedVecWithCudaMalloc(std::size_t n_elements) {
auto resource = std::make_shared<common::CudaMallocResource>(n_elements * sizeof(T));
auto ref = RefResourceView{resource->DataAs<T>(), n_elements, resource};
return ref;
Expand All @@ -26,8 +25,15 @@ template <typename T>
template <typename T>
[[nodiscard]] RefResourceView<T> MakeFixedVecWithCudaMalloc(Context const* ctx,
std::size_t n_elements, T const& init) {
auto ref = MakeFixedVecWithCudaMalloc<T>(ctx, n_elements);
auto ref = MakeFixedVecWithCudaMalloc<T>(n_elements);
thrust::fill_n(ctx->CUDACtx()->CTP(), ref.data(), ref.size(), init);
return ref;
}

template <typename T>
[[nodiscard]] RefResourceView<T> MakeFixedVecWithPinnedMalloc(std::size_t n_elements) {
auto resource = std::make_shared<common::CudaPinnedResource>(n_elements * sizeof(T));
auto ref = RefResourceView{resource->DataAs<T>(), n_elements, resource};
return ref;
}
} // namespace xgboost::common
23 changes: 20 additions & 3 deletions src/common/resource.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -5,9 +5,10 @@
#include <cstddef> // for size_t
#include <functional> // for function

#include "device_vector.cuh" // for DeviceUVector
#include "io.h" // for ResourceHandler, MMAPFile
#include "xgboost/string_view.h" // for StringView
#include "cuda_pinned_allocator.h" // for SamAllocator
#include "device_vector.cuh" // for DeviceUVector
#include "io.h" // for ResourceHandler, MMAPFile
#include "xgboost/string_view.h" // for StringView

namespace xgboost::common {
/**
Expand All @@ -29,6 +30,22 @@ class CudaMallocResource : public ResourceHandler {
void Resize(std::size_t n_bytes) { this->storage_.resize(n_bytes); }
};

class CudaPinnedResource : public ResourceHandler {
std::vector<std::byte, cuda_impl::SamAllocator<std::byte>> storage_;

void Clear() noexcept(true) { this->Resize(0); }

public:
explicit CudaPinnedResource(std::size_t n_bytes) : ResourceHandler{kCudaHostCache} {
this->Resize(n_bytes);
}
~CudaPinnedResource() noexcept(true) override { this->Clear(); }

[[nodiscard]] void* Data() override { return storage_.data(); }
[[nodiscard]] std::size_t Size() const override { return storage_.size(); }
void Resize(std::size_t n_bytes) { this->storage_.resize(n_bytes); }
};

class CudaMmapResource : public ResourceHandler {
std::unique_ptr<MMAPFile, std::function<void(MMAPFile*)>> handle_;
std::size_t n_;
Expand Down
7 changes: 5 additions & 2 deletions src/data/ellpack_page.cu
Original file line number Diff line number Diff line change
Expand Up @@ -404,7 +404,7 @@ size_t EllpackPageImpl::Copy(Context const* ctx, EllpackPageImpl const* page, bs
bst_idx_t num_elements = page->n_rows * page->row_stride;
CHECK_EQ(this->row_stride, page->row_stride);
CHECK_EQ(NumSymbols(), page->NumSymbols());
CHECK_GE(n_rows * row_stride, offset + num_elements);
CHECK_GE(this->n_rows * this->row_stride, offset + num_elements);
if (page == this) {
LOG(FATAL) << "Concatenating the same Ellpack.";
return this->n_rows * this->row_stride;
Expand Down Expand Up @@ -542,7 +542,10 @@ void EllpackPageImpl::CreateHistIndices(DeviceOrd device,
// Return the number of rows contained in this page.
[[nodiscard]] bst_idx_t EllpackPageImpl::Size() const { return n_rows; }

std::size_t EllpackPageImpl::MemCostBytes() const { return this->gidx_buffer.size_bytes(); }
std::size_t EllpackPageImpl::MemCostBytes() const {
return this->gidx_buffer.size_bytes() + sizeof(this->n_rows) + sizeof(this->is_dense) +
sizeof(this->row_stride) + sizeof(this->base_rowid);
}

EllpackDeviceAccessor EllpackPageImpl::GetDeviceAccessor(
DeviceOrd device, common::Span<FeatureType const> feature_types) const {
Expand Down
5 changes: 3 additions & 2 deletions src/data/ellpack_page.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,7 @@ struct EllpackDeviceAccessor {
min_fvalue = cuts->min_vals_.ConstHostSpan();
}
}

/**
* @brief Given a row index and a feature index, returns the corresponding cut value.
*
Expand All @@ -75,7 +76,7 @@ struct EllpackDeviceAccessor {
* local to the current batch.
*/
template <bool global_ridx = true>
[[nodiscard]] __device__ bst_bin_t GetBinIndex(size_t ridx, size_t fidx) const {
[[nodiscard]] __device__ bst_bin_t GetBinIndex(bst_idx_t ridx, size_t fidx) const {
if (global_ridx) {
ridx -= base_rowid;
}
Expand Down Expand Up @@ -114,7 +115,7 @@ struct EllpackDeviceAccessor {
return idx;
}

[[nodiscard]] __device__ float GetFvalue(size_t ridx, size_t fidx) const {
[[nodiscard]] __device__ float GetFvalue(bst_idx_t ridx, size_t fidx) const {
auto gidx = GetBinIndex(ridx, fidx);
if (gidx == -1) {
return std::numeric_limits<float>::quiet_NaN();
Expand Down
51 changes: 7 additions & 44 deletions src/data/ellpack_page_raw_format.cu
Original file line number Diff line number Diff line change
Expand Up @@ -39,8 +39,7 @@ template <typename T>
return false;
}

auto ctx = Context{}.MakeCUDA(common::CurrentDevice());
*vec = common::MakeFixedVecWithCudaMalloc<T>(&ctx, n);
*vec = common::MakeFixedVecWithCudaMalloc<T>(n);
dh::safe_cuda(cudaMemcpyAsync(vec->data(), ptr, n_bytes, cudaMemcpyDefault, dh::DefaultStream()));
return true;
}
Expand Down Expand Up @@ -96,57 +95,21 @@ template <typename T>
CHECK(this->cuts_->cut_values_.DeviceCanRead());
impl->SetCuts(this->cuts_);

// Read vector
Context ctx = Context{}.MakeCUDA(common::CurrentDevice());
auto read_vec = [&] {
common::NvtxScopedRange range{common::NvtxEventAttr{"read-vec", common::NvtxRgb{127, 255, 0}}};
bst_idx_t n{0};
RET_IF_NOT(fi->Read(&n));
if (n == 0) {
return true;
}
impl->gidx_buffer = common::MakeFixedVecWithCudaMalloc<common::CompressedByteT>(&ctx, n);
RET_IF_NOT(fi->Read(impl->gidx_buffer.data(), impl->gidx_buffer.size_bytes()));
return true;
};
RET_IF_NOT(read_vec());

RET_IF_NOT(fi->Read(&impl->n_rows));
RET_IF_NOT(fi->Read(&impl->is_dense));
RET_IF_NOT(fi->Read(&impl->row_stride));
RET_IF_NOT(fi->Read(&impl->base_rowid));

fi->Read(page);
dh::DefaultStream().Sync();

return true;
}

[[nodiscard]] std::size_t EllpackPageRawFormat::Write(const EllpackPage& page,
EllpackHostCacheStream* fo) const {
xgboost_NVTX_FN_RANGE();

bst_idx_t bytes{0};
auto* impl = page.Impl();

// Write vector
auto write_vec = [&] {
common::NvtxScopedRange range{common::NvtxEventAttr{"write-vec", common::NvtxRgb{127, 255, 0}}};
bst_idx_t n = impl->gidx_buffer.size();
bytes += fo->Write(n);

if (!impl->gidx_buffer.empty()) {
bytes += fo->Write(impl->gidx_buffer.data(), impl->gidx_buffer.size_bytes());
}
};

write_vec();

bytes += fo->Write(impl->n_rows);
bytes += fo->Write(impl->is_dense);
bytes += fo->Write(impl->row_stride);
bytes += fo->Write(impl->base_rowid);

fo->Write(page);
dh::DefaultStream().Sync();
return bytes;

auto* impl = page.Impl();
return impl->MemCostBytes();
}

#undef RET_IF_NOT
Expand Down
Loading

0 comments on commit 55aef8f

Please sign in to comment.