[EM] Avoid resizing host cache. (dmlc#10734)

* [EM] Avoid resizing host cache. - Add SAM allocator and resource. - Use page-based cache instead of stream-based cache.
kirchho · Aug 22, 2024 · 55aef8f · 55aef8f
1 parent dbfafd8
commit 55aef8f
Show file tree

Hide file tree

Showing 16 changed files with 262 additions and 144 deletions.
diff --git a/jvm-packages/xgboost4j/src/native/xgboost4j-gpu.cu b/jvm-packages/xgboost4j/src/native/xgboost4j-gpu.cu
@@ -132,7 +132,7 @@ class DataIteratorProxy {
   bool cache_on_host_{true}; // TODO(Bobby): Make this optional.
 
   template <typename T>
-  using Alloc = xgboost::common::cuda_impl::pinned_allocator<T>;
+  using Alloc = xgboost::common::cuda_impl::PinnedAllocator<T>;
   template <typename U>
   using HostVector = std::vector<U, Alloc<U>>;
 

diff --git a/src/common/cuda_pinned_allocator.h b/src/common/cuda_pinned_allocator.h
@@ -21,7 +21,6 @@ namespace xgboost::common::cuda_impl {
 // that Thrust used to provide.
 //
 //  \see https://en.cppreference.com/w/cpp/memory/allocator
-
 template <typename T>
 struct PinnedAllocPolicy {
   using pointer = T*;              // NOLINT: The type returned by address() / allocate()
@@ -33,7 +32,7 @@ struct PinnedAllocPolicy {
     return std::numeric_limits<size_type>::max() / sizeof(value_type);
   }
 
-  pointer allocate(size_type cnt, const_pointer = nullptr) {  // NOLINT
+  [[nodiscard]] pointer allocate(size_type cnt, const_pointer = nullptr) const {  // NOLINT
     if (cnt > this->max_size()) {
       throw std::bad_alloc{};
     }  // end if
@@ -57,7 +56,7 @@ struct ManagedAllocPolicy {
     return std::numeric_limits<size_type>::max() / sizeof(value_type);
   }
 
-  pointer allocate(size_type cnt, const_pointer = nullptr) {  // NOLINT
+  [[nodiscard]] pointer allocate(size_type cnt, const_pointer = nullptr) const {  // NOLINT
     if (cnt > this->max_size()) {
       throw std::bad_alloc{};
     }  // end if
@@ -70,16 +69,49 @@ struct ManagedAllocPolicy {
   void deallocate(pointer p, size_type) { dh::safe_cuda(cudaFree(p)); }  // NOLINT
 };
 
+// This is actually a pinned memory allocator in disguise. We utilize HMM or ATS for
+// efficient tracked memory allocation.
+template <typename T>
+struct SamAllocPolicy {
+  using pointer = T*;              // NOLINT: The type returned by address() / allocate()
+  using const_pointer = const T*;  // NOLINT: The type returned by address()
+  using size_type = std::size_t;   // NOLINT: The type used for the size of the allocation
+  using value_type = T;            // NOLINT: The type of the elements in the allocator
+
+  size_type max_size() const {  // NOLINT
+    return std::numeric_limits<size_type>::max() / sizeof(value_type);
+  }
+
+  [[nodiscard]] pointer allocate(size_type cnt, const_pointer = nullptr) const {  // NOLINT
+    if (cnt > this->max_size()) {
+      throw std::bad_alloc{};
+    }  // end if
+
+    size_type n_bytes = cnt * sizeof(value_type);
+    pointer result = reinterpret_cast<pointer>(std::malloc(n_bytes));
+    if (!result) {
+      throw std::bad_alloc{};
+    }
+    dh::safe_cuda(cudaHostRegister(result, n_bytes, cudaHostRegisterDefault));
+    return result;
+  }
+
+  void deallocate(pointer p, size_type) {  // NOLINT
+    dh::safe_cuda(cudaHostUnregister(p));
+    std::free(p);
+  }
+};
+
 template <typename T, template <typename> typename Policy>
-class CudaHostAllocatorImpl : public Policy<T> {  // NOLINT
+class CudaHostAllocatorImpl : public Policy<T> {
  public:
-  using value_type = typename Policy<T>::value_type;        // NOLINT
-  using pointer = typename Policy<T>::pointer;              // NOLINT
-  using const_pointer = typename Policy<T>::const_pointer;  // NOLINT
-  using size_type = typename Policy<T>::size_type;          // NOLINT
+  using typename Policy<T>::value_type;
+  using typename Policy<T>::pointer;
+  using typename Policy<T>::const_pointer;
+  using typename Policy<T>::size_type;
 
-  using reference = T&;              // NOLINT: The parameter type for address()
-  using const_reference = const T&;  // NOLINT: The parameter type for address()
+  using reference = value_type&;              // NOLINT: The parameter type for address()
+  using const_reference = const value_type&;  // NOLINT: The parameter type for address()
 
   using difference_type = std::ptrdiff_t;  // NOLINT: The type of the distance between two pointers
 
@@ -101,14 +133,17 @@ class CudaHostAllocatorImpl : public Policy<T> {  // NOLINT
   pointer address(reference r) { return &r; }              // NOLINT
   const_pointer address(const_reference r) { return &r; }  // NOLINT
 
-  bool operator==(CudaHostAllocatorImpl const& x) const { return true; }
+  bool operator==(CudaHostAllocatorImpl const&) const { return true; }
 
   bool operator!=(CudaHostAllocatorImpl const& x) const { return !operator==(x); }
 };
 
 template <typename T>
-using pinned_allocator = CudaHostAllocatorImpl<T, PinnedAllocPolicy>;  // NOLINT
+using PinnedAllocator = CudaHostAllocatorImpl<T, PinnedAllocPolicy>;  // NOLINT
+
+template <typename T>
+using ManagedAllocator = CudaHostAllocatorImpl<T, ManagedAllocPolicy>;  // NOLINT
 
 template <typename T>
-using managed_allocator = CudaHostAllocatorImpl<T, ManagedAllocPolicy>;  // NOLINT
+using SamAllocator = CudaHostAllocatorImpl<T, SamAllocPolicy>;
 }  // namespace xgboost::common::cuda_impl
diff --git a/src/common/io.h b/src/common/io.h
@@ -286,6 +286,7 @@ class ResourceHandler {
     kMmap = 1,
     kCudaMalloc = 2,
     kCudaMmap = 3,
+    kCudaHostCache = 4,
   };
 
  private:
@@ -310,6 +311,8 @@ class ResourceHandler {
         return "CudaMalloc";
       case kCudaMmap:
         return "CudaMmap";
+      case kCudaHostCache:
+        return "CudaHostCache";
     }
     LOG(FATAL) << "Unreachable.";
     return {};

diff --git a/src/common/ref_resource_view.cuh b/src/common/ref_resource_view.cuh
@@ -16,8 +16,7 @@ namespace xgboost::common {
  * @brief Make a fixed size `RefResourceView` with cudaMalloc resource.
  */
 template <typename T>
-[[nodiscard]] RefResourceView<T> MakeFixedVecWithCudaMalloc(Context const*,
-                                                            std::size_t n_elements) {
+[[nodiscard]] RefResourceView<T> MakeFixedVecWithCudaMalloc(std::size_t n_elements) {
   auto resource = std::make_shared<common::CudaMallocResource>(n_elements * sizeof(T));
   auto ref = RefResourceView{resource->DataAs<T>(), n_elements, resource};
   return ref;
@@ -26,8 +25,15 @@ template <typename T>
 template <typename T>
 [[nodiscard]] RefResourceView<T> MakeFixedVecWithCudaMalloc(Context const* ctx,
                                                             std::size_t n_elements, T const& init) {
-  auto ref = MakeFixedVecWithCudaMalloc<T>(ctx, n_elements);
+  auto ref = MakeFixedVecWithCudaMalloc<T>(n_elements);
   thrust::fill_n(ctx->CUDACtx()->CTP(), ref.data(), ref.size(), init);
   return ref;
 }
+
+template <typename T>
+[[nodiscard]] RefResourceView<T> MakeFixedVecWithPinnedMalloc(std::size_t n_elements) {
+  auto resource = std::make_shared<common::CudaPinnedResource>(n_elements * sizeof(T));
+  auto ref = RefResourceView{resource->DataAs<T>(), n_elements, resource};
+  return ref;
+}
 }  // namespace xgboost::common
diff --git a/src/common/resource.cuh b/src/common/resource.cuh
@@ -5,9 +5,10 @@
 #include <cstddef>     // for size_t
 #include <functional>  // for function
 
-#include "device_vector.cuh"      // for DeviceUVector
-#include "io.h"                   // for ResourceHandler, MMAPFile
-#include "xgboost/string_view.h"  // for StringView
+#include "cuda_pinned_allocator.h"  // for SamAllocator
+#include "device_vector.cuh"        // for DeviceUVector
+#include "io.h"                     // for ResourceHandler, MMAPFile
+#include "xgboost/string_view.h"    // for StringView
 
 namespace xgboost::common {
 /**
@@ -29,6 +30,22 @@ class CudaMallocResource : public ResourceHandler {
   void Resize(std::size_t n_bytes) { this->storage_.resize(n_bytes); }
 };
 
+class CudaPinnedResource : public ResourceHandler {
+  std::vector<std::byte, cuda_impl::SamAllocator<std::byte>> storage_;
+
+  void Clear() noexcept(true) { this->Resize(0); }
+
+ public:
+  explicit CudaPinnedResource(std::size_t n_bytes) : ResourceHandler{kCudaHostCache} {
+    this->Resize(n_bytes);
+  }
+  ~CudaPinnedResource() noexcept(true) override { this->Clear(); }
+
+  [[nodiscard]] void* Data() override { return storage_.data(); }
+  [[nodiscard]] std::size_t Size() const override { return storage_.size(); }
+  void Resize(std::size_t n_bytes) { this->storage_.resize(n_bytes); }
+};
+
 class CudaMmapResource : public ResourceHandler {
   std::unique_ptr<MMAPFile, std::function<void(MMAPFile*)>> handle_;
   std::size_t n_;

diff --git a/src/data/ellpack_page.cu b/src/data/ellpack_page.cu
@@ -404,7 +404,7 @@ size_t EllpackPageImpl::Copy(Context const* ctx, EllpackPageImpl const* page, bs
   bst_idx_t num_elements = page->n_rows * page->row_stride;
   CHECK_EQ(this->row_stride, page->row_stride);
   CHECK_EQ(NumSymbols(), page->NumSymbols());
-  CHECK_GE(n_rows * row_stride, offset + num_elements);
+  CHECK_GE(this->n_rows * this->row_stride, offset + num_elements);
   if (page == this) {
     LOG(FATAL) << "Concatenating the same Ellpack.";
     return this->n_rows * this->row_stride;
@@ -542,7 +542,10 @@ void EllpackPageImpl::CreateHistIndices(DeviceOrd device,
 // Return the number of rows contained in this page.
 [[nodiscard]] bst_idx_t EllpackPageImpl::Size() const { return n_rows; }
 
-std::size_t EllpackPageImpl::MemCostBytes() const { return this->gidx_buffer.size_bytes(); }
+std::size_t EllpackPageImpl::MemCostBytes() const {
+  return this->gidx_buffer.size_bytes() + sizeof(this->n_rows) + sizeof(this->is_dense) +
+         sizeof(this->row_stride) + sizeof(this->base_rowid);
+}
 
 EllpackDeviceAccessor EllpackPageImpl::GetDeviceAccessor(
     DeviceOrd device, common::Span<FeatureType const> feature_types) const {

diff --git a/src/data/ellpack_page.cuh b/src/data/ellpack_page.cuh
@@ -66,6 +66,7 @@ struct EllpackDeviceAccessor {
       min_fvalue = cuts->min_vals_.ConstHostSpan();
     }
   }
+
   /**
    * @brief Given a row index and a feature index, returns the corresponding cut value.
    *
@@ -75,7 +76,7 @@ struct EllpackDeviceAccessor {
    *                     local to the current batch.
    */
   template <bool global_ridx = true>
-  [[nodiscard]] __device__ bst_bin_t GetBinIndex(size_t ridx, size_t fidx) const {
+  [[nodiscard]] __device__ bst_bin_t GetBinIndex(bst_idx_t ridx, size_t fidx) const {
     if (global_ridx) {
       ridx -= base_rowid;
     }
@@ -114,7 +115,7 @@ struct EllpackDeviceAccessor {
     return idx;
   }
 
-  [[nodiscard]] __device__ float GetFvalue(size_t ridx, size_t fidx) const {
+  [[nodiscard]] __device__ float GetFvalue(bst_idx_t ridx, size_t fidx) const {
     auto gidx = GetBinIndex(ridx, fidx);
     if (gidx == -1) {
       return std::numeric_limits<float>::quiet_NaN();

diff --git a/src/data/ellpack_page_raw_format.cu b/src/data/ellpack_page_raw_format.cu
@@ -39,8 +39,7 @@ template <typename T>
     return false;
   }
 
-  auto ctx = Context{}.MakeCUDA(common::CurrentDevice());
-  *vec = common::MakeFixedVecWithCudaMalloc<T>(&ctx, n);
+  *vec = common::MakeFixedVecWithCudaMalloc<T>(n);
   dh::safe_cuda(cudaMemcpyAsync(vec->data(), ptr, n_bytes, cudaMemcpyDefault, dh::DefaultStream()));
   return true;
 }
@@ -96,57 +95,21 @@ template <typename T>
   CHECK(this->cuts_->cut_values_.DeviceCanRead());
   impl->SetCuts(this->cuts_);
 
-  // Read vector
-  Context ctx = Context{}.MakeCUDA(common::CurrentDevice());
-  auto read_vec = [&] {
-    common::NvtxScopedRange range{common::NvtxEventAttr{"read-vec", common::NvtxRgb{127, 255, 0}}};
-    bst_idx_t n{0};
-    RET_IF_NOT(fi->Read(&n));
-    if (n == 0) {
-      return true;
-    }
-    impl->gidx_buffer = common::MakeFixedVecWithCudaMalloc<common::CompressedByteT>(&ctx, n);
-    RET_IF_NOT(fi->Read(impl->gidx_buffer.data(), impl->gidx_buffer.size_bytes()));
-    return true;
-  };
-  RET_IF_NOT(read_vec());
-
-  RET_IF_NOT(fi->Read(&impl->n_rows));
-  RET_IF_NOT(fi->Read(&impl->is_dense));
-  RET_IF_NOT(fi->Read(&impl->row_stride));
-  RET_IF_NOT(fi->Read(&impl->base_rowid));
-
+  fi->Read(page);
   dh::DefaultStream().Sync();
+
   return true;
 }
 
 [[nodiscard]] std::size_t EllpackPageRawFormat::Write(const EllpackPage& page,
                                                       EllpackHostCacheStream* fo) const {
   xgboost_NVTX_FN_RANGE();
 
-  bst_idx_t bytes{0};
-  auto* impl = page.Impl();
-
-  // Write vector
-  auto write_vec = [&] {
-    common::NvtxScopedRange range{common::NvtxEventAttr{"write-vec", common::NvtxRgb{127, 255, 0}}};
-    bst_idx_t n = impl->gidx_buffer.size();
-    bytes += fo->Write(n);
-
-    if (!impl->gidx_buffer.empty()) {
-      bytes += fo->Write(impl->gidx_buffer.data(), impl->gidx_buffer.size_bytes());
-    }
-  };
-
-  write_vec();
-
-  bytes += fo->Write(impl->n_rows);
-  bytes += fo->Write(impl->is_dense);
-  bytes += fo->Write(impl->row_stride);
-  bytes += fo->Write(impl->base_rowid);
-
+  fo->Write(page);
   dh::DefaultStream().Sync();
-  return bytes;
+
+  auto* impl = page.Impl();
+  return impl->MemCostBytes();
 }
 
 #undef RET_IF_NOT