From ac81fd51727200400d77445d5fb0670d64fb6b3f Mon Sep 17 00:00:00 2001
From: Bernhard Manfred Gruber <bernhardmgruber@gmail.com>
Date: Tue, 14 Jan 2025 15:56:40 +0100
Subject: [PATCH] Replace cub::Traits by numeric_limits and deprecate

Fixes: #3381
---
 c2h/generators.cu                             |  27 +-
 c2h/include/c2h/bfloat16.cuh                  |  26 +-
 c2h/include/c2h/custom_type.h                 |  11 +-
 c2h/include/c2h/generators.h                  |  45 +--
 c2h/include/c2h/half.cuh                      |  30 +-
 c2h/include/c2h/test_util_vec.h               | 298 +++++++++++------
 cub/benchmarks/bench/reduce/arg_extrema.cu    |   4 +-
 .../nvbench_helper/nvbench_helper.cuh         |  22 +-
 cub/cub/block/radix_rank_sort_operations.cuh  |  26 +-
 cub/cub/device/device_reduce.cuh              |  14 +-
 cub/cub/device/device_segmented_reduce.cuh    |  15 +-
 cub/cub/util_type.cuh                         | 305 +++++++++++-------
 cub/test/catch2_radix_sort_helper.cuh         |  10 +-
 cub/test/catch2_test_block_radix_sort.cu      |   3 +-
 cub/test/catch2_test_device_histogram.cu      |  11 +-
 .../catch2_test_device_radix_sort_keys.cu     |   2 +-
 cub/test/catch2_test_device_reduce.cuh        |  34 +-
 cub/test/catch2_test_device_reduce_by_key.cu  |   2 +-
 cub/test/catch2_test_device_scan.cu           |   6 +-
 cub/test/catch2_test_device_scan_iterators.cu |   4 +-
 .../catch2_test_device_segmented_reduce.cu    |   4 +-
 cub/test/test_util.h                          |  58 ++--
 .../is_extended_floating_point.h              |  20 ++
 .../meta.unary.cat/is_floating_point.pass.cpp |   4 +
 24 files changed, 535 insertions(+), 446 deletions(-)
diff --git a/c2h/generators.cu b/c2h/generators.cu
index 8044eabe6fe..771cc234c90 100644
--- a/c2h/generators.cu
+++ b/c2h/generators.cu
@@ -40,7 +40,7 @@
 #include <thrust/scan.h>
 #include <thrust/tabulate.h>
 
-#include <cuda/std/type_traits>
+#include <cuda/type_traits>
 
 #include <cstdint>
 
@@ -118,30 +118,7 @@ private:
   c2h::device_vector<float> m_distribution;
 };
 
-// TODO(bgruber): modelled after cub::Traits. We should generalize this somewhere into libcu++.
-template <typename T>
-struct is_floating_point : ::cuda::std::is_floating_point<T>
-{};
-#ifdef _CCCL_HAS_NVFP16
-template <>
-struct is_floating_point<__half> : ::cuda::std::true_type
-{};
-#endif // _CCCL_HAS_NVFP16
-#ifdef _CCCL_HAS_NVBF16
-template <>
-struct is_floating_point<__nv_bfloat16> : ::cuda::std::true_type
-{};
-#endif // _CCCL_HAS_NVBF16
-#ifdef __CUDA_FP8_TYPES_EXIST__
-template <>
-struct is_floating_point<__nv_fp8_e4m3> : ::cuda::std::true_type
-{};
-template <>
-struct is_floating_point<__nv_fp8_e5m2> : ::cuda::std::true_type
-{};
-#endif // __CUDA_FP8_TYPES_EXIST__
-
-template <typename T, bool = is_floating_point<T>::value>
+template <typename T, bool = ::cuda::is_floating_point_v<T>>
 struct random_to_item_t
 {
   float m_min;
diff --git a/c2h/include/c2h/bfloat16.cuh b/c2h/include/c2h/bfloat16.cuh
index b7598562715..77701936bca 100644
--- a/c2h/include/c2h/bfloat16.cuh
+++ b/c2h/include/c2h/bfloat16.cuh
@@ -211,6 +211,10 @@ struct bfloat16_t
   }
 };
 
+#ifdef __GNUC__
+#  pragma GCC diagnostic pop
+#endif
+
 /******************************************************************************
  * I/O stream overloads
  ******************************************************************************/
@@ -229,28 +233,28 @@ inline std::ostream& operator<<(std::ostream& out, const __nv_bfloat16& x)
 }
 
 /******************************************************************************
- * Traits overloads
+ * limits
  ******************************************************************************/
 
+_LIBCUDACXX_BEGIN_NAMESPACE_STD
 template <>
-struct CUB_NS_QUALIFIER::FpLimits<bfloat16_t>
+class numeric_limits<bfloat16_t>
 {
-  static __host__ __device__ __forceinline__ bfloat16_t Max()
+public:
+  static __host__ __device__ __forceinline__ bfloat16_t max()
   {
     return bfloat16_t::max();
   }
 
-  static __host__ __device__ __forceinline__ bfloat16_t Lowest()
+  static __host__ __device__ __forceinline__ bfloat16_t lowest()
   {
     return bfloat16_t::lowest();
   }
 };
+_LIBCUDACXX_END_NAMESPACE_STD
 
 template <>
-struct CUB_NS_QUALIFIER::NumericTraits<bfloat16_t>
-    : CUB_NS_QUALIFIER::BaseTraits<FLOATING_POINT, true, false, unsigned short, bfloat16_t>
-{};
-
-#ifdef __GNUC__
-#  pragma GCC diagnostic pop
-#endif
+struct CUB_NS_QUALIFIER::detail::unsigned_bits<bfloat16_t, void>
+{
+  using type = unsigned short;
+};
diff --git a/c2h/include/c2h/custom_type.h b/c2h/include/c2h/custom_type.h
index ddbceef388a..e8759481eec 100644
--- a/c2h/include/c2h/custom_type.h
+++ b/c2h/include/c2h/custom_type.h
@@ -178,13 +178,12 @@ class accumulateable_t
 
 } // namespace c2h
 
-namespace std
-{
+_LIBCUDACXX_BEGIN_NAMESPACE_STD
 template <template <typename> class... Policies>
 class numeric_limits<c2h::custom_type_t<Policies...>>
 {
 public:
-  static c2h::custom_type_t<Policies...> max()
+  static __host__ __device__ c2h::custom_type_t<Policies...> max()
   {
     c2h::custom_type_t<Policies...> val;
     val.key = std::numeric_limits<std::size_t>::max();
@@ -192,7 +191,7 @@ class numeric_limits<c2h::custom_type_t<Policies...>>
     return val;
   }
 
-  static c2h::custom_type_t<Policies...> min()
+  static __host__ __device__ c2h::custom_type_t<Policies...> min()
   {
     c2h::custom_type_t<Policies...> val;
     val.key = std::numeric_limits<std::size_t>::min();
@@ -200,7 +199,7 @@ class numeric_limits<c2h::custom_type_t<Policies...>>
     return val;
   }
 
-  static c2h::custom_type_t<Policies...> lowest()
+  static __host__ __device__ c2h::custom_type_t<Policies...> lowest()
   {
     c2h::custom_type_t<Policies...> val;
     val.key = std::numeric_limits<std::size_t>::lowest();
@@ -208,4 +207,4 @@ class numeric_limits<c2h::custom_type_t<Policies...>>
     return val;
   }
 };
-} // namespace std
+_LIBCUDACXX_END_NAMESPACE_STD
diff --git a/c2h/include/c2h/generators.h b/c2h/include/c2h/generators.h
index 62f169e9e21..0a6b2e0778f 100644
--- a/c2h/include/c2h/generators.h
+++ b/c2h/include/c2h/generators.h
@@ -29,7 +29,7 @@
 
 #include <thrust/detail/config/device_system.h>
 
-#include <limits>
+#include <cuda/std/limits>
 
 #include <c2h/custom_type.h>
 #include <c2h/vector.h>
@@ -52,41 +52,6 @@ _CCCL_DIAG_PUSH
 _CCCL_DIAG_POP
 #    endif // _CCCL_CUDACC_AT_LEAST(11, 8)
 #  endif // _CCCL_HAS_NVBF16
-
-#  if defined(__CUDA_FP8_TYPES_EXIST__)
-namespace std
-{
-template <>
-class numeric_limits<__nv_fp8_e4m3>
-{
-public:
-  static __nv_fp8_e4m3 max()
-  {
-    return cub::Traits<__nv_fp8_e4m3>::Max();
-  }
-
-  static __nv_fp8_e4m3 lowest()
-  {
-    return cub::Traits<__nv_fp8_e4m3>::Lowest();
-  }
-};
-
-template <>
-class numeric_limits<__nv_fp8_e5m2>
-{
-public:
-  static __nv_fp8_e5m2 max()
-  {
-    return cub::Traits<__nv_fp8_e5m2>::Max();
-  }
-
-  static __nv_fp8_e5m2 lowest()
-  {
-    return cub::Traits<__nv_fp8_e5m2>::Lowest();
-  }
-};
-} // namespace std
-#  endif // defined(__CUDA_FP8_TYPES_EXIST__)
 #endif // THRUST_DEVICE_SYSTEM == THRUST_DEVICE_SYSTEM_CUDA
 
 namespace c2h
@@ -157,8 +122,8 @@ void init_key_segments(const c2h::device_vector<OffsetT>& segment_offsets, KeyT*
 template <template <typename> class... Ps>
 void gen(seed_t seed,
          c2h::device_vector<c2h::custom_type_t<Ps...>>& data,
-         c2h::custom_type_t<Ps...> min = std::numeric_limits<c2h::custom_type_t<Ps...>>::lowest(),
-         c2h::custom_type_t<Ps...> max = std::numeric_limits<c2h::custom_type_t<Ps...>>::max())
+         c2h::custom_type_t<Ps...> min = ::cuda::std::numeric_limits<c2h::custom_type_t<Ps...>>::lowest(),
+         c2h::custom_type_t<Ps...> max = ::cuda::std::numeric_limits<c2h::custom_type_t<Ps...>>::max())
 {
   detail::gen(seed,
               reinterpret_cast<char*>(thrust::raw_pointer_cast(data.data())),
@@ -171,8 +136,8 @@ void gen(seed_t seed,
 template <typename T>
 void gen(seed_t seed,
          c2h::device_vector<T>& data,
-         T min = std::numeric_limits<T>::lowest(),
-         T max = std::numeric_limits<T>::max());
+         T min = ::cuda::std::numeric_limits<T>::lowest(),
+         T max = ::cuda::std::numeric_limits<T>::max());
 
 template <typename T>
 void gen(modulo_t mod, c2h::device_vector<T>& data);
diff --git a/c2h/include/c2h/half.cuh b/c2h/include/c2h/half.cuh
index 3e59c0933f3..0d94bd7911f 100644
--- a/c2h/include/c2h/half.cuh
+++ b/c2h/include/c2h/half.cuh
@@ -37,6 +37,7 @@
 
 #include <cub/util_type.cuh>
 
+#include <cuda/std/limits>
 #include <cuda/std/type_traits>
 
 #include <cstdint>
@@ -306,6 +307,10 @@ struct half_t
   }
 };
 
+#ifdef __GNUC__
+#  pragma GCC diagnostic pop
+#endif
+
 /******************************************************************************
  * I/O stream overloads
  ******************************************************************************/
@@ -324,28 +329,33 @@ inline std::ostream& operator<<(std::ostream& out, const __half& x)
 }
 
 /******************************************************************************
- * Traits overloads
+ * limits
  ******************************************************************************/
 
+_LIBCUDACXX_BEGIN_NAMESPACE_STD
 template <>
-struct CUB_NS_QUALIFIER::FpLimits<half_t>
+class numeric_limits<half_t>
 {
-  static __host__ __device__ __forceinline__ half_t Max()
+public:
+  static __host__ __device__ __forceinline__ half_t max()
   {
     return (half_t::max)();
   }
 
-  static __host__ __device__ __forceinline__ half_t Lowest()
+  static __host__ __device__ __forceinline__ half_t lowest()
   {
     return half_t::lowest();
   }
 };
+_LIBCUDACXX_END_NAMESPACE_STD
 
 template <>
-struct CUB_NS_QUALIFIER::NumericTraits<half_t>
-    : CUB_NS_QUALIFIER::BaseTraits<FLOATING_POINT, true, false, unsigned short, half_t>
-{};
+struct CUB_NS_QUALIFIER::detail::unsigned_bits<half_t, void>
+{
+  using type = unsigned short;
+};
 
-#ifdef __GNUC__
-#  pragma GCC diagnostic pop
-#endif
+// template <>
+// struct CUB_NS_QUALIFIER::detail::NumericTraits<half_t>
+//     : CUB_NS_QUALIFIER::detail::BaseTraits<FLOATING_POINT, true, false, unsigned short, half_t>
+// {};
diff --git a/c2h/include/c2h/test_util_vec.h b/c2h/include/c2h/test_util_vec.h
index 128fb5dbfce..4304cfe6c02 100644
--- a/c2h/include/c2h/test_util_vec.h
+++ b/c2h/include/c2h/test_util_vec.h
@@ -29,6 +29,8 @@
 
 #include <thrust/detail/config/device_system.h>
 
+#include <cuda/std/limits>
+
 #include <iostream>
 
 /******************************************************************************
@@ -288,123 +290,221 @@ C2H_VEC_OVERLOAD(float, float)
 C2H_VEC_OVERLOAD(double, double)
 
 /*
- * The following section defines macros to overload cub::NumericTraits<T>::{Max,Lowest}() for vector
+ * The following section defines macros to specialize cub::NumericTraits<T>::{Max,Lowest}() for vector
  * types.
  */
 
 /**
  * Vector1 overloads
  */
-#  define C2H_VEC_1_TRAITS_OVERLOAD(T, BaseT)            \
-    CUB_NAMESPACE_BEGIN                                  \
-    template <>                                          \
-    struct NumericTraits<T>                              \
-    {                                                    \
-      static constexpr Category CATEGORY = NOT_A_NUMBER; \
-      enum                                               \
-      {                                                  \
-        PRIMITIVE = false,                               \
-        NULL_TYPE = false,                               \
-      };                                                 \
-      static __host__ __device__ T Max()                 \
-      {                                                  \
-        T retval = {NumericTraits<BaseT>::Max()};        \
-        return retval;                                   \
-      }                                                  \
-      static __host__ __device__ T Lowest()              \
-      {                                                  \
-        T retval = {NumericTraits<BaseT>::Lowest()};     \
-        return retval;                                   \
-      }                                                  \
-    };                                                   \
-    CUB_NAMESPACE_END
+#  define C2H_VEC_1_TRAITS_OVERLOAD(T, BaseT)                      \
+    CUB_NAMESPACE_BEGIN                                            \
+    namespace detail                                               \
+    {                                                              \
+    template <>                                                    \
+    struct NumericTraits<T>                                        \
+    {                                                              \
+      static constexpr Category CATEGORY = NOT_A_NUMBER;           \
+      enum                                                         \
+      {                                                            \
+        PRIMITIVE = false,                                         \
+        NULL_TYPE = false,                                         \
+      };                                                           \
+      static __host__ __device__ T Max()                           \
+      {                                                            \
+        T retval = {::cuda::std::numeric_limits<BaseT>::max()};    \
+        return retval;                                             \
+      }                                                            \
+      static __host__ __device__ T Lowest()                        \
+      {                                                            \
+        T retval = {::cuda::std::numeric_limits<BaseT>::lowest()}; \
+        return retval;                                             \
+      }                                                            \
+    };                                                             \
+    }                                                              \
+    CUB_NAMESPACE_END                                              \
+    _LIBCUDACXX_BEGIN_NAMESPACE_STD                                \
+    template <>                                                    \
+    class numeric_limits<T>                                        \
+    {                                                              \
+    public:                                                        \
+      static constexpr bool is_specialized = true;                 \
+      static __host__ __device__ T max()                           \
+      {                                                            \
+        T retval = {::cuda::std::numeric_limits<BaseT>::max()};    \
+        return retval;                                             \
+      }                                                            \
+      static __host__ __device__ T lowest()                        \
+      {                                                            \
+        T retval = {::cuda::std::numeric_limits<BaseT>::lowest()}; \
+        return retval;                                             \
+      }                                                            \
+    };                                                             \
+    _LIBCUDACXX_END_NAMESPACE_STD
 
 /**
  * Vector2 overloads
  */
-#  define C2H_VEC_2_TRAITS_OVERLOAD(T, BaseT)                                        \
-    CUB_NAMESPACE_BEGIN                                                              \
-    template <>                                                                      \
-    struct NumericTraits<T>                                                          \
-    {                                                                                \
-      static constexpr Category CATEGORY = NOT_A_NUMBER;                             \
-      enum                                                                           \
-      {                                                                              \
-        PRIMITIVE = false,                                                           \
-        NULL_TYPE = false,                                                           \
-      };                                                                             \
-      static __host__ __device__ T Max()                                             \
-      {                                                                              \
-        T retval = {NumericTraits<BaseT>::Max(), NumericTraits<BaseT>::Max()};       \
-        return retval;                                                               \
-      }                                                                              \
-      static __host__ __device__ T Lowest()                                          \
-      {                                                                              \
-        T retval = {NumericTraits<BaseT>::Lowest(), NumericTraits<BaseT>::Lowest()}; \
-        return retval;                                                               \
-      }                                                                              \
-    };                                                                               \
-    CUB_NAMESPACE_END
+#  define C2H_VEC_2_TRAITS_OVERLOAD(T, BaseT)                                                                    \
+    CUB_NAMESPACE_BEGIN                                                                                          \
+    namespace detail                                                                                             \
+    {                                                                                                            \
+    template <>                                                                                                  \
+    struct NumericTraits<T>                                                                                      \
+    {                                                                                                            \
+      static constexpr Category CATEGORY = NOT_A_NUMBER;                                                         \
+      enum                                                                                                       \
+      {                                                                                                          \
+        PRIMITIVE = false,                                                                                       \
+        NULL_TYPE = false,                                                                                       \
+      };                                                                                                         \
+      static __host__ __device__ T Max()                                                                         \
+      {                                                                                                          \
+        T retval = {::cuda::std::numeric_limits<BaseT>::max(), ::cuda::std::numeric_limits<BaseT>::max()};       \
+        return retval;                                                                                           \
+      }                                                                                                          \
+      static __host__ __device__ T Lowest()                                                                      \
+      {                                                                                                          \
+        T retval = {::cuda::std::numeric_limits<BaseT>::lowest(), ::cuda::std::numeric_limits<BaseT>::lowest()}; \
+        return retval;                                                                                           \
+      }                                                                                                          \
+    };                                                                                                           \
+    }                                                                                                            \
+    CUB_NAMESPACE_END                                                                                            \
+    _LIBCUDACXX_BEGIN_NAMESPACE_STD                                                                              \
+    template <>                                                                                                  \
+    class numeric_limits<T>                                                                                      \
+    {                                                                                                            \
+    public:                                                                                                      \
+      static constexpr bool is_specialized = true;                                                               \
+      static __host__ __device__ T max()                                                                         \
+      {                                                                                                          \
+        T retval = {::cuda::std::numeric_limits<BaseT>::max(), ::cuda::std::numeric_limits<BaseT>::max()};       \
+        return retval;                                                                                           \
+      }                                                                                                          \
+      static __host__ __device__ T lowest()                                                                      \
+      {                                                                                                          \
+        T retval = {::cuda::std::numeric_limits<BaseT>::lowest(), ::cuda::std::numeric_limits<BaseT>::lowest()}; \
+        return retval;                                                                                           \
+      }                                                                                                          \
+    };                                                                                                           \
+    _LIBCUDACXX_END_NAMESPACE_STD
 
 /**
  * Vector3 overloads
  */
-#  define C2H_VEC_3_TRAITS_OVERLOAD(T, BaseT)                                                                        \
-    CUB_NAMESPACE_BEGIN                                                                                              \
-    template <>                                                                                                      \
-    struct NumericTraits<T>                                                                                          \
-    {                                                                                                                \
-      static constexpr Category CATEGORY = NOT_A_NUMBER;                                                             \
-      enum                                                                                                           \
-      {                                                                                                              \
-        PRIMITIVE = false,                                                                                           \
-        NULL_TYPE = false,                                                                                           \
-      };                                                                                                             \
-      static __host__ __device__ T Max()                                                                             \
-      {                                                                                                              \
-        T retval = {NumericTraits<BaseT>::Max(), NumericTraits<BaseT>::Max(), NumericTraits<BaseT>::Max()};          \
-        return retval;                                                                                               \
-      }                                                                                                              \
-      static __host__ __device__ T Lowest()                                                                          \
-      {                                                                                                              \
-        T retval = {NumericTraits<BaseT>::Lowest(), NumericTraits<BaseT>::Lowest(), NumericTraits<BaseT>::Lowest()}; \
-        return retval;                                                                                               \
-      }                                                                                                              \
-    };                                                                                                               \
-    CUB_NAMESPACE_END
+#  define C2H_VEC_3_TRAITS_OVERLOAD(T, BaseT)                      \
+    CUB_NAMESPACE_BEGIN                                            \
+    namespace detail                                               \
+    {                                                              \
+    template <>                                                    \
+    struct NumericTraits<T>                                        \
+    {                                                              \
+      static constexpr Category CATEGORY = NOT_A_NUMBER;           \
+      enum                                                         \
+      {                                                            \
+        PRIMITIVE = false,                                         \
+        NULL_TYPE = false,                                         \
+      };                                                           \
+      static __host__ __device__ T Max()                           \
+      {                                                            \
+        T retval = {::cuda::std::numeric_limits<BaseT>::max(),     \
+                    ::cuda::std::numeric_limits<BaseT>::max(),     \
+                    ::cuda::std::numeric_limits<BaseT>::max()};    \
+        return retval;                                             \
+      }                                                            \
+      static __host__ __device__ T Lowest()                        \
+      {                                                            \
+        T retval = {::cuda::std::numeric_limits<BaseT>::lowest(),  \
+                    ::cuda::std::numeric_limits<BaseT>::lowest(),  \
+                    ::cuda::std::numeric_limits<BaseT>::lowest()}; \
+        return retval;                                             \
+      }                                                            \
+    };                                                             \
+    }                                                              \
+    CUB_NAMESPACE_END                                              \
+    _LIBCUDACXX_BEGIN_NAMESPACE_STD                                \
+    template <>                                                    \
+    class numeric_limits<T>                                        \
+    {                                                              \
+    public:                                                        \
+      static constexpr bool is_specialized = true;                 \
+      static __host__ __device__ T max()                           \
+      {                                                            \
+        T retval = {::cuda::std::numeric_limits<BaseT>::max(),     \
+                    ::cuda::std::numeric_limits<BaseT>::max(),     \
+                    ::cuda::std::numeric_limits<BaseT>::max()};    \
+        return retval;                                             \
+      }                                                            \
+      static __host__ __device__ T lowest()                        \
+      {                                                            \
+        T retval = {::cuda::std::numeric_limits<BaseT>::lowest(),  \
+                    ::cuda::std::numeric_limits<BaseT>::lowest(),  \
+                    ::cuda::std::numeric_limits<BaseT>::lowest()}; \
+        return retval;                                             \
+      }                                                            \
+    };                                                             \
+    _LIBCUDACXX_END_NAMESPACE_STD
 
 /**
  * Vector4 overloads
  */
-#  define C2H_VEC_4_TRAITS_OVERLOAD(T, BaseT)            \
-    CUB_NAMESPACE_BEGIN                                  \
-    template <>                                          \
-    struct NumericTraits<T>                              \
-    {                                                    \
-      static constexpr Category CATEGORY = NOT_A_NUMBER; \
-      enum                                               \
-      {                                                  \
-        PRIMITIVE = false,                               \
-        NULL_TYPE = false,                               \
-      };                                                 \
-      static __host__ __device__ T Max()                 \
-      {                                                  \
-        T retval = {NumericTraits<BaseT>::Max(),         \
-                    NumericTraits<BaseT>::Max(),         \
-                    NumericTraits<BaseT>::Max(),         \
-                    NumericTraits<BaseT>::Max()};        \
-        return retval;                                   \
-      }                                                  \
-      static __host__ __device__ T Lowest()              \
-      {                                                  \
-        T retval = {NumericTraits<BaseT>::Lowest(),      \
-                    NumericTraits<BaseT>::Lowest(),      \
-                    NumericTraits<BaseT>::Lowest(),      \
-                    NumericTraits<BaseT>::Lowest()};     \
-        return retval;                                   \
-      }                                                  \
-    };                                                   \
-    CUB_NAMESPACE_END
+#  define C2H_VEC_4_TRAITS_OVERLOAD(T, BaseT)                      \
+    CUB_NAMESPACE_BEGIN                                            \
+    namespace detail                                               \
+    {                                                              \
+    template <>                                                    \
+    struct NumericTraits<T>                                        \
+    {                                                              \
+      static constexpr Category CATEGORY = NOT_A_NUMBER;           \
+      enum                                                         \
+      {                                                            \
+        PRIMITIVE = false,                                         \
+        NULL_TYPE = false,                                         \
+      };                                                           \
+      static __host__ __device__ T Max()                           \
+      {                                                            \
+        T retval = {::cuda::std::numeric_limits<BaseT>::max(),     \
+                    ::cuda::std::numeric_limits<BaseT>::max(),     \
+                    ::cuda::std::numeric_limits<BaseT>::max(),     \
+                    ::cuda::std::numeric_limits<BaseT>::max()};    \
+        return retval;                                             \
+      }                                                            \
+      static __host__ __device__ T Lowest()                        \
+      {                                                            \
+        T retval = {::cuda::std::numeric_limits<BaseT>::lowest(),  \
+                    ::cuda::std::numeric_limits<BaseT>::lowest(),  \
+                    ::cuda::std::numeric_limits<BaseT>::lowest(),  \
+                    ::cuda::std::numeric_limits<BaseT>::lowest()}; \
+        return retval;                                             \
+      }                                                            \
+    };                                                             \
+    }                                                              \
+    CUB_NAMESPACE_END                                              \
+    _LIBCUDACXX_BEGIN_NAMESPACE_STD                                \
+    template <>                                                    \
+    class numeric_limits<T>                                        \
+    {                                                              \
+    public:                                                        \
+      static constexpr bool is_specialized = true;                 \
+      static __host__ __device__ T max()                           \
+      {                                                            \
+        T retval = {::cuda::std::numeric_limits<BaseT>::max(),     \
+                    ::cuda::std::numeric_limits<BaseT>::max(),     \
+                    ::cuda::std::numeric_limits<BaseT>::max(),     \
+                    ::cuda::std::numeric_limits<BaseT>::max()};    \
+        return retval;                                             \
+      }                                                            \
+      static __host__ __device__ T lowest()                        \
+      {                                                            \
+        T retval = {::cuda::std::numeric_limits<BaseT>::lowest(),  \
+                    ::cuda::std::numeric_limits<BaseT>::lowest(),  \
+                    ::cuda::std::numeric_limits<BaseT>::lowest(),  \
+                    ::cuda::std::numeric_limits<BaseT>::lowest()}; \
+        return retval;                                             \
+      }                                                            \
+    };                                                             \
+    _LIBCUDACXX_END_NAMESPACE_STD
 
 /**
  * All vector overloads
diff --git a/cub/benchmarks/bench/reduce/arg_extrema.cu b/cub/benchmarks/bench/reduce/arg_extrema.cu
index 8e7e88ecaf9..28850da92e5 100644
--- a/cub/benchmarks/bench/reduce/arg_extrema.cu
+++ b/cub/benchmarks/bench/reduce/arg_extrema.cu
@@ -57,7 +57,9 @@ struct policy_hub_t
     // Type used for the final result
     using output_tuple_t = cub::KeyValuePair<global_offset_t, T>;
 
-    auto const init = ::cuda::std::is_same<OpT, cub::ArgMin>::value ? cub::Traits<T>::Max() : cub::Traits<T>::Lowest();
+    auto const init = ::cuda::std::is_same<OpT, cub::ArgMin>::value
+                      ? ::cuda::std::numeric_limits<T>::max()
+                      : ::cuda::std::numeric_limits<T>::lowest();
 
 #if !TUNE_BASE
     using policy_t   = policy_hub_t<output_tuple_t, per_partition_offset_t>;
diff --git a/cub/benchmarks/nvbench_helper/nvbench_helper/nvbench_helper.cuh b/cub/benchmarks/nvbench_helper/nvbench_helper/nvbench_helper.cuh
index 8324650d044..d2b6fbf8456 100644
--- a/cub/benchmarks/nvbench_helper/nvbench_helper/nvbench_helper.cuh
+++ b/cub/benchmarks/nvbench_helper/nvbench_helper/nvbench_helper.cuh
@@ -6,9 +6,9 @@
 #include <thrust/execution_policy.h>
 
 #include <cuda/std/complex>
+#include <cuda/std/limits>
 #include <cuda/std/span>
 
-#include <limits>
 #include <map>
 #include <stdexcept>
 
@@ -260,8 +260,8 @@ struct generator_base_t
 template <class T>
 struct vector_generator_t : generator_base_t
 {
-  const T m_min{std::numeric_limits<T>::min()};
-  const T m_max{std::numeric_limits<T>::max()};
+  const T m_min{::cuda::std::numeric_limits<T>::min()};
+  const T m_max{::cuda::std::numeric_limits<T>::max()};
 
   operator thrust::device_vector<T>()
   {
@@ -275,17 +275,17 @@ struct vector_generator_t<void> : generator_base_t
   template <typename T>
   operator thrust::device_vector<T>()
   {
-    return generator_base_t::generate(std::numeric_limits<T>::min(), std::numeric_limits<T>::max());
+    return generator_base_t::generate(::cuda::std::numeric_limits<T>::min(), ::cuda::std::numeric_limits<T>::max());
   }
 
   // This overload is needed because numeric limits is not specialized for complex, making
   // the min and max values for complex equal zero.
   operator thrust::device_vector<complex>()
   {
-    const complex min =
-      complex{std::numeric_limits<complex::value_type>::min(), std::numeric_limits<complex::value_type>::min()};
-    const complex max =
-      complex{std::numeric_limits<complex::value_type>::max(), std::numeric_limits<complex::value_type>::max()};
+    const complex min = complex{
+      ::cuda::std::numeric_limits<complex::value_type>::min(), ::cuda::std::numeric_limits<complex::value_type>::min()};
+    const complex max = complex{
+      ::cuda::std::numeric_limits<complex::value_type>::max(), ::cuda::std::numeric_limits<complex::value_type>::max()};
 
     return generator_base_t::generate(min, max);
   }
@@ -407,8 +407,8 @@ struct gen_t
   vector_generator_t<T> operator()(
     std::size_t elements,
     bit_entropy entropy = bit_entropy::_1_000,
-    T min               = std::numeric_limits<T>::min,
-    T max               = std::numeric_limits<T>::max()) const
+    T min               = ::cuda::std::numeric_limits<T>::min,
+    T max               = ::cuda::std::numeric_limits<T>::max()) const
   {
     return {{seed_t{}, elements, entropy}, min, max};
   }
@@ -464,7 +464,7 @@ struct less_t
     }
 
     const complex::value_type difference = cuda::std::abs(magnitude_0 - magnitude_1);
-    const complex::value_type threshold  = cuda::std::numeric_limits<complex::value_type>::epsilon() * 2;
+    const complex::value_type threshold  = ::cuda::std::numeric_limits<complex::value_type>::epsilon() * 2;
 
     if (difference < threshold)
     {
diff --git a/cub/cub/block/radix_rank_sort_operations.cuh b/cub/cub/block/radix_rank_sort_operations.cuh
index 35bdfe8ee02..08d6e6fc8d6 100644
--- a/cub/cub/block/radix_rank_sort_operations.cuh
+++ b/cub/cub/block/radix_rank_sort_operations.cuh
@@ -53,7 +53,7 @@
 #include <cuda/std/__algorithm/min.h>
 #include <cuda/std/cstdint>
 #include <cuda/std/tuple>
-#include <cuda/std/type_traits>
+#include <cuda/type_traits>
 
 CUB_NAMESPACE_BEGIN
 
@@ -73,11 +73,10 @@ CUB_NAMESPACE_BEGIN
     and only one of them is used, the sorting works correctly. For double, the
     same applies, but with 64-bit patterns.
 */
-template <typename KeyT, Category TypeCategory = Traits<KeyT>::CATEGORY>
+template <typename KeyT, bool IsFP = ::cuda::is_floating_point_v<KeyT>>
 struct BaseDigitExtractor
 {
-  using TraitsT      = Traits<KeyT>;
-  using UnsignedBits = typename TraitsT::UnsignedBits;
+  using UnsignedBits = typename Twiddle<KeyT>::UnsignedBits;
 
   static _CCCL_HOST_DEVICE _CCCL_FORCEINLINE UnsignedBits ProcessFloatMinusZero(UnsignedBits key)
   {
@@ -86,16 +85,15 @@ struct BaseDigitExtractor
 };
 
 template <typename KeyT>
-struct BaseDigitExtractor<KeyT, FLOATING_POINT>
+struct BaseDigitExtractor<KeyT, true>
 {
-  using TraitsT      = Traits<KeyT>;
-  using UnsignedBits = typename TraitsT::UnsignedBits;
+  using UnsignedBits = typename Twiddle<KeyT>::UnsignedBits;
 
   static _CCCL_HOST_DEVICE _CCCL_FORCEINLINE UnsignedBits ProcessFloatMinusZero(UnsignedBits key)
   {
     UnsignedBits TWIDDLED_MINUS_ZERO_BITS =
-      TraitsT::TwiddleIn(UnsignedBits(1) << UnsignedBits(8 * sizeof(UnsignedBits) - 1));
-    UnsignedBits TWIDDLED_ZERO_BITS = TraitsT::TwiddleIn(0);
+      Twiddle<KeyT>::In(UnsignedBits(1) << UnsignedBits(8 * sizeof(UnsignedBits) - 1));
+    UnsignedBits TWIDDLED_ZERO_BITS = Twiddle<KeyT>::In(0);
     return key == TWIDDLED_MINUS_ZERO_BITS ? TWIDDLED_ZERO_BITS : key;
   }
 };
@@ -232,23 +230,23 @@ using decomposer_check_t = is_tuple_of_references_to_fundamental_types_t<invoke_
 template <class T>
 struct bit_ordered_conversion_policy_t
 {
-  using bit_ordered_type = typename Traits<T>::UnsignedBits;
+  using bit_ordered_type = typename Twiddle<T>::UnsignedBits;
 
   static _CCCL_HOST_DEVICE bit_ordered_type to_bit_ordered(detail::identity_decomposer_t, bit_ordered_type val)
   {
-    return Traits<T>::TwiddleIn(val);
+    return Twiddle<T>::In(val);
   }
 
   static _CCCL_HOST_DEVICE bit_ordered_type from_bit_ordered(detail::identity_decomposer_t, bit_ordered_type val)
   {
-    return Traits<T>::TwiddleOut(val);
+    return Twiddle<T>::Out(val);
   }
 };
 
 template <class T>
 struct bit_ordered_inversion_policy_t
 {
-  using bit_ordered_type = typename Traits<T>::UnsignedBits;
+  using bit_ordered_type = typename Twiddle<T>::UnsignedBits;
 
   static _CCCL_HOST_DEVICE bit_ordered_type inverse(detail::identity_decomposer_t, bit_ordered_type val)
   {
@@ -259,7 +257,7 @@ struct bit_ordered_inversion_policy_t
 template <class T, bool = is_fundamental_type<T>::value>
 struct traits_t
 {
-  using bit_ordered_type              = typename Traits<T>::UnsignedBits;
+  using bit_ordered_type              = unsigned_bits_t<T>;
   using bit_ordered_conversion_policy = bit_ordered_conversion_policy_t<T>;
   using bit_ordered_inversion_policy  = bit_ordered_inversion_policy_t<T>;
 
diff --git a/cub/cub/device/device_reduce.cuh b/cub/cub/device/device_reduce.cuh
index a5c3de4a313..fb13f9bd8c7 100644
--- a/cub/cub/device/device_reduce.cuh
+++ b/cub/cub/device/device_reduce.cuh
@@ -51,8 +51,9 @@
 
 #include <thrust/iterator/tabulate_output_iterator.h>
 
+#include <cuda/std/limits>
+
 #include <iterator>
-#include <limits>
 
 CUB_NAMESPACE_BEGIN
 
@@ -434,10 +435,7 @@ struct DeviceReduce
       d_out,
       static_cast<OffsetT>(num_items),
       ::cuda::minimum<>{},
-      // replace with
-      // std::numeric_limits<T>::max() when
-      // C++11 support is more prevalent
-      Traits<InitT>::Max(),
+      ::cuda::std::numeric_limits<InitT>::max(),
       stream);
   }
 
@@ -694,7 +692,7 @@ struct DeviceReduce
 
     // Initial value
     // TODO Address https://github.com/NVIDIA/cub/issues/651
-    InitT initial_value{AccumT(1, Traits<InputValueT>::Max())};
+    InitT initial_value{AccumT(1, ::cuda::std::numeric_limits<InputValueT>::max())};
 
     return DispatchReduce<ArgIndexInputIteratorT, OutputIteratorT, OffsetT, cub::ArgMin, InitT, AccumT>::Dispatch(
       d_temp_storage, temp_storage_bytes, d_indexed_in, d_out, num_items, cub::ArgMin(), initial_value, stream);
@@ -803,7 +801,7 @@ struct DeviceReduce
       // std::numeric_limits<T>::lowest()
       // when C++11 support is more
       // prevalent
-      Traits<InitT>::Lowest(),
+      ::cuda::std::numeric_limits<InitT>::lowest(),
       stream);
   }
 
@@ -1064,7 +1062,7 @@ struct DeviceReduce
 
     // Initial value
     // TODO Address https://github.com/NVIDIA/cub/issues/651
-    InitT initial_value{AccumT(1, Traits<InputValueT>::Lowest())};
+    InitT initial_value{AccumT(1, ::cuda::std::numeric_limits<InputValueT>::lowest())};
 
     return DispatchReduce<ArgIndexInputIteratorT, OutputIteratorT, OffsetT, cub::ArgMax, InitT, AccumT>::Dispatch(
       d_temp_storage, temp_storage_bytes, d_indexed_in, d_out, num_items, cub::ArgMax(), initial_value, stream);
diff --git a/cub/cub/device/device_segmented_reduce.cuh b/cub/cub/device/device_segmented_reduce.cuh
index 5eac51ee742..69a8883a5b2 100644
--- a/cub/cub/device/device_segmented_reduce.cuh
+++ b/cub/cub/device/device_segmented_reduce.cuh
@@ -49,6 +49,7 @@
 #include <cub/iterator/arg_index_input_iterator.cuh>
 #include <cub/util_type.cuh>
 
+#include <cuda/std/limits>
 #include <cuda/std/type_traits>
 
 #include <iterator>
@@ -508,10 +509,7 @@ public:
       d_begin_offsets,
       d_end_offsets,
       ::cuda::minimum<>{},
-      Traits<InputT>::Max(), // replace with
-                             // std::numeric_limits<T>::max()
-                             // when C++11 support is
-                             // more prevalent
+      ::cuda::std::numeric_limits<InputT>::max(),
       stream);
   }
 
@@ -639,7 +637,7 @@ public:
 
     // Initial value
     // TODO Address https://github.com/NVIDIA/cub/issues/651
-    InitT initial_value{AccumT(1, Traits<InputValueT>::Max())};
+    InitT initial_value{AccumT(1, ::cuda::std::numeric_limits<InputValueT>::max())};
 
     using integral_offset_check = ::cuda::std::is_integral<OffsetT>;
     static_assert(integral_offset_check::value, "Offset iterator value type should be integral.");
@@ -773,10 +771,7 @@ public:
       d_begin_offsets,
       d_end_offsets,
       ::cuda::maximum<>{},
-      Traits<InputT>::Lowest(), // replace with
-                                // std::numeric_limits<T>::lowest()
-                                // when C++11 support is
-                                // more prevalent
+      ::cuda::std::numeric_limits<InputT>::lowest(),
       stream);
   }
 
@@ -907,7 +902,7 @@ public:
 
     // Initial value
     // TODO Address https://github.com/NVIDIA/cub/issues/651
-    InitT initial_value{AccumT(1, Traits<InputValueT>::Lowest())};
+    InitT initial_value{AccumT(1, ::cuda::std::numeric_limits<InputValueT>::lowest())};
 
     using integral_offset_check = ::cuda::std::is_integral<OffsetT>;
     static_assert(integral_offset_check::value, "Offset iterator value type should be integral.");
diff --git a/cub/cub/util_type.cuh b/cub/cub/util_type.cuh
index 4d1db99a821..f90801b5b80 100644
--- a/cub/cub/util_type.cuh
+++ b/cub/cub/util_type.cuh
@@ -47,7 +47,7 @@
 
 #include <cuda/std/cstdint>
 #include <cuda/std/limits>
-#include <cuda/std/type_traits>
+#include <cuda/type_traits>
 
 #if defined(_CCCL_HAS_NVFP16)
 #  include <cuda_fp16.h>
@@ -59,6 +59,10 @@ _CCCL_DIAG_SUPPRESS_CLANG("-Wunused-function")
 #  include <cuda_bf16.h>
 _CCCL_DIAG_POP
 
+#  if defined(_CCCL_HAS_NVFP8)
+#    include <cuda_fp8.h>
+#  endif // _CCCL_HAS_NVFP8
+
 #  if _CCCL_CUDACC_AT_LEAST(11, 8)
 // cuda_fp8.h resets default for C4127, so we have to guard the inclusion
 _CCCL_DIAG_PUSH
@@ -809,9 +813,8 @@ enum Category
   FLOATING_POINT
 };
 
-/**
- * \brief Basic type traits
- */
+namespace detail
+{
 template <Category _CATEGORY, bool _PRIMITIVE, bool _NULL_TYPE, typename _UnsignedBits, typename T>
 struct BaseTraits
 {
@@ -820,9 +823,6 @@ struct BaseTraits
   static constexpr bool NULL_TYPE    = _NULL_TYPE;
 };
 
-/**
- * Basic type traits (unsigned primitive specialization)
- */
 template <typename _UnsignedBits, typename T>
 struct BaseTraits<UNSIGNED_INTEGER, true, false, _UnsignedBits, T>
 {
@@ -861,9 +861,6 @@ struct BaseTraits<UNSIGNED_INTEGER, true, false, _UnsignedBits, T>
   }
 };
 
-/**
- * Basic type traits (signed primitive specialization)
- */
 template <typename _UnsignedBits, typename T>
 struct BaseTraits<SIGNED_INTEGER, true, false, _UnsignedBits, T>
 {
@@ -899,119 +896,20 @@ struct BaseTraits<SIGNED_INTEGER, true, false, _UnsignedBits, T>
   }
 };
 
-template <typename _T>
-struct FpLimits;
-
-template <>
-struct FpLimits<float>
-{
-  static _CCCL_HOST_DEVICE _CCCL_FORCEINLINE float Max()
-  {
-    return ::cuda::std::numeric_limits<float>::max();
-  }
-
-  static _CCCL_HOST_DEVICE _CCCL_FORCEINLINE float Lowest()
-  {
-    return ::cuda::std::numeric_limits<float>::lowest();
-  }
-};
-
-template <>
-struct FpLimits<double>
-{
-  static _CCCL_HOST_DEVICE _CCCL_FORCEINLINE double Max()
-  {
-    return ::cuda::std::numeric_limits<double>::max();
-  }
-
-  static _CCCL_HOST_DEVICE _CCCL_FORCEINLINE double Lowest()
-  {
-    return ::cuda::std::numeric_limits<double>::lowest();
-  }
-};
-
-#  if defined(_CCCL_HAS_NVFP16)
-template <>
-struct FpLimits<__half>
-{
-  static _CCCL_HOST_DEVICE _CCCL_FORCEINLINE __half Max()
-  {
-    unsigned short max_word = 0x7BFF;
-    return reinterpret_cast<__half&>(max_word);
-  }
-
-  static _CCCL_HOST_DEVICE _CCCL_FORCEINLINE __half Lowest()
-  {
-    unsigned short lowest_word = 0xFBFF;
-    return reinterpret_cast<__half&>(lowest_word);
-  }
-};
-#  endif // _CCCL_HAS_NVFP16
-
-#  if defined(_CCCL_HAS_NVBF16)
-template <>
-struct FpLimits<__nv_bfloat16>
-{
-  static _CCCL_HOST_DEVICE _CCCL_FORCEINLINE __nv_bfloat16 Max()
-  {
-    unsigned short max_word = 0x7F7F;
-    return reinterpret_cast<__nv_bfloat16&>(max_word);
-  }
-
-  static _CCCL_HOST_DEVICE _CCCL_FORCEINLINE __nv_bfloat16 Lowest()
-  {
-    unsigned short lowest_word = 0xFF7F;
-    return reinterpret_cast<__nv_bfloat16&>(lowest_word);
-  }
-};
-#  endif // _CCCL_HAS_NVBF16
-
-#  if defined(__CUDA_FP8_TYPES_EXIST__)
-template <>
-struct FpLimits<__nv_fp8_e4m3>
-{
-  static _CCCL_HOST_DEVICE _CCCL_FORCEINLINE __nv_fp8_e4m3 Max()
-  {
-    unsigned char max_word = 0x7EU;
-    __nv_fp8_e4m3 ret_val;
-    memcpy(&ret_val, &max_word, sizeof(__nv_fp8_e4m3));
-    return ret_val;
-  }
-
-  static _CCCL_HOST_DEVICE _CCCL_FORCEINLINE __nv_fp8_e4m3 Lowest()
-  {
-    unsigned char lowest_word = 0xFEU;
-    __nv_fp8_e4m3 ret_val;
-    memcpy(&ret_val, &lowest_word, sizeof(__nv_fp8_e4m3));
-    return ret_val;
-  }
-};
-
-template <>
-struct FpLimits<__nv_fp8_e5m2>
+template <typename T>
+struct FpLimits
 {
-  static _CCCL_HOST_DEVICE _CCCL_FORCEINLINE __nv_fp8_e5m2 Max()
+  static _CCCL_HOST_DEVICE _CCCL_FORCEINLINE T Max()
   {
-    unsigned char max_word = 0x7BU;
-    __nv_fp8_e5m2 ret_val;
-    memcpy(&ret_val, &max_word, sizeof(__nv_fp8_e5m2));
-    return ret_val;
+    return ::cuda::std::numeric_limits<T>::max();
   }
 
-  static _CCCL_HOST_DEVICE _CCCL_FORCEINLINE __nv_fp8_e5m2 Lowest()
+  static _CCCL_HOST_DEVICE _CCCL_FORCEINLINE T Lowest()
   {
-    unsigned char lowest_word = 0xFBU;
-    __nv_fp8_e5m2 ret_val;
-    memcpy(&ret_val, &lowest_word, sizeof(__nv_fp8_e5m2));
-    return ret_val;
+    return ::cuda::std::numeric_limits<T>::lowest();
   }
 };
 
-#  endif // __CUDA_FP8_TYPES_EXIST__
-
-/**
- * Basic type traits (fp primitive specialization)
- */
 template <typename _UnsignedBits, typename T>
 struct BaseTraits<FLOATING_POINT, true, false, _UnsignedBits, T>
 {
@@ -1047,9 +945,6 @@ struct BaseTraits<FLOATING_POINT, true, false, _UnsignedBits, T>
   }
 };
 
-/**
- * \brief Numeric type traits
- */
 // clang-format off
 template <typename T> struct NumericTraits :            BaseTraits<NOT_A_NUMBER, false, false, T, T> {};
 
@@ -1143,26 +1038,186 @@ struct NumericTraits<__int128_t>
 template <> struct NumericTraits<float> :               BaseTraits<FLOATING_POINT, true, false, unsigned int, float> {};
 template <> struct NumericTraits<double> :              BaseTraits<FLOATING_POINT, true, false, unsigned long long, double> {};
 #  if defined(_CCCL_HAS_NVFP16)
-    template <> struct NumericTraits<__half> :          BaseTraits<FLOATING_POINT, true, false, unsigned short, __half> {};
+template <> struct NumericTraits<__half> :          BaseTraits<FLOATING_POINT, true, false, unsigned short, __half> {};
 #  endif // _CCCL_HAS_NVFP16
 #  if defined(_CCCL_HAS_NVBF16)
-    template <> struct NumericTraits<__nv_bfloat16> :   BaseTraits<FLOATING_POINT, true, false, unsigned short, __nv_bfloat16> {};
+template <> struct NumericTraits<__nv_bfloat16> :   BaseTraits<FLOATING_POINT, true, false, unsigned short, __nv_bfloat16> {};
 #  endif // _CCCL_HAS_NVBF16
 
 #if defined(__CUDA_FP8_TYPES_EXIST__)
-    template <> struct NumericTraits<__nv_fp8_e4m3> :   BaseTraits<FLOATING_POINT, true, false, __nv_fp8_storage_t, __nv_fp8_e4m3> {};
-    template <> struct NumericTraits<__nv_fp8_e5m2> :   BaseTraits<FLOATING_POINT, true, false, __nv_fp8_storage_t, __nv_fp8_e5m2> {};
+template <> struct NumericTraits<__nv_fp8_e4m3> :   BaseTraits<FLOATING_POINT, true, false, __nv_fp8_storage_t, __nv_fp8_e4m3> {};
+template <> struct NumericTraits<__nv_fp8_e5m2> :   BaseTraits<FLOATING_POINT, true, false, __nv_fp8_storage_t, __nv_fp8_e5m2> {};
 #endif // __CUDA_FP8_TYPES_EXIST__
 
 template <> struct NumericTraits<bool> :                BaseTraits<UNSIGNED_INTEGER, true, false, typename UnitWord<bool>::VolatileWord, bool> {};
 // clang-format on
 
-/**
- * \brief Type traits
- */
 template <typename T>
 struct Traits : NumericTraits<typename ::cuda::std::remove_cv<T>::type>
 {};
+} // namespace detail
+
+template <Category _CATEGORY, bool _PRIMITIVE, bool _NULL_TYPE, typename _UnsignedBits, typename T>
+using BaseTraits CCCL_DEPRECATED = detail::BaseTraits<_CATEGORY, _PRIMITIVE, _NULL_TYPE, _UnsignedBits, T>;
+
+template <typename T>
+using FpLimits CCCL_DEPRECATED_BECAUSE("Use cuda::std::numeric_limits instead") = detail::FpLimits<T>;
+
+template <typename T>
+using NumericTraits CCCL_DEPRECATED = detail::NumericTraits<T>;
+
+template <typename T>
+using Traits CCCL_DEPRECATED = detail::Traits<T>;
+
+namespace detail
+{
+//! Trait to get an unsigned integral type with the same size as T, exposed as a nested alias ::type.
+template <typename T, typename SFINAE = void>
+struct unsigned_bits;
+
+template <typename T>
+struct unsigned_bits<T, ::cuda::std::enable_if_t<::cuda::std::is_unsigned_v<T>>>
+{
+  using type = T;
+};
+
+template <typename T>
+struct unsigned_bits<T, ::cuda::std::enable_if_t<::cuda::std::is_signed_v<T>>>
+{
+  using type = ::cuda::std::make_unsigned_t<T>;
+};
+
+template <>
+struct unsigned_bits<float, void>
+{
+  using type = unsigned int;
+};
+
+template <>
+struct unsigned_bits<double, void>
+{
+  using type = unsigned long long;
+};
+
+#  if defined(_CCCL_HAS_NVFP16)
+template <>
+struct unsigned_bits<__half, void>
+{
+  using type = unsigned short;
+};
+#  endif // _CCCL_HAS_NVFP16
+
+#  if defined(_CCCL_HAS_NVBF16)
+template <>
+struct unsigned_bits<__nv_bfloat16, void>
+{
+  using type = unsigned short;
+};
+#  endif // _CCCL_HAS_NVBF16
+
+#  if defined(__CUDA_FP8_TYPES_EXIST__)
+template <>
+struct unsigned_bits<__nv_fp8_e4m3, void>
+{
+  using type = __nv_fp8_storage_t;
+};
+template <>
+struct unsigned_bits<__nv_fp8_e5m2, void>
+{
+  using type = __nv_fp8_storage_t;
+};
+#  endif // __CUDA_FP8_TYPES_EXIST__
+
+template <>
+struct unsigned_bits<bool, void>
+{
+  using type = typename UnitWord<bool>::VolatileWord;
+};
+
+//! Alias to an unsigned integral type with the same size as T.
+template <typename T>
+using unsigned_bits_t = typename unsigned_bits<T>::type;
+} // namespace detail
+
+//! Bit twiddling utilities
+template <typename T, typename SFINAE = void>
+struct Twiddle;
+
+template <typename T>
+struct Twiddle<
+  T,
+  ::cuda::std::enable_if_t<::cuda::std::__cccl_is_unsigned_integer<T>::value
+                           || (::cuda::std::is_same_v<T, char> && !::cuda::std::numeric_limits<char>::is_signed)>>
+{
+  using UnsignedBits = T;
+
+  static _CCCL_HOST_DEVICE _CCCL_FORCEINLINE UnsignedBits In(UnsignedBits key)
+  {
+    return key;
+  }
+
+  static _CCCL_HOST_DEVICE _CCCL_FORCEINLINE UnsignedBits Out(UnsignedBits key)
+  {
+    return key;
+  }
+};
+
+template <typename T>
+struct Twiddle<
+  T,
+  ::cuda::std::enable_if_t<::cuda::std::__cccl_is_signed_integer<T>::value
+                           || (::cuda::std::is_same_v<T, char> && ::cuda::std::numeric_limits<char>::is_signed)>>
+{
+  using UnsignedBits = detail::unsigned_bits_t<T>;
+
+  static constexpr UnsignedBits high_bit = UnsignedBits(1) << (sizeof(UnsignedBits) * CHAR_BIT - 1);
+
+  static _CCCL_HOST_DEVICE _CCCL_FORCEINLINE UnsignedBits In(UnsignedBits key)
+  {
+    return key ^ high_bit;
+  }
+
+  static _CCCL_HOST_DEVICE _CCCL_FORCEINLINE UnsignedBits Out(UnsignedBits key)
+  {
+    return key ^ high_bit;
+  }
+};
+
+template <typename T>
+struct Twiddle<T, ::cuda::std::enable_if_t<::cuda::is_floating_point_v<T>>>
+{
+  using UnsignedBits = detail::unsigned_bits_t<T>;
+
+  static constexpr UnsignedBits high_bit = UnsignedBits(1) << (sizeof(UnsignedBits) * CHAR_BIT - 1);
+
+  static _CCCL_HOST_DEVICE _CCCL_FORCEINLINE UnsignedBits In(UnsignedBits key)
+  {
+    const UnsignedBits mask = (key & high_bit) ? UnsignedBits(-1) : high_bit;
+    return key ^ mask;
+  };
+
+  static _CCCL_HOST_DEVICE _CCCL_FORCEINLINE UnsignedBits Out(UnsignedBits key)
+  {
+    const UnsignedBits mask = (key & high_bit) ? high_bit : UnsignedBits(-1);
+    return key ^ mask;
+  }
+};
+
+template <>
+struct Twiddle<bool, void>
+{
+  using UnsignedBits = detail::unsigned_bits_t<bool>;
+
+  static _CCCL_HOST_DEVICE _CCCL_FORCEINLINE UnsignedBits In(UnsignedBits key)
+  {
+    return key;
+  }
+
+  static _CCCL_HOST_DEVICE _CCCL_FORCEINLINE UnsignedBits Out(UnsignedBits key)
+  {
+    return key;
+  }
+};
 
 #endif // _CCCL_DOXYGEN_INVOKED
 
diff --git a/cub/test/catch2_radix_sort_helper.cuh b/cub/test/catch2_radix_sort_helper.cuh
index 642b2aed4f1..3428bc058ba 100644
--- a/cub/test/catch2_radix_sort_helper.cuh
+++ b/cub/test/catch2_radix_sort_helper.cuh
@@ -217,8 +217,7 @@ c2h::host_vector<KeyT> get_striped_keys(const c2h::host_vector<KeyT>& h_keys, in
   c2h::host_vector<KeyT> h_striped_keys(h_keys);
   KeyT* h_striped_keys_data = thrust::raw_pointer_cast(h_striped_keys.data());
 
-  using traits_t      = cub::Traits<KeyT>;
-  using bit_ordered_t = typename traits_t::UnsignedBits;
+  using bit_ordered_t = typename cub::Twiddle<KeyT>::UnsignedBits;
 
   const int num_bits = end_bit - begin_bit;
 
@@ -226,7 +225,7 @@ c2h::host_vector<KeyT> get_striped_keys(const c2h::host_vector<KeyT>& h_keys, in
   {
     bit_ordered_t key = ::cuda::std::bit_cast<bit_ordered_t>(h_keys[i]);
 
-    _CCCL_IF_CONSTEXPR (traits_t::CATEGORY == cub::FLOATING_POINT)
+    _CCCL_IF_CONSTEXPR (::cuda::is_floating_point_v<KeyT>)
     {
       const bit_ordered_t negative_zero = bit_ordered_t(1) << bit_ordered_t(sizeof(bit_ordered_t) * 8 - 1);
 
@@ -236,7 +235,7 @@ c2h::host_vector<KeyT> get_striped_keys(const c2h::host_vector<KeyT>& h_keys, in
       }
     }
 
-    key = traits_t::TwiddleIn(key);
+    key = cub::Twiddle<KeyT>::In(key);
 
     if ((begin_bit > 0) || (end_bit < static_cast<int>(sizeof(KeyT) * 8)))
     {
@@ -290,8 +289,7 @@ c2h::host_vector<std::size_t> get_permutation(
   c2h::host_vector<std::size_t> h_permutation(h_keys.size());
   thrust::sequence(h_permutation.begin(), h_permutation.end());
 
-  using traits_t      = cub::Traits<KeyT>;
-  using bit_ordered_t = typename traits_t::UnsignedBits;
+  using bit_ordered_t = typename cub::Twiddle<KeyT>::UnsignedBits;
 
   auto bit_ordered_striped_keys =
     reinterpret_cast<const bit_ordered_t*>(thrust::raw_pointer_cast(h_striped_keys.data()));
diff --git a/cub/test/catch2_test_block_radix_sort.cu b/cub/test/catch2_test_block_radix_sort.cu
index 403451d6749..77f234dc13b 100644
--- a/cub/test/catch2_test_block_radix_sort.cu
+++ b/cub/test/catch2_test_block_radix_sort.cu
@@ -104,8 +104,7 @@ bool binary_equal(
 {
   d_tmp = h_reference;
 
-  using traits_t      = cub::Traits<T>;
-  using bit_ordered_t = typename traits_t::UnsignedBits;
+  using bit_ordered_t = typename cub::Twiddle<T>::UnsignedBits;
 
   auto d_output_ptr    = reinterpret_cast<const bit_ordered_t*>(thrust::raw_pointer_cast(d_output.data()));
   auto d_reference_ptr = reinterpret_cast<const bit_ordered_t*>(thrust::raw_pointer_cast(d_tmp.data()));
diff --git a/cub/test/catch2_test_device_histogram.cu b/cub/test/catch2_test_device_histogram.cu
index 8a4186a406b..48bc63b168a 100644
--- a/cub/test/catch2_test_device_histogram.cu
+++ b/cub/test/catch2_test_device_histogram.cu
@@ -28,17 +28,19 @@
 
 #include <cub/device/device_histogram.cuh>
 #include <cub/iterator/counting_input_iterator.cuh>
+#include <cub/util_type.cuh>
 
 #include <cuda/std/__algorithm_>
 #include <cuda/std/array>
 #include <cuda/std/bit>
-#include <cuda/std/type_traits>
+#include <cuda/type_traits>
 
 #include <algorithm>
 #include <limits>
 #include <tuple>
 
 #include "catch2_test_launch_helper.h"
+#include "cub/util_type.cuh"
 #include <c2h/catch2_test_helper.h>
 #include <c2h/extended_types.h>
 #include <c2h/vector.h>
@@ -211,7 +213,7 @@ struct bit_and_anything
   template <typename T>
   _CCCL_HOST_DEVICE auto operator()(const T& a, const T& b) const -> T
   {
-    using U = typename cub::Traits<T>::UnsignedBits;
+    using U = typename cub::detail::unsigned_bits_t<T>;
     return ::cuda::std::bit_cast<T>(static_cast<U>(::cuda::std::bit_cast<U>(a) & ::cuda::std::bit_cast<U>(b)));
   }
 };
@@ -420,8 +422,7 @@ using types =
 C2H_TEST("DeviceHistogram::Histogram* basic use", "[histogram][device]", types)
 {
   using sample_t = c2h::get<0, TestType>;
-  using level_t =
-    typename cs::conditional<cub::NumericTraits<sample_t>::CATEGORY == cub::FLOATING_POINT, sample_t, int>::type;
+  using level_t  = typename cs::conditional<cuda::is_floating_point_v<sample_t>, sample_t, int>::type;
   // Max for int8/uint8 is 2^8, for half_t is 2^10. Beyond, we would need a different level generation
   const auto max_level       = level_t{sizeof(sample_t) == 1 ? 126 : 1024};
   const auto max_level_count = (sizeof(sample_t) == 1 ? 126 : 1024) + 1;
@@ -435,7 +436,7 @@ C2H_TEST("DeviceHistogram::Histogram* large levels", "[histogram][device]", c2h:
   using sample_t             = c2h::get<0, TestType>;
   using level_t              = sample_t;
   const auto max_level_count = 128;
-  auto max_level             = cub::NumericTraits<level_t>::Max();
+  auto max_level             = cuda::std::numeric_limits<level_t>::max();
   _CCCL_IF_CONSTEXPR (sizeof(sample_t) > sizeof(int))
   {
     max_level /= static_cast<level_t>(max_level_count - 1); // cf. overflow detection in ScaleTransform::MayOverflow
diff --git a/cub/test/catch2_test_device_radix_sort_keys.cu b/cub/test/catch2_test_device_radix_sort_keys.cu
index d09003f7d74..296f5776f56 100644
--- a/cub/test/catch2_test_device_radix_sort_keys.cu
+++ b/cub/test/catch2_test_device_radix_sort_keys.cu
@@ -189,7 +189,7 @@ C2H_TEST("DeviceRadixSort::SortKeys: bit windows", "[keys][radix][sort][device]"
 C2H_TEST("DeviceRadixSort::SortKeys: negative zero handling", "[keys][radix][sort][device]", fp_key_types)
 {
   using key_t  = c2h::get<0, TestType>;
-  using bits_t = typename cub::Traits<key_t>::UnsignedBits;
+  using bits_t = typename cub::Twiddle<key_t>::UnsignedBits;
 
   constexpr std::size_t num_bits = sizeof(key_t) * CHAR_BIT;
   const key_t positive_zero      = ::cuda::std::bit_cast<key_t>(bits_t(0));
diff --git a/cub/test/catch2_test_device_reduce.cuh b/cub/test/catch2_test_device_reduce.cuh
index 6e89b692ed0..620dc93ae05 100644
--- a/cub/test/catch2_test_device_reduce.cuh
+++ b/cub/test/catch2_test_device_reduce.cuh
@@ -109,36 +109,6 @@ __host__ __device__ __forceinline__ //
 }
 #endif // TEST_HALF_T
 
-/**
- * @brief Introduces the required NumericTraits for `c2h::custom_type_t`.
- */
-template <template <typename> class... Policies>
-struct NumericTraits<c2h::custom_type_t<Policies...>>
-{
-  using custom_t                     = c2h::custom_type_t<Policies...>;
-  static constexpr Category CATEGORY = NOT_A_NUMBER;
-  enum
-  {
-    PRIMITIVE = false,
-    NULL_TYPE = false,
-  };
-  __host__ __device__ static custom_t Max()
-  {
-    custom_t val{};
-    val.key = NumericTraits<decltype(std::declval<custom_t>().key)>::Max();
-    val.val = NumericTraits<decltype(std::declval<custom_t>().val)>::Max();
-    return val;
-  }
-
-  __host__ __device__ static custom_t Lowest()
-  {
-    custom_t val{};
-    val.key = NumericTraits<decltype(std::declval<custom_t>().key)>::Lowest();
-    val.val = NumericTraits<decltype(std::declval<custom_t>().val)>::Lowest();
-    return val;
-  }
-};
-
 template <typename Key, typename Value>
 static std::ostream& operator<<(std::ostream& os, const KeyValuePair<Key, Value>& val)
 {
@@ -400,7 +370,7 @@ void compute_segmented_argmin_reference(
   {
     if (h_offsets[seg] >= h_offsets[seg + 1])
     {
-      h_results[seg] = {1, cub::Traits<ItemT>::Max()};
+      h_results[seg] = {1, ::cuda::std::numeric_limits<ItemT>::max()};
     }
     else
     {
@@ -427,7 +397,7 @@ void compute_segmented_argmax_reference(
   {
     if (h_offsets[seg] >= h_offsets[seg + 1])
     {
-      h_results[seg] = {1, cub::Traits<ItemT>::Lowest()};
+      h_results[seg] = {1, ::cuda::std::numeric_limits<ItemT>::lowest()};
     }
     else
     {
diff --git a/cub/test/catch2_test_device_reduce_by_key.cu b/cub/test/catch2_test_device_reduce_by_key.cu
index ee8726219f2..82b532af75d 100644
--- a/cub/test/catch2_test_device_reduce_by_key.cu
+++ b/cub/test/catch2_test_device_reduce_by_key.cu
@@ -149,7 +149,7 @@ C2H_TEST("Device reduce-by-key works", "[by_key][reduce][device]", full_type_lis
     // Prepare verification data
     c2h::host_vector<output_t> expected_result(num_segments);
     compute_segmented_problem_reference(
-      in_values, segment_offsets, op_t{}, cub::NumericTraits<value_t>::Max(), expected_result.begin());
+      in_values, segment_offsets, op_t{}, cuda::std::numeric_limits<value_t>::max(), expected_result.begin());
     c2h::host_vector<key_t> expected_keys = compute_unique_keys_reference(segment_keys);
 
     // Run test
diff --git a/cub/test/catch2_test_device_scan.cu b/cub/test/catch2_test_device_scan.cu
index d9cf517f55d..d7d2322cd56 100644
--- a/cub/test/catch2_test_device_scan.cu
+++ b/cub/test/catch2_test_device_scan.cu
@@ -190,7 +190,11 @@ C2H_TEST("Device scan works with all device interfaces", "[scan][device]", full_
     c2h::host_vector<input_t> host_items(in_items);
     c2h::host_vector<output_t> expected_result(num_items);
     compute_inclusive_scan_reference(
-      host_items.cbegin(), host_items.cend(), expected_result.begin(), op_t{}, cub::NumericTraits<accum_t>::Max());
+      host_items.cbegin(),
+      host_items.cend(),
+      expected_result.begin(),
+      op_t{},
+      cuda::std::numeric_limits<accum_t>::max());
 
     // Run test
     c2h::device_vector<output_t> out_result(num_items);
diff --git a/cub/test/catch2_test_device_scan_iterators.cu b/cub/test/catch2_test_device_scan_iterators.cu
index a6a2a2941bd..32ca7dff183 100644
--- a/cub/test/catch2_test_device_scan_iterators.cu
+++ b/cub/test/catch2_test_device_scan_iterators.cu
@@ -33,6 +33,8 @@
 #include <thrust/iterator/constant_iterator.h>
 #include <thrust/iterator/discard_iterator.h>
 
+#include <cuda/std/limits>
+
 #include <cstdint>
 
 #include "catch2_test_device_reduce.cuh"
@@ -125,7 +127,7 @@ C2H_TEST("Device scan works with iterators", "[scan][device]", iterator_type_lis
     // Prepare verification data
     c2h::host_vector<output_t> expected_result(num_items);
     compute_inclusive_scan_reference(
-      in_it, in_it + num_items, expected_result.begin(), op_t{}, cub::NumericTraits<accum_t>::Max());
+      in_it, in_it + num_items, expected_result.begin(), op_t{}, cuda::std::numeric_limits<accum_t>::max());
 
     // Run test
     c2h::device_vector<output_t> out_result(num_items);
diff --git a/cub/test/catch2_test_device_segmented_reduce.cu b/cub/test/catch2_test_device_segmented_reduce.cu
index c524a7ef753..864d6c640f3 100644
--- a/cub/test/catch2_test_device_segmented_reduce.cu
+++ b/cub/test/catch2_test_device_segmented_reduce.cu
@@ -165,7 +165,7 @@ C2H_TEST("Device reduce works with all device interfaces", "[segmented][reduce][
     // Prepare verification data
     c2h::host_vector<output_t> expected_result(num_segments);
     compute_segmented_problem_reference(
-      in_items, segment_offsets, op_t{}, cub::NumericTraits<input_t>::Max(), expected_result.begin());
+      in_items, segment_offsets, op_t{}, cuda::std::numeric_limits<input_t>::max(), expected_result.begin());
 
     // Run test
     c2h::device_vector<output_t> out_result(num_segments);
@@ -183,7 +183,7 @@ C2H_TEST("Device reduce works with all device interfaces", "[segmented][reduce][
     // Prepare verification data
     c2h::host_vector<output_t> expected_result(num_segments);
     compute_segmented_problem_reference(
-      in_items, segment_offsets, op_t{}, cub::NumericTraits<input_t>::Lowest(), expected_result.begin());
+      in_items, segment_offsets, op_t{}, cuda::std::numeric_limits<input_t>::lowest(), expected_result.begin());
 
     // Run test
     c2h::device_vector<output_t> out_result(num_segments);
diff --git a/cub/test/test_util.h b/cub/test/test_util.h
index c06d803ecb1..72c6f2bc850 100644
--- a/cub/test/test_util.h
+++ b/cub/test/test_util.h
@@ -28,6 +28,7 @@
 
 #pragma once
 
+#include "cuda/std/__internal/namespaces.h"
 #ifdef _WIN32
 #  include <windows.h>
 #  undef small // Windows is terrible for polluting macro namespace
@@ -580,7 +581,7 @@ __host__ __device__ __forceinline__ void InitValue(GenMode gen_mode, T& value, s
       case RANDOM_MINUS_PLUS_ZERO: {
         // Replace roughly 1/128 of values with -0.0 or +0.0, and
         // generate the rest randomly
-        using UnsignedBits = typename CUB_NS_QUALIFIER::Traits<T>::UnsignedBits;
+        using UnsignedBits = CUB_NS_QUALIFIER::detail::unsigned_bits_t<T>;
         char c;
         RandomBits(c);
         if (c == 0)
@@ -966,36 +967,30 @@ __host__ __device__ __forceinline__ void InitValue(GenMode gen_mode, TestFoo& va
   InitValue(gen_mode, value.w, index);
 }
 
-/// numeric_limits<TestFoo> specialization
-CUB_NAMESPACE_BEGIN
+_LIBCUDACXX_BEGIN_NAMESPACE_STD
 template <>
-struct NumericTraits<TestFoo>
+class numeric_limits<TestFoo>
 {
-  static constexpr Category CATEGORY = NOT_A_NUMBER;
-  enum
-  {
-    PRIMITIVE = false,
-    NULL_TYPE = false,
-  };
-  __host__ __device__ static TestFoo Max()
+private:
+  __host__ __device__ static TestFoo max()
   {
     return TestFoo::MakeTestFoo(
-      NumericTraits<long long>::Max(),
-      NumericTraits<int>::Max(),
-      NumericTraits<short>::Max(),
-      NumericTraits<char>::Max());
+      numeric_limits<long long>::max(),
+      numeric_limits<int>::max(),
+      numeric_limits<short>::max(),
+      numeric_limits<char>::max());
   }
 
-  __host__ __device__ static TestFoo Lowest()
+  __host__ __device__ static TestFoo lowest()
   {
     return TestFoo::MakeTestFoo(
-      NumericTraits<long long>::Lowest(),
-      NumericTraits<int>::Lowest(),
-      NumericTraits<short>::Lowest(),
-      NumericTraits<char>::Lowest());
+      numeric_limits<long long>::lowest(),
+      numeric_limits<int>::lowest(),
+      numeric_limits<short>::lowest(),
+      numeric_limits<char>::lowest());
   }
 };
-CUB_NAMESPACE_END
+_LIBCUDACXX_END_NAMESPACE_STD
 
 //---------------------------------------------------------------------
 // Complex data type TestBar (with optimizations for fence-free warp-synchrony)
@@ -1100,28 +1095,21 @@ __host__ __device__ __forceinline__ void InitValue(GenMode gen_mode, TestBar& va
   InitValue(gen_mode, value.y, index);
 }
 
-/// numeric_limits<TestBar> specialization
-CUB_NAMESPACE_BEGIN
+_LIBCUDACXX_BEGIN_NAMESPACE_STD
 template <>
-struct NumericTraits<TestBar>
+class numeric_limits<TestBar>
 {
-  static constexpr Category CATEGORY = NOT_A_NUMBER;
-  enum
-  {
-    PRIMITIVE = false,
-    NULL_TYPE = false,
-  };
-  __host__ __device__ static TestBar Max()
+  __host__ __device__ static TestBar max()
   {
-    return TestBar(NumericTraits<long long>::Max(), NumericTraits<int>::Max());
+    return TestBar(numeric_limits<long long>::max(), numeric_limits<int>::max());
   }
 
-  __host__ __device__ static TestBar Lowest()
+  __host__ __device__ static TestBar lowest()
   {
-    return TestBar(NumericTraits<long long>::Lowest(), NumericTraits<int>::Lowest());
+    return TestBar(numeric_limits<long long>::lowest(), numeric_limits<int>::lowest());
   }
 };
-CUB_NAMESPACE_END
+_LIBCUDACXX_END_NAMESPACE_STD
 
 /******************************************************************************
  * Helper routines for list comparison and display
diff --git a/libcudacxx/include/cuda/std/__type_traits/is_extended_floating_point.h b/libcudacxx/include/cuda/std/__type_traits/is_extended_floating_point.h
index bb1afa4225b..f1e468039c3 100644
--- a/libcudacxx/include/cuda/std/__type_traits/is_extended_floating_point.h
+++ b/libcudacxx/include/cuda/std/__type_traits/is_extended_floating_point.h
@@ -33,6 +33,10 @@ _CCCL_DIAG_SUPPRESS_CLANG("-Wunused-function")
 _CCCL_DIAG_POP
 #endif // _LIBCUDACXX_HAS_NVBF16
 
+#if defined(_CCCL_HAS_NVFP8)
+#  include <cuda_fp8.h>
+#endif // _CCCL_HAS_NVFP8
+
 _LIBCUDACXX_BEGIN_NAMESPACE_STD
 
 template <class _Tp>
@@ -71,6 +75,22 @@ _CCCL_INLINE_VAR constexpr bool __is_extended_floating_point_v<__nv_bfloat16> =
 #  endif // !_CCCL_NO_INLINE_VARIABLES
 #endif // _LIBCUDACXX_HAS_NVBF16
 
+#if defined(_CCCL_HAS_NVFP8)
+template <>
+struct __is_extended_floating_point<__nv_fp8_e4m3> : true_type
+{};
+template <>
+struct __is_extended_floating_point<__nv_fp8_e5m2> : true_type
+{};
+
+#  ifndef _CCCL_NO_INLINE_VARIABLES
+template <>
+_CCCL_INLINE_VAR constexpr bool __is_extended_floating_point_v<__nv_fp8_e4m3> = true;
+template <>
+_CCCL_INLINE_VAR constexpr bool __is_extended_floating_point_v<__nv_fp8_e5m2> = true;
+#  endif // !_CCCL_NO_INLINE_VARIABLES
+#endif // _CCCL_HAS_NVFP8
+
 _LIBCUDACXX_END_NAMESPACE_STD
 
 #endif // _LIBCUDACXX___TYPE_TRAITS_IS_EXTENDED_FLOATING_POINT_H
diff --git a/libcudacxx/test/libcudacxx/libcxx/utilities/meta/meta.unary/meta.unary.cat/is_floating_point.pass.cpp b/libcudacxx/test/libcudacxx/libcxx/utilities/meta/meta.unary/meta.unary.cat/is_floating_point.pass.cpp
index 13bb443314a..6ff0ea388e7 100644
--- a/libcudacxx/test/libcudacxx/libcxx/utilities/meta/meta.unary/meta.unary.cat/is_floating_point.pass.cpp
+++ b/libcudacxx/test/libcudacxx/libcxx/utilities/meta/meta.unary/meta.unary.cat/is_floating_point.pass.cpp
@@ -86,6 +86,10 @@ int main(int, char**)
 #ifdef _LIBCUDACXX_HAS_NVBF16
   test_is_floating_point<__nv_bfloat16>();
 #endif // _LIBCUDACXX_HAS_NVBF16
+#ifdef _CCCL_HAS_NVFP8
+  test_is_floating_point<__nv_fp8_e4m3>();
+  test_is_floating_point<__nv_fp8_e5m2>();
+#endif // _CCCL_HAS_NVFP8
 
   test_is_not_floating_point<short>();
   test_is_not_floating_point<unsigned short>();