diff --git a/libc/src/__support/CPP/atomic.h b/libc/src/__support/CPP/atomic.h index 6922a367289a9f..495d492a0b7c61 100644 --- a/libc/src/__support/CPP/atomic.h +++ b/libc/src/__support/CPP/atomic.h @@ -90,10 +90,6 @@ template struct Atomic { return __atomic_fetch_or(&val, mask, int(mem_ord)); } - T fetch_and(T mask, MemoryOrder mem_ord = MemoryOrder::SEQ_CST) { - return __atomic_fetch_and(&val, mask, int(mem_ord)); - } - T fetch_sub(T decrement, MemoryOrder mem_ord = MemoryOrder::SEQ_CST) { return __atomic_fetch_sub(&val, decrement, int(mem_ord)); } diff --git a/libc/src/__support/GPU/amdgpu/utils.h b/libc/src/__support/GPU/amdgpu/utils.h index 87cd6451445a3e..ca9122b6b6a54b 100644 --- a/libc/src/__support/GPU/amdgpu/utils.h +++ b/libc/src/__support/GPU/amdgpu/utils.h @@ -106,7 +106,7 @@ LIBC_INLINE uint32_t get_lane_size() { return LANE_SIZE; } /// Returns the id of the thread inside of an AMD wavefront executing together. [[clang::convergent]] LIBC_INLINE uint32_t get_lane_id() { - if constexpr (LANE_SIZE == 64) + if (LANE_SIZE == 64) return __builtin_amdgcn_mbcnt_hi(~0u, __builtin_amdgcn_mbcnt_lo(~0u, 0u)); else return __builtin_amdgcn_mbcnt_lo(~0u, 0u); @@ -122,16 +122,6 @@ LIBC_INLINE uint32_t get_lane_size() { return LANE_SIZE; } return __builtin_amdgcn_readfirstlane(x); } -[[clang::convergent]] LIBC_INLINE uint64_t ballot(uint64_t lane_mask, bool x) { - // the lane_mask & gives the nvptx semantics when lane_mask is a subset of - // the active threads - if constexpr (LANE_SIZE == 64) { - return lane_mask & __builtin_amdgcn_ballot_w64(x); - } else { - return lane_mask & __builtin_amdgcn_ballot_w32(x); - } -} - /// Waits for all the threads in the block to converge and issues a fence. [[clang::convergent]] LIBC_INLINE void sync_threads() { __builtin_amdgcn_s_barrier(); diff --git a/libc/src/__support/GPU/generic/utils.h b/libc/src/__support/GPU/generic/utils.h index 546e83e033afab..0decb3fa59d59b 100644 --- a/libc/src/__support/GPU/generic/utils.h +++ b/libc/src/__support/GPU/generic/utils.h @@ -58,11 +58,6 @@ LIBC_INLINE uint64_t get_lane_mask() { return 1; } LIBC_INLINE uint32_t broadcast_value(uint32_t x) { return x; } -LIBC_INLINE uint64_t ballot(uint64_t lane_mask, bool x) { - (void)lane_mask; - return x; -} - LIBC_INLINE void sync_threads() {} LIBC_INLINE void sync_lane(uint64_t) {} diff --git a/libc/src/__support/GPU/nvptx/utils.h b/libc/src/__support/GPU/nvptx/utils.h index 9f20edf16c6210..443b8c72fc85c8 100644 --- a/libc/src/__support/GPU/nvptx/utils.h +++ b/libc/src/__support/GPU/nvptx/utils.h @@ -118,13 +118,6 @@ LIBC_INLINE uint32_t get_lane_size() { return LANE_SIZE; } #endif } -[[clang::convergent]] LIBC_INLINE uint64_t ballot(uint64_t lane_mask, bool x) { -#if __CUDA_ARCH__ >= 600 - return __nvvm_vote_ballot_sync(lane_mask, x); -#else - return lane_mask & __nvvm_vote_ballot(x); -#endif -} /// Waits for all the threads in the block to converge and issues a fence. [[clang::convergent]] LIBC_INLINE void sync_threads() { __syncthreads(); } diff --git a/libc/src/__support/RPC/rpc.h b/libc/src/__support/RPC/rpc.h index fc7a66f4b88d56..5f6c149b84baa0 100644 --- a/libc/src/__support/RPC/rpc.h +++ b/libc/src/__support/RPC/rpc.h @@ -107,55 +107,16 @@ template struct Process { } /// Attempt to claim the lock at index. Return true on lock taken. - /// lane_mask is a bitmap of the threads in the warp that would hold the - /// single lock on success, e.g. the result of gpu::get_lane_mask() /// The lock is held when the zeroth bit of the uint32_t at lock[index] /// is set, and available when that bit is clear. Bits [1, 32) are zero. /// Or with one is a no-op when the lock is already held. - [[clang::convergent]] LIBC_INLINE bool try_lock(uint64_t lane_mask, - uint64_t index) { - // On amdgpu, test and set to lock[index] and a sync_lane would suffice - // On volta, need to handle differences between the threads running and - // the threads that were detected in the previous call to get_lane_mask() - // - // All threads in lane_mask try to claim the lock. At most one can succeed. - // There may be threads active which are not in lane mask which must not - // succeed in taking the lock, as otherwise it will leak. This is handled - // by making threads which are not in lane_mask or with 0, a no-op. - uint32_t id = gpu::get_lane_id(); - bool id_in_lane_mask = lane_mask & (1ul << id); - - // All threads in the warp call fetch_or. Possibly at the same time. - bool before = - lock[index].fetch_or(id_in_lane_mask, cpp::MemoryOrder::RELAXED); - uint64_t packed = gpu::ballot(lane_mask, before); - - // If every bit set in lane_mask is also set in packed, every single thread - // in the warp failed to get the lock. Ballot returns unset for threads not - // in the lane mask. - // - // Cases, per thread: - // mask==0 -> unspecified before, discarded by ballot -> 0 - // mask==1 and before==0 (success), set zero by ballot -> 0 - // mask==1 and before==1 (failure), set one by ballot -> 1 - // - // mask != packed implies at least one of the threads got the lock - // atomic semantics of fetch_or mean at most one of the threads for the lock - return lane_mask != packed; + LIBC_INLINE bool try_lock(uint64_t, uint64_t index) { + return lock[index].fetch_or(1, cpp::MemoryOrder::RELAXED) == 0; } // Unlock the lock at index. - [[clang::convergent]] LIBC_INLINE void unlock(uint64_t lane_mask, - uint64_t index) { - // Wait for other threads in the warp to finish using the lock - gpu::sync_lane(lane_mask); - - // Use exactly one thread to clear the bit at position 0 in lock[index] - // Must restrict to a single thread to avoid one thread dropping the lock, - // then an unrelated warp claiming the lock, then a second thread in this - // warp dropping the lock again. - uint32_t and_mask = ~(rpc::is_first_lane(lane_mask) ? 1 : 0); - lock[index].fetch_and(and_mask, cpp::MemoryOrder::RELAXED); + LIBC_INLINE void unlock(uint64_t, uint64_t index) { + lock[index].store(0, cpp::MemoryOrder::RELAXED); } }; diff --git a/libc/src/__support/RPC/rpc_util.h b/libc/src/__support/RPC/rpc_util.h index da305e3f4b662f..53a993292fb35f 100644 --- a/libc/src/__support/RPC/rpc_util.h +++ b/libc/src/__support/RPC/rpc_util.h @@ -9,7 +9,6 @@ #ifndef LLVM_LIBC_SRC_SUPPORT_RPC_RPC_UTILS_H #define LLVM_LIBC_SRC_SUPPORT_RPC_RPC_UTILS_H -#include "src/__support/GPU/utils.h" #include "src/__support/macros/attributes.h" #include "src/__support/macros/properties/architectures.h"