Skip to content

Commit

Permalink
Revert "[libc][rpc] Update locking to work on volta"
Browse files Browse the repository at this point in the history
This reverts commit b132373.
  • Loading branch information
JonChesterfield committed May 4, 2023
1 parent b132373 commit 8aaaa1c
Show file tree
Hide file tree
Showing 6 changed files with 5 additions and 71 deletions.
4 changes: 0 additions & 4 deletions libc/src/__support/CPP/atomic.h
Original file line number Diff line number Diff line change
Expand Up @@ -90,10 +90,6 @@ template <typename T> struct Atomic {
return __atomic_fetch_or(&val, mask, int(mem_ord));
}

T fetch_and(T mask, MemoryOrder mem_ord = MemoryOrder::SEQ_CST) {
return __atomic_fetch_and(&val, mask, int(mem_ord));
}

T fetch_sub(T decrement, MemoryOrder mem_ord = MemoryOrder::SEQ_CST) {
return __atomic_fetch_sub(&val, decrement, int(mem_ord));
}
Expand Down
12 changes: 1 addition & 11 deletions libc/src/__support/GPU/amdgpu/utils.h
Original file line number Diff line number Diff line change
Expand Up @@ -106,7 +106,7 @@ LIBC_INLINE uint32_t get_lane_size() { return LANE_SIZE; }

/// Returns the id of the thread inside of an AMD wavefront executing together.
[[clang::convergent]] LIBC_INLINE uint32_t get_lane_id() {
if constexpr (LANE_SIZE == 64)
if (LANE_SIZE == 64)
return __builtin_amdgcn_mbcnt_hi(~0u, __builtin_amdgcn_mbcnt_lo(~0u, 0u));
else
return __builtin_amdgcn_mbcnt_lo(~0u, 0u);
Expand All @@ -122,16 +122,6 @@ LIBC_INLINE uint32_t get_lane_size() { return LANE_SIZE; }
return __builtin_amdgcn_readfirstlane(x);
}

[[clang::convergent]] LIBC_INLINE uint64_t ballot(uint64_t lane_mask, bool x) {
// the lane_mask & gives the nvptx semantics when lane_mask is a subset of
// the active threads
if constexpr (LANE_SIZE == 64) {
return lane_mask & __builtin_amdgcn_ballot_w64(x);
} else {
return lane_mask & __builtin_amdgcn_ballot_w32(x);
}
}

/// Waits for all the threads in the block to converge and issues a fence.
[[clang::convergent]] LIBC_INLINE void sync_threads() {
__builtin_amdgcn_s_barrier();
Expand Down
5 changes: 0 additions & 5 deletions libc/src/__support/GPU/generic/utils.h
Original file line number Diff line number Diff line change
Expand Up @@ -58,11 +58,6 @@ LIBC_INLINE uint64_t get_lane_mask() { return 1; }

LIBC_INLINE uint32_t broadcast_value(uint32_t x) { return x; }

LIBC_INLINE uint64_t ballot(uint64_t lane_mask, bool x) {
(void)lane_mask;
return x;
}

LIBC_INLINE void sync_threads() {}

LIBC_INLINE void sync_lane(uint64_t) {}
Expand Down
7 changes: 0 additions & 7 deletions libc/src/__support/GPU/nvptx/utils.h
Original file line number Diff line number Diff line change
Expand Up @@ -118,13 +118,6 @@ LIBC_INLINE uint32_t get_lane_size() { return LANE_SIZE; }
#endif
}

[[clang::convergent]] LIBC_INLINE uint64_t ballot(uint64_t lane_mask, bool x) {
#if __CUDA_ARCH__ >= 600
return __nvvm_vote_ballot_sync(lane_mask, x);
#else
return lane_mask & __nvvm_vote_ballot(x);
#endif
}
/// Waits for all the threads in the block to converge and issues a fence.
[[clang::convergent]] LIBC_INLINE void sync_threads() { __syncthreads(); }

Expand Down
47 changes: 4 additions & 43 deletions libc/src/__support/RPC/rpc.h
Original file line number Diff line number Diff line change
Expand Up @@ -107,55 +107,16 @@ template <bool InvertInbox> struct Process {
}

/// Attempt to claim the lock at index. Return true on lock taken.
/// lane_mask is a bitmap of the threads in the warp that would hold the
/// single lock on success, e.g. the result of gpu::get_lane_mask()
/// The lock is held when the zeroth bit of the uint32_t at lock[index]
/// is set, and available when that bit is clear. Bits [1, 32) are zero.
/// Or with one is a no-op when the lock is already held.
[[clang::convergent]] LIBC_INLINE bool try_lock(uint64_t lane_mask,
uint64_t index) {
// On amdgpu, test and set to lock[index] and a sync_lane would suffice
// On volta, need to handle differences between the threads running and
// the threads that were detected in the previous call to get_lane_mask()
//
// All threads in lane_mask try to claim the lock. At most one can succeed.
// There may be threads active which are not in lane mask which must not
// succeed in taking the lock, as otherwise it will leak. This is handled
// by making threads which are not in lane_mask or with 0, a no-op.
uint32_t id = gpu::get_lane_id();
bool id_in_lane_mask = lane_mask & (1ul << id);

// All threads in the warp call fetch_or. Possibly at the same time.
bool before =
lock[index].fetch_or(id_in_lane_mask, cpp::MemoryOrder::RELAXED);
uint64_t packed = gpu::ballot(lane_mask, before);

// If every bit set in lane_mask is also set in packed, every single thread
// in the warp failed to get the lock. Ballot returns unset for threads not
// in the lane mask.
//
// Cases, per thread:
// mask==0 -> unspecified before, discarded by ballot -> 0
// mask==1 and before==0 (success), set zero by ballot -> 0
// mask==1 and before==1 (failure), set one by ballot -> 1
//
// mask != packed implies at least one of the threads got the lock
// atomic semantics of fetch_or mean at most one of the threads for the lock
return lane_mask != packed;
LIBC_INLINE bool try_lock(uint64_t, uint64_t index) {
return lock[index].fetch_or(1, cpp::MemoryOrder::RELAXED) == 0;
}

// Unlock the lock at index.
[[clang::convergent]] LIBC_INLINE void unlock(uint64_t lane_mask,
uint64_t index) {
// Wait for other threads in the warp to finish using the lock
gpu::sync_lane(lane_mask);

// Use exactly one thread to clear the bit at position 0 in lock[index]
// Must restrict to a single thread to avoid one thread dropping the lock,
// then an unrelated warp claiming the lock, then a second thread in this
// warp dropping the lock again.
uint32_t and_mask = ~(rpc::is_first_lane(lane_mask) ? 1 : 0);
lock[index].fetch_and(and_mask, cpp::MemoryOrder::RELAXED);
LIBC_INLINE void unlock(uint64_t, uint64_t index) {
lock[index].store(0, cpp::MemoryOrder::RELAXED);
}
};

Expand Down
1 change: 0 additions & 1 deletion libc/src/__support/RPC/rpc_util.h
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,6 @@
#ifndef LLVM_LIBC_SRC_SUPPORT_RPC_RPC_UTILS_H
#define LLVM_LIBC_SRC_SUPPORT_RPC_RPC_UTILS_H

#include "src/__support/GPU/utils.h"
#include "src/__support/macros/attributes.h"
#include "src/__support/macros/properties/architectures.h"

Expand Down

0 comments on commit 8aaaa1c

Please sign in to comment.