Skip to content

Commit

Permalink
[SYCL][HIP] Add AMDGPU reflect pass to choose between safe and unsafe…
Browse files Browse the repository at this point in the history
… AMDGPU atomics (intel#11467)

AMDGPU reflect pass is needed to choose between safe and unsafe atomics
at the libclc level. In the long run we will delete this patch as work
is being done to ensure correct lowering of atomic instructions. See
patches:

llvm/llvm-project#85052
llvm/llvm-project#69229

This work is necessary as malloc shared atomics rely on PCIe atomics
which can have patchy and unreliable support. Therefore, we want to be
able to choose at compile time whether we should use safe atomics using
CAS (which PCIe should support), or if we want to rely of the
availability of the newest PCIe atomics, if malloc shared atomics are
desired.

Also changes the implementation of `atomic_or`, `atomic_and` so that
they
can choose between the safe or unsafe version based on the AMDGPU
reflect value.
  • Loading branch information
hdelan authored Apr 24, 2024
1 parent daeb58b commit 34135a3
Show file tree
Hide file tree
Showing 13 changed files with 196 additions and 36 deletions.
6 changes: 6 additions & 0 deletions libclc/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -328,6 +328,10 @@ foreach( t ${LIBCLC_TARGETS_TO_BUILD} )
# Disables NVVM reflection to defer to after linking
list( APPEND flags -Xclang -target-feature -Xclang +ptx72
-march=sm_86 -mllvm --nvvm-reflect-enable=false)
elseif( ARCH STREQUAL amdgcn )
# AMDGCN needs libclc to be compiled to high bc version since all atomic
# clang builtins need to be accessible
list( APPEND flags -mcpu=gfx940 -mllvm --amdgpu-oclc-reflect-enable=false )
elseif( ARCH STREQUAL x86_64)
# TODO: This is used by SYCL Native Cpu, we should define an option to set this flags
list( APPEND flags -Xclang -target-feature -Xclang +avx
Expand Down Expand Up @@ -358,6 +362,8 @@ foreach( t ${LIBCLC_TARGETS_TO_BUILD} )
# simultaneously, we choose declare the builtins using the private space,
# which will also work for the generic address space.
set( supports_generic_addrspace FALSE )
elseif( ARCH STREQUAL amdgcn )
set( opt_flags -O3 --amdgpu-oclc-reflect-enable=false )
elseif( ARCH STREQUAL x86_64)
set( opt_flags )
set( supports_generic_addrspace FALSE )
Expand Down
13 changes: 4 additions & 9 deletions libclc/amdgcn-amdhsa/libspirv/atomic/atomic_and.cl
Original file line number Diff line number Diff line change
Expand Up @@ -10,13 +10,8 @@
#include <spirv/spirv.h>
#include <spirv/spirv_types.h>

AMDGPU_ATOMIC(_Z17__spirv_AtomicAnd, int, i, __hip_atomic_fetch_and)
AMDGPU_ATOMIC(_Z17__spirv_AtomicAnd, unsigned int, j, __hip_atomic_fetch_and)
AMDGPU_ATOMIC(_Z17__spirv_AtomicAnd, long, l, __hip_atomic_fetch_and)
AMDGPU_ATOMIC(_Z17__spirv_AtomicAnd, unsigned long, m, __hip_atomic_fetch_and)
#define __CLC_OP &
#define __SPIRV_BUILTIN _Z17__spirv_AtomicAnd
#define __HIP_BUILTIN __hip_atomic_fetch_and

#undef AMDGPU_ATOMIC
#undef AMDGPU_ATOMIC_IMPL
#undef AMDGPU_ARCH_GEQ
#undef AMDGPU_ARCH_BETWEEN
#undef GET_ATOMIC_SCOPE_AND_ORDER
#include "atomic_safe.def"
28 changes: 22 additions & 6 deletions libclc/amdgcn-amdhsa/libspirv/atomic/atomic_helpers.h
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,8 @@
#include <spirv/spirv.h>
#include <spirv/spirv_types.h>

extern int __oclc_amdgpu_reflect(__constant char *);

#define AMDGPU_ARCH_GEQ(LOWER) __oclc_ISA_version >= LOWER
#define AMDGPU_ARCH_BETWEEN(LOWER, UPPER) \
__oclc_ISA_version >= LOWER &&__oclc_ISA_version < UPPER
Expand Down Expand Up @@ -72,14 +74,22 @@
AMDGPU_ATOMIC_IMPL(FUNC_NAME, TYPE, TYPE_MANGLED, local, U3AS3, 1, BUILTIN) \
AMDGPU_ATOMIC_IMPL(FUNC_NAME, TYPE, TYPE_MANGLED, , , 0, BUILTIN)

#define AMDGPU_CAS_ATOMIC_IMPL(FUNC_NAME, TYPE, TYPE_MANGLED, AS, AS_MANGLED, \
SUB1, OP) \
// Safe atomics will either choose a slow CAS atomic impl (default) or a fast
// native atomic if --amdgpu-unsafe-int-atomics is passed to LLVM.
//
// Safe atomics using CAS may be necessary if PCIe does not support atomic
// operations such as and, or, xor
#define AMDGPU_SAFE_ATOMIC_IMPL(FUNC_NAME, TYPE, TYPE_MANGLED, AS, AS_MANGLED, \
SUB1, OP, USE_BUILTIN_COND, BUILTIN) \
_CLC_DEF TYPE \
FUNC_NAME##P##AS_MANGLED##TYPE_MANGLED##N5__spv5Scope4FlagENS##SUB1##_19MemorySemanticsMask4FlagE##TYPE_MANGLED( \
volatile AS TYPE *p, enum Scope scope, \
enum MemorySemanticsMask semantics, TYPE val) { \
int atomic_scope = 0, memory_order = 0; \
GET_ATOMIC_SCOPE_AND_ORDER(scope, atomic_scope, semantics, memory_order) \
if (USE_BUILTIN_COND) \
return BUILTIN(p, val, memory_order, atomic_scope); \
/* CAS atomics*/ \
TYPE oldval = __hip_atomic_load(p, memory_order, atomic_scope); \
TYPE newval = 0; \
do { \
Expand All @@ -89,7 +99,13 @@
return oldval; \
}

#define AMDGPU_CAS_ATOMIC(FUNC_NAME, TYPE, TYPE_MANGLED, OP) \
AMDGPU_CAS_ATOMIC_IMPL(FUNC_NAME, TYPE, TYPE_MANGLED, global, U3AS1, 1, OP) \
AMDGPU_CAS_ATOMIC_IMPL(FUNC_NAME, TYPE, TYPE_MANGLED, local, U3AS3, 1, OP) \
AMDGPU_CAS_ATOMIC_IMPL(FUNC_NAME, TYPE, TYPE_MANGLED, , , 0, OP)
#define AMDGPU_SAFE_ATOMIC(FUNC_NAME, TYPE, TYPE_MANGLED, OP, BUILTIN) \
AMDGPU_SAFE_ATOMIC_IMPL( \
FUNC_NAME, TYPE, TYPE_MANGLED, global, U3AS1, 1, OP, \
__oclc_amdgpu_reflect("AMDGPU_OCLC_UNSAFE_INT_ATOMICS"), BUILTIN) \
AMDGPU_SAFE_ATOMIC_IMPL(FUNC_NAME, TYPE, TYPE_MANGLED, local, U3AS3, 1, OP, \
true /* local AS should always use builtin*/, \
BUILTIN) \
AMDGPU_SAFE_ATOMIC_IMPL( \
FUNC_NAME, TYPE, TYPE_MANGLED, , , 0, OP, \
__oclc_amdgpu_reflect("AMDGPU_OCLC_UNSAFE_INT_ATOMICS"), BUILTIN)
13 changes: 4 additions & 9 deletions libclc/amdgcn-amdhsa/libspirv/atomic/atomic_or.cl
Original file line number Diff line number Diff line change
Expand Up @@ -10,13 +10,8 @@
#include <spirv/spirv.h>
#include <spirv/spirv_types.h>

AMDGPU_ATOMIC(_Z16__spirv_AtomicOr, int, i, __hip_atomic_fetch_or)
AMDGPU_ATOMIC(_Z16__spirv_AtomicOr, unsigned int, j, __hip_atomic_fetch_or)
AMDGPU_ATOMIC(_Z16__spirv_AtomicOr, long, l, __hip_atomic_fetch_or)
AMDGPU_ATOMIC(_Z16__spirv_AtomicOr, unsigned long, m, __hip_atomic_fetch_or)
#define __CLC_OP |
#define __SPIRV_BUILTIN _Z16__spirv_AtomicOr
#define __HIP_BUILTIN __hip_atomic_fetch_or

#undef AMDGPU_ATOMIC
#undef AMDGPU_ATOMIC_IMPL
#undef AMDGPU_ARCH_GEQ
#undef AMDGPU_ARCH_BETWEEN
#undef GET_ATOMIC_SCOPE_AND_ORDER
#include "atomic_safe.def"
9 changes: 9 additions & 0 deletions libclc/amdgcn-amdhsa/libspirv/atomic/atomic_safe.def
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
// Before including, define: __SPIRV_BUILTIN, __CLC_OP, __HIP_BUILTIN
// and include atomic_helpers.h to get AMDGPU_SAFE_ATOMIC

AMDGPU_SAFE_ATOMIC(__SPIRV_BUILTIN, int, i, __CLC_OP, __HIP_BUILTIN)
AMDGPU_SAFE_ATOMIC(__SPIRV_BUILTIN, unsigned int, j, __CLC_OP,
__HIP_BUILTIN)
AMDGPU_SAFE_ATOMIC(__SPIRV_BUILTIN, long, l, __CLC_OP, __HIP_BUILTIN)
AMDGPU_SAFE_ATOMIC(__SPIRV_BUILTIN, unsigned long, m, __CLC_OP,
__HIP_BUILTIN)
16 changes: 4 additions & 12 deletions libclc/amdgcn-amdhsa/libspirv/atomic/atomic_xor.cl
Original file line number Diff line number Diff line change
Expand Up @@ -10,16 +10,8 @@
#include <spirv/spirv.h>
#include <spirv/spirv_types.h>

#define __CLC_XOR ^
#define __CLC_OP ^
#define __SPIRV_BUILTIN _Z17__spirv_AtomicXor
#define __HIP_BUILTIN __hip_atomic_fetch_xor

AMDGPU_CAS_ATOMIC(_Z17__spirv_AtomicXor, int, i, __CLC_XOR)
AMDGPU_CAS_ATOMIC(_Z17__spirv_AtomicXor, unsigned int, j, __CLC_XOR)
AMDGPU_CAS_ATOMIC(_Z17__spirv_AtomicXor, long, l, __CLC_XOR)
AMDGPU_CAS_ATOMIC(_Z17__spirv_AtomicXor, unsigned long, m, __CLC_XOR)

#undef __CLC_XOR
#undef AMDGPU_ATOMIC
#undef AMDGPU_ATOMIC_IMPL
#undef AMDGPU_ARCH_GEQ
#undef AMDGPU_ARCH_BETWEEN
#undef GET_ATOMIC_SCOPE_AND_ORDER
#include "atomic_safe.def"
6 changes: 6 additions & 0 deletions llvm/lib/Target/AMDGPU/AMDGPU.h
Original file line number Diff line number Diff line change
Expand Up @@ -130,6 +130,12 @@ struct AMDGPULowerKernelAttributesPass
PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM);
};

struct AMDGPUOclcReflectPass : public PassInfoMixin<AMDGPUOclcReflectPass> {
public:
PreservedAnalyses run(Function &M, FunctionAnalysisManager &AM);
static bool isRequired() { return true; }
};

void initializeAMDGPULowerModuleLDSLegacyPass(PassRegistry &);
extern char &AMDGPULowerModuleLDSLegacyPassID;

Expand Down
92 changes: 92 additions & 0 deletions llvm/lib/Target/AMDGPU/AMDGPUOclcReflect.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,92 @@
//===- AMDGPUOclcReflect.cpp ----------------------------------------------===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
// This pass searches for occurences of the AMDGPU_OCLC_REFLECT function, and
// replaces the calls with some val dependent on the operand of the func. This
// can be used to reflect across different implementations of functions at
// compile time based on a compiler flag or some other means. This pass
// currently supports use cases:
//
// 1. Choose a safe or unsafe version of atomic_xor at compile time, which can
// be chosen at compile time by setting the flag
// --amdgpu-oclc-unsafe-int-atomics=true.
//
// This pass is similar to the NVPTX pass NVVMReflect.
//
//===----------------------------------------------------------------------===//

#include "AMDGPU.h"
#include "llvm/IR/Constants.h"
#include "llvm/IR/Dominators.h"
#include "llvm/IR/InstIterator.h"
#include "llvm/IR/Instructions.h"
#include "llvm/Pass.h"
#include "llvm/Support/CommandLine.h"

using namespace llvm;

#define AMDGPU_OCLC_REFLECT "__oclc_amdgpu_reflect"

static cl::opt<bool>
AMDGPUReflectEnabled("amdgpu-oclc-reflect-enable", cl::init(true),
cl::Hidden,
cl::desc("AMDGPU reflection, enabled by default"));
static cl::opt<bool> AMDGPUUnsafeIntAtomicsEnable(
"amdgpu-oclc-unsafe-int-atomics", cl::init(false), cl::Hidden,
cl::desc("Should unsafe int atomics be chosen. Disabled by default."));

PreservedAnalyses AMDGPUOclcReflectPass::run(Function &F,
FunctionAnalysisManager &AM) {
if (!AMDGPUReflectEnabled)
return PreservedAnalyses::all();

if (F.getName() == AMDGPU_OCLC_REFLECT) {
assert(F.isDeclaration() &&
"__oclc_amdgpu_reflect function should not have a body");
return PreservedAnalyses::all();
}

SmallVector<CallInst *, 4> ToRemove;

for (Instruction &I : instructions(F)) {
auto *Call = dyn_cast<CallInst>(&I);
if (!Call)
continue;
if (Function *Callee = Call->getCalledFunction();
!Callee || Callee->getName() != AMDGPU_OCLC_REFLECT)
continue;

assert(Call->arg_size() == 1 &&
"Wrong number of operands to __oclc_amdgpu_reflect function");

ToRemove.push_back(Call);
}

if (!ToRemove.size())
return PreservedAnalyses::all();

for (CallInst *Call : ToRemove) {
const Value *Str = Call->getArgOperand(0);
const Value *Operand = cast<Constant>(Str)->getOperand(0);
StringRef ReflectArg = cast<ConstantDataSequential>(Operand)->getAsString();
ReflectArg = ReflectArg.drop_back(1);

if (ReflectArg == "AMDGPU_OCLC_UNSAFE_INT_ATOMICS") {
int ReflectVal = AMDGPUUnsafeIntAtomicsEnable ? 1 : 0;
Call->replaceAllUsesWith(ConstantInt::get(Call->getType(), ReflectVal));
} else {
report_fatal_error("Invalid arg passed to __oclc_amdgpu_reflect");
}
Call->eraseFromParent();
}

PreservedAnalyses PA;
PA.preserveSet<CFGAnalyses>();
PA.preserve<DominatorTreeAnalysis>();
return PA;
}
1 change: 1 addition & 0 deletions llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@ FUNCTION_PASS("amdgpu-lower-kernel-arguments",
AMDGPULowerKernelArgumentsPass(*this))
FUNCTION_PASS("amdgpu-lower-kernel-attributes",
AMDGPULowerKernelAttributesPass())
FUNCTION_PASS("amdgpu-oclc-reflect", AMDGPUOclcReflectPass())
FUNCTION_PASS("amdgpu-simplifylib", AMDGPUSimplifyLibCallsPass())
FUNCTION_PASS("amdgpu-promote-alloca", AMDGPUPromoteAllocaPass(*this))
FUNCTION_PASS("amdgpu-promote-alloca-to-vector",
Expand Down
1 change: 1 addition & 0 deletions llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -663,6 +663,7 @@ void AMDGPUTargetMachine::registerPassBuilderCallbacks(
PB.registerPipelineStartEPCallback(
[](ModulePassManager &PM, OptimizationLevel Level) {
FunctionPassManager FPM;
FPM.addPass(AMDGPUOclcReflectPass());
PM.addPass(createModuleToFunctionPassAdaptor(std::move(FPM)));
if (EnableHipStdPar)
PM.addPass(HipStdParAcceleratorCodeSelectionPass());
Expand Down
1 change: 1 addition & 0 deletions llvm/lib/Target/AMDGPU/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -82,6 +82,7 @@ add_llvm_target(AMDGPUCodeGen
AMDGPUInsertSingleUseVDST.cpp
AMDGPUMarkLastScratchLoad.cpp
AMDGPUMIRFormatter.cpp
AMDGPUOclcReflect.cpp
AMDGPUOpenCLEnqueuedBlockLowering.cpp
AMDGPUPerfHintAnalysis.cpp
AMDGPUPostLegalizerCombiner.cpp
Expand Down
22 changes: 22 additions & 0 deletions llvm/test/CodeGen/AMDGPU/amdgpu-oclc-reflect.ll
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
; RUN: opt -S -p amdgpu-oclc-reflect %s | FileCheck %s -check-prefixes=CHECK,CHECK-SAFE-ATOMICS
; RUN: opt -S -p amdgpu-oclc-reflect -amdgpu-oclc-unsafe-int-atomics=true %s | FileCheck %s -check-prefixes=CHECK,CHECK-UNSAFE-ATOMICS

target triple = "amdgcn-amd-amdhsa"

@.str = private unnamed_addr addrspace(4) constant [31 x i8] c"AMDGPU_OCLC_UNSAFE_INT_ATOMICS\00", align 1

declare hidden i32 @__oclc_amdgpu_reflect(ptr addrspace(4) noundef) local_unnamed_addr

define i32 @foo() {
; CHECK-SAFE-ATOMICS-LABEL: define i32 @foo() {
; CHECK-SAFE-ATOMICS-NEXT: ret i32 0
;
; CHECK-UNSAFE-ATOMICS-LABEL: define i32 @foo() {
; CHECK-UNSAFE-ATOMICS-NEXT: ret i32 1
;
%call = tail call i32 @__oclc_amdgpu_reflect(ptr addrspace(4) noundef @.str)
ret i32 %call
}
;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
; CHECK: {{.*}}
24 changes: 24 additions & 0 deletions sycl/test/check_device_code/hip/atomic/amdgpu_unsafe_atomics.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
// REQUIRES: hip
// RUN: %clangxx -fsycl -fsycl-targets=amd_gpu_gfx906 %s -S -emit-llvm -o - | FileCheck %s --check-prefixes=CHECK,CHECK-SAFE
// RUN: %clangxx -fsycl -fsycl-targets=amd_gpu_gfx906 %s -mllvm --amdgpu-oclc-unsafe-int-atomics=true -S -emit-llvm -o - | FileCheck %s --check-prefixes=CHECK,CHECK-UNSAFE

#include <sycl/sycl.hpp>

int main() {
sycl::queue{}.single_task([=] {
int a;
sycl::atomic_ref<int, sycl::memory_order_relaxed, sycl::memory_scope_device>
atomicInt(a);
atomicInt.fetch_xor(1);
atomicInt.fetch_and(1);
atomicInt.fetch_or(1);
// CHECK: __CLANG_OFFLOAD_BUNDLE____START__ sycl-amdgcn-amd-amdhsa-
// CHECK-SAFE: cmpxchg volatile
// CHECK-SAFE-NOT: atomicrmw
// CHECK-UNSAFE: atomicrmw volatile xor
// CHECK-UNSAFE: atomicrmw volatile and
// CHECK-UNSAFE: atomicrmw volatile or
// CHECK-UNSAFE-NOT: cmpxchg
// CHECK: __CLANG_OFFLOAD_BUNDLE____END__ sycl-amdgcn-amd-amdhsa-
});
}

0 comments on commit 34135a3

Please sign in to comment.