Skip to content

Commit

Permalink
AMDGPU: Add cvt.pkrtz intrinsic
Browse files Browse the repository at this point in the history
Convert llvm.SI.packf16 test uses

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@295797 91177308-0d34-0410-b5e6-96231b3b80d8
  • Loading branch information
arsenm committed Feb 22, 2017
1 parent 2b21feb commit 7d65faa
Show file tree
Hide file tree
Showing 18 changed files with 351 additions and 51 deletions.
4 changes: 4 additions & 0 deletions include/llvm/IR/IntrinsicsAMDGPU.td
Original file line number Diff line number Diff line change
Expand Up @@ -206,6 +206,10 @@ def int_amdgcn_fract : Intrinsic<
[llvm_anyfloat_ty], [LLVMMatchType<0>], [IntrNoMem]
>;

def int_amdgcn_cvt_pkrtz : Intrinsic<
[llvm_v2f16_ty], [llvm_float_ty, llvm_float_ty], [IntrNoMem]
>;

def int_amdgcn_class : Intrinsic<
[llvm_i1_ty], [llvm_anyfloat_ty, llvm_i32_ty], [IntrNoMem]
>;
Expand Down
1 change: 1 addition & 0 deletions lib/Target/AMDGPU/AMDGPUISelLowering.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -3458,6 +3458,7 @@ const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const {
NODE_NAME_CASE(CVT_F32_UBYTE1)
NODE_NAME_CASE(CVT_F32_UBYTE2)
NODE_NAME_CASE(CVT_F32_UBYTE3)
NODE_NAME_CASE(CVT_PKRTZ_F16_F32)
NODE_NAME_CASE(BUILD_VERTICAL_VECTOR)
NODE_NAME_CASE(CONST_DATA_PTR)
NODE_NAME_CASE(PC_ADD_REL_OFFSET)
Expand Down
5 changes: 5 additions & 0 deletions lib/Target/AMDGPU/AMDGPUISelLowering.h
Original file line number Diff line number Diff line change
Expand Up @@ -318,6 +318,11 @@ enum NodeType : unsigned {
CVT_F32_UBYTE1,
CVT_F32_UBYTE2,
CVT_F32_UBYTE3,

// Convert two float 32 numbers into a single register holding two packed f16
// with round to zero.
CVT_PKRTZ_F16_F32,

/// This node is for VLIW targets and it is used to represent a vector
/// that is stored in consecutive registers with the same channel.
/// For example:
Expand Down
6 changes: 6 additions & 0 deletions lib/Target/AMDGPU/AMDGPUInstrInfo.td
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,10 @@ def AMDGPUFPClassOp : SDTypeProfile<1, 2,
[SDTCisInt<0>, SDTCisFP<1>, SDTCisInt<2>]
>;

def AMDGPUFPPackOp : SDTypeProfile<1, 2,
[SDTCisFP<1>, SDTCisSameAs<1, 2>]
>;

def AMDGPUDivScaleOp : SDTypeProfile<2, 3,
[SDTCisFP<0>, SDTCisInt<1>, SDTCisSameAs<0, 2>, SDTCisSameAs<0, 3>, SDTCisSameAs<0, 4>]
>;
Expand Down Expand Up @@ -78,6 +82,8 @@ def AMDGPUrsq_clamp : SDNode<"AMDGPUISD::RSQ_CLAMP", SDTFPUnaryOp>;

def AMDGPUldexp : SDNode<"AMDGPUISD::LDEXP", AMDGPULdExpOp>;

def AMDGPUpkrtz_f16_f32 : SDNode<"AMDGPUISD::CVT_PKRTZ_F16_F32", AMDGPUFPPackOp>;

def AMDGPUfp_class : SDNode<"AMDGPUISD::FP_CLASS", AMDGPUFPClassOp>;

// out = max(a, b) a and b are floats, where a nan comparison fails.
Expand Down
45 changes: 41 additions & 4 deletions lib/Target/AMDGPU/SIISelLowering.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -188,6 +188,7 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::Other, Custom);
setOperationAction(ISD::INTRINSIC_VOID, MVT::v2i16, Custom);
setOperationAction(ISD::INTRINSIC_VOID, MVT::v2f16, Custom);
setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::v2f16, Custom);

setOperationAction(ISD::BRCOND, MVT::Other, Custom);
setOperationAction(ISD::BR_CC, MVT::i1, Expand);
Expand Down Expand Up @@ -2014,6 +2015,23 @@ void SITargetLowering::ReplaceNodeResults(SDNode *N,
Results.push_back(Res);
return;
}
case ISD::INTRINSIC_WO_CHAIN: {
unsigned IID = cast<ConstantSDNode>(N->getOperand(0))->getZExtValue();
switch (IID) {
case Intrinsic::amdgcn_cvt_pkrtz: {
SDValue Src0 = N->getOperand(1);
SDValue Src1 = N->getOperand(2);
SDLoc SL(N);
SDValue Cvt = DAG.getNode(AMDGPUISD::CVT_PKRTZ_F16_F32, SL, MVT::i32,
Src0, Src1);

Results.push_back(DAG.getNode(ISD::BITCAST, SL, MVT::v2f16, Cvt));
return;
}
default:
break;
}
}
default:
break;
}
Expand Down Expand Up @@ -2691,10 +2709,6 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
Op.getOperand(1),
Op.getOperand(2),
Op.getOperand(3));
case AMDGPUIntrinsic::SI_packf16:
if (Op.getOperand(1).isUndef() && Op.getOperand(2).isUndef())
return DAG.getUNDEF(MVT::i32);
return Op;
case Intrinsic::amdgcn_interp_mov: {
SDValue M0 = copyToM0(DAG, DAG.getEntryNode(), DL, Op.getOperand(4));
SDValue Glue = M0.getValue(1);
Expand Down Expand Up @@ -2811,6 +2825,18 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
Op.getOperand(1), Op.getOperand(2));
case Intrinsic::amdgcn_sffbh:
return DAG.getNode(AMDGPUISD::FFBH_I32, DL, VT, Op.getOperand(1));
case Intrinsic::amdgcn_cvt_pkrtz: {
// FIXME: Stop adding cast if v2f16 legal.
EVT VT = Op.getValueType();
SDValue Node = DAG.getNode(AMDGPUISD::CVT_PKRTZ_F16_F32, DL, MVT::i32,
Op.getOperand(1), Op.getOperand(2));
return DAG.getNode(ISD::BITCAST, DL, VT, Node);
}
case AMDGPUIntrinsic::SI_packf16: { // Legacy name
EVT VT = Op.getValueType();
return DAG.getNode(AMDGPUISD::CVT_PKRTZ_F16_F32, DL, VT,
Op.getOperand(1), Op.getOperand(2));
}
default:
return AMDGPUTargetLowering::LowerOperation(Op, DAG);
}
Expand Down Expand Up @@ -4154,6 +4180,15 @@ SDValue SITargetLowering::performFMed3Combine(SDNode *N,
return SDValue();
}

SDValue SITargetLowering::performCvtPkRTZCombine(SDNode *N,
DAGCombinerInfo &DCI) const {
SDValue Src0 = N->getOperand(0);
SDValue Src1 = N->getOperand(1);
if (Src0.isUndef() && Src1.isUndef())
return DCI.DAG.getUNDEF(N->getValueType(0));
return SDValue();
}

unsigned SITargetLowering::getFusedOpcode(const SelectionDAG &DAG,
const SDNode *N0,
const SDNode *N1) const {
Expand Down Expand Up @@ -4422,6 +4457,8 @@ SDValue SITargetLowering::PerformDAGCombine(SDNode *N,
return performCvtF32UByteNCombine(N, DCI);
case AMDGPUISD::FMED3:
return performFMed3Combine(N, DCI);
case AMDGPUISD::CVT_PKRTZ_F16_F32:
return performCvtPkRTZCombine(N, DCI);
}
return AMDGPUTargetLowering::PerformDAGCombine(N, DCI);
}
Expand Down
1 change: 1 addition & 0 deletions lib/Target/AMDGPU/SIISelLowering.h
Original file line number Diff line number Diff line change
Expand Up @@ -88,6 +88,7 @@ class SITargetLowering final : public AMDGPUTargetLowering {
SDValue Op0, SDValue Op1) const;
SDValue performMinMaxCombine(SDNode *N, DAGCombinerInfo &DCI) const;
SDValue performFMed3Combine(SDNode *N, DAGCombinerInfo &DCI) const;
SDValue performCvtPkRTZCombine(SDNode *N, DAGCombinerInfo &DCI) const;

unsigned getFusedOpcode(const SelectionDAG &DAG,
const SDNode *N0, const SDNode *N1) const;
Expand Down
1 change: 1 addition & 0 deletions lib/Target/AMDGPU/SIInstrInfo.td
Original file line number Diff line number Diff line change
Expand Up @@ -1151,6 +1151,7 @@ def VOP_F64_F64_I32 : VOPProfile <[f64, f64, i32, untyped]>;
def VOP_I32_F32_F32 : VOPProfile <[i32, f32, f32, untyped]>;
def VOP_I32_F32_I32 : VOPProfile <[i32, f32, i32, untyped]>;
def VOP_I32_I32_I32 : VOPProfile <[i32, i32, i32, untyped]>;
def VOP_V2F16_F32_F32 : VOPProfile <[v2f16, f32, f32, untyped]>;

def VOP_I64_I64_I32 : VOPProfile <[i64, i64, i32, untyped]>;
def VOP_I64_I32_I64 : VOPProfile <[i64, i32, i64, untyped]>;
Expand Down
2 changes: 1 addition & 1 deletion lib/Target/AMDGPU/VOP2Instructions.td
Original file line number Diff line number Diff line change
Expand Up @@ -354,7 +354,7 @@ defm V_LDEXP_F32 : VOP2Inst <"v_ldexp_f32", VOP_F32_F32_I32, AMDGPUldexp>;
defm V_CVT_PKACCUM_U8_F32 : VOP2Inst <"v_cvt_pkaccum_u8_f32", VOP_I32_F32_I32>; // TODO: set "Uses = dst"
defm V_CVT_PKNORM_I16_F32 : VOP2Inst <"v_cvt_pknorm_i16_f32", VOP_I32_F32_F32>;
defm V_CVT_PKNORM_U16_F32 : VOP2Inst <"v_cvt_pknorm_u16_f32", VOP_I32_F32_F32>;
defm V_CVT_PKRTZ_F16_F32 : VOP2Inst <"v_cvt_pkrtz_f16_f32", VOP_I32_F32_F32, int_SI_packf16>;
defm V_CVT_PKRTZ_F16_F32 : VOP2Inst <"v_cvt_pkrtz_f16_f32", VOP_I32_F32_F32, AMDGPUpkrtz_f16_f32>;
defm V_CVT_PK_U16_U32 : VOP2Inst <"v_cvt_pk_u16_u32", VOP_I32_I32_I32>;
defm V_CVT_PK_I16_I32 : VOP2Inst <"v_cvt_pk_i16_i32", VOP_I32_I32_I32>;

Expand Down
25 changes: 25 additions & 0 deletions lib/Transforms/InstCombine/InstCombineCalls.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -3206,6 +3206,31 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {

return replaceInstUsesWith(*II, ConstantInt::get(II->getType(), Result));
}
case Intrinsic::amdgcn_cvt_pkrtz: {
Value *Src0 = II->getArgOperand(0);
Value *Src1 = II->getArgOperand(1);
if (const ConstantFP *C0 = dyn_cast<ConstantFP>(Src0)) {
if (const ConstantFP *C1 = dyn_cast<ConstantFP>(Src1)) {
const fltSemantics &HalfSem
= II->getType()->getScalarType()->getFltSemantics();
bool LosesInfo;
APFloat Val0 = C0->getValueAPF();
APFloat Val1 = C1->getValueAPF();
Val0.convert(HalfSem, APFloat::rmTowardZero, &LosesInfo);
Val1.convert(HalfSem, APFloat::rmTowardZero, &LosesInfo);

Constant *Folded = ConstantVector::get({
ConstantFP::get(II->getContext(), Val0),
ConstantFP::get(II->getContext(), Val1) });
return replaceInstUsesWith(*II, Folded);
}
}

if (isa<UndefValue>(Src0) && isa<UndefValue>(Src1))
return replaceInstUsesWith(*II, UndefValue::get(II->getType()));

break;
}
case Intrinsic::stackrestore: {
// If the save is right next to the restore, remove the restore. This can
// happen when variable allocas are DCE'd.
Expand Down
6 changes: 3 additions & 3 deletions test/CodeGen/AMDGPU/commute-shifts.ll
Original file line number Diff line number Diff line change
Expand Up @@ -15,13 +15,13 @@ bb:
%tmp5 = and i32 %tmp2, %tmp4
%tmp6 = icmp eq i32 %tmp5, 0
%tmp7 = select i1 %tmp6, float 0.000000e+00, float %arg1
%tmp8 = call i32 @llvm.SI.packf16(float undef, float %tmp7)
%tmp9 = bitcast i32 %tmp8 to float
%tmp8 = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float undef, float %tmp7)
%tmp9 = bitcast <2 x half> %tmp8 to float
ret float %tmp9
}

declare <4 x float> @llvm.SI.image.load.v4i32(<4 x i32>, <8 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1
declare i32 @llvm.SI.packf16(float, float) #1
declare <2 x half> @llvm.amdgcn.cvt.pkrtz(float, float) #1

attributes #0 = { nounwind }
attributes #1 = { nounwind readnone }
163 changes: 163 additions & 0 deletions test/CodeGen/AMDGPU/llvm.amdgcn.cvt.pkrtz.ll
Original file line number Diff line number Diff line change
@@ -0,0 +1,163 @@
; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
; RUN: llc -march=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
; RUN: llc -march=amdgcn -mcpu=gfx901 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s

; GCN-LABEL: {{^}}s_cvt_pkrtz_v2f16_f32:
; GCN: s_load_dword [[X:s[0-9]+]]
; GCN: s_load_dword [[Y:s[0-9]+]]
; GCN: v_mov_b32_e32 [[VY:v[0-9]+]]
; SI: v_cvt_pkrtz_f16_f32_e32 v{{[0-9]+}}, [[X]], [[VY]]
; VI: v_cvt_pkrtz_f16_f32_e64 v{{[0-9]+}}, [[X]], [[VY]]
define void @s_cvt_pkrtz_v2f16_f32(<2 x half> addrspace(1)* %out, float %x, float %y) #0 {
%result = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float %x, float %y)
store <2 x half> %result, <2 x half> addrspace(1)* %out
ret void
}

; GCN-LABEL: {{^}}s_cvt_pkrtz_samereg_v2f16_f32:
; GCN: s_load_dword [[X:s[0-9]+]]
; GCN: v_cvt_pkrtz_f16_f32_e64 v{{[0-9]+}}, [[X]], [[X]]
define void @s_cvt_pkrtz_samereg_v2f16_f32(<2 x half> addrspace(1)* %out, float %x) #0 {
%result = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float %x, float %x)
store <2 x half> %result, <2 x half> addrspace(1)* %out
ret void
}

; GCN-LABEL: {{^}}s_cvt_pkrtz_undef_undef:
; GCN-NEXT: ; BB#0
; GCN-NEXT: s_endpgm
define void @s_cvt_pkrtz_undef_undef(<2 x half> addrspace(1)* %out) #0 {
%result = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float undef, float undef)
store <2 x half> %result, <2 x half> addrspace(1)* %out
ret void
}

; GCN-LABEL: {{^}}v_cvt_pkrtz_v2f16_f32:
; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
; SI: v_cvt_pkrtz_f16_f32_e32 v{{[0-9]+}}, [[A]], [[B]]
; VI: v_cvt_pkrtz_f16_f32_e64 v{{[0-9]+}}, [[A]], [[B]]
define void @v_cvt_pkrtz_v2f16_f32(<2 x half> addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%tid.ext = sext i32 %tid to i64
%a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
%b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
%out.gep = getelementptr inbounds <2 x half>, <2 x half> addrspace(1)* %out, i64 %tid.ext
%a = load volatile float, float addrspace(1)* %a.gep
%b = load volatile float, float addrspace(1)* %b.gep
%cvt = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float %a, float %b)
store <2 x half> %cvt, <2 x half> addrspace(1)* %out.gep
ret void
}

; GCN-LABEL: {{^}}v_cvt_pkrtz_v2f16_f32_reg_imm:
; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
; GCN: v_cvt_pkrtz_f16_f32_e64 v{{[0-9]+}}, [[A]], 1.0
define void @v_cvt_pkrtz_v2f16_f32_reg_imm(<2 x half> addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%tid.ext = sext i32 %tid to i64
%a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
%out.gep = getelementptr inbounds <2 x half>, <2 x half> addrspace(1)* %out, i64 %tid.ext
%a = load volatile float, float addrspace(1)* %a.gep
%cvt = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float %a, float 1.0)
store <2 x half> %cvt, <2 x half> addrspace(1)* %out.gep
ret void
}

; GCN-LABEL: {{^}}v_cvt_pkrtz_v2f16_f32_imm_reg:
; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
; SI: v_cvt_pkrtz_f16_f32_e32 v{{[0-9]+}}, 1.0, [[A]]
; VI: v_cvt_pkrtz_f16_f32_e64 v{{[0-9]+}}, 1.0, [[A]]
define void @v_cvt_pkrtz_v2f16_f32_imm_reg(<2 x half> addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%tid.ext = sext i32 %tid to i64
%a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
%out.gep = getelementptr inbounds <2 x half>, <2 x half> addrspace(1)* %out, i64 %tid.ext
%a = load volatile float, float addrspace(1)* %a.gep
%cvt = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float 1.0, float %a)
store <2 x half> %cvt, <2 x half> addrspace(1)* %out.gep
ret void
}

; GCN-LABEL: {{^}}v_cvt_pkrtz_v2f16_f32_fneg_lo:
; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
; GCN: v_cvt_pkrtz_f16_f32_e64 v{{[0-9]+}}, -[[A]], [[B]]
define void @v_cvt_pkrtz_v2f16_f32_fneg_lo(<2 x half> addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%tid.ext = sext i32 %tid to i64
%a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
%b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
%out.gep = getelementptr inbounds <2 x half>, <2 x half> addrspace(1)* %out, i64 %tid.ext
%a = load volatile float, float addrspace(1)* %a.gep
%b = load volatile float, float addrspace(1)* %b.gep
%neg.a = fsub float -0.0, %a
%cvt = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float %neg.a, float %b)
store <2 x half> %cvt, <2 x half> addrspace(1)* %out.gep
ret void
}

; GCN-LABEL: {{^}}v_cvt_pkrtz_v2f16_f32_fneg_hi:
; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
; GCN: v_cvt_pkrtz_f16_f32_e64 v{{[0-9]+}}, [[A]], -[[B]]
define void @v_cvt_pkrtz_v2f16_f32_fneg_hi(<2 x half> addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%tid.ext = sext i32 %tid to i64
%a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
%b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
%out.gep = getelementptr inbounds <2 x half>, <2 x half> addrspace(1)* %out, i64 %tid.ext
%a = load volatile float, float addrspace(1)* %a.gep
%b = load volatile float, float addrspace(1)* %b.gep
%neg.b = fsub float -0.0, %b
%cvt = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float %a, float %neg.b)
store <2 x half> %cvt, <2 x half> addrspace(1)* %out.gep
ret void
}

; GCN-LABEL: {{^}}v_cvt_pkrtz_v2f16_f32_fneg_lo_hi:
; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
; GCN: v_cvt_pkrtz_f16_f32_e64 v{{[0-9]+}}, -[[A]], -[[B]]
define void @v_cvt_pkrtz_v2f16_f32_fneg_lo_hi(<2 x half> addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%tid.ext = sext i32 %tid to i64
%a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
%b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
%out.gep = getelementptr inbounds <2 x half>, <2 x half> addrspace(1)* %out, i64 %tid.ext
%a = load volatile float, float addrspace(1)* %a.gep
%b = load volatile float, float addrspace(1)* %b.gep
%neg.a = fsub float -0.0, %a
%neg.b = fsub float -0.0, %b
%cvt = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float %neg.a, float %neg.b)
store <2 x half> %cvt, <2 x half> addrspace(1)* %out.gep
ret void
}

; GCN-LABEL: {{^}}v_cvt_pkrtz_v2f16_f32_fneg_fabs_lo_fneg_hi:
; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
; GCN: v_cvt_pkrtz_f16_f32_e64 v{{[0-9]+}}, -|[[A]]|, -[[B]]
define void @v_cvt_pkrtz_v2f16_f32_fneg_fabs_lo_fneg_hi(<2 x half> addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%tid.ext = sext i32 %tid to i64
%a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
%b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
%out.gep = getelementptr inbounds <2 x half>, <2 x half> addrspace(1)* %out, i64 %tid.ext
%a = load volatile float, float addrspace(1)* %a.gep
%b = load volatile float, float addrspace(1)* %b.gep
%fabs.a = call float @llvm.fabs.f32(float %a)
%neg.fabs.a = fsub float -0.0, %fabs.a
%neg.b = fsub float -0.0, %b
%cvt = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float %neg.fabs.a, float %neg.b)
store <2 x half> %cvt, <2 x half> addrspace(1)* %out.gep
ret void
}

declare <2 x half> @llvm.amdgcn.cvt.pkrtz(float, float) #1
declare float @llvm.fabs.f32(float) #1
declare i32 @llvm.amdgcn.workitem.id.x() #1


attributes #0 = { nounwind }
attributes #1 = { nounwind readnone }
10 changes: 4 additions & 6 deletions test/CodeGen/AMDGPU/llvm.amdgcn.interp.ll
Original file line number Diff line number Diff line change
Expand Up @@ -204,11 +204,9 @@ main_body:
%tmp = call float @llvm.fabs.f32(float %p2.i)
%tmp34 = call float @llvm.fabs.f32(float %p2.i12)
%tmp35 = call float @llvm.fabs.f32(float %p2.i6)
%tmp36 = call i32 @llvm.SI.packf16(float %tmp, float %tmp34)
%tmp37 = bitcast i32 %tmp36 to <2 x half>
%tmp38 = call i32 @llvm.SI.packf16(float %tmp35, float 1.000000e+00)
%tmp39 = bitcast i32 %tmp38 to <2 x half>
call void @llvm.amdgcn.exp.compr.v2f16(i32 0, i32 15, <2 x half> %tmp37, <2 x half> %tmp39, i1 true, i1 true) #0
%tmp36 = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float %tmp, float %tmp34)
%tmp38 = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float %tmp35, float 1.000000e+00)
call void @llvm.amdgcn.exp.compr.v2f16(i32 0, i32 15, <2 x half> %tmp36, <2 x half> %tmp38, i1 true, i1 true) #0
ret void
}

Expand All @@ -218,7 +216,7 @@ declare float @llvm.amdgcn.interp.p2(float, float, i32, i32, i32) #1
declare float @llvm.amdgcn.interp.mov(i32, i32, i32, i32) #1
declare void @llvm.amdgcn.exp.f32(i32, i32, float, float, float, float, i1, i1) #0
declare void @llvm.amdgcn.exp.compr.v2f16(i32, i32, <2 x half>, <2 x half>, i1, i1) #0
declare i32 @llvm.SI.packf16(float, float) #1
declare <2 x half> @llvm.amdgcn.cvt.pkrtz(float, float) #1

attributes #0 = { nounwind }
attributes #1 = { nounwind readnone }
Loading

0 comments on commit 7d65faa

Please sign in to comment.