diff --git a/lib/Target/ARM/ARMISelLowering.cpp b/lib/Target/ARM/ARMISelLowering.cpp index 7021941076f6..74353c1788a5 100644 --- a/lib/Target/ARM/ARMISelLowering.cpp +++ b/lib/Target/ARM/ARMISelLowering.cpp @@ -1069,6 +1069,8 @@ const char *ARMTargetLowering::getTargetNodeName(unsigned Opcode) const { case ARMISD::BUILD_VECTOR: return "ARMISD::BUILD_VECTOR"; case ARMISD::FMAX: return "ARMISD::FMAX"; case ARMISD::FMIN: return "ARMISD::FMIN"; + case ARMISD::VMAXNM: return "ARMISD::VMAX"; + case ARMISD::VMINNM: return "ARMISD::VMIN"; case ARMISD::BFI: return "ARMISD::BFI"; case ARMISD::VORRIMM: return "ARMISD::VORRIMM"; case ARMISD::VBICIMM: return "ARMISD::VBICIMM"; @@ -3276,6 +3278,20 @@ SDValue ARMTargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const { // Try to generate VSEL on ARMv8. if (getSubtarget()->hasV8FP() && (TrueVal.getValueType() == MVT::f32 || TrueVal.getValueType() == MVT::f64)) { + // We can select VMAXNM/VMINNM from a compare followed by a select with the + // same operands, as follows: + // c = fcmp [ogt, olt, ugt, ult] a, b + // select c, a, b + // We only do this in unsafe-fp-math, because signed zeros and NaNs are + // handled differently than the original code sequence. + if (getTargetMachine().Options.UnsafeFPMath && LHS == TrueVal && + RHS == FalseVal) { + if (CC == ISD::SETOGT || CC == ISD::SETUGT) + return DAG.getNode(ARMISD::VMAXNM, dl, VT, TrueVal, FalseVal); + if (CC == ISD::SETOLT || CC == ISD::SETULT) + return DAG.getNode(ARMISD::VMINNM, dl, VT, TrueVal, FalseVal); + } + bool swpCmpOps = false; bool swpVselOps = false; checkVSELConstraints(CC, CondCode, swpCmpOps, swpVselOps); diff --git a/lib/Target/ARM/ARMISelLowering.h b/lib/Target/ARM/ARMISelLowering.h index 44c769f82213..be7811f8feb6 100644 --- a/lib/Target/ARM/ARMISelLowering.h +++ b/lib/Target/ARM/ARMISelLowering.h @@ -186,6 +186,8 @@ namespace llvm { // Floating-point max and min: FMAX, FMIN, + VMAXNM, + VMINNM, // Bit-field insert BFI, diff --git a/lib/Target/ARM/ARMInstrInfo.td b/lib/Target/ARM/ARMInstrInfo.td index df64a09514a7..dc9a6d2b33c5 100644 --- a/lib/Target/ARM/ARMInstrInfo.td +++ b/lib/Target/ARM/ARMInstrInfo.td @@ -71,6 +71,9 @@ def SDT_ARMTCRET : SDTypeProfile<0, 1, [SDTCisPtrTy<0>]>; def SDT_ARMBFI : SDTypeProfile<1, 3, [SDTCisVT<0, i32>, SDTCisVT<1, i32>, SDTCisVT<2, i32>, SDTCisVT<3, i32>]>; +def SDT_ARMVMAXNM : SDTypeProfile<1, 2, [SDTCisFP<0>, SDTCisFP<1>, SDTCisFP<2>]>; +def SDT_ARMVMINNM : SDTypeProfile<1, 2, [SDTCisFP<0>, SDTCisFP<1>, SDTCisFP<2>]>; + def SDTBinaryArithWithFlags : SDTypeProfile<2, 2, [SDTCisSameAs<0, 2>, SDTCisSameAs<0, 3>, @@ -174,9 +177,11 @@ def ARMrbit : SDNode<"ARMISD::RBIT", SDTIntUnaryOp>; def ARMtcret : SDNode<"ARMISD::TC_RETURN", SDT_ARMTCRET, [SDNPHasChain, SDNPOptInGlue, SDNPVariadic]>; - def ARMbfi : SDNode<"ARMISD::BFI", SDT_ARMBFI>; +def ARMvmaxnm : SDNode<"ARMISD::VMAXNM", SDT_ARMVMAXNM, []>; +def ARMvminnm : SDNode<"ARMISD::VMINNM", SDT_ARMVMINNM, []>; + //===----------------------------------------------------------------------===// // ARM Instruction Predicate Definitions. // diff --git a/lib/Target/ARM/ARMInstrVFP.td b/lib/Target/ARM/ARMInstrVFP.td index b4df4d787e3a..3bb4d6f97f96 100644 --- a/lib/Target/ARM/ARMInstrVFP.td +++ b/lib/Target/ARM/ARMInstrVFP.td @@ -356,22 +356,24 @@ defm VSELGE : vsel_inst<"ge", 0b10, 10>; defm VSELEQ : vsel_inst<"eq", 0b00, 0>; defm VSELVS : vsel_inst<"vs", 0b01, 6>; -multiclass vmaxmin_inst { +multiclass vmaxmin_inst { let DecoderNamespace = "VFPV8", PostEncoderMethod = "" in { def S : ASbInp<0b11101, 0b00, opc, (outs SPR:$Sd), (ins SPR:$Sn, SPR:$Sm), NoItinerary, !strconcat(op, ".f32\t$Sd, $Sn, $Sm"), - []>, Requires<[HasV8FP]>; + [(set SPR:$Sd, (SD SPR:$Sn, SPR:$Sm))]>, + Requires<[HasV8FP]>; def D : ADbInp<0b11101, 0b00, opc, (outs DPR:$Dd), (ins DPR:$Dn, DPR:$Dm), NoItinerary, !strconcat(op, ".f64\t$Dd, $Dn, $Dm"), - []>, Requires<[HasV8FP]>; + [(set DPR:$Dd, (f64 (SD (f64 DPR:$Dn), (f64 DPR:$Dm))))]>, + Requires<[HasV8FP]>; } } -defm VMAXNM : vmaxmin_inst<"vmaxnm", 0>; -defm VMINNM : vmaxmin_inst<"vminnm", 1>; +defm VMAXNM : vmaxmin_inst<"vmaxnm", 0, ARMvmaxnm>; +defm VMINNM : vmaxmin_inst<"vminnm", 1, ARMvminnm>; // Match reassociated forms only if not sign dependent rounding. def : Pat<(fmul (fneg DPR:$a), (f64 DPR:$b)), diff --git a/test/CodeGen/ARM/vminmaxnm.ll b/test/CodeGen/ARM/vminmaxnm.ll index afa73b9e9c0b..fdf0c6a7627a 100644 --- a/test/CodeGen/ARM/vminmaxnm.ll +++ b/test/CodeGen/ARM/vminmaxnm.ll @@ -1,4 +1,5 @@ ; RUN: llc < %s -mtriple armv8 -mattr=+neon | FileCheck %s +; RUN: llc < %s -mtriple armv8 -mattr=+neon,+v8fp -enable-unsafe-fp-math | FileCheck %s --check-prefix=CHECK-FAST define <4 x float> @vmaxnmq(<4 x float>* %A, <4 x float>* %B) nounwind { ; CHECK: vmaxnmq @@ -36,6 +37,51 @@ define <2 x float> @vminnmd(<2 x float>* %A, <2 x float>* %B) nounwind { ret <2 x float> %tmp3 } +define float @v8fp_vminnm_o(float %a, float %b) { +; CHECK-FAST: v8fp_vminnm_o +; CHECK-FAST-NOT: vcmp +; CHECK-FAST: vminnm.f32 +; CHECK: v8fp_vminnm_o +; CHECK-NOT: vminnm.f32 + %cmp = fcmp olt float %a, %b + %cond = select i1 %cmp, float %a, float %b + ret float %cond +} + +define float @v8fp_vminnm_u(float %a, float %b) { +; CHECK-FAST: v8fp_vminnm_u +; CHECK-FAST-NOT: vcmp +; CHECK-FAST: vminnm.f32 +; CHECK: v8fp_vminnm_u +; CHECK-NOT: vminnm.f32 + %cmp = fcmp ult float %a, %b + %cond = select i1 %cmp, float %a, float %b + ret float %cond +} + +define float @v8fp_vmaxnm_o(float %a, float %b) { +; CHECK-FAST: v8fp_vmaxnm_o +; CHECK-FAST-NOT: vcmp +; CHECK-FAST: vmaxnm.f32 +; CHECK: v8fp_vmaxnm_o +; CHECK-NOT: vmaxnm.f32 + %cmp = fcmp ogt float %a, %b + %cond = select i1 %cmp, float %a, float %b + ret float %cond +} + +define float @v8fp_vmaxnm_u(float %a, float %b) { +; CHECK-FAST: v8fp_vmaxnm_u +; CHECK-FAST-NOT: vcmp +; CHECK-FAST: vmaxnm.f32 +; CHECK: v8fp_vmaxnm_u +; CHECK-NOT: vmaxnm.f32 + %cmp = fcmp ugt float %a, %b + %cond = select i1 %cmp, float %a, float %b + ret float %cond +} + + declare <4 x float> @llvm.arm.neon.vminnm.v4f32(<4 x float>, <4 x float>) nounwind readnone declare <2 x float> @llvm.arm.neon.vminnm.v2f32(<2 x float>, <2 x float>) nounwind readnone declare <4 x float> @llvm.arm.neon.vmaxnm.v4f32(<4 x float>, <4 x float>) nounwind readnone