[DAGCombiner] do not fold (fmul (fadd X, 1), Y) -> (fmad X, Y, Y) by …

…default Summary: When X = 0 and Y = inf, the original code produces inf, but the transformed code produces nan. So this transform (and its relatives) should only be used when the no-infs-fp-math flag is explicitly enabled. Also disable the transform using fmad (intermediate rounding) when unsafe-math is not enabled, since it can reduce the precision of the result; consider this example with binary floating point numbers with two bits of mantissa: x = 1.01 y = 111 x * (y + 1) = 1.01 * 1000 = 1010 (this is the exact result; no rounding occurs at any step) x * y + x = 1000.11 + 1.01 =r 1000 + 1.01 = 1001.01 =r 1000 (with rounding towards zero) The example relies on rounding towards zero at least in the second step. Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=98578 Reviewers: RKSimon, tstellarAMD, spatel, arsenm Subscribers: wdng, llvm-commits Differential Revision: https://reviews.llvm.org/D26602 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@288506 91177308-0d34-0410-b5e6-96231b3b80d8
liuchyi · Dec 2, 2016 · 42285f5 · 42285f5
1 parent 7ac6b09
commit 42285f5
Show file tree

Hide file tree

Showing 4 changed files with 390 additions and 147 deletions.
diff --git a/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -8392,17 +8392,23 @@ SDValue DAGCombiner::visitFMULForFMADistributiveCombine(SDNode *N) {
   assert(N->getOpcode() == ISD::FMUL && "Expected FMUL Operation");
 
   const TargetOptions &Options = DAG.getTarget().Options;
-  bool AllowFusion =
-      (Options.AllowFPOpFusion == FPOpFusion::Fast || Options.UnsafeFPMath);
 
-  // Floating-point multiply-add with intermediate rounding.
-  bool HasFMAD = (LegalOperations && TLI.isOperationLegal(ISD::FMAD, VT));
+  // The transforms below are incorrect when x == 0 and y == inf, because the
+  // intermediate multiplication produces a nan.
+  if (!Options.NoInfsFPMath)
+    return SDValue();
 
   // Floating-point multiply-add without intermediate rounding.
   bool HasFMA =
-      AllowFusion && TLI.isFMAFasterThanFMulAndFAdd(VT) &&
+      (Options.AllowFPOpFusion == FPOpFusion::Fast || Options.UnsafeFPMath) &&
+      TLI.isFMAFasterThanFMulAndFAdd(VT) &&
       (!LegalOperations || TLI.isOperationLegalOrCustom(ISD::FMA, VT));
 
+  // Floating-point multiply-add with intermediate rounding. This can result
+  // in a less precise result due to the changed rounding order.
+  bool HasFMAD = Options.UnsafeFPMath &&
+                 (LegalOperations && TLI.isOperationLegal(ISD::FMAD, VT));
+
   // No valid opcode, do not combine.
   if (!HasFMAD && !HasFMA)
     return SDValue();

diff --git a/test/CodeGen/AMDGPU/fma-combine.ll b/test/CodeGen/AMDGPU/fma-combine.ll
@@ -1,5 +1,10 @@
-; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs -fp-contract=fast < %s | FileCheck -check-prefix=SI-FASTFMAF -check-prefix=SI -check-prefix=FUNC %s
-; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs -fp-contract=fast < %s | FileCheck -check-prefix=SI-SLOWFMAF -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs -fp-contract=fast < %s | FileCheck -check-prefix=SI-NOFMA -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs -fp-contract=fast < %s | FileCheck -check-prefix=SI-NOFMA -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs -fp-contract=fast -enable-no-infs-fp-math -mattr=+fp32-denormals < %s | FileCheck -check-prefix=SI-FMA -check-prefix=SI -check-prefix=FUNC %s
+
+; Note: The SI-FMA conversions of type x * (y + 1) --> x * y + x would be
+; beneficial even without fp32 denormals, but they do require no-infs-fp-math
+; for correctness.
 
 declare i32 @llvm.amdgcn.workitem.id.x() #0
 declare double @llvm.fabs.f64(double) #0
@@ -369,7 +374,10 @@ define void @aggressive_combine_to_fma_fsub_1_f64(double addrspace(1)* noalias %
 ;
 
 ; FUNC-LABEL: {{^}}test_f32_mul_add_x_one_y:
-; SI: v_mac_f32_e32 [[VY:v[0-9]]], [[VY:v[0-9]]], [[VX:v[0-9]]]
+; SI-NOFMA: v_add_f32_e32 [[VS:v[0-9]]], 1.0, [[VX:v[0-9]]]
+; SI-NOFMA: v_mul_f32_e32 {{v[0-9]}}, [[VY:v[0-9]]], [[VS]]
+;
+; SI-FMA: v_fma_f32 {{v[0-9]}}, [[VX:v[0-9]]], [[VY:v[0-9]]], [[VY:v[0-9]]]
 define void @test_f32_mul_add_x_one_y(float addrspace(1)* %out,
                                         float addrspace(1)* %in1,
                                         float addrspace(1)* %in2) {
@@ -382,7 +390,10 @@ define void @test_f32_mul_add_x_one_y(float addrspace(1)* %out,
 }
 
 ; FUNC-LABEL: {{^}}test_f32_mul_y_add_x_one:
-; SI: v_mac_f32_e32 [[VY:v[0-9]]], [[VY:v[0-9]]], [[VX:v[0-9]]]
+; SI-NOFMA: v_add_f32_e32 [[VS:v[0-9]]], 1.0, [[VX:v[0-9]]]
+; SI-NOFMA: v_mul_f32_e32 {{v[0-9]}}, [[VS]], [[VY:v[0-9]]]
+;
+; SI-FMA: v_fma_f32 {{v[0-9]}}, [[VX:v[0-9]]], [[VY:v[0-9]]], [[VY:v[0-9]]]
 define void @test_f32_mul_y_add_x_one(float addrspace(1)* %out,
                                         float addrspace(1)* %in1,
                                         float addrspace(1)* %in2) {
@@ -395,7 +406,10 @@ define void @test_f32_mul_y_add_x_one(float addrspace(1)* %out,
 }
 
 ; FUNC-LABEL: {{^}}test_f32_mul_add_x_negone_y:
-; SI: v_mad_f32 [[VX:v[0-9]]], [[VX]], [[VY:v[0-9]]], -[[VY]]
+; SI-NOFMA: v_add_f32_e32 [[VS:v[0-9]]], -1.0, [[VX:v[0-9]]]
+; SI-NOFMA: v_mul_f32_e32 {{v[0-9]}}, [[VY:v[0-9]]], [[VS]]
+;
+; SI-FMA: v_fma_f32 {{v[0-9]}}, [[VX:v[0-9]]], [[VY:v[0-9]]], -[[VY:v[0-9]]]
 define void @test_f32_mul_add_x_negone_y(float addrspace(1)* %out,
                                            float addrspace(1)* %in1,
                                            float addrspace(1)* %in2) {
@@ -408,7 +422,10 @@ define void @test_f32_mul_add_x_negone_y(float addrspace(1)* %out,
 }
 
 ; FUNC-LABEL: {{^}}test_f32_mul_y_add_x_negone:
-; SI: v_mad_f32 [[VX:v[0-9]]], [[VX]], [[VY:v[0-9]]], -[[VY]]
+; SI-NOFMA: v_add_f32_e32 [[VS:v[0-9]]], -1.0, [[VX:v[0-9]]]
+; SI-NOFMA: v_mul_f32_e32 {{v[0-9]}}, [[VS]], [[VY:v[0-9]]]
+;
+; SI-FMA: v_fma_f32 {{v[0-9]}}, [[VX:v[0-9]]], [[VY:v[0-9]]], -[[VY:v[0-9]]]
 define void @test_f32_mul_y_add_x_negone(float addrspace(1)* %out,
                                            float addrspace(1)* %in1,
                                            float addrspace(1)* %in2) {
@@ -421,7 +438,10 @@ define void @test_f32_mul_y_add_x_negone(float addrspace(1)* %out,
 }
 
 ; FUNC-LABEL: {{^}}test_f32_mul_sub_one_x_y:
-; SI: v_mad_f32 [[VX:v[0-9]]], -[[VX]], [[VY:v[0-9]]], [[VY]]
+; SI-NOFMA: v_sub_f32_e32 [[VS:v[0-9]]], 1.0, [[VX:v[0-9]]]
+; SI-NOFMA: v_mul_f32_e32 {{v[0-9]}}, [[VY:v[0-9]]], [[VS]]
+;
+; SI-FMA: v_fma_f32 {{v[0-9]}}, -[[VX:v[0-9]]], [[VY:v[0-9]]], [[VY:v[0-9]]]
 define void @test_f32_mul_sub_one_x_y(float addrspace(1)* %out,
                                         float addrspace(1)* %in1,
                                         float addrspace(1)* %in2) {
@@ -434,7 +454,10 @@ define void @test_f32_mul_sub_one_x_y(float addrspace(1)* %out,
 }
 
 ; FUNC-LABEL: {{^}}test_f32_mul_y_sub_one_x:
-; SI: v_mad_f32 [[VX:v[0-9]]], -[[VX]], [[VY:v[0-9]]], [[VY]]
+; SI-NOFMA: v_sub_f32_e32 [[VS:v[0-9]]], 1.0, [[VX:v[0-9]]]
+; SI-NOFMA: v_mul_f32_e32 {{v[0-9]}}, [[VS]], [[VY:v[0-9]]]
+;
+; SI-FMA: v_fma_f32 {{v[0-9]}}, -[[VX:v[0-9]]], [[VY:v[0-9]]], [[VY:v[0-9]]]
 define void @test_f32_mul_y_sub_one_x(float addrspace(1)* %out,
                                         float addrspace(1)* %in1,
                                         float addrspace(1)* %in2) {
@@ -447,7 +470,10 @@ define void @test_f32_mul_y_sub_one_x(float addrspace(1)* %out,
 }
 
 ; FUNC-LABEL: {{^}}test_f32_mul_sub_negone_x_y:
-; SI: v_mad_f32 [[VX:v[0-9]]], -[[VX]], [[VY:v[0-9]]], -[[VY]]
+; SI-NOFMA: v_sub_f32_e32 [[VS:v[0-9]]], -1.0, [[VX:v[0-9]]]
+; SI-NOFMA: v_mul_f32_e32 {{v[0-9]}}, [[VY:v[0-9]]], [[VS]]
+;
+; SI-FMA: v_fma_f32 {{v[0-9]}}, -[[VX:v[0-9]]], [[VY:v[0-9]]], -[[VY:v[0-9]]]
 define void @test_f32_mul_sub_negone_x_y(float addrspace(1)* %out,
                                            float addrspace(1)* %in1,
                                            float addrspace(1)* %in2) {
@@ -460,7 +486,10 @@ define void @test_f32_mul_sub_negone_x_y(float addrspace(1)* %out,
 }
 
 ; FUNC-LABEL: {{^}}test_f32_mul_y_sub_negone_x:
-; SI: v_mad_f32 [[VX:v[0-9]]], -[[VX]], [[VY:v[0-9]]], -[[VY]]
+; SI-NOFMA: v_sub_f32_e32 [[VS:v[0-9]]], -1.0, [[VX:v[0-9]]]
+; SI-NOFMA: v_mul_f32_e32 {{v[0-9]}}, [[VS]], [[VY:v[0-9]]]
+;
+; SI-FMA: v_fma_f32 {{v[0-9]}}, -[[VX:v[0-9]]], [[VY:v[0-9]]], -[[VY:v[0-9]]]
 define void @test_f32_mul_y_sub_negone_x(float addrspace(1)* %out,
                                          float addrspace(1)* %in1,
                                          float addrspace(1)* %in2) {
@@ -473,7 +502,10 @@ define void @test_f32_mul_y_sub_negone_x(float addrspace(1)* %out,
 }
 
 ; FUNC-LABEL: {{^}}test_f32_mul_sub_x_one_y:
-; SI: v_mad_f32 [[VX:v[0-9]]], [[VX]], [[VY:v[0-9]]], -[[VY]]
+; SI-NOFMA: v_add_f32_e32 [[VS:v[0-9]]], -1.0, [[VX:v[0-9]]]
+; SI-NOFMA: v_mul_f32_e32 {{v[0-9]}}, [[VY:v[0-9]]], [[VS]]
+;
+; SI-FMA: v_fma_f32 {{v[0-9]}}, [[VX:v[0-9]]], [[VY:v[0-9]]], -[[VY:v[0-9]]]
 define void @test_f32_mul_sub_x_one_y(float addrspace(1)* %out,
                                         float addrspace(1)* %in1,
                                         float addrspace(1)* %in2) {
@@ -486,7 +518,10 @@ define void @test_f32_mul_sub_x_one_y(float addrspace(1)* %out,
 }
 
 ; FUNC-LABEL: {{^}}test_f32_mul_y_sub_x_one:
-; SI: v_mad_f32 [[VX:v[0-9]]], [[VX]], [[VY:v[0-9]]], -[[VY]]
+; SI-NOFMA: v_add_f32_e32 [[VS:v[0-9]]], -1.0, [[VX:v[0-9]]]
+; SI-NOFMA: v_mul_f32_e32 {{v[0-9]}}, [[VS]], [[VY:v[0-9]]]
+;
+; SI-FMA: v_fma_f32 {{v[0-9]}}, [[VX:v[0-9]]], [[VY:v[0-9]]], -[[VY:v[0-9]]]
 define void @test_f32_mul_y_sub_x_one(float addrspace(1)* %out,
                                       float addrspace(1)* %in1,
                                       float addrspace(1)* %in2) {
@@ -499,7 +534,10 @@ define void @test_f32_mul_y_sub_x_one(float addrspace(1)* %out,
 }
 
 ; FUNC-LABEL: {{^}}test_f32_mul_sub_x_negone_y:
-; SI: v_mac_f32_e32 [[VY:v[0-9]]], [[VY]], [[VX:v[0-9]]]
+; SI-NOFMA: v_add_f32_e32 [[VS:v[0-9]]], 1.0, [[VX:v[0-9]]]
+; SI-NOFMA: v_mul_f32_e32 {{v[0-9]}}, [[VY:v[0-9]]], [[VS]]
+;
+; SI-FMA: v_fma_f32 {{v[0-9]}}, [[VX:v[0-9]]], [[VY:v[0-9]]], [[VY:v[0-9]]]
 define void @test_f32_mul_sub_x_negone_y(float addrspace(1)* %out,
                                          float addrspace(1)* %in1,
                                          float addrspace(1)* %in2) {
@@ -512,7 +550,10 @@ define void @test_f32_mul_sub_x_negone_y(float addrspace(1)* %out,
 }
 
 ; FUNC-LABEL: {{^}}test_f32_mul_y_sub_x_negone:
-; SI: v_mac_f32_e32 [[VY:v[0-9]]], [[VY]], [[VX:v[0-9]]]
+; SI-NOFMA: v_add_f32_e32 [[VS:v[0-9]]], 1.0, [[VX:v[0-9]]]
+; SI-NOFMA: v_mul_f32_e32 {{v[0-9]}}, [[VS]], [[VY:v[0-9]]]
+;
+; SI-FMA: v_fma_f32 {{v[0-9]}}, [[VX:v[0-9]]], [[VY:v[0-9]]], [[VY:v[0-9]]]
 define void @test_f32_mul_y_sub_x_negone(float addrspace(1)* %out,
                                          float addrspace(1)* %in1,
                                          float addrspace(1)* %in2) {
@@ -529,8 +570,12 @@ define void @test_f32_mul_y_sub_x_negone(float addrspace(1)* %out,
 ;
 
 ; FUNC-LABEL: {{^}}test_f32_interp:
-; SI: v_mad_f32 [[VR:v[0-9]]], -[[VT:v[0-9]]], [[VY:v[0-9]]], [[VY]]
-; SI: v_mac_f32_e32 [[VR]], [[VT]], [[VX:v[0-9]]]
+; SI-NOFMA: v_sub_f32_e32 [[VT1:v[0-9]]], 1.0, [[VT:v[0-9]]]
+; SI-NOFMA: v_mul_f32_e32 [[VTY:v[0-9]]], [[VT1]], [[VY:v[0-9]]]
+; SI-NOFMA: v_mac_f32_e32 [[VTY]], [[VT]], [[VX:v[0-9]]]
+;
+; SI-FMA: v_fma_f32 [[VR:v[0-9]]], -[[VT:v[0-9]]], [[VY:v[0-9]]], [[VY]]
+; SI-FMA: v_fma_f32 {{v[0-9]}}, [[VX:v[0-9]]], [[VT]], [[VR]]
 define void @test_f32_interp(float addrspace(1)* %out,
                              float addrspace(1)* %in1,
                              float addrspace(1)* %in2,
@@ -547,8 +592,12 @@ define void @test_f32_interp(float addrspace(1)* %out,
 }
 
 ; FUNC-LABEL: {{^}}test_f64_interp:
-; SI: v_fma_f64 [[VR:v\[[0-9]+:[0-9]+\]]], -[[VT:v\[[0-9]+:[0-9]+\]]], [[VY:v\[[0-9]+:[0-9]+\]]], [[VY]]
-; SI: v_fma_f64 v{{\[[0-9]+:[0-9]+\]}}, [[VX:v\[[0-9]+:[0-9]+\]]], [[VT]], [[VR]]
+; SI-NOFMA: v_add_f64 [[VT1:v\[[0-9]+:[0-9]+\]]], -[[VT:v\[[0-9]+:[0-9]+\]]], 1.0
+; SI-NOFMA: v_mul_f64 [[VTY:v\[[0-9]+:[0-9]+\]]], [[VY:v\[[0-9]+:[0-9]+\]]], [[VT1]]
+; SI-NOFMA: v_fma_f64 v{{\[[0-9]+:[0-9]+\]}}, [[VX:v\[[0-9]+:[0-9]+\]]], [[VT]], [[VTY]]
+;
+; SI-FMA: v_fma_f64 [[VR:v\[[0-9]+:[0-9]+\]]], -[[VT:v\[[0-9]+:[0-9]+\]]], [[VY:v\[[0-9]+:[0-9]+\]]], [[VY]]
+; SI-FMA: v_fma_f64 v{{\[[0-9]+:[0-9]+\]}}, [[VX:v\[[0-9]+:[0-9]+\]]], [[VT]], [[VR]]
 define void @test_f64_interp(double addrspace(1)* %out,
                              double addrspace(1)* %in1,
                              double addrspace(1)* %in2,