diff --git a/lib/Target/X86/X86InstrAVX512.td b/lib/Target/X86/X86InstrAVX512.td index 4b2b8c9fd7b5..ae56349580a7 100644 --- a/lib/Target/X86/X86InstrAVX512.td +++ b/lib/Target/X86/X86InstrAVX512.td @@ -7641,19 +7641,6 @@ defm VSQRT : avx512_sqrt_packed_all<0x51, "vsqrt", fsqrt>, defm VSQRT : avx512_sqrt_scalar_all<0x51, "vsqrt">, VEX_LIG; -let Predicates = [HasAVX512] in { - def : Pat<(f32 (X86frsqrt FR32X:$src)), - (COPY_TO_REGCLASS (VRSQRT14SSrr (v4f32 (IMPLICIT_DEF)), (COPY_TO_REGCLASS FR32X:$src, VR128X)), VR128X)>; - def : Pat<(f32 (X86frsqrt (load addr:$src))), - (COPY_TO_REGCLASS (VRSQRT14SSrm (v4f32 (IMPLICIT_DEF)), addr:$src), VR128X)>, - Requires<[OptForSize]>; - def : Pat<(f32 (X86frcp FR32X:$src)), - (COPY_TO_REGCLASS (VRCP14SSrr (v4f32 (IMPLICIT_DEF)), (COPY_TO_REGCLASS FR32X:$src, VR128X)), VR128X )>; - def : Pat<(f32 (X86frcp (load addr:$src))), - (COPY_TO_REGCLASS (VRCP14SSrm (v4f32 (IMPLICIT_DEF)), addr:$src), VR128X)>, - Requires<[OptForSize]>; -} - multiclass avx512_rndscale_scalar opc, string OpcodeStr, X86VectorVTInfo _> { diff --git a/lib/Target/X86/X86InstrSSE.td b/lib/Target/X86/X86InstrSSE.td index 08d28a78bf0f..4314506c34f2 100644 --- a/lib/Target/X86/X86InstrSSE.td +++ b/lib/Target/X86/X86InstrSSE.td @@ -3095,7 +3095,7 @@ multiclass avx_fp_unop_s opc, string OpcodeStr, RegisterClass RC, ValueType vt, ValueType ScalarVT, X86MemOperand x86memop, Intrinsic Intr, SDNode OpNode, Domain d, - OpndItins itins, string Suffix> { + OpndItins itins, Predicate target, string Suffix> { let hasSideEffects = 0 in { def r : I opc, string OpcodeStr, RegisterClass RC, // vrcpss mem, %xmm0, %xmm0 // TODO: In theory, we could fold the load, and avoid the stall caused by // the partial register store, either in ExecutionDepsFix or with smarter RA. - let Predicates = [UseAVX] in { + let Predicates = [target] in { def : Pat<(OpNode RC:$src), (!cast("V"#NAME#Suffix##r) (ScalarVT (IMPLICIT_DEF)), RC:$src)>; } @@ -3140,7 +3140,7 @@ multiclass avx_fp_unop_s opc, string OpcodeStr, RegisterClass RC, (!cast("V"#NAME#Suffix##m_Int) (vt (IMPLICIT_DEF)), addr:$src2)>; } - let Predicates = [UseAVX, OptForSize] in { + let Predicates = [target, OptForSize] in { def : Pat<(ScalarVT (OpNode (load addr:$src))), (!cast("V"#NAME#Suffix##m) (ScalarVT (IMPLICIT_DEF)), addr:$src)>; @@ -3220,40 +3220,40 @@ let Predicates = [HasAVX, NoVLX] in { } multiclass sse1_fp_unop_s opc, string OpcodeStr, SDNode OpNode, - OpndItins itins> { + OpndItins itins, Predicate AVXTarget> { defm SS : sse_fp_unop_s("int_x86_sse_"##OpcodeStr##_ss), OpNode, SSEPackedSingle, itins, UseSSE1, "SS">, XS; defm V#NAME#SS : avx_fp_unop_s("int_x86_sse_"##OpcodeStr##_ss), OpNode, - SSEPackedSingle, itins, "SS">, XS, VEX_4V, VEX_LIG, VEX_WIG, - NotMemoryFoldable; + SSEPackedSingle, itins, AVXTarget, "SS">, XS, VEX_4V, + VEX_LIG, VEX_WIG, NotMemoryFoldable; } multiclass sse2_fp_unop_s opc, string OpcodeStr, SDNode OpNode, - OpndItins itins> { + OpndItins itins, Predicate AVXTarget> { defm SD : sse_fp_unop_s("int_x86_sse2_"##OpcodeStr##_sd), OpNode, SSEPackedDouble, itins, UseSSE2, "SD">, XD; defm V#NAME#SD : avx_fp_unop_s("int_x86_sse2_"##OpcodeStr##_sd), - OpNode, SSEPackedDouble, itins, "SD">, + OpNode, SSEPackedDouble, itins, AVXTarget, "SD">, XD, VEX_4V, VEX_LIG, VEX_WIG, NotMemoryFoldable; } // Square root. -defm SQRT : sse1_fp_unop_s<0x51, "sqrt", fsqrt, SSE_SQRTSS>, +defm SQRT : sse1_fp_unop_s<0x51, "sqrt", fsqrt, SSE_SQRTSS, UseAVX>, sse1_fp_unop_p<0x51, "sqrt", fsqrt, SSE_SQRTPS, [HasAVX, NoVLX]>, - sse2_fp_unop_s<0x51, "sqrt", fsqrt, SSE_SQRTSD>, + sse2_fp_unop_s<0x51, "sqrt", fsqrt, SSE_SQRTSD, UseAVX>, sse2_fp_unop_p<0x51, "sqrt", fsqrt, SSE_SQRTPD>; // Reciprocal approximations. Note that these typically require refinement // in order to obtain suitable precision. -defm RSQRT : sse1_fp_unop_s<0x52, "rsqrt", X86frsqrt, SSE_RSQRTSS>, +defm RSQRT : sse1_fp_unop_s<0x52, "rsqrt", X86frsqrt, SSE_RSQRTSS, HasAVX>, sse1_fp_unop_p<0x52, "rsqrt", X86frsqrt, SSE_RSQRTPS, [HasAVX]>; -defm RCP : sse1_fp_unop_s<0x53, "rcp", X86frcp, SSE_RCPS>, +defm RCP : sse1_fp_unop_s<0x53, "rcp", X86frcp, SSE_RCPS, HasAVX>, sse1_fp_unop_p<0x53, "rcp", X86frcp, SSE_RCPP, [HasAVX]>; // There is no f64 version of the reciprocal approximation instructions. diff --git a/test/CodeGen/X86/recip-fastmath.ll b/test/CodeGen/X86/recip-fastmath.ll index 56856d2204f3..296d165b3eb5 100644 --- a/test/CodeGen/X86/recip-fastmath.ll +++ b/test/CodeGen/X86/recip-fastmath.ll @@ -144,14 +144,14 @@ define float @f32_one_step(float %x) #1 { ; ; KNL-LABEL: f32_one_step: ; KNL: # BB#0: -; KNL-NEXT: vrcp14ss %xmm0, %xmm0, %xmm1 +; KNL-NEXT: vrcpss %xmm0, %xmm0, %xmm1 # sched: [5:1.00] ; KNL-NEXT: vfnmadd213ss {{.*}}(%rip), %xmm1, %xmm0 # sched: [5:0.50] ; KNL-NEXT: vfmadd132ss %xmm1, %xmm1, %xmm0 # sched: [5:0.50] ; KNL-NEXT: retq # sched: [2:1.00] ; ; SKX-LABEL: f32_one_step: ; SKX: # BB#0: -; SKX-NEXT: vrcp14ss %xmm0, %xmm0, %xmm1 # sched: [4:1.00] +; SKX-NEXT: vrcpss %xmm0, %xmm0, %xmm1 # sched: [4:1.00] ; SKX-NEXT: vfnmadd213ss {{.*}}(%rip), %xmm1, %xmm0 # sched: [9:0.50] ; SKX-NEXT: vfmadd132ss %xmm1, %xmm1, %xmm0 # sched: [4:0.33] ; SKX-NEXT: retq # sched: [7:1.00] @@ -257,7 +257,7 @@ define float @f32_two_step(float %x) #2 { ; ; KNL-LABEL: f32_two_step: ; KNL: # BB#0: -; KNL-NEXT: vrcp14ss %xmm0, %xmm0, %xmm1 +; KNL-NEXT: vrcpss %xmm0, %xmm0, %xmm1 # sched: [5:1.00] ; KNL-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero sched: [1:0.50] ; KNL-NEXT: vmovaps %xmm1, %xmm3 # sched: [1:1.00] ; KNL-NEXT: vfnmadd213ss %xmm2, %xmm0, %xmm3 # sched: [5:0.50] @@ -268,7 +268,7 @@ define float @f32_two_step(float %x) #2 { ; ; SKX-LABEL: f32_two_step: ; SKX: # BB#0: -; SKX-NEXT: vrcp14ss %xmm0, %xmm0, %xmm1 # sched: [4:1.00] +; SKX-NEXT: vrcpss %xmm0, %xmm0, %xmm1 # sched: [4:1.00] ; SKX-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero sched: [5:0.50] ; SKX-NEXT: vmovaps %xmm1, %xmm3 # sched: [1:1.00] ; SKX-NEXT: vfnmadd213ss %xmm2, %xmm0, %xmm3 # sched: [4:0.33] diff --git a/test/CodeGen/X86/recip-fastmath2.ll b/test/CodeGen/X86/recip-fastmath2.ll index 338f23f0bc31..f6eeeec57f11 100644 --- a/test/CodeGen/X86/recip-fastmath2.ll +++ b/test/CodeGen/X86/recip-fastmath2.ll @@ -56,13 +56,13 @@ define float @f32_no_step_2(float %x) #3 { ; ; KNL-LABEL: f32_no_step_2: ; KNL: # BB#0: -; KNL-NEXT: vrcp14ss %xmm0, %xmm0, %xmm0 +; KNL-NEXT: vrcpss %xmm0, %xmm0, %xmm0 # sched: [5:1.00] ; KNL-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm0 # sched: [5:0.50] ; KNL-NEXT: retq # sched: [2:1.00] ; ; SKX-LABEL: f32_no_step_2: ; SKX: # BB#0: -; SKX-NEXT: vrcp14ss %xmm0, %xmm0, %xmm0 # sched: [4:1.00] +; SKX-NEXT: vrcpss %xmm0, %xmm0, %xmm0 # sched: [4:1.00] ; SKX-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm0 # sched: [9:0.50] ; SKX-NEXT: retq # sched: [7:1.00] %div = fdiv fast float 1234.0, %x @@ -144,7 +144,7 @@ define float @f32_one_step_2(float %x) #1 { ; ; KNL-LABEL: f32_one_step_2: ; KNL: # BB#0: -; KNL-NEXT: vrcp14ss %xmm0, %xmm0, %xmm1 +; KNL-NEXT: vrcpss %xmm0, %xmm0, %xmm1 # sched: [5:1.00] ; KNL-NEXT: vfnmadd213ss {{.*}}(%rip), %xmm1, %xmm0 # sched: [5:0.50] ; KNL-NEXT: vfmadd132ss %xmm1, %xmm1, %xmm0 # sched: [5:0.50] ; KNL-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm0 # sched: [5:0.50] @@ -152,7 +152,7 @@ define float @f32_one_step_2(float %x) #1 { ; ; SKX-LABEL: f32_one_step_2: ; SKX: # BB#0: -; SKX-NEXT: vrcp14ss %xmm0, %xmm0, %xmm1 # sched: [4:1.00] +; SKX-NEXT: vrcpss %xmm0, %xmm0, %xmm1 # sched: [4:1.00] ; SKX-NEXT: vfnmadd213ss {{.*}}(%rip), %xmm1, %xmm0 # sched: [9:0.50] ; SKX-NEXT: vfmadd132ss %xmm1, %xmm1, %xmm0 # sched: [4:0.33] ; SKX-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm0 # sched: [9:0.50] @@ -243,7 +243,7 @@ define float @f32_one_step_2_divs(float %x) #1 { ; ; KNL-LABEL: f32_one_step_2_divs: ; KNL: # BB#0: -; KNL-NEXT: vrcp14ss %xmm0, %xmm0, %xmm1 +; KNL-NEXT: vrcpss %xmm0, %xmm0, %xmm1 # sched: [5:1.00] ; KNL-NEXT: vfnmadd213ss {{.*}}(%rip), %xmm1, %xmm0 # sched: [5:0.50] ; KNL-NEXT: vfmadd132ss %xmm1, %xmm1, %xmm0 # sched: [5:0.50] ; KNL-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm1 # sched: [5:0.50] @@ -252,7 +252,7 @@ define float @f32_one_step_2_divs(float %x) #1 { ; ; SKX-LABEL: f32_one_step_2_divs: ; SKX: # BB#0: -; SKX-NEXT: vrcp14ss %xmm0, %xmm0, %xmm1 # sched: [4:1.00] +; SKX-NEXT: vrcpss %xmm0, %xmm0, %xmm1 # sched: [4:1.00] ; SKX-NEXT: vfnmadd213ss {{.*}}(%rip), %xmm1, %xmm0 # sched: [9:0.50] ; SKX-NEXT: vfmadd132ss %xmm1, %xmm1, %xmm0 # sched: [4:0.33] ; SKX-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm1 # sched: [9:0.50] @@ -368,7 +368,7 @@ define float @f32_two_step_2(float %x) #2 { ; ; KNL-LABEL: f32_two_step_2: ; KNL: # BB#0: -; KNL-NEXT: vrcp14ss %xmm0, %xmm0, %xmm1 +; KNL-NEXT: vrcpss %xmm0, %xmm0, %xmm1 # sched: [5:1.00] ; KNL-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero sched: [1:0.50] ; KNL-NEXT: vmovaps %xmm1, %xmm3 # sched: [1:1.00] ; KNL-NEXT: vfnmadd213ss %xmm2, %xmm0, %xmm3 # sched: [5:0.50] @@ -380,7 +380,7 @@ define float @f32_two_step_2(float %x) #2 { ; ; SKX-LABEL: f32_two_step_2: ; SKX: # BB#0: -; SKX-NEXT: vrcp14ss %xmm0, %xmm0, %xmm1 # sched: [4:1.00] +; SKX-NEXT: vrcpss %xmm0, %xmm0, %xmm1 # sched: [4:1.00] ; SKX-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero sched: [5:0.50] ; SKX-NEXT: vmovaps %xmm1, %xmm3 # sched: [1:1.00] ; SKX-NEXT: vfnmadd213ss %xmm2, %xmm0, %xmm3 # sched: [4:0.33]