From c2faa70777902555e3adfde51eddd294887d9a2c Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Tue, 15 May 2018 14:12:32 +0000 Subject: [PATCH] [X86] Split off F16C WriteCvtPH2PS/WriteCvtPS2PH scheduler classes Btver2 - VCVTPH2PSYrm needs to double pump the AGU Broadwell - missing VCVTPS2PH*mr stores extra latency Allows us to remove the WriteCvtF2FSt conversion store class git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@332357 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Target/X86/X86InstrAVX512.td | 30 ++++++++------ lib/Target/X86/X86InstrSSE.td | 24 ++++++----- lib/Target/X86/X86SchedBroadwell.td | 24 +++++------ lib/Target/X86/X86SchedHaswell.td | 33 ++++++--------- lib/Target/X86/X86SchedSandyBridge.td | 9 +++- lib/Target/X86/X86SchedSkylakeClient.td | 29 +++++-------- lib/Target/X86/X86SchedSkylakeServer.td | 31 +++++--------- lib/Target/X86/X86Schedule.td | 9 +++- lib/Target/X86/X86ScheduleAtom.td | 8 +++- lib/Target/X86/X86ScheduleBtVer2.td | 41 ++++--------------- lib/Target/X86/X86ScheduleSLM.td | 8 +++- lib/Target/X86/X86ScheduleZnver1.td | 14 ++++--- test/CodeGen/X86/f16c-schedule.ll | 4 +- .../llvm-mca/X86/Broadwell/resources-f16c.s | 4 +- .../llvm-mca/X86/BtVer2/resources-f16c.s | 4 +- 15 files changed, 127 insertions(+), 145 deletions(-) diff --git a/lib/Target/X86/X86InstrAVX512.td b/lib/Target/X86/X86InstrAVX512.td index d8991fa0e9fb..3ac2b43a67b5 100644 --- a/lib/Target/X86/X86InstrAVX512.td +++ b/lib/Target/X86/X86InstrAVX512.td @@ -7863,16 +7863,16 @@ multiclass avx512_cvtph2ps_sae, + WriteCvtPH2PSY>, avx512_cvtph2ps_sae, EVEX, EVEX_V512, EVEX_CD8<32, CD8VH>; let Predicates = [HasVLX] in { defm VCVTPH2PSZ256 : avx512_cvtph2ps, EVEX, EVEX_V256, + loadv2i64, WriteCvtPH2PSY>, EVEX, EVEX_V256, EVEX_CD8<32, CD8VH>; defm VCVTPH2PSZ128 : avx512_cvtph2ps, EVEX, EVEX_V128, + loadv2i64, WriteCvtPH2PS>, EVEX, EVEX_V128, EVEX_CD8<32, CD8VH>; // Pattern match vcvtph2ps of a scalar i64 load. @@ -7886,42 +7886,46 @@ let Predicates = [HasVLX] in { } multiclass avx512_cvtps2ph { + X86MemOperand x86memop, SchedWrite RR, SchedWrite MR> { defm rr : AVX512_maskable<0x1D, MRMDestReg, _dest ,(outs _dest.RC:$dst), (ins _src.RC:$src1, i32u8imm:$src2), "vcvtps2ph", "$src2, $src1", "$src1, $src2", (X86cvtps2ph (_src.VT _src.RC:$src1), (i32 imm:$src2)), 0, 0>, - AVX512AIi8Base, Sched<[WriteCvtF2F]>; + AVX512AIi8Base, Sched<[RR]>; let hasSideEffects = 0, mayStore = 1 in { def mr : AVX512AIi8<0x1D, MRMDestMem, (outs), (ins x86memop:$dst, _src.RC:$src1, i32u8imm:$src2), "vcvtps2ph\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>, - Sched<[WriteCvtF2FSt]>; + Sched<[MR]>; def mrk : AVX512AIi8<0x1D, MRMDestMem, (outs), (ins x86memop:$dst, _dest.KRCWM:$mask, _src.RC:$src1, i32u8imm:$src2), "vcvtps2ph\t{$src2, $src1, $dst {${mask}}|$dst {${mask}}, $src1, $src2}", []>, - EVEX_K, Sched<[WriteCvtF2FSt]>; + EVEX_K, Sched<[MR]>; } } -multiclass avx512_cvtps2ph_sae { +multiclass avx512_cvtps2ph_sae { let hasSideEffects = 0 in defm rrb : AVX512_maskable_in_asm<0x1D, MRMDestReg, _dest, (outs _dest.RC:$dst), (ins _src.RC:$src1, i32u8imm:$src2), "vcvtps2ph", "$src2, {sae}, $src1", "$src1, {sae}, $src2", []>, - EVEX_B, AVX512AIi8Base, Sched<[WriteCvtF2F]>; + EVEX_B, AVX512AIi8Base, Sched<[Sched]>; } let Predicates = [HasAVX512] in { - defm VCVTPS2PHZ : avx512_cvtps2ph, - avx512_cvtps2ph_sae, + defm VCVTPS2PHZ : avx512_cvtps2ph, + avx512_cvtps2ph_sae, EVEX, EVEX_V512, EVEX_CD8<32, CD8VH>; let Predicates = [HasVLX] in { - defm VCVTPS2PHZ256 : avx512_cvtps2ph, + defm VCVTPS2PHZ256 : avx512_cvtps2ph, EVEX, EVEX_V256, EVEX_CD8<32, CD8VH>; - defm VCVTPS2PHZ128 : avx512_cvtps2ph, + defm VCVTPS2PHZ128 : avx512_cvtps2ph, EVEX, EVEX_V128, EVEX_CD8<32, CD8VH>; } diff --git a/lib/Target/X86/X86InstrSSE.td b/lib/Target/X86/X86InstrSSE.td index cc615e4438d5..c216f7ff5bc6 100644 --- a/lib/Target/X86/X86InstrSSE.td +++ b/lib/Target/X86/X86InstrSSE.td @@ -7303,37 +7303,41 @@ let Defs = [YMM0, YMM1, YMM2, YMM3, YMM4, YMM5, YMM6, YMM7, // Half precision conversion instructions // -multiclass f16c_ph2ps { +multiclass f16c_ph2ps { def rr : I<0x13, MRMSrcReg, (outs RC:$dst), (ins VR128:$src), "vcvtph2ps\t{$src, $dst|$dst, $src}", [(set RC:$dst, (X86cvtph2ps VR128:$src))]>, - T8PD, VEX, Sched<[WriteCvtF2F]>; + T8PD, VEX, Sched<[sched]>; let hasSideEffects = 0, mayLoad = 1 in def rm : I<0x13, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src), "vcvtph2ps\t{$src, $dst|$dst, $src}", [(set RC:$dst, (X86cvtph2ps (bc_v8i16 (loadv2i64 addr:$src))))]>, - T8PD, VEX, Sched<[WriteCvtF2FLd]>; + T8PD, VEX, Sched<[sched.Folded]>; } -multiclass f16c_ps2ph { +multiclass f16c_ps2ph { def rr : Ii8<0x1D, MRMDestReg, (outs VR128:$dst), (ins RC:$src1, i32u8imm:$src2), "vcvtps2ph\t{$src2, $src1, $dst|$dst, $src1, $src2}", [(set VR128:$dst, (X86cvtps2ph RC:$src1, imm:$src2))]>, - TAPD, VEX, Sched<[WriteCvtF2F]>; + TAPD, VEX, Sched<[RR]>; let hasSideEffects = 0, mayStore = 1 in def mr : Ii8<0x1D, MRMDestMem, (outs), (ins x86memop:$dst, RC:$src1, i32u8imm:$src2), "vcvtps2ph\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>, - TAPD, VEX, Sched<[WriteCvtF2FSt]>; + TAPD, VEX, Sched<[MR]>; } let Predicates = [HasF16C, NoVLX] in { - defm VCVTPH2PS : f16c_ph2ps; - defm VCVTPH2PSY : f16c_ph2ps, VEX_L; - defm VCVTPS2PH : f16c_ps2ph; - defm VCVTPS2PHY : f16c_ps2ph, VEX_L; + defm VCVTPH2PS : f16c_ph2ps; + defm VCVTPH2PSY : f16c_ph2ps, VEX_L; + defm VCVTPS2PH : f16c_ps2ph; + defm VCVTPS2PHY : f16c_ps2ph, VEX_L; // Pattern match vcvtph2ps of a scalar i64 load. def : Pat<(v4f32 (X86cvtph2ps (v8i16 (vzmovl_v2i64 addr:$src)))), diff --git a/lib/Target/X86/X86SchedBroadwell.td b/lib/Target/X86/X86SchedBroadwell.td index 3af49f624286..2307a2b51486 100755 --- a/lib/Target/X86/X86SchedBroadwell.td +++ b/lib/Target/X86/X86SchedBroadwell.td @@ -257,12 +257,6 @@ defm : BWWriteResPair; // Floating poin defm : BWWriteResPair; // Fp vector variable blends. defm : BWWriteResPair; // Fp vector variable blends. -def : WriteRes { - let Latency = 4; - let NumMicroOps = 3; - let ResourceCycles = [1,1,1]; -} - // FMA Scheduling helper class. // class FMASC { X86FoldableSchedWrite Sched = WriteFAdd; } @@ -353,6 +347,16 @@ defm : BWWriteResPair; // Float -> Integer. defm : BWWriteResPair; // Integer -> Float. defm : BWWriteResPair; // Float -> Float size conversion. +defm : X86WriteRes; +defm : X86WriteRes; +defm : X86WriteRes; +defm : X86WriteRes; + +defm : X86WriteRes; +defm : X86WriteRes; +defm : X86WriteRes; +defm : X86WriteRes; + // Strings instructions. // Packed Compare Implicit Length Strings, Return Mask @@ -625,8 +629,7 @@ def BWWriteResGroup15 : SchedWriteRes<[BWPort0,BWPort5]> { let NumMicroOps = 2; let ResourceCycles = [1,1]; } -def: InstRW<[BWWriteResGroup15], (instregex "VCVTPH2PS(Y?)rr", - "(V?)CVTPS2PDrr", +def: InstRW<[BWWriteResGroup15], (instregex "(V?)CVTPS2PDrr", "(V?)CVTSS2SDrr")>; def BWWriteResGroup16 : SchedWriteRes<[BWPort6,BWPort0156]> { @@ -825,7 +828,6 @@ def: InstRW<[BWWriteResGroup42], (instregex "MMX_CVTPI2PDirr", "MMX_CVT(T?)PS2PIirr", "(V?)CVTDQ2PDrr", "(V?)CVTPD2PSrr", - "VCVTPS2PHrr", "(V?)CVTSD2SSrr", "(V?)CVTSI642SDrr", "(V?)CVTSI2SDrr", @@ -963,8 +965,7 @@ def BWWriteResGroup59 : SchedWriteRes<[BWPort0,BWPort23]> { let NumMicroOps = 2; let ResourceCycles = [1,1]; } -def: InstRW<[BWWriteResGroup59], (instregex "VCVTPH2PS(Y?)rm", - "(V?)CVTPS2PDrm", +def: InstRW<[BWWriteResGroup59], (instregex "(V?)CVTPS2PDrm", "(V?)CVTSS2SDrm", "VPSLLVQrm", "VPSRLVQrm")>; @@ -976,7 +977,6 @@ def BWWriteResGroup60 : SchedWriteRes<[BWPort1,BWPort5]> { } def: InstRW<[BWWriteResGroup60], (instregex "VCVTDQ2PDYrr", "VCVTPD2PSYrr", - "VCVTPS2PHYrr", "VCVT(T?)PD2DQYrr")>; def BWWriteResGroup62 : SchedWriteRes<[BWPort6,BWPort23]> { diff --git a/lib/Target/X86/X86SchedHaswell.td b/lib/Target/X86/X86SchedHaswell.td index 7eba2469eb94..03188a9c6727 100644 --- a/lib/Target/X86/X86SchedHaswell.td +++ b/lib/Target/X86/X86SchedHaswell.td @@ -251,11 +251,15 @@ defm : HWWriteResPair; defm : HWWriteResPair; defm : HWWriteResPair; -def : WriteRes { - let Latency = 5; - let NumMicroOps = 4; - let ResourceCycles = [1,1,1,1]; -} +defm : X86WriteRes; +defm : X86WriteRes; +defm : X86WriteRes; +defm : X86WriteRes; + +defm : X86WriteRes; +defm : X86WriteRes; +defm : X86WriteRes; +defm : X86WriteRes; // Vector integer operations. defm : X86WriteRes; @@ -868,16 +872,14 @@ def HWWriteResGroup11 : SchedWriteRes<[HWPort0,HWPort23]> { let NumMicroOps = 2; let ResourceCycles = [1,1]; } -def: InstRW<[HWWriteResGroup11], (instregex "VCVTPH2PSrm", - "(V?)CVTPS2PDrm")>; +def: InstRW<[HWWriteResGroup11], (instregex "(V?)CVTPS2PDrm")>; def HWWriteResGroup11_1 : SchedWriteRes<[HWPort0,HWPort23]> { let Latency = 7; let NumMicroOps = 2; let ResourceCycles = [1,1]; } -def: InstRW<[HWWriteResGroup11_1], (instregex "VCVTPH2PSYrm", - "(V?)CVTSS2SDrm", +def: InstRW<[HWWriteResGroup11_1], (instregex "(V?)CVTSS2SDrm", "VPSLLVQrm", "VPSRLVQrm")>; @@ -1076,9 +1078,7 @@ def HWWriteResGroup31 : SchedWriteRes<[HWPort0,HWPort5]> { let NumMicroOps = 2; let ResourceCycles = [1,1]; } -def: InstRW<[HWWriteResGroup31], (instregex "VCVTPH2PSYrr", - "VCVTPH2PSrr", - "(V?)CVTPS2PDrr", +def: InstRW<[HWWriteResGroup31], (instregex "(V?)CVTPS2PDrr", "(V?)CVTSS2SDrr")>; def HWWriteResGroup32 : SchedWriteRes<[HWPort6,HWPort0156]> { @@ -1397,7 +1397,6 @@ def: InstRW<[HWWriteResGroup73], (instregex "MMX_CVTPI2PDirr", "MMX_CVT(T?)PS2PIirr", "(V?)CVTDQ2PDrr", "(V?)CVTPD2PSrr", - "VCVTPS2PHrr", "(V?)CVTSD2SSrr", "(V?)CVTSI(64)?2SDrr", "(V?)CVTSI2SSrr", @@ -1604,7 +1603,6 @@ def HWWriteResGroup102 : SchedWriteRes<[HWPort1,HWPort5]> { } def: InstRW<[HWWriteResGroup102], (instregex "VCVTDQ2PDYrr", "VCVTPD2PSYrr", - "VCVTPS2PHYrr", "VCVT(T?)PD2DQYrr")>; def HWWriteResGroup103 : SchedWriteRes<[HWPort1,HWPort23]> { @@ -1629,13 +1627,6 @@ def HWWriteResGroup105 : SchedWriteRes<[HWPort1,HWPort06,HWPort0156]> { def: InstRW<[HWWriteResGroup105], (instregex "SHLD(16|32|64)rrCL", "SHRD(16|32|64)rrCL")>; -def HWWriteResGroup106 : SchedWriteRes<[HWPort1,HWPort4,HWPort5,HWPort237]> { - let Latency = 7; - let NumMicroOps = 4; - let ResourceCycles = [1,1,1,1]; -} -def: InstRW<[HWWriteResGroup106], (instregex "VCVTPS2PHYmr")>; - def HWWriteResGroup107 : SchedWriteRes<[HWPort1,HWPort6,HWPort06,HWPort0156]> { let Latency = 6; let NumMicroOps = 4; diff --git a/lib/Target/X86/X86SchedSandyBridge.td b/lib/Target/X86/X86SchedSandyBridge.td index 953d0c266fe1..5d6bcac03723 100644 --- a/lib/Target/X86/X86SchedSandyBridge.td +++ b/lib/Target/X86/X86SchedSandyBridge.td @@ -235,7 +235,14 @@ defm : SBWriteResPair; defm : SBWriteResPair; defm : SBWriteResPair; defm : SBWriteResPair; -def : WriteRes { let Latency = 4; } + +defm : SBWriteResPair; +defm : SBWriteResPair; + +defm : X86WriteRes; +defm : X86WriteRes; +defm : X86WriteRes; +defm : X86WriteRes; // Vector integer operations. defm : X86WriteRes; diff --git a/lib/Target/X86/X86SchedSkylakeClient.td b/lib/Target/X86/X86SchedSkylakeClient.td index a986d883c0ae..ade7d0877f46 100644 --- a/lib/Target/X86/X86SchedSkylakeClient.td +++ b/lib/Target/X86/X86SchedSkylakeClient.td @@ -249,12 +249,6 @@ defm : SKLWriteResPair; // Floating po defm : SKLWriteResPair; // Fp vector variable blends. defm : SKLWriteResPair; // Fp vector variable blends. -def : WriteRes { - let Latency = 6; - let NumMicroOps = 4; - let ResourceCycles = [1,1,1,1]; -} - // FMA Scheduling helper class. // class FMASC { X86FoldableSchedWrite Sched = WriteFAdd; } @@ -346,6 +340,16 @@ defm : SKLWriteResPair; // Float -> Integer. defm : SKLWriteResPair; // Integer -> Float. defm : SKLWriteResPair; // Float -> Float size conversion. +defm : X86WriteRes; +defm : X86WriteRes; +defm : X86WriteRes; +defm : X86WriteRes; + +defm : X86WriteRes; +defm : X86WriteRes; +defm : X86WriteRes; +defm : X86WriteRes; + // Strings instructions. // Packed Compare Implicit Length Strings, Return Mask @@ -930,9 +934,7 @@ def: InstRW<[SKLWriteResGroup60], (instregex "MMX_CVT(T?)PD2PIirr", "MMX_CVT(T?)PS2PIirr", "(V?)CVT(T?)PD2DQrr", "(V?)CVTPD2PSrr", - "VCVTPH2PSrr", "(V?)CVTPS2PDrr", - "VCVTPS2PHrr", "(V?)CVTSD2SSrr", "(V?)CVTSI642SDrr", "(V?)CVTSI2SDrr", @@ -1157,9 +1159,7 @@ def SKLWriteResGroup89 : SchedWriteRes<[SKLPort5,SKLPort01]> { let ResourceCycles = [1,1]; } def: InstRW<[SKLWriteResGroup89], (instregex "VCVTPD2PSYrr", - "VCVTPH2PSYrr", "VCVTPS2PDYrr", - "VCVTPS2PHYrr", "VCVT(T?)PD2DQYrr")>; def SKLWriteResGroup91 : SchedWriteRes<[SKLPort23,SKLPort015]> { @@ -1300,13 +1300,6 @@ def SKLWriteResGroup112 : SchedWriteRes<[SKLPort0,SKLPort5,SKLPort23]> { } def: InstRW<[SKLWriteResGroup112], (instregex "MMX_PH(ADD|SUB)SWrm")>; -def SKLWriteResGroup114 : SchedWriteRes<[SKLPort4,SKLPort5,SKLPort237,SKLPort01]> { - let Latency = 8; - let NumMicroOps = 4; - let ResourceCycles = [1,1,1,1]; -} -def: InstRW<[SKLWriteResGroup114], (instregex "VCVTPS2PHYmr")>; - def SKLWriteResGroup115 : SchedWriteRes<[SKLPort23,SKLPort237,SKLPort06]> { let Latency = 8; let NumMicroOps = 5; @@ -1369,7 +1362,6 @@ def SKLWriteResGroup123 : SchedWriteRes<[SKLPort23,SKLPort01]> { let ResourceCycles = [1,1]; } def: InstRW<[SKLWriteResGroup123], (instregex "MMX_CVT(T?)PS2PIirm", - "VCVTPH2PSrm", "(V?)CVTPS2PDrm")>; def SKLWriteResGroup127 : SchedWriteRes<[SKLPort1,SKLPort5,SKLPort23]> { @@ -1418,7 +1410,6 @@ def SKLWriteResGroup134 : SchedWriteRes<[SKLPort01,SKLPort23]> { let ResourceCycles = [1,1]; } def: InstRW<[SKLWriteResGroup134], (instregex "(V?)CVTDQ2PSrm", - "(V?)CVTPH2PSYrm", "(V?)CVTPS2DQrm", "(V?)CVTSS2SDrm", "(V?)CVTTPS2DQrm")>; diff --git a/lib/Target/X86/X86SchedSkylakeServer.td b/lib/Target/X86/X86SchedSkylakeServer.td index 01bbb035d46d..fb9f0bb87b66 100755 --- a/lib/Target/X86/X86SchedSkylakeServer.td +++ b/lib/Target/X86/X86SchedSkylakeServer.td @@ -249,12 +249,6 @@ defm : SKXWriteResPair; // Floating poi defm : SKXWriteResPair; // Fp vector variable blends. defm : SKXWriteResPair; // Fp vector variable blends. -def : WriteRes { - let Latency = 6; - let NumMicroOps = 4; - let ResourceCycles = [1,1,1,1]; -} - // FMA Scheduling helper class. // class FMASC { X86FoldableSchedWrite Sched = WriteFAdd; } @@ -346,6 +340,16 @@ defm : SKXWriteResPair; // Float -> Integer. defm : SKXWriteResPair; // Integer -> Float. defm : SKXWriteResPair; // Float -> Float size conversion. +defm : X86WriteRes; +defm : X86WriteRes; +defm : X86WriteRes; +defm : X86WriteRes; + +defm : X86WriteRes; +defm : X86WriteRes; +defm : X86WriteRes; +defm : X86WriteRes; + // Strings instructions. // Packed Compare Implicit Length Strings, Return Mask @@ -1050,12 +1054,8 @@ def: InstRW<[SKXWriteResGroup61], (instregex "MMX_CVT(T?)PD2PIirr", "VCVTPD2PSZ128rr", "(V?)CVTPD2PSrr", "VCVTPD2UDQZ128rr", - "VCVTPH2PSZ128rr", - "VCVTPH2PSrr", "VCVTPS2PDZ128rr", "(V?)CVTPS2PDrr", - "VCVTPS2PHZ128rr", - "VCVTPS2PHrr", "VCVTPS2QQZ128rr", "VCVTPS2UQQZ128rr", "VCVTQQ2PSZ128rr", @@ -1370,9 +1370,7 @@ def: InstRW<[SKXWriteResGroup93], (instregex "VCVTDQ2PD(Z|Z256)rr", "VCVTPD2DQ(Y|Z|Z256)rr", "VCVTPD2PS(Y|Z|Z256)rr", "VCVTPD2UDQ(Z|Z256)rr", - "VCVTPH2PS(Y|Z|Z256)rr", "VCVTPS2PD(Y|Z|Z256)rr", - "VCVTPS2PH(Y|Z|Z256)rr", "VCVTPS2QQ(Z|Z256)rr", "VCVTPS2UQQ(Z|Z256)rr", "VCVTQQ2PS(Z|Z256)rr", @@ -1668,13 +1666,6 @@ def SKXWriteResGroup123 : SchedWriteRes<[SKXPort0,SKXPort5,SKXPort23]> { } def: InstRW<[SKXWriteResGroup123], (instregex "MMX_PH(ADD|SUB)SWrm")>; -def SKXWriteResGroup125 : SchedWriteRes<[SKXPort4,SKXPort5,SKXPort237,SKXPort015]> { - let Latency = 8; - let NumMicroOps = 4; - let ResourceCycles = [1,1,1,1]; -} -def: InstRW<[SKXWriteResGroup125], (instregex "VCVTPS2PHYmr")>; - def SKXWriteResGroup126 : SchedWriteRes<[SKXPort23,SKXPort237,SKXPort06]> { let Latency = 8; let NumMicroOps = 5; @@ -1816,7 +1807,6 @@ def SKXWriteResGroup137 : SchedWriteRes<[SKXPort23,SKXPort015]> { let ResourceCycles = [1,1]; } def: InstRW<[SKXWriteResGroup137], (instregex "MMX_CVT(T?)PS2PIirm", - "VCVTPH2PSrm", "(V?)CVTPS2PDrm")>; def SKXWriteResGroup138 : SchedWriteRes<[SKXPort0,SKXPort015]> { @@ -1905,7 +1895,6 @@ def: InstRW<[SKXWriteResGroup149], (instregex "VCVTDQ2PDZ128rm(b?)", "(V?)CVTDQ2PSrm", "VCVTPD2QQZ128rm(b?)", "VCVTPD2UQQZ128rm(b?)", - "VCVTPH2PSYrm", "VCVTPH2PSZ128rm(b?)", "VCVTPS2DQZ128rm(b?)", "(V?)CVTPS2DQrm", diff --git a/lib/Target/X86/X86Schedule.td b/lib/Target/X86/X86Schedule.td index 6bbf456e5880..4c004f02630b 100644 --- a/lib/Target/X86/X86Schedule.td +++ b/lib/Target/X86/X86Schedule.td @@ -299,7 +299,14 @@ def WriteMMXMOVMSK : SchedWrite; defm WriteCvtF2I : X86SchedWritePair; // Float -> Integer. defm WriteCvtI2F : X86SchedWritePair; // Integer -> Float. defm WriteCvtF2F : X86SchedWritePair; // Float -> Float size conversion. -def WriteCvtF2FSt : SchedWrite; // // Float -> Float + store size conversion. + +defm WriteCvtPH2PS : X86SchedWritePair; // Half -> Float size conversion. +defm WriteCvtPH2PSY : X86SchedWritePair; // Half -> Float size conversion (YMM/ZMM). + +def WriteCvtPS2PH : SchedWrite; // // Float -> Half size conversion. +def WriteCvtPS2PHY : SchedWrite; // // Float -> Half size conversion (YMM/ZMM). +def WriteCvtPS2PHSt : SchedWrite; // // Float -> Half + store size conversion. +def WriteCvtPS2PHYSt : SchedWrite; // // Float -> Half + store size conversion (YMM/ZMM). // CRC32 instruction. defm WriteCRC32 : X86SchedWritePair; diff --git a/lib/Target/X86/X86ScheduleAtom.td b/lib/Target/X86/X86ScheduleAtom.td index 32e84a169ea7..dd09ef2e7c8e 100644 --- a/lib/Target/X86/X86ScheduleAtom.td +++ b/lib/Target/X86/X86ScheduleAtom.td @@ -276,7 +276,13 @@ defm : AtomWriteResPair; // NOTE defm : AtomWriteResPair; // Float -> Integer. defm : AtomWriteResPair; // Integer -> Float. defm : AtomWriteResPair; // Float -> Float size conversion. -def : WriteRes; // NOTE: Doesn't exist on Atom. + +defm : AtomWriteResPair; // NOTE: Doesn't exist on Atom. +defm : AtomWriteResPair; // NOTE: Doesn't exist on Atom. +def : WriteRes; // NOTE: Doesn't exist on Atom. +def : WriteRes; // NOTE: Doesn't exist on Atom. +def : WriteRes; // NOTE: Doesn't exist on Atom. +def : WriteRes; // NOTE: Doesn't exist on Atom. //////////////////////////////////////////////////////////////////////////////// // Vector integer operations. diff --git a/lib/Target/X86/X86ScheduleBtVer2.td b/lib/Target/X86/X86ScheduleBtVer2.td index aa0803ff2f06..66aef173fcea 100644 --- a/lib/Target/X86/X86ScheduleBtVer2.td +++ b/lib/Target/X86/X86ScheduleBtVer2.td @@ -363,7 +363,14 @@ defm : JWriteResFpuPair; // NOTE: Doesn defm : JWriteResFpuPair; // Float -> Integer. defm : JWriteResFpuPair; // Integer -> Float. defm : JWriteResFpuPair; // Float -> Float size conversion. -def : WriteRes { let Latency = 4; } + +defm : JWriteResFpuPair; +defm : JWriteResYMMPair; + +defm : X86WriteRes; +defm : X86WriteRes; +defm : X86WriteRes; +defm : X86WriteRes; def JWriteCVTF2F : SchedWriteRes<[JFPU1, JSTC]> { let Latency = 7; @@ -528,38 +535,6 @@ def JWriteINSERTQ: SchedWriteRes<[JFPU01, JVALU]> { } def : InstRW<[JWriteINSERTQ], (instrs INSERTQ, INSERTQI)>; -//////////////////////////////////////////////////////////////////////////////// -// F16C instructions. -//////////////////////////////////////////////////////////////////////////////// - -def JWriteCVTPS2PHY: SchedWriteRes<[JFPU1, JSTC, JFPX]> { - let Latency = 6; - let ResourceCycles = [2, 2, 2]; - let NumMicroOps = 3; -} -def : InstRW<[JWriteCVTPS2PHY], (instrs VCVTPS2PHYrr)>; - -def JWriteCVTPS2PHYSt: SchedWriteRes<[JFPU1, JSTC, JFPX, JSAGU]> { - let Latency = 7; - let ResourceCycles = [2, 2, 2, 1]; - let NumMicroOps = 3; -} -def : InstRW<[JWriteCVTPS2PHYSt], (instrs VCVTPS2PHYmr)>; - -def JWriteCVTPH2PSY: SchedWriteRes<[JFPU1, JSTC]> { - let Latency = 3; - let ResourceCycles = [2, 2]; - let NumMicroOps = 2; -} -def : InstRW<[JWriteCVTPH2PSY], (instrs VCVTPH2PSYrr)>; - -def JWriteCVTPH2PSYLd: SchedWriteRes<[JLAGU, JFPU1, JSTC]> { - let Latency = 8; - let ResourceCycles = [1, 2, 2]; - let NumMicroOps = 2; -} -def : InstRW<[JWriteCVTPH2PSYLd], (instrs VCVTPH2PSYrm)>; - //////////////////////////////////////////////////////////////////////////////// // AVX instructions. //////////////////////////////////////////////////////////////////////////////// diff --git a/lib/Target/X86/X86ScheduleSLM.td b/lib/Target/X86/X86ScheduleSLM.td index cec2768b7929..3388412575d5 100644 --- a/lib/Target/X86/X86ScheduleSLM.td +++ b/lib/Target/X86/X86ScheduleSLM.td @@ -212,7 +212,6 @@ defm : SLMWriteResPair; defm : SLMWriteResPair; defm : SLMWriteResPair; defm : SLMWriteResPair; -def : WriteRes; // Vector integer operations. def : WriteRes { let Latency = 3; } @@ -397,4 +396,11 @@ defm : SLMWriteResPair; defm : SLMWriteResPair; defm : SLMWriteResPair; +defm : SLMWriteResPair; +defm : SLMWriteResPair; +def : WriteRes; +def : WriteRes; +def : WriteRes; +def : WriteRes; + } // SchedModel diff --git a/lib/Target/X86/X86ScheduleZnver1.td b/lib/Target/X86/X86ScheduleZnver1.td index ecf5ea62b444..d8584a91c7a9 100644 --- a/lib/Target/X86/X86ScheduleZnver1.td +++ b/lib/Target/X86/X86ScheduleZnver1.td @@ -272,7 +272,6 @@ defm : ZnWriteResFpuPair; defm : ZnWriteResFpuPair; defm : ZnWriteResFpuPair; defm : ZnWriteResFpuPair; -def : WriteRes; // Vector integer operations which uses FPU units defm : X86WriteRes; @@ -1326,18 +1325,21 @@ def : InstRW<[ZnWriteCVSTSI2SIr], (instregex "(V?)CVT(T?)SD2SI(64)?rr")>; // r32,m32. def : InstRW<[ZnWriteCVSTSI2SILd], (instregex "(V?)CVT(T?)SD2SI(64)?rm")>; - // VCVTPS2PH. // x,v,i. -def : InstRW<[WriteMicrocoded], (instregex "VCVTPS2PH(Y?)rr")>; +def : SchedAlias; +def : SchedAlias; // m,v,i. -def : InstRW<[WriteMicrocoded], (instregex "VCVTPS2PH(Y?)mr")>; +def : SchedAlias; +def : SchedAlias; // VCVTPH2PS. // v,x. -def : InstRW<[WriteMicrocoded], (instregex "VCVTPH2PS(Y?)rr")>; +def : SchedAlias; +def : SchedAlias; // v,m. -def : InstRW<[WriteMicrocoded], (instregex "VCVTPH2PS(Y?)rm")>; +def : SchedAlias; +def : SchedAlias; //-- SSE4A instructions --// // EXTRQ diff --git a/test/CodeGen/X86/f16c-schedule.ll b/test/CodeGen/X86/f16c-schedule.ll index 0c0f9d4b403b..973f6453e7e9 100644 --- a/test/CodeGen/X86/f16c-schedule.ll +++ b/test/CodeGen/X86/f16c-schedule.ll @@ -143,7 +143,7 @@ define <8 x i16> @test_vcvtps2ph_128(<4 x float> %a0, <4 x float> %a1, <4 x i16> ; BROADWELL-LABEL: test_vcvtps2ph_128: ; BROADWELL: # %bb.0: ; BROADWELL-NEXT: vcvtps2ph $0, %xmm0, %xmm0 # sched: [4:1.00] -; BROADWELL-NEXT: vcvtps2ph $0, %xmm1, (%rdi) # sched: [4:1.00] +; BROADWELL-NEXT: vcvtps2ph $0, %xmm1, (%rdi) # sched: [5:1.00] ; BROADWELL-NEXT: retq # sched: [7:1.00] ; ; SKYLAKE-LABEL: test_vcvtps2ph_128: @@ -196,7 +196,7 @@ define <8 x i16> @test_vcvtps2ph_256(<8 x float> %a0, <8 x float> %a1, <8 x i16> ; BROADWELL-LABEL: test_vcvtps2ph_256: ; BROADWELL: # %bb.0: ; BROADWELL-NEXT: vcvtps2ph $0, %ymm0, %xmm0 # sched: [6:1.00] -; BROADWELL-NEXT: vcvtps2ph $0, %ymm1, (%rdi) # sched: [4:1.00] +; BROADWELL-NEXT: vcvtps2ph $0, %ymm1, (%rdi) # sched: [7:1.00] ; BROADWELL-NEXT: vzeroupper # sched: [4:1.00] ; BROADWELL-NEXT: retq # sched: [7:1.00] ; diff --git a/test/tools/llvm-mca/X86/Broadwell/resources-f16c.s b/test/tools/llvm-mca/X86/Broadwell/resources-f16c.s index 810bfcecda72..d7490ae83b15 100644 --- a/test/tools/llvm-mca/X86/Broadwell/resources-f16c.s +++ b/test/tools/llvm-mca/X86/Broadwell/resources-f16c.s @@ -27,9 +27,9 @@ vcvtps2ph $0, %ymm0, (%rax) # CHECK-NEXT: 2 2 1.00 vcvtph2ps %xmm0, %ymm2 # CHECK-NEXT: 2 6 1.00 * vcvtph2ps (%rax), %ymm2 # CHECK-NEXT: 2 4 1.00 vcvtps2ph $0, %xmm0, %xmm2 -# CHECK-NEXT: 3 4 1.00 * vcvtps2ph $0, %xmm0, (%rax) +# CHECK-NEXT: 3 5 1.00 * vcvtps2ph $0, %xmm0, (%rax) # CHECK-NEXT: 2 6 1.00 vcvtps2ph $0, %ymm0, %xmm2 -# CHECK-NEXT: 3 4 1.00 * vcvtps2ph $0, %ymm0, (%rax) +# CHECK-NEXT: 3 7 1.00 * vcvtps2ph $0, %ymm0, (%rax) # CHECK: Resources: # CHECK-NEXT: [0] - BWDivider diff --git a/test/tools/llvm-mca/X86/BtVer2/resources-f16c.s b/test/tools/llvm-mca/X86/BtVer2/resources-f16c.s index d1c07b3cacc2..d407cc478c66 100644 --- a/test/tools/llvm-mca/X86/BtVer2/resources-f16c.s +++ b/test/tools/llvm-mca/X86/BtVer2/resources-f16c.s @@ -49,14 +49,14 @@ vcvtps2ph $0, %ymm0, (%rax) # CHECK: Resource pressure per iteration: # CHECK-NEXT: [0] [1] [2] [3] [4] [5] [6] [7] [8] [9] [10] [11] [12] [13] -# CHECK-NEXT: - - - 2.00 2.00 - 12.00 2.00 - 2.00 12.00 - - - +# CHECK-NEXT: - - - 2.00 2.00 - 12.00 3.00 - 2.00 12.00 - - - # CHECK: Resource pressure by instruction: # CHECK-NEXT: [0] [1] [2] [3] [4] [5] [6] [7] [8] [9] [10] [11] [12] [13] Instructions: # CHECK-NEXT: - - - - - - 1.00 - - - 1.00 - - - vcvtph2ps %xmm0, %xmm2 # CHECK-NEXT: - - - - - - 1.00 1.00 - - 1.00 - - - vcvtph2ps (%rax), %xmm2 # CHECK-NEXT: - - - - - - 2.00 - - - 2.00 - - - vcvtph2ps %xmm0, %ymm2 -# CHECK-NEXT: - - - - - - 2.00 1.00 - - 2.00 - - - vcvtph2ps (%rax), %ymm2 +# CHECK-NEXT: - - - - - - 2.00 2.00 - - 2.00 - - - vcvtph2ps (%rax), %ymm2 # CHECK-NEXT: - - - - - - 1.00 - - - 1.00 - - - vcvtps2ph $0, %xmm0, %xmm2 # CHECK-NEXT: - - - - - - 1.00 - - 1.00 1.00 - - - vcvtps2ph $0, %xmm0, (%rax) # CHECK-NEXT: - - - 1.00 1.00 - 2.00 - - - 2.00 - - - vcvtps2ph $0, %ymm0, %xmm2