Skip to content

Commit

Permalink
[X86] Fix non-intrinsic roundss/roundsd to not read the destination r…
Browse files Browse the repository at this point in the history
…egister

This changes the scalar non-intrinsic non-avx roundss/sd instruction
definitions not to read their destination register - allowing partial dependency
breaking.

This fixes PR31143.

Differential Revision: https://reviews.llvm.org/D27323


git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@288703 91177308-0d34-0410-b5e6-96231b3b80d8
  • Loading branch information
mkuperst committed Dec 5, 2016
1 parent 923a400 commit 8bb9ea4
Show file tree
Hide file tree
Showing 3 changed files with 156 additions and 82 deletions.
4 changes: 2 additions & 2 deletions lib/Target/X86/X86InstrInfo.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -592,6 +592,8 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
{ X86::RCPSSr_Int, X86::RCPSSm_Int, TB_NO_REVERSE },
{ X86::ROUNDPDr, X86::ROUNDPDm, TB_ALIGN_16 },
{ X86::ROUNDPSr, X86::ROUNDPSm, TB_ALIGN_16 },
{ X86::ROUNDSDr, X86::ROUNDSDm, 0 },
{ X86::ROUNDSSr, X86::ROUNDSSm, 0 },
{ X86::RSQRTPSr, X86::RSQRTPSm, TB_ALIGN_16 },
{ X86::RSQRTSSr, X86::RSQRTSSm, 0 },
{ X86::RSQRTSSr_Int, X86::RSQRTSSm_Int, TB_NO_REVERSE },
Expand Down Expand Up @@ -1212,8 +1214,6 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
{ X86::PUNPCKLQDQrr, X86::PUNPCKLQDQrm, TB_ALIGN_16 },
{ X86::PUNPCKLWDrr, X86::PUNPCKLWDrm, TB_ALIGN_16 },
{ X86::PXORrr, X86::PXORrm, TB_ALIGN_16 },
{ X86::ROUNDSDr, X86::ROUNDSDm, 0 },
{ X86::ROUNDSSr, X86::ROUNDSSm, 0 },
{ X86::ROUNDSDr_Int, X86::ROUNDSDm_Int, TB_NO_REVERSE },
{ X86::ROUNDSSr_Int, X86::ROUNDSSm_Int, TB_NO_REVERSE },
{ X86::SBB32rr, X86::SBB32rm, 0 },
Expand Down
174 changes: 94 additions & 80 deletions lib/Target/X86/X86InstrSSE.td
Original file line number Diff line number Diff line change
Expand Up @@ -6268,10 +6268,10 @@ let Predicates = [UseAVX] in {
// SSE4.1 - Round Instructions
//===----------------------------------------------------------------------===//

multiclass sse41_fp_unop_rm<bits<8> opcps, bits<8> opcpd, string OpcodeStr,
X86MemOperand x86memop, RegisterClass RC,
PatFrag mem_frag32, PatFrag mem_frag64,
Intrinsic V4F32Int, Intrinsic V2F64Int> {
multiclass sse41_fp_unop_p<bits<8> opcps, bits<8> opcpd, string OpcodeStr,
X86MemOperand x86memop, RegisterClass RC,
PatFrag mem_frag32, PatFrag mem_frag64,
Intrinsic V4F32Int, Intrinsic V2F64Int> {
let ExeDomain = SSEPackedSingle in {
// Intrinsic operation, reg.
// Vector intrinsic operation, reg
Expand Down Expand Up @@ -6312,35 +6312,73 @@ let ExeDomain = SSEPackedDouble in {
} // ExeDomain = SSEPackedDouble
}

multiclass sse41_fp_binop_rm<bits<8> opcss, bits<8> opcsd,
string OpcodeStr,
Intrinsic F32Int,
Intrinsic F64Int, bit Is2Addr = 1> {
let ExeDomain = GenericDomain in {
// Operation, reg.
let hasSideEffects = 0 in
multiclass avx_fp_unop_rm<bits<8> opcss, bits<8> opcsd,
string OpcodeStr> {
let ExeDomain = GenericDomain, hasSideEffects = 0 in {
def SSr : SS4AIi8<opcss, MRMSrcReg,
(outs FR32:$dst), (ins FR32:$src1, FR32:$src2, i32u8imm:$src3),
!if(Is2Addr,
!strconcat(OpcodeStr,
"ss\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
!strconcat(OpcodeStr,
"ss\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
(outs FR32:$dst), (ins FR32:$src1, FR32:$src2, i32u8imm:$src3),
!strconcat(OpcodeStr,
"ss\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
[]>, Sched<[WriteFAdd]>;

// Operation, mem.
let mayLoad = 1, hasSideEffects = 0 in
let mayLoad = 1 in
def SSm : SS4AIi8<opcss, MRMSrcMem,
(outs FR32:$dst), (ins FR32:$src1, f32mem:$src2, i32u8imm:$src3),
!if(Is2Addr,
!strconcat(OpcodeStr,
"ss\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
!strconcat(OpcodeStr,
"ss\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
!strconcat(OpcodeStr,
"ss\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
[]>, Sched<[WriteFAddLd, ReadAfterLd]>;

// Intrinsic operation, reg.
let isCodeGenOnly = 1 in
def SDr : SS4AIi8<opcsd, MRMSrcReg,
(outs FR64:$dst), (ins FR64:$src1, FR64:$src2, i32u8imm:$src3),
!strconcat(OpcodeStr,
"sd\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
[]>, Sched<[WriteFAdd]>;

let mayLoad = 1 in
def SDm : SS4AIi8<opcsd, MRMSrcMem,
(outs FR64:$dst), (ins FR64:$src1, f64mem:$src2, i32u8imm:$src3),
!strconcat(OpcodeStr,
"sd\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
[]>, Sched<[WriteFAddLd, ReadAfterLd]>;
} // ExeDomain = GenericDomain, hasSideEffects = 0
}

multiclass sse41_fp_unop_s<bits<8> opcss, bits<8> opcsd,
string OpcodeStr> {
let ExeDomain = GenericDomain, hasSideEffects = 0 in {
def SSr : SS4AIi8<opcss, MRMSrcReg,
(outs FR32:$dst), (ins FR32:$src1, i32u8imm:$src2),
!strconcat(OpcodeStr,
"ss\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
[]>, Sched<[WriteFAdd]>;

let mayLoad = 1 in
def SSm : SS4AIi8<opcss, MRMSrcMem,
(outs FR32:$dst), (ins f32mem:$src1, i32u8imm:$src2),
!strconcat(OpcodeStr,
"ss\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
[]>, Sched<[WriteFAddLd, ReadAfterLd]>;

def SDr : SS4AIi8<opcsd, MRMSrcReg,
(outs FR64:$dst), (ins FR64:$src1, i32u8imm:$src2),
!strconcat(OpcodeStr,
"sd\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
[]>, Sched<[WriteFAdd]>;

let mayLoad = 1 in
def SDm : SS4AIi8<opcsd, MRMSrcMem,
(outs FR64:$dst), (ins f64mem:$src1, i32u8imm:$src2),
!strconcat(OpcodeStr,
"sd\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
[]>, Sched<[WriteFAddLd, ReadAfterLd]>;
} // ExeDomain = GenericDomain, hasSideEffects = 0
}

multiclass sse41_fp_binop_s<bits<8> opcss, bits<8> opcsd,
string OpcodeStr,
Intrinsic F32Int,
Intrinsic F64Int, bit Is2Addr = 1> {
let ExeDomain = GenericDomain, isCodeGenOnly = 1 in {
def SSr_Int : SS4AIi8<opcss, MRMSrcReg,
(outs VR128:$dst), (ins VR128:$src1, VR128:$src2, i32u8imm:$src3),
!if(Is2Addr,
Expand All @@ -6351,8 +6389,6 @@ let ExeDomain = GenericDomain in {
[(set VR128:$dst, (F32Int VR128:$src1, VR128:$src2, imm:$src3))]>,
Sched<[WriteFAdd]>;

// Intrinsic operation, mem.
let isCodeGenOnly = 1 in
def SSm_Int : SS4AIi8<opcss, MRMSrcMem,
(outs VR128:$dst), (ins VR128:$src1, ssmem:$src2, i32u8imm:$src3),
!if(Is2Addr,
Expand All @@ -6364,30 +6400,6 @@ let ExeDomain = GenericDomain in {
(F32Int VR128:$src1, sse_load_f32:$src2, imm:$src3))]>,
Sched<[WriteFAddLd, ReadAfterLd]>;

// Operation, reg.
let hasSideEffects = 0 in
def SDr : SS4AIi8<opcsd, MRMSrcReg,
(outs FR64:$dst), (ins FR64:$src1, FR64:$src2, i32u8imm:$src3),
!if(Is2Addr,
!strconcat(OpcodeStr,
"sd\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
!strconcat(OpcodeStr,
"sd\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
[]>, Sched<[WriteFAdd]>;

// Operation, mem.
let mayLoad = 1, hasSideEffects = 0 in
def SDm : SS4AIi8<opcsd, MRMSrcMem,
(outs FR64:$dst), (ins FR64:$src1, f64mem:$src2, i32u8imm:$src3),
!if(Is2Addr,
!strconcat(OpcodeStr,
"sd\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
!strconcat(OpcodeStr,
"sd\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
[]>, Sched<[WriteFAddLd, ReadAfterLd]>;

// Intrinsic operation, reg.
let isCodeGenOnly = 1 in
def SDr_Int : SS4AIi8<opcsd, MRMSrcReg,
(outs VR128:$dst), (ins VR128:$src1, VR128:$src2, i32u8imm:$src3),
!if(Is2Addr,
Expand All @@ -6398,8 +6410,6 @@ let ExeDomain = GenericDomain in {
[(set VR128:$dst, (F64Int VR128:$src1, VR128:$src2, imm:$src3))]>,
Sched<[WriteFAdd]>;

// Intrinsic operation, mem.
let isCodeGenOnly = 1 in
def SDm_Int : SS4AIi8<opcsd, MRMSrcMem,
(outs VR128:$dst), (ins VR128:$src1, sdmem:$src2, i32u8imm:$src3),
!if(Is2Addr,
Expand All @@ -6410,23 +6420,24 @@ let ExeDomain = GenericDomain in {
[(set VR128:$dst,
(F64Int VR128:$src1, sse_load_f64:$src2, imm:$src3))]>,
Sched<[WriteFAddLd, ReadAfterLd]>;
} // ExeDomain = GenericDomain
} // ExeDomain = GenericDomain, isCodeGenOnly = 1
}

// FP round - roundss, roundps, roundsd, roundpd
let Predicates = [HasAVX] in {
// Intrinsic form
defm VROUND : sse41_fp_unop_rm<0x08, 0x09, "vround", f128mem, VR128,
loadv4f32, loadv2f64,
int_x86_sse41_round_ps,
int_x86_sse41_round_pd>, VEX;
defm VROUNDY : sse41_fp_unop_rm<0x08, 0x09, "vround", f256mem, VR256,
loadv8f32, loadv4f64,
int_x86_avx_round_ps_256,
int_x86_avx_round_pd_256>, VEX, VEX_L;
defm VROUND : sse41_fp_binop_rm<0x0A, 0x0B, "vround",
int_x86_sse41_round_ss,
int_x86_sse41_round_sd, 0>, VEX_4V, VEX_LIG;
defm VROUND : sse41_fp_unop_p<0x08, 0x09, "vround", f128mem, VR128,
loadv4f32, loadv2f64,
int_x86_sse41_round_ps,
int_x86_sse41_round_pd>, VEX;
defm VROUNDY : sse41_fp_unop_p<0x08, 0x09, "vround", f256mem, VR256,
loadv8f32, loadv4f64,
int_x86_avx_round_ps_256,
int_x86_avx_round_pd_256>, VEX, VEX_L;
defm VROUND : sse41_fp_binop_s<0x0A, 0x0B, "vround",
int_x86_sse41_round_ss,
int_x86_sse41_round_sd, 0>, VEX_4V, VEX_LIG;
defm VROUND : avx_fp_unop_rm<0x0A, 0x0B, "vround">, VEX_4V, VEX_LIG;
}

let Predicates = [UseAVX] in {
Expand Down Expand Up @@ -6498,34 +6509,37 @@ let Predicates = [HasAVX] in {
(VROUNDYPDr VR256:$src, (i32 0xB))>;
}

defm ROUND : sse41_fp_unop_rm<0x08, 0x09, "round", f128mem, VR128,
memopv4f32, memopv2f64,
int_x86_sse41_round_ps, int_x86_sse41_round_pd>;
defm ROUND : sse41_fp_unop_p<0x08, 0x09, "round", f128mem, VR128,
memopv4f32, memopv2f64, int_x86_sse41_round_ps,
int_x86_sse41_round_pd>;

defm ROUND : sse41_fp_unop_s<0x0A, 0x0B, "round">;

let Constraints = "$src1 = $dst" in
defm ROUND : sse41_fp_binop_rm<0x0A, 0x0B, "round",
defm ROUND : sse41_fp_binop_s<0x0A, 0x0B, "round",
int_x86_sse41_round_ss, int_x86_sse41_round_sd>;

let Predicates = [UseSSE41] in {
def : Pat<(ffloor FR32:$src),
(ROUNDSSr (f32 (IMPLICIT_DEF)), FR32:$src, (i32 0x9))>;
(ROUNDSSr FR32:$src, (i32 0x9))>;
def : Pat<(f64 (ffloor FR64:$src)),
(ROUNDSDr (f64 (IMPLICIT_DEF)), FR64:$src, (i32 0x9))>;
(ROUNDSDr FR64:$src, (i32 0x9))>;
def : Pat<(f32 (fnearbyint FR32:$src)),
(ROUNDSSr (f32 (IMPLICIT_DEF)), FR32:$src, (i32 0xC))>;
(ROUNDSSr FR32:$src, (i32 0xC))>;
def : Pat<(f64 (fnearbyint FR64:$src)),
(ROUNDSDr (f64 (IMPLICIT_DEF)), FR64:$src, (i32 0xC))>;
(ROUNDSDr FR64:$src, (i32 0xC))>;
def : Pat<(f32 (fceil FR32:$src)),
(ROUNDSSr (f32 (IMPLICIT_DEF)), FR32:$src, (i32 0xA))>;
(ROUNDSSr FR32:$src, (i32 0xA))>;
def : Pat<(f64 (fceil FR64:$src)),
(ROUNDSDr (f64 (IMPLICIT_DEF)), FR64:$src, (i32 0xA))>;
(ROUNDSDr FR64:$src, (i32 0xA))>;
def : Pat<(f32 (frint FR32:$src)),
(ROUNDSSr (f32 (IMPLICIT_DEF)), FR32:$src, (i32 0x4))>;
(ROUNDSSr FR32:$src, (i32 0x4))>;
def : Pat<(f64 (frint FR64:$src)),
(ROUNDSDr (f64 (IMPLICIT_DEF)), FR64:$src, (i32 0x4))>;
(ROUNDSDr FR64:$src, (i32 0x4))>;
def : Pat<(f32 (ftrunc FR32:$src)),
(ROUNDSSr (f32 (IMPLICIT_DEF)), FR32:$src, (i32 0xB))>;
(ROUNDSSr FR32:$src, (i32 0xB))>;
def : Pat<(f64 (ftrunc FR64:$src)),
(ROUNDSDr (f64 (IMPLICIT_DEF)), FR64:$src, (i32 0xB))>;
(ROUNDSDr FR64:$src, (i32 0xB))>;

def : Pat<(v4f32 (ffloor VR128:$src)),
(ROUNDPSr VR128:$src, (i32 0x9))>;
Expand Down
60 changes: 60 additions & 0 deletions test/CodeGen/X86/pr31143.ll
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
; RUN: llc -mtriple=x86_64-pc-linux-gnu -mattr=+sse4.2 < %s | FileCheck %s

; CHECK-LABEL: testss:
; CHECK: movss {{.*}}, %[[XMM0:xmm[0-9]+]]
; CHECK: xorps %[[XMM1:xmm[0-9]+]], %[[XMM1]]
; CHECK: roundss $9, %[[XMM0]], %[[XMM1]]

define void @testss(float* nocapture %a, <4 x float>* nocapture %b, i32 %k) {
entry:
br label %for.body

for.body:
%indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
%arrayidx = getelementptr inbounds float, float* %a, i64 %indvars.iv
%v = load float, float* %arrayidx, align 4
%floor = call float @floorf(float %v)
%sub = fsub float %floor, %v
%v1 = insertelement <4 x float> undef, float %sub, i32 0
%br = shufflevector <4 x float> %v1, <4 x float> undef, <4 x i32> <i32 0, i32 0, i32 0, i32 0>
store volatile <4 x float> %br, <4 x float>* %b, align 4
%indvars.iv.next = add i64 %indvars.iv, 1
%lftr.wideiv = trunc i64 %indvars.iv.next to i32
%exitcond = icmp eq i32 %lftr.wideiv, %k
br i1 %exitcond, label %for.end, label %for.body

for.end:
ret void
}

; CHECK-LABEL: testsd:
; CHECK: movsd {{.*}}, %[[XMM0:xmm[0-9]+]]
; CHECK: xorps %[[XMM1:xmm[0-9]+]], %[[XMM1]]
; CHECK: roundsd $9, %[[XMM0]], %[[XMM1]]

define void @testsd(double* nocapture %a, <2 x double>* nocapture %b, i32 %k) {
entry:
br label %for.body

for.body:
%indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
%arrayidx = getelementptr inbounds double, double* %a, i64 %indvars.iv
%v = load double, double* %arrayidx, align 4
%floor = call double @floor(double %v)
%sub = fsub double %floor, %v
%v1 = insertelement <2 x double> undef, double %sub, i32 0
%br = shufflevector <2 x double> %v1, <2 x double> undef, <2 x i32> <i32 0, i32 0>
store volatile <2 x double> %br, <2 x double>* %b, align 4
%indvars.iv.next = add i64 %indvars.iv, 1
%lftr.wideiv = trunc i64 %indvars.iv.next to i32
%exitcond = icmp eq i32 %lftr.wideiv, %k
br i1 %exitcond, label %for.end, label %for.body

for.end:
ret void
}

declare float @floorf(float) nounwind readnone

declare double @floor(double) nounwind readnone

0 comments on commit 8bb9ea4

Please sign in to comment.