Skip to content
This repository has been archived by the owner on Jan 17, 2019. It is now read-only.

Commit

Permalink
[X86][DAG] Switch X86 Target to post-legalized store merge
Browse files Browse the repository at this point in the history
Move store merge to happen after intrinsic lowering to allow lowered
stores to be merged.

Some regressions due in MergeConsecutiveStores to missing
insert_subvector that are addressed in follow up patch.

Reviewers: craig.topper, efriedma, RKSimon

Subscribers: llvm-commits

Differential Revision: https://reviews.llvm.org/D34559

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@310710 91177308-0d34-0410-b5e6-96231b3b80d8
  • Loading branch information
niravhdave committed Aug 11, 2017
1 parent 0fdbc97 commit b872fbb
Show file tree
Hide file tree
Showing 16 changed files with 175 additions and 235 deletions.
3 changes: 3 additions & 0 deletions include/llvm/Target/TargetLowering.h
Original file line number Diff line number Diff line change
Expand Up @@ -2723,6 +2723,9 @@ class TargetLowering : public TargetLoweringBase {
bool foldBooleans, DAGCombinerInfo &DCI,
const SDLoc &dl) const;

// For targets which wrap address, unwrap for analysis.
virtual SDValue unwrapAddress(SDValue N) const { return N; }

/// Returns true (and the GlobalValue and the offset) if the node is a
/// GlobalAddress + offset.
virtual bool
Expand Down
3 changes: 2 additions & 1 deletion lib/CodeGen/SelectionDAG/SelectionDAGAddressAnalysis.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
#include "llvm/CodeGen/MachineFrameInfo.h"
#include "llvm/CodeGen/SelectionDAG.h"
#include "llvm/CodeGen/SelectionDAGNodes.h"
#include "llvm/Target/TargetLowering.h"

namespace llvm {

Expand Down Expand Up @@ -55,7 +56,7 @@ bool BaseIndexOffset::equalBaseIndex(BaseIndexOffset &Other,
/// Parses tree in Ptr for base, index, offset addresses.
BaseIndexOffset BaseIndexOffset::match(SDValue Ptr, const SelectionDAG &DAG) {
// (((B + I*M) + c)) + c ...
SDValue Base = Ptr;
SDValue Base = DAG.getTargetLoweringInfo().unwrapAddress(Ptr);
SDValue Index = SDValue();
int64_t Offset = 0;
bool IsIndexSignExt = false;
Expand Down
6 changes: 6 additions & 0 deletions lib/Target/X86/X86ISelLowering.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -27034,6 +27034,12 @@ unsigned X86TargetLowering::ComputeNumSignBitsForTargetNode(
return 1;
}

SDValue X86TargetLowering::unwrapAddress(SDValue N) const {
if (N->getOpcode() == X86ISD::Wrapper || N->getOpcode() == X86ISD::WrapperRIP)
return N->getOperand(0);
return N;
}

/// Returns true (and the GlobalValue and the offset) if the node is a
/// GlobalAddress + offset.
bool X86TargetLowering::isGAPlusOffset(SDNode *N,
Expand Down
4 changes: 4 additions & 0 deletions lib/Target/X86/X86ISelLowering.h
Original file line number Diff line number Diff line change
Expand Up @@ -812,6 +812,8 @@ namespace llvm {
/// This method returns the name of a target specific DAG node.
const char *getTargetNodeName(unsigned Opcode) const override;

bool mergeStoresAfterLegalization() const override { return true; }

bool isCheapToSpeculateCttz() const override;

bool isCheapToSpeculateCtlz() const override;
Expand Down Expand Up @@ -867,6 +869,8 @@ namespace llvm {
const SelectionDAG &DAG,
unsigned Depth) const override;

SDValue unwrapAddress(SDValue N) const override;

bool isGAPlusOffset(SDNode *N, const GlobalValue* &GA,
int64_t &Offset) const override;

Expand Down
39 changes: 24 additions & 15 deletions test/CodeGen/X86/MergeConsecutiveStores.ll
Original file line number Diff line number Diff line change
Expand Up @@ -492,10 +492,15 @@ define void @merge_vec_element_store(<8 x float> %v, float* %ptr) {
store float %vecext7, float* %arrayidx7, align 4
ret void

; CHECK-LABEL: merge_vec_element_store
; CHECK: vmovups
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
; CHECK: vextractf128 $1, %ymm0, %xmm1
; CHECK: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; CHECK: retq

; This is what should be generated:
; FIXME-LABEL: merge_vec_element_store
; FIXME: vmovups
; FIXME-NEXT: vzeroupper
; FIXME-NEXT: retq
}

; PR21711 - Merge vector stores into wider vector stores.
Expand All @@ -515,11 +520,18 @@ define void @merge_vec_extract_stores(<8 x float> %v1, <8 x float> %v2, <4 x flo
store <4 x float> %shuffle3, <4 x float>* %idx3, align 16
ret void

; CHECK-LABEL: merge_vec_extract_stores
; CHECK: vmovups %ymm0, 48(%rdi)
; CHECK-NEXT: vmovups %ymm1, 80(%rdi)
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
; These vblendpd are obviously redundant.
; CHECK: vblendpd $12, %ymm0, %ymm0, %ymm0 # ymm0 = ymm0[0,1,2,3]
; CHECK: vmovupd %ymm0, 48(%rdi)
; CHECK: vblendpd $12, %ymm1, %ymm1, %ymm0 # ymm0 = ymm1[0,1,2,3]
; CHECK: vmovupd %ymm0, 80(%rdi)

; This is what should be generated:
; FIXME-LABEL: merge_vec_extract_stores
; FIXME: vmovups %ymm0, 48(%rdi)
; FIXME-NEXT: vmovups %ymm1, 80(%rdi)
; FIXME-NEXT: vzeroupper
; FIXME-NEXT: retq
}

; Merging vector stores when sourced from vector loads.
Expand Down Expand Up @@ -557,8 +569,7 @@ define void @merge_vec_stores_of_constants(<4 x i32>* %ptr) {
}

; This is a minimized test based on real code that was failing.
; We could merge stores (and loads) like this...

; This should now be merged.
define void @merge_vec_element_and_scalar_load([6 x i64]* %array) {
%idx0 = getelementptr inbounds [6 x i64], [6 x i64]* %array, i64 0, i64 0
%idx1 = getelementptr inbounds [6 x i64], [6 x i64]* %array, i64 0, i64 1
Expand All @@ -575,10 +586,8 @@ define void @merge_vec_element_and_scalar_load([6 x i64]* %array) {
ret void

; CHECK-LABEL: merge_vec_element_and_scalar_load
; CHECK: movq (%rdi), %rax
; CHECK-NEXT: movq 8(%rdi), %rcx
; CHECK-NEXT: movq %rax, 32(%rdi)
; CHECK-NEXT: movq %rcx, 40(%rdi)
; CHECK: vmovups (%rdi), %xmm0
; CHECK-NEXT: vmovups %xmm0, 32(%rdi)
; CHECK-NEXT: retq
}

Expand Down
7 changes: 3 additions & 4 deletions test/CodeGen/X86/bigstructret.ll
Original file line number Diff line number Diff line change
Expand Up @@ -31,22 +31,21 @@ entry:
ret %0 %3
}


define fastcc %1 @ReturnBigStruct2() nounwind readnone {
; X86-LABEL: ReturnBigStruct2:
; X86: # BB#0: # %entry
; X86-NEXT: movl $48, 4(%ecx)
; X86-NEXT: movb $1, 2(%ecx)
; X86-NEXT: movb $1, 1(%ecx)
; X86-NEXT: movb $0, (%ecx)
; X86-NEXT: movw $256, (%ecx) # imm = 0x100
; X86-NEXT: movl %ecx, %eax
; X86-NEXT: retl
;
; X64-LABEL: ReturnBigStruct2:
; X64: # BB#0: # %entry
; X64-NEXT: movl $48, 4(%rdi)
; X64-NEXT: movb $1, 2(%rdi)
; X64-NEXT: movb $1, 1(%rdi)
; X64-NEXT: movb $0, (%rdi)
; X64-NEXT: movw $256, (%rdi) # imm = 0x100
; X64-NEXT: movq %rdi, %rax
; X64-NEXT: retq
entry:
Expand Down
7 changes: 2 additions & 5 deletions test/CodeGen/X86/bitcast-i256.ll
Original file line number Diff line number Diff line change
Expand Up @@ -12,11 +12,8 @@ define i256 @foo(<8 x i32> %a) {
;
; SLOW-LABEL: foo:
; SLOW: # BB#0:
; SLOW-NEXT: vextractf128 $1, %ymm0, %xmm1
; SLOW-NEXT: vpextrq $1, %xmm1, 24(%rdi)
; SLOW-NEXT: vmovq %xmm1, 16(%rdi)
; SLOW-NEXT: vpextrq $1, %xmm0, 8(%rdi)
; SLOW-NEXT: vmovq %xmm0, (%rdi)
; SLOW-NEXT: vextractf128 $1, %ymm0, 16(%rdi)
; SLOW-NEXT: vmovups %xmm0, (%rdi)
; SLOW-NEXT: movq %rdi, %rax
; SLOW-NEXT: vzeroupper
; SLOW-NEXT: retq
Expand Down
3 changes: 1 addition & 2 deletions test/CodeGen/X86/constant-combines.ll
Original file line number Diff line number Diff line change
Expand Up @@ -15,12 +15,11 @@ define void @PR22524({ float, float }* %arg) {
;
; CHECK-LABEL: PR22524:
; CHECK: # BB#0: # %entry
; CHECK-NEXT: movl $0, 4(%rdi)
; CHECK-NEXT: xorl %eax, %eax
; CHECK-NEXT: movd %eax, %xmm0
; CHECK-NEXT: xorps %xmm1, %xmm1
; CHECK-NEXT: mulss %xmm0, %xmm1
; CHECK-NEXT: movl $0, (%rdi)
; CHECK-NEXT: movq $0, (%rdi)
; CHECK-NEXT: movss %xmm1, 4(%rdi)
; CHECK-NEXT: retq
entry:
Expand Down
78 changes: 46 additions & 32 deletions test/CodeGen/X86/extract-store.ll
Original file line number Diff line number Diff line change
Expand Up @@ -510,22 +510,22 @@ define void @extract_f64_1(double* nocapture %dst, <2 x double> %foo) nounwind {
}

define void @extract_f128_0(fp128* nocapture %dst, <2 x fp128> %foo) nounwind {
; X32-LABEL: extract_f128_0:
; X32: # BB#0:
; X32-NEXT: pushl %edi
; X32-NEXT: pushl %esi
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X32-NEXT: movl {{[0-9]+}}(%esp), %edx
; X32-NEXT: movl {{[0-9]+}}(%esp), %esi
; X32-NEXT: movl {{[0-9]+}}(%esp), %edi
; X32-NEXT: movl %esi, 12(%edi)
; X32-NEXT: movl %edx, 8(%edi)
; X32-NEXT: movl %ecx, 4(%edi)
; X32-NEXT: movl %eax, (%edi)
; X32-NEXT: popl %esi
; X32-NEXT: popl %edi
; X32-NEXT: retl
; SSE-X32-LABEL: extract_f128_0:
; SSE-X32: # BB#0:
; SSE-X32-NEXT: pushl %edi
; SSE-X32-NEXT: pushl %esi
; SSE-X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; SSE-X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
; SSE-X32-NEXT: movl {{[0-9]+}}(%esp), %edx
; SSE-X32-NEXT: movl {{[0-9]+}}(%esp), %esi
; SSE-X32-NEXT: movl {{[0-9]+}}(%esp), %edi
; SSE-X32-NEXT: movl %esi, 12(%edi)
; SSE-X32-NEXT: movl %edx, 8(%edi)
; SSE-X32-NEXT: movl %ecx, 4(%edi)
; SSE-X32-NEXT: movl %eax, (%edi)
; SSE-X32-NEXT: popl %esi
; SSE-X32-NEXT: popl %edi
; SSE-X32-NEXT: retl
;
; SSE2-X64-LABEL: extract_f128_0:
; SSE2-X64: # BB#0:
Expand All @@ -539,6 +539,13 @@ define void @extract_f128_0(fp128* nocapture %dst, <2 x fp128> %foo) nounwind {
; SSE41-X64-NEXT: movq %rsi, (%rdi)
; SSE41-X64-NEXT: retq
;
; AVX-X32-LABEL: extract_f128_0:
; AVX-X32: # BB#0:
; AVX-X32-NEXT: vmovups {{[0-9]+}}(%esp), %xmm0
; AVX-X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; AVX-X32-NEXT: vmovups %xmm0, (%eax)
; AVX-X32-NEXT: retl
;
; AVX-X64-LABEL: extract_f128_0:
; AVX-X64: # BB#0:
; AVX-X64-NEXT: movq %rdx, 8(%rdi)
Expand All @@ -555,22 +562,22 @@ define void @extract_f128_0(fp128* nocapture %dst, <2 x fp128> %foo) nounwind {
}

define void @extract_f128_1(fp128* nocapture %dst, <2 x fp128> %foo) nounwind {
; X32-LABEL: extract_f128_1:
; X32: # BB#0:
; X32-NEXT: pushl %edi
; X32-NEXT: pushl %esi
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X32-NEXT: movl {{[0-9]+}}(%esp), %edx
; X32-NEXT: movl {{[0-9]+}}(%esp), %esi
; X32-NEXT: movl {{[0-9]+}}(%esp), %edi
; X32-NEXT: movl %esi, 12(%edi)
; X32-NEXT: movl %edx, 8(%edi)
; X32-NEXT: movl %ecx, 4(%edi)
; X32-NEXT: movl %eax, (%edi)
; X32-NEXT: popl %esi
; X32-NEXT: popl %edi
; X32-NEXT: retl
; SSE-X32-LABEL: extract_f128_1:
; SSE-X32: # BB#0:
; SSE-X32-NEXT: pushl %edi
; SSE-X32-NEXT: pushl %esi
; SSE-X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; SSE-X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
; SSE-X32-NEXT: movl {{[0-9]+}}(%esp), %edx
; SSE-X32-NEXT: movl {{[0-9]+}}(%esp), %esi
; SSE-X32-NEXT: movl {{[0-9]+}}(%esp), %edi
; SSE-X32-NEXT: movl %esi, 12(%edi)
; SSE-X32-NEXT: movl %edx, 8(%edi)
; SSE-X32-NEXT: movl %ecx, 4(%edi)
; SSE-X32-NEXT: movl %eax, (%edi)
; SSE-X32-NEXT: popl %esi
; SSE-X32-NEXT: popl %edi
; SSE-X32-NEXT: retl
;
; SSE2-X64-LABEL: extract_f128_1:
; SSE2-X64: # BB#0:
Expand All @@ -584,6 +591,13 @@ define void @extract_f128_1(fp128* nocapture %dst, <2 x fp128> %foo) nounwind {
; SSE41-X64-NEXT: movq %rcx, (%rdi)
; SSE41-X64-NEXT: retq
;
; AVX-X32-LABEL: extract_f128_1:
; AVX-X32: # BB#0:
; AVX-X32-NEXT: vmovups {{[0-9]+}}(%esp), %xmm0
; AVX-X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; AVX-X32-NEXT: vmovups %xmm0, (%eax)
; AVX-X32-NEXT: retl
;
; AVX-X64-LABEL: extract_f128_1:
; AVX-X64: # BB#0:
; AVX-X64-NEXT: movq %r8, 8(%rdi)
Expand Down
12 changes: 8 additions & 4 deletions test/CodeGen/X86/fold-vector-sext-crash2.ll
Original file line number Diff line number Diff line change
Expand Up @@ -53,8 +53,10 @@ define <2 x i256> @test_zext1() {
ret <2 x i256> %Shuff

; X64-LABEL: test_zext1
; X64: movq $0
; X64-NEXT: movq $0
; X64: xorps %xmm0, %xmm0
; X64: movaps %xmm0
; X64: movaps %xmm0
; X64: movaps %xmm0
; X64-NEXT: movq $0
; X64-NEXT: movq $254

Expand All @@ -75,8 +77,10 @@ define <2 x i256> @test_zext2() {
ret <2 x i256> %Shuff

; X64-LABEL: test_zext2
; X64: movq $0
; X64-NEXT: movq $0
; X64: xorps %xmm0, %xmm0
; X64-NEXT: movaps %xmm0
; X64-NEXT: movaps %xmm0
; X64-NEXT: movaps %xmm0
; X64-NEXT: movq $-1
; X64-NEXT: movq $-2

Expand Down
26 changes: 10 additions & 16 deletions test/CodeGen/X86/legalize-shl-vec.ll
Original file line number Diff line number Diff line change
Expand Up @@ -26,14 +26,11 @@ define <2 x i256> @test_shl(<2 x i256> %In) {
;
; X64-LABEL: test_shl:
; X64: # BB#0:
; X64-NEXT: movq $0, 56(%rdi)
; X64-NEXT: movq $0, 48(%rdi)
; X64-NEXT: movq $0, 40(%rdi)
; X64-NEXT: movq $0, 32(%rdi)
; X64-NEXT: movq $0, 24(%rdi)
; X64-NEXT: movq $0, 16(%rdi)
; X64-NEXT: movq $0, 8(%rdi)
; X64-NEXT: movq $0, (%rdi)
; X64-NEXT: xorps %xmm0, %xmm0
; X64-NEXT: movaps %xmm0, 48(%rdi)
; X64-NEXT: movaps %xmm0, 32(%rdi)
; X64-NEXT: movaps %xmm0, 16(%rdi)
; X64-NEXT: movaps %xmm0, (%rdi)
; X64-NEXT: movq %rdi, %rax
; X64-NEXT: retq
%Amt = insertelement <2 x i256> undef, i256 -1, i32 0
Expand Down Expand Up @@ -65,14 +62,11 @@ define <2 x i256> @test_srl(<2 x i256> %In) {
;
; X64-LABEL: test_srl:
; X64: # BB#0:
; X64-NEXT: movq $0, 56(%rdi)
; X64-NEXT: movq $0, 48(%rdi)
; X64-NEXT: movq $0, 40(%rdi)
; X64-NEXT: movq $0, 32(%rdi)
; X64-NEXT: movq $0, 24(%rdi)
; X64-NEXT: movq $0, 16(%rdi)
; X64-NEXT: movq $0, 8(%rdi)
; X64-NEXT: movq $0, (%rdi)
; X64-NEXT: xorps %xmm0, %xmm0
; X64-NEXT: movaps %xmm0, 48(%rdi)
; X64-NEXT: movaps %xmm0, 32(%rdi)
; X64-NEXT: movaps %xmm0, 16(%rdi)
; X64-NEXT: movaps %xmm0, (%rdi)
; X64-NEXT: movq %rdi, %rax
; X64-NEXT: retq
%Amt = insertelement <2 x i256> undef, i256 -1, i32 0
Expand Down
Loading

0 comments on commit b872fbb

Please sign in to comment.