Skip to content

Commit

Permalink
Revert "In visitSTORE, always use FindBetterChain, rather than only w…
Browse files Browse the repository at this point in the history
…hen UseAA is enabled."

This reverts commit r293893 which is miscompiling lua on ARM and
bootstrapping for x86-windows.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@293915 91177308-0d34-0410-b5e6-96231b3b80d8
  • Loading branch information
niravhdave committed Feb 2, 2017
1 parent 416bf90 commit 529986a
Show file tree
Hide file tree
Showing 72 changed files with 2,226 additions and 2,046 deletions.
3 changes: 0 additions & 3 deletions include/llvm/Target/TargetLowering.h
Original file line number Diff line number Diff line change
Expand Up @@ -363,9 +363,6 @@ class TargetLoweringBase {
return false;
}

/// Returns if it's reasonable to merge stores to MemVT size.
virtual bool canMergeStoresTo(EVT MemVT) const { return true; }

/// \brief Return true if it is cheap to speculate a call to intrinsic cttz.
virtual bool isCheapToSpeculateCttz() const {
return false;
Expand Down
740 changes: 368 additions & 372 deletions lib/CodeGen/SelectionDAG/DAGCombiner.cpp

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion lib/CodeGen/TargetLoweringBase.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -851,7 +851,7 @@ TargetLoweringBase::TargetLoweringBase(const TargetMachine &tm) : TM(tm) {
MinFunctionAlignment = 0;
PrefFunctionAlignment = 0;
PrefLoopAlignment = 0;
GatherAllAliasesMaxDepth = 18;
GatherAllAliasesMaxDepth = 6;
MinStackArgumentAlignment = 1;
// TODO: the default will be switched to 0 in the next commit, along
// with the Target-specific changes necessary.
Expand Down
2 changes: 1 addition & 1 deletion lib/Target/AArch64/AArch64ISelLowering.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -9254,7 +9254,7 @@ static SDValue performSTORECombine(SDNode *N,
return SDValue();
}

/// This function handles the log2-shuffle pattern produced by the
/// This function handles the log2-shuffle pattern produced by the
/// LoopVectorizer for the across vector reduction. It consists of
/// log2(NumVectorElements) steps and, in each step, 2^(s) elements
/// are reduced, where s is an induction variable from 0 to
Expand Down
10 changes: 10 additions & 0 deletions lib/Target/AMDGPU/AMDGPUISelLowering.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -451,6 +451,16 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM,

PredictableSelectIsExpensive = false;

// We want to find all load dependencies for long chains of stores to enable
// merging into very wide vectors. The problem is with vectors with > 4
// elements. MergeConsecutiveStores will attempt to merge these because x8/x16
// vectors are a legal type, even though we have to split the loads
// usually. When we can more precisely specify load legality per address
// space, we should be able to make FindBetterChain/MergeConsecutiveStores
// smarter so that they can figure out what to do in 2 iterations without all
// N > 4 stores on the same chain.
GatherAllAliasesMaxDepth = 16;

// FIXME: Need to really handle these.
MaxStoresPerMemcpy = 4096;
MaxStoresPerMemmove = 4096;
Expand Down
5 changes: 0 additions & 5 deletions lib/Target/ARM/ARMISelLowering.h
Original file line number Diff line number Diff line change
Expand Up @@ -500,11 +500,6 @@ class InstrItineraryData;
bool canCombineStoreAndExtract(Type *VectorTy, Value *Idx,
unsigned &Cost) const override;

bool canMergeStoresTo(EVT MemVT) const override {
// Do not merge to larger than i32.
return (MemVT.getSizeInBits() <= 32);
}

bool isCheapToSpeculateCttz() const override;
bool isCheapToSpeculateCtlz() const override;

Expand Down
4 changes: 2 additions & 2 deletions test/CodeGen/AArch64/argument-blocks.ll
Original file line number Diff line number Diff line change
Expand Up @@ -59,10 +59,10 @@ define i64 @test_hfa_ignores_gprs([7 x float], [2 x float] %in, i64, i64 %res) {
}

; [2 x float] should not be promoted to double by the Darwin varargs handling,
; but should go in an 8-byte aligned slot and can be merged as integer stores.
; but should go in an 8-byte aligned slot.
define void @test_varargs_stackalign() {
; CHECK-LABEL: test_varargs_stackalign:
; CHECK-DARWINPCS: str {{x[0-9]+}}, [sp, #16]
; CHECK-DARWINPCS: stp {{w[0-9]+}}, {{w[0-9]+}}, [sp, #16]

call void(...) @callee([3 x float] undef, [2 x float] [float 1.0, float 2.0])
ret void
Expand Down
5 changes: 4 additions & 1 deletion test/CodeGen/AArch64/arm64-abi.ll
Original file line number Diff line number Diff line change
Expand Up @@ -205,7 +205,10 @@ declare i32 @args_i32(i32, i32, i32, i32, i32, i32, i32, i32, i16 signext, i32,
define i32 @test8(i32 %argc, i8** nocapture %argv) nounwind {
entry:
; CHECK-LABEL: test8
; CHECK: str w8, [sp]
; CHECK: strb {{w[0-9]+}}, [sp, #3]
; CHECK: strb wzr, [sp, #2]
; CHECK: strb {{w[0-9]+}}, [sp, #1]
; CHECK: strb wzr, [sp]
; CHECK: bl
; FAST-LABEL: test8
; FAST: strb {{w[0-9]+}}, [sp]
Expand Down
4 changes: 2 additions & 2 deletions test/CodeGen/AArch64/arm64-memset-inline.ll
Original file line number Diff line number Diff line change
Expand Up @@ -13,8 +13,8 @@ define void @t2() nounwind ssp {
entry:
; CHECK-LABEL: t2:
; CHECK: strh wzr, [sp, #32]
; CHECK: stp xzr, xzr, [sp, #8]
; CHECK: str xzr, [sp, #24]
; CHECK: stp xzr, xzr, [sp, #16]
; CHECK: str xzr, [sp, #8]
%buf = alloca [26 x i8], align 1
%0 = getelementptr inbounds [26 x i8], [26 x i8]* %buf, i32 0, i32 0
call void @llvm.memset.p0i8.i32(i8* %0, i8 0, i32 26, i32 1, i1 false)
Expand Down
2 changes: 1 addition & 1 deletion test/CodeGen/AArch64/arm64-variadic-aapcs.ll
Original file line number Diff line number Diff line change
Expand Up @@ -99,7 +99,7 @@ define void @test_nospare([8 x i64], [8 x float], ...) {
; __stack field should point just past them.
define void @test_offsetstack([8 x i64], [2 x i64], [3 x float], ...) {
; CHECK-LABEL: test_offsetstack:
; CHECK: stp {{q[0-9]+}}, {{q[0-9]+}}, [sp, #-80]!
; CHECK: sub sp, sp, #80
; CHECK: add [[STACK_TOP:x[0-9]+]], sp, #96
; CHECK: add x[[VAR:[0-9]+]], {{x[0-9]+}}, :lo12:var
; CHECK: str [[STACK_TOP]], [x[[VAR]]]
Expand Down
3 changes: 2 additions & 1 deletion test/CodeGen/AArch64/merge-store.ll
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,8 @@
@g0 = external global <3 x float>, align 16
@g1 = external global <3 x float>, align 4

; CHECK: ldr q[[R0:[0-9]+]], {{\[}}[[R1:x[0-9]+]], :lo12:g0
; CHECK: ldr s[[R0:[0-9]+]], {{\[}}[[R1:x[0-9]+]]{{\]}}, #4
; CHECK: ld1{{\.?s?}} { v[[R0]]{{\.?s?}} }[1], {{\[}}[[R1]]{{\]}}
; CHECK: str d[[R0]]

define void @blam() {
Expand Down
3 changes: 2 additions & 1 deletion test/CodeGen/AArch64/vector_merge_dep_check.ll
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
; RUN: llc < %s | FileCheck %s
; RUN: llc --combiner-alias-analysis=false < %s | FileCheck %s
; RUN: llc --combiner-alias-analysis=true < %s | FileCheck %s

; This test checks that we do not merge stores together which have
; dependencies through their non-chain operands (e.g. one store is the
Expand Down
24 changes: 8 additions & 16 deletions test/CodeGen/AMDGPU/debugger-insert-nops.ll
Original file line number Diff line number Diff line change
@@ -1,21 +1,13 @@
; RUN: llc -O0 -mtriple=amdgcn--amdhsa -mcpu=fiji -mattr=+amdgpu-debugger-insert-nops -verify-machineinstrs < %s | FileCheck %s --check-prefix=CHECK
; RUN: llc -O0 -mtriple=amdgcn--amdhsa -mcpu=fiji -mattr=+amdgpu-debugger-insert-nops -verify-machineinstrs < %s | FileCheck %s --check-prefix=CHECKNOP
; RUN: llc -O0 -mtriple=amdgcn--amdhsa -mcpu=fiji -mattr=+amdgpu-debugger-insert-nops -verify-machineinstrs < %s | FileCheck %s

; This test expects that we have one instance for each line in some order with "s_nop 0" instances after each.

; Check that each line appears at least once
; CHECK-DAG: test01.cl:2:3
; CHECK-DAG: test01.cl:3:3
; CHECK-DAG: test01.cl:4:3
; CHECK: test01.cl:2:{{[0-9]+}}
; CHECK-NEXT: s_nop 0

; CHECK: test01.cl:3:{{[0-9]+}}
; CHECK-NEXT: s_nop 0

; Check that each of each of the lines consists of the line output, followed by "s_nop 0"
; CHECKNOP: test01.cl:{{[234]}}:3
; CHECKNOP-NEXT: s_nop 0
; CHECKNOP: test01.cl:{{[234]}}:3
; CHECKNOP-NEXT: s_nop 0
; CHECKNOP: test01.cl:{{[234]}}:3
; CHECKNOP-NEXT: s_nop 0
; CHECK: test01.cl:4:{{[0-9]+}}
; CHECK-NEXT: s_nop 0

; CHECK: test01.cl:5:{{[0-9]+}}
; CHECK-NEXT: s_nop 0
Expand All @@ -29,7 +21,7 @@ entry:
call void @llvm.dbg.declare(metadata i32 addrspace(1)** %A.addr, metadata !17, metadata !18), !dbg !19
%0 = load i32 addrspace(1)*, i32 addrspace(1)** %A.addr, align 4, !dbg !20
%arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %0, i32 0, !dbg !20
store i32 1, i32 addrspace(1)* %arrayidx, align 4, !dbg !20
store i32 1, i32 addrspace(1)* %arrayidx, align 4, !dbg !21
%1 = load i32 addrspace(1)*, i32 addrspace(1)** %A.addr, align 4, !dbg !22
%arrayidx1 = getelementptr inbounds i32, i32 addrspace(1)* %1, i32 1, !dbg !22
store i32 2, i32 addrspace(1)* %arrayidx1, align 4, !dbg !23
Expand Down
5 changes: 3 additions & 2 deletions test/CodeGen/AMDGPU/insert_vector_elt.ll
Original file line number Diff line number Diff line change
Expand Up @@ -257,8 +257,9 @@ define void @dynamic_insertelement_v2i8(<2 x i8> addrspace(1)* %out, <2 x i8> %a

; GCN-DAG: buffer_store_byte v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offset:2
; GCN-DAG: buffer_store_byte v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offset:1
; GCN-DAG: buffer_store_byte v{{[0-9]+}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offen{{$}}
; GCN-DAG: buffer_store_byte v{{[0-9]+}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offen{{$}}
; GCN-DAG: buffer_store_byte v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}}{{$}}

; GCN: buffer_store_byte v{{[0-9]+}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offen{{$}}

; GCN: buffer_load_ubyte
; GCN: buffer_load_ubyte
Expand Down
24 changes: 20 additions & 4 deletions test/CodeGen/AMDGPU/merge-stores.ll
Original file line number Diff line number Diff line change
@@ -1,5 +1,8 @@
; RUN: llc -march=amdgcn -verify-machineinstrs -amdgpu-load-store-vectorizer=0 < %s | FileCheck -check-prefix=SI -check-prefix=GCN -check-prefix=GCN-AA %s
; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs -amdgpu-load-store-vectorizer=0 < %s | FileCheck -check-prefix=SI -check-prefix=GCN -check-prefix=GCN-AA %s
; RUN: llc -march=amdgcn -verify-machineinstrs -amdgpu-load-store-vectorizer=0 < %s | FileCheck -check-prefix=SI -check-prefix=GCN -check-prefix=GCN-NOAA %s
; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs -amdgpu-load-store-vectorizer=0 < %s | FileCheck -check-prefix=SI -check-prefix=GCN -check-prefix=GCN-NOAA %s

; RUN: llc -march=amdgcn -verify-machineinstrs -combiner-alias-analysis -amdgpu-load-store-vectorizer=0 < %s | FileCheck -check-prefix=SI -check-prefix=GCN -check-prefix=GCN-AA %s
; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs -combiner-alias-analysis -amdgpu-load-store-vectorizer=0 < %s | FileCheck -check-prefix=SI -check-prefix=GCN -check-prefix=GCN-AA %s

; This test is mostly to test DAG store merging, so disable the vectorizer.
; Run with devices with different unaligned load restrictions.
Expand Down Expand Up @@ -147,7 +150,12 @@ define void @merge_global_store_4_constants_f32(float addrspace(1)* %out) #0 {
}

; GCN-LABEL: {{^}}merge_global_store_4_constants_mixed_i32_f32:
; GCN-AA: buffer_store_dwordx4 v
; GCN-NOAA: buffer_store_dwordx4 v

; GCN-AA: buffer_store_dwordx2
; GCN-AA: buffer_store_dword v
; GCN-AA: buffer_store_dword v

; GCN: s_endpgm
define void @merge_global_store_4_constants_mixed_i32_f32(float addrspace(1)* %out) #0 {
%out.gep.1 = getelementptr float, float addrspace(1)* %out, i32 1
Expand Down Expand Up @@ -466,9 +474,17 @@ define void @merge_global_store_4_adjacent_loads_i8_natural_align(i8 addrspace(1
ret void
}

; This works once AA is enabled on the subtarget
; GCN-LABEL: {{^}}merge_global_store_4_vector_elts_loads_v4i32:
; GCN: buffer_load_dwordx4 [[LOAD:v\[[0-9]+:[0-9]+\]]]
; GCN: buffer_store_dwordx4 [[LOAD]]

; GCN-NOAA: buffer_store_dword v
; GCN-NOAA: buffer_store_dword v
; GCN-NOAA: buffer_store_dword v
; GCN-NOAA: buffer_store_dword v

; GCN-AA: buffer_store_dwordx4 [[LOAD]]

; GCN: s_endpgm
define void @merge_global_store_4_vector_elts_loads_v4i32(i32 addrspace(1)* %out, <4 x i32> addrspace(1)* %in) #0 {
%out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1
Expand Down
12 changes: 6 additions & 6 deletions test/CodeGen/AMDGPU/private-element-size.ll
Original file line number Diff line number Diff line change
Expand Up @@ -32,10 +32,10 @@
; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], s9 offset:24{{$}}
; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], s9 offset:28{{$}}

; HSA-ELT4-DAG: buffer_load_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s9 offen{{$}}
; HSA-ELT4-DAG: buffer_load_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s9 offen offset:4{{$}}
; HSA-ELT4-DAG: buffer_load_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s9 offen offset:8{{$}}
; HSA-ELT4-DAG: buffer_load_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s9 offen offset:12{{$}}
; HSA-ELT4: buffer_load_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s9 offen{{$}}
; HSA-ELT4: buffer_load_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s9 offen offset:4{{$}}
; HSA-ELT4: buffer_load_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s9 offen offset:8{{$}}
; HSA-ELT4: buffer_load_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s9 offen offset:12{{$}}
define void @private_elt_size_v4i32(<4 x i32> addrspace(1)* %out, i32 addrspace(1)* %index.array) #0 {
entry:
%tid = call i32 @llvm.amdgcn.workitem.id.x()
Expand Down Expand Up @@ -130,8 +130,8 @@ entry:
; HSA-ELT8: private_element_size = 2
; HSA-ELT4: private_element_size = 1

; HSA-ELTGE8-DAG: buffer_store_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, {{off|v[0-9]}}, s[0:3], s9{{$}}
; HSA-ELTGE8-DAG: buffer_store_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, {{off|v[0-9]}}, s[0:3], s9 offset:8
; HSA-ELTGE8-DAG: buffer_store_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, off, s[0:3], s9{{$}}
; HSA-ELTGE8-DAG: buffer_store_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, off, s[0:3], s9 offset:8

; HSA-ELTGE8: buffer_load_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s[0:3], s9 offen

Expand Down
17 changes: 9 additions & 8 deletions test/CodeGen/AMDGPU/si-triv-disjoint-mem-access.ll
Original file line number Diff line number Diff line change
Expand Up @@ -157,8 +157,9 @@ define void @reorder_global_load_local_store_global_load(i32 addrspace(1)* %out,

; FUNC-LABEL: @reorder_local_offsets
; CI: ds_read2_b32 {{v\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}} offset0:100 offset1:102
; CI-DAG: ds_write2_b32 {{v[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset0:3 offset1:100
; CI-DAG: ds_write_b32 {{v[0-9]+}}, {{v[0-9]+}} offset:408
; CI: ds_write2_b32 {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} offset0:3 offset1:100
; CI: ds_read_b32 {{v[0-9]+}}, {{v[0-9]+}} offset:12
; CI: ds_write_b32 {{v[0-9]+}}, {{v[0-9]+}} offset:408
; CI: buffer_store_dword
; CI: s_endpgm
define void @reorder_local_offsets(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* noalias nocapture readnone %gptr, i32 addrspace(3)* noalias nocapture %ptr0) #0 {
Expand All @@ -180,12 +181,12 @@ define void @reorder_local_offsets(i32 addrspace(1)* nocapture %out, i32 addrspa
}

; FUNC-LABEL: @reorder_global_offsets
; CI-DAG: buffer_load_dword {{v[0-9]+}}, off, {{s\[[0-9]+:[0-9]+\]}}, 0 offset:400
; CI-DAG: buffer_load_dword {{v[0-9]+}}, off, {{s\[[0-9]+:[0-9]+\]}}, 0 offset:408
; CI-DAG: buffer_store_dword {{v[0-9]+}}, off, {{s\[[0-9]+:[0-9]+\]}}, 0 offset:12
; CI-DAG: buffer_store_dword {{v[0-9]+}}, off, {{s\[[0-9]+:[0-9]+\]}}, 0 offset:400
; CI-DAG: buffer_store_dword {{v[0-9]+}}, off, {{s\[[0-9]+:[0-9]+\]}}, 0 offset:408
; CI: buffer_store_dword
; CI: buffer_load_dword {{v[0-9]+}}, off, {{s\[[0-9]+:[0-9]+\]}}, 0 offset:400
; CI: buffer_load_dword {{v[0-9]+}}, off, {{s\[[0-9]+:[0-9]+\]}}, 0 offset:408
; CI: buffer_store_dword {{v[0-9]+}}, off, {{s\[[0-9]+:[0-9]+\]}}, 0 offset:12
; CI: buffer_store_dword {{v[0-9]+}}, off, {{s\[[0-9]+:[0-9]+\]}}, 0 offset:400
; CI: buffer_store_dword {{v[0-9]+}}, off, {{s\[[0-9]+:[0-9]+\]}}, 0 offset:408
; CI: buffer_load_dword {{v[0-9]+}}, off, {{s\[[0-9]+:[0-9]+\]}}, 0 offset:12
; CI: s_endpgm
define void @reorder_global_offsets(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* noalias nocapture readnone %gptr, i32 addrspace(1)* noalias nocapture %ptr0) #0 {
%ptr1 = getelementptr inbounds i32, i32 addrspace(1)* %ptr0, i32 3
Expand Down
3 changes: 1 addition & 2 deletions test/CodeGen/ARM/2012-10-04-AAPCS-byval-align8.ll
Original file line number Diff line number Diff line change
Expand Up @@ -12,8 +12,7 @@ define void @test_byval_8_bytes_alignment(i32 %i, ...) {
entry:
; CHECK: sub sp, sp, #12
; CHECK: sub sp, sp, #4
; CHECK: add r0, sp, #4
; CHECK: stm sp, {r0, r1, r2, r3}
; CHECK: stmib sp, {r1, r2, r3}
%g = alloca i8*
%g1 = bitcast i8** %g to i8*
call void @llvm.va_start(i8* %g1)
Expand Down
100 changes: 49 additions & 51 deletions test/CodeGen/ARM/alloc-no-stack-realign.ll
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
; RUN: llc < %s -mtriple=armv7-apple-ios -O0 | FileCheck %s
; RUN: llc < %s -mtriple=armv7-apple-ios -O0 | FileCheck %s -check-prefix=NO-REALIGN
; RUN: llc < %s -mtriple=armv7-apple-ios -O0 | FileCheck %s -check-prefix=REALIGN

; rdar://12713765
; When realign-stack is set to false, make sure we are not creating stack
Expand All @@ -7,31 +8,29 @@

define void @test1(<16 x float>* noalias sret %agg.result) nounwind ssp "no-realign-stack" {
entry:
; CHECK-LABEL: test1
; CHECK: ldr r[[R1:[0-9]+]], [pc, r1]
; CHECK: add r[[R2:[0-9]+]], r1, #48
; CHECK: vld1.64 {{{d[0-9]+, d[0-9]+}}}, [r[[R2]]:128]
; CHECK: mov r[[R2:[0-9]+]], r[[R1]]
; CHECK: vld1.32 {{{d[0-9]+, d[0-9]+}}}, [r[[R2]]:128]!
; CHECK: vld1.64 {{{d[0-9]+, d[0-9]+}}}, [r[[R2]]:128]
; CHECK: add r[[R1:[0-9]+]], r[[R1]], #32
; CHECK: vld1.64 {{{d[0-9]+, d[0-9]+}}}, [r[[R1]]:128]
; CHECK: mov r[[R1:[0-9]+]], sp
; CHECK: vst1.64 {{{d[0-9]+, d[0-9]+}}}, [r[[R1]]:128]
; CHECK: add r[[R2:[0-9]+]], r[[R1]], #32
; CHECK: vst1.64 {{{d[0-9]+, d[0-9]+}}}, [r[[R2]]:128]
; CHECK: vld1.32 {{{d[0-9]+, d[0-9]+}}}, [r[[R1]]:128]!
; CHECK: vst1.64 {{{d[0-9]+, d[0-9]+}}}, [r[[R1]]:128]
; CHECK: vld1.32 {{{d[0-9]+, d[0-9]+}}}, [r[[R2]]:128]!
; CHECK: vst1.64 {{{d[0-9]+, d[0-9]+}}}, [r[[R2]]:128]
; CHECK: vld1.64 {{{d[0-9]+, d[0-9]+}}}, [r[[R1]]:128]
; CHECK: vld1.64 {{{d[0-9]+, d[0-9]+}}}, [r[[R2]]:128]
; CHECK: add r[[R1:[0-9]+]], r0, #48
; CHECK: vst1.64 {{{d[0-9]+, d[0-9]+}}}, [r[[R1]]:128]
; CHECK: add r[[R1:[0-9]+]], r0, #32
; CHECK: vst1.64 {{{d[0-9]+, d[0-9]+}}}, [r[[R1]]:128]
; CHECK: vst1.32 {{{d[0-9]+, d[0-9]+}}}, [r0:128]!
; CHECK: vst1.64 {{{d[0-9]+, d[0-9]+}}}, [r0:128]
; NO-REALIGN-LABEL: test1
; NO-REALIGN: mov r[[R2:[0-9]+]], r[[R1:[0-9]+]]
; NO-REALIGN: vld1.32 {{{d[0-9]+, d[0-9]+}}}, [r[[R2]]:128]!
; NO-REALIGN: vld1.64 {{{d[0-9]+, d[0-9]+}}}, [r[[R2]]:128]
; NO-REALIGN: add r[[R2:[0-9]+]], r[[R1]], #32
; NO-REALIGN: vld1.64 {{{d[0-9]+, d[0-9]+}}}, [r[[R2]]:128]
; NO-REALIGN: add r[[R2:[0-9]+]], r[[R1]], #48
; NO-REALIGN: vld1.64 {{{d[0-9]+, d[0-9]+}}}, [r[[R2]]:128]

; NO-REALIGN: add r[[R2:[0-9]+]], r[[R1:[0-9]+]], #48
; NO-REALIGN: vst1.64 {{{d[0-9]+, d[0-9]+}}}, [r[[R2]]:128]
; NO-REALIGN: add r[[R2:[0-9]+]], r[[R1]], #32
; NO-REALIGN: vst1.64 {{{d[0-9]+, d[0-9]+}}}, [r[[R2]]:128]
; NO-REALIGN: mov r[[R3:[0-9]+]], r[[R1]]
; NO-REALIGN: vst1.32 {{{d[0-9]+, d[0-9]+}}}, [r[[R3]]:128]!
; NO-REALIGN: vst1.64 {{{d[0-9]+, d[0-9]+}}}, [r[[R3]]:128]

; NO-REALIGN: add r[[R2:[0-9]+]], r[[R0:0]], #48
; NO-REALIGN: vst1.64 {{{d[0-9]+, d[0-9]+}}}, [r[[R2]]:128]
; NO-REALIGN: add r[[R2:[0-9]+]], r[[R0]], #32
; NO-REALIGN: vst1.64 {{{d[0-9]+, d[0-9]+}}}, [r[[R2]]:128]
; NO-REALIGN: vst1.32 {{{d[0-9]+, d[0-9]+}}}, [r[[R0]]:128]!
; NO-REALIGN: vst1.64 {{{d[0-9]+, d[0-9]+}}}, [r[[R0]]:128]
%retval = alloca <16 x float>, align 16
%0 = load <16 x float>, <16 x float>* @T3_retval, align 16
store <16 x float> %0, <16 x float>* %retval
Expand All @@ -42,33 +41,32 @@ entry:

define void @test2(<16 x float>* noalias sret %agg.result) nounwind ssp {
entry:
; CHECK: ldr r[[R1:[0-9]+]], [pc, r1]
; CHECK: add r[[R2:[0-9]+]], r[[R1]], #48
; CHECK: vld1.64 {{{d[0-9]+, d[0-9]+}}}, [r[[R2]]:128]
; CHECK: mov r[[R2:[0-9]+]], r[[R1]]
; CHECK: vld1.32 {{{d[0-9]+, d[0-9]+}}}, [r[[R2]]:128]!
; CHECK: vld1.64 {{{d[0-9]+, d[0-9]+}}}, [r[[R2]]:128]
; CHECK: add r[[R1:[0-9]+]], r[[R1]], #32
; CHECK: vld1.64 {{{d[0-9]+, d[0-9]+}}}, [r[[R1]]:128]
; CHECK: mov r[[R1:[0-9]+]], sp
; CHECK: vst1.64 {{{d[0-9]+, d[0-9]+}}}, [r[[R1]]:128]
; CHECK: orr r[[R2:[0-9]+]], r[[R1]], #32
; CHECK: vst1.64 {{{d[0-9]+, d[0-9]+}}}, [r[[R2]]:128]
; CHECK: vld1.32 {{{d[0-9]+, d[0-9]+}}}, [r[[R1]]:128]!
; CHECK: vst1.64 {{{d[0-9]+, d[0-9]+}}}, [r[[R1]]:128]
; CHECK: vld1.32 {{{d[0-9]+, d[0-9]+}}}, [r[[R2]]:128]!
; CHECK: vst1.64 {{{d[0-9]+, d[0-9]+}}}, [r[[R2]]:128]
; CHECK: vld1.64 {{{d[0-9]+, d[0-9]+}}}, [r[[R1]]:128]
; CHECK: vld1.64 {{{d[0-9]+, d[0-9]+}}}, [r[[R2]]:128]
; CHECK: add r[[R1:[0-9]+]], r0, #48
; CHECK: vst1.64 {{{d[0-9]+, d[0-9]+}}}, [r[[R1]]:128]
; CHECK: add r[[R1:[0-9]+]], r0, #32
; CHECK: vst1.64 {{{d[0-9]+, d[0-9]+}}}, [r[[R1]]:128]
; CHECK: vst1.32 {{{d[0-9]+, d[0-9]+}}}, [r0:128]!
; CHECK: vst1.64 {{{d[0-9]+, d[0-9]+}}}, [r0:128]
; REALIGN-LABEL: test2
; REALIGN: bfc sp, #0, #6
; REALIGN: mov r[[R2:[0-9]+]], r[[R1:[0-9]+]]
; REALIGN: vld1.32 {{{d[0-9]+, d[0-9]+}}}, [r[[R2]]:128]!
; REALIGN: vld1.64 {{{d[0-9]+, d[0-9]+}}}, [r[[R2]]:128]
; REALIGN: add r[[R2:[0-9]+]], r[[R1]], #32
; REALIGN: vld1.64 {{{d[0-9]+, d[0-9]+}}}, [r[[R2]]:128]
; REALIGN: add r[[R2:[0-9]+]], r[[R1]], #48
; REALIGN: vld1.64 {{{d[0-9]+, d[0-9]+}}}, [r[[R2]]:128]


%retval = alloca <16 x float>, align 16
; REALIGN: orr r[[R2:[0-9]+]], r[[R1:[0-9]+]], #48
; REALIGN: vst1.64 {{{d[0-9]+, d[0-9]+}}}, [r[[R2]]:128]
; REALIGN: orr r[[R2:[0-9]+]], r[[R1]], #32
; REALIGN: vst1.64 {{{d[0-9]+, d[0-9]+}}}, [r[[R2]]:128]
; REALIGN: orr r[[R2:[0-9]+]], r[[R1]], #16
; REALIGN: vst1.64 {{{d[0-9]+, d[0-9]+}}}, [r[[R2]]:128]
; REALIGN: vst1.64 {{{d[0-9]+, d[0-9]+}}}, [r[[R1]]:128]

; REALIGN: add r[[R1:[0-9]+]], r[[R0:0]], #48
; REALIGN: vst1.64 {{{d[0-9]+, d[0-9]+}}}, [r[[R1]]:128]
; REALIGN: add r[[R1:[0-9]+]], r[[R0]], #32
; REALIGN: vst1.64 {{{d[0-9]+, d[0-9]+}}}, [r[[R1]]:128]
; REALIGN: vst1.32 {{{d[0-9]+, d[0-9]+}}}, [r[[R0]]:128]!
; REALIGN: vst1.64 {{{d[0-9]+, d[0-9]+}}}, [r[[R0]]:128]
%retval = alloca <16 x float>, align 16
%0 = load <16 x float>, <16 x float>* @T3_retval, align 16
store <16 x float> %0, <16 x float>* %retval
%1 = load <16 x float>, <16 x float>* %retval
Expand Down
Loading

0 comments on commit 529986a

Please sign in to comment.