Revert "In visitSTORE, always use FindBetterChain, rather than only w…

…hen UseAA is enabled." This reverts commit r293893 which is miscompiling lua on ARM and bootstrapping for x86-windows. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@293915 91177308-0d34-0410-b5e6-96231b3b80d8
hacxman · Feb 2, 2017 · 529986a · 529986a
1 parent 416bf90
commit 529986a
Show file tree

Hide file tree

Showing 72 changed files with 2,226 additions and 2,046 deletions.
diff --git a/include/llvm/Target/TargetLowering.h b/include/llvm/Target/TargetLowering.h
@@ -363,9 +363,6 @@ class TargetLoweringBase {
     return false;
   }
 
-  /// Returns if it's reasonable to merge stores to MemVT size.
-  virtual bool canMergeStoresTo(EVT MemVT) const { return true; }
-
   /// \brief Return true if it is cheap to speculate a call to intrinsic cttz.
   virtual bool isCheapToSpeculateCttz() const {
     return false;

diff --git a/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
diff --git a/lib/CodeGen/TargetLoweringBase.cpp b/lib/CodeGen/TargetLoweringBase.cpp
@@ -851,7 +851,7 @@ TargetLoweringBase::TargetLoweringBase(const TargetMachine &tm) : TM(tm) {
   MinFunctionAlignment = 0;
   PrefFunctionAlignment = 0;
   PrefLoopAlignment = 0;
-  GatherAllAliasesMaxDepth = 18;
+  GatherAllAliasesMaxDepth = 6;
   MinStackArgumentAlignment = 1;
   // TODO: the default will be switched to 0 in the next commit, along
   // with the Target-specific changes necessary.

diff --git a/lib/Target/AArch64/AArch64ISelLowering.cpp b/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -9254,7 +9254,7 @@ static SDValue performSTORECombine(SDNode *N,
   return SDValue();
 }
 
-/// This function handles the log2-shuffle pattern produced by the
+  /// This function handles the log2-shuffle pattern produced by the
 /// LoopVectorizer for the across vector reduction. It consists of
 /// log2(NumVectorElements) steps and, in each step, 2^(s) elements
 /// are reduced, where s is an induction variable from 0 to

diff --git a/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
@@ -451,6 +451,16 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM,
 
   PredictableSelectIsExpensive = false;
 
+  // We want to find all load dependencies for long chains of stores to enable
+  // merging into very wide vectors. The problem is with vectors with > 4
+  // elements. MergeConsecutiveStores will attempt to merge these because x8/x16
+  // vectors are a legal type, even though we have to split the loads
+  // usually. When we can more precisely specify load legality per address
+  // space, we should be able to make FindBetterChain/MergeConsecutiveStores
+  // smarter so that they can figure out what to do in 2 iterations without all
+  // N > 4 stores on the same chain.
+  GatherAllAliasesMaxDepth = 16;
+
   // FIXME: Need to really handle these.
   MaxStoresPerMemcpy  = 4096;
   MaxStoresPerMemmove = 4096;

diff --git a/lib/Target/ARM/ARMISelLowering.h b/lib/Target/ARM/ARMISelLowering.h
@@ -500,11 +500,6 @@ class InstrItineraryData;
     bool canCombineStoreAndExtract(Type *VectorTy, Value *Idx,
                                    unsigned &Cost) const override;
 
-    bool canMergeStoresTo(EVT MemVT) const override {
-      // Do not merge to larger than i32.
-      return (MemVT.getSizeInBits() <= 32);
-    }
-
     bool isCheapToSpeculateCttz() const override;
     bool isCheapToSpeculateCtlz() const override;
 

diff --git a/test/CodeGen/AArch64/argument-blocks.ll b/test/CodeGen/AArch64/argument-blocks.ll
@@ -59,10 +59,10 @@ define i64 @test_hfa_ignores_gprs([7 x float], [2 x float] %in, i64, i64 %res) {
 }
 
 ; [2 x float] should not be promoted to double by the Darwin varargs handling,
-; but should go in an 8-byte aligned slot and can be merged as integer stores.
+; but should go in an 8-byte aligned slot.
 define void @test_varargs_stackalign() {
 ; CHECK-LABEL: test_varargs_stackalign:
-; CHECK-DARWINPCS: str {{x[0-9]+}}, [sp, #16]
+; CHECK-DARWINPCS: stp {{w[0-9]+}}, {{w[0-9]+}}, [sp, #16]
 
   call void(...) @callee([3 x float] undef, [2 x float] [float 1.0, float 2.0])
   ret void

diff --git a/test/CodeGen/AArch64/arm64-abi.ll b/test/CodeGen/AArch64/arm64-abi.ll
@@ -205,7 +205,10 @@ declare i32 @args_i32(i32, i32, i32, i32, i32, i32, i32, i32, i16 signext, i32,
 define i32 @test8(i32 %argc, i8** nocapture %argv) nounwind {
 entry:
 ; CHECK-LABEL: test8
-; CHECK: str w8, [sp]
+; CHECK: strb {{w[0-9]+}}, [sp, #3]
+; CHECK: strb wzr, [sp, #2]
+; CHECK: strb {{w[0-9]+}}, [sp, #1]
+; CHECK: strb wzr, [sp]
 ; CHECK: bl
 ; FAST-LABEL: test8
 ; FAST: strb {{w[0-9]+}}, [sp]

diff --git a/test/CodeGen/AArch64/arm64-memset-inline.ll b/test/CodeGen/AArch64/arm64-memset-inline.ll
@@ -13,8 +13,8 @@ define void @t2() nounwind ssp {
 entry:
 ; CHECK-LABEL: t2:
 ; CHECK: strh wzr, [sp, #32]
-; CHECK: stp xzr, xzr, [sp, #8]
-; CHECK: str xzr, [sp, #24]
+; CHECK: stp xzr, xzr, [sp, #16]
+; CHECK: str xzr, [sp, #8]
   %buf = alloca [26 x i8], align 1
   %0 = getelementptr inbounds [26 x i8], [26 x i8]* %buf, i32 0, i32 0
   call void @llvm.memset.p0i8.i32(i8* %0, i8 0, i32 26, i32 1, i1 false)

diff --git a/test/CodeGen/AArch64/arm64-variadic-aapcs.ll b/test/CodeGen/AArch64/arm64-variadic-aapcs.ll
@@ -99,7 +99,7 @@ define void @test_nospare([8 x i64], [8 x float], ...) {
 ; __stack field should point just past them.
 define void @test_offsetstack([8 x i64], [2 x i64], [3 x float], ...) {
 ; CHECK-LABEL: test_offsetstack:
-; CHECK: stp {{q[0-9]+}}, {{q[0-9]+}}, [sp, #-80]!
+; CHECK: sub sp, sp, #80
 ; CHECK: add [[STACK_TOP:x[0-9]+]], sp, #96
 ; CHECK: add x[[VAR:[0-9]+]], {{x[0-9]+}}, :lo12:var
 ; CHECK: str [[STACK_TOP]], [x[[VAR]]]

diff --git a/test/CodeGen/AArch64/merge-store.ll b/test/CodeGen/AArch64/merge-store.ll
@@ -4,7 +4,8 @@
 @g0 = external global <3 x float>, align 16
 @g1 = external global <3 x float>, align 4
 
-; CHECK: ldr q[[R0:[0-9]+]], {{\[}}[[R1:x[0-9]+]], :lo12:g0
+; CHECK: ldr s[[R0:[0-9]+]], {{\[}}[[R1:x[0-9]+]]{{\]}}, #4
+; CHECK: ld1{{\.?s?}} { v[[R0]]{{\.?s?}} }[1], {{\[}}[[R1]]{{\]}}
 ; CHECK: str d[[R0]]
 
 define void @blam() {

diff --git a/test/CodeGen/AArch64/vector_merge_dep_check.ll b/test/CodeGen/AArch64/vector_merge_dep_check.ll
@@ -1,4 +1,5 @@
-; RUN: llc < %s | FileCheck %s
+; RUN: llc --combiner-alias-analysis=false < %s | FileCheck %s
+; RUN: llc --combiner-alias-analysis=true  < %s | FileCheck %s
 
 ; This test checks that we do not merge stores together which have
 ; dependencies through their non-chain operands (e.g. one store is the

diff --git a/test/CodeGen/AMDGPU/debugger-insert-nops.ll b/test/CodeGen/AMDGPU/debugger-insert-nops.ll
@@ -1,21 +1,13 @@
-; RUN: llc -O0 -mtriple=amdgcn--amdhsa -mcpu=fiji -mattr=+amdgpu-debugger-insert-nops -verify-machineinstrs < %s | FileCheck %s --check-prefix=CHECK
-; RUN: llc -O0 -mtriple=amdgcn--amdhsa -mcpu=fiji -mattr=+amdgpu-debugger-insert-nops -verify-machineinstrs < %s | FileCheck %s --check-prefix=CHECKNOP
+; RUN: llc -O0 -mtriple=amdgcn--amdhsa -mcpu=fiji -mattr=+amdgpu-debugger-insert-nops -verify-machineinstrs < %s | FileCheck %s
 
-; This test expects that we have one instance for each line in some order with "s_nop 0" instances after each.
-
-; Check that each line appears at least once
-; CHECK-DAG: test01.cl:2:3
-; CHECK-DAG: test01.cl:3:3
-; CHECK-DAG: test01.cl:4:3
+; CHECK: test01.cl:2:{{[0-9]+}}
+; CHECK-NEXT: s_nop 0
 
+; CHECK: test01.cl:3:{{[0-9]+}}
+; CHECK-NEXT: s_nop 0
 
-; Check that each of each of the lines consists of the line output, followed by "s_nop 0"
-; CHECKNOP: test01.cl:{{[234]}}:3
-; CHECKNOP-NEXT: s_nop 0
-; CHECKNOP: test01.cl:{{[234]}}:3
-; CHECKNOP-NEXT: s_nop 0
-; CHECKNOP: test01.cl:{{[234]}}:3
-; CHECKNOP-NEXT: s_nop 0
+; CHECK: test01.cl:4:{{[0-9]+}}
+; CHECK-NEXT: s_nop 0
 
 ; CHECK: test01.cl:5:{{[0-9]+}}
 ; CHECK-NEXT: s_nop 0
@@ -29,7 +21,7 @@ entry:
   call void @llvm.dbg.declare(metadata i32 addrspace(1)** %A.addr, metadata !17, metadata !18), !dbg !19
   %0 = load i32 addrspace(1)*, i32 addrspace(1)** %A.addr, align 4, !dbg !20
   %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %0, i32 0, !dbg !20
-  store i32 1, i32 addrspace(1)* %arrayidx, align 4, !dbg !20
+  store i32 1, i32 addrspace(1)* %arrayidx, align 4, !dbg !21
   %1 = load i32 addrspace(1)*, i32 addrspace(1)** %A.addr, align 4, !dbg !22
   %arrayidx1 = getelementptr inbounds i32, i32 addrspace(1)* %1, i32 1, !dbg !22
   store i32 2, i32 addrspace(1)* %arrayidx1, align 4, !dbg !23

diff --git a/test/CodeGen/AMDGPU/insert_vector_elt.ll b/test/CodeGen/AMDGPU/insert_vector_elt.ll
@@ -257,8 +257,9 @@ define void @dynamic_insertelement_v2i8(<2 x i8> addrspace(1)* %out, <2 x i8> %a
 
 ; GCN-DAG: buffer_store_byte v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offset:2
 ; GCN-DAG: buffer_store_byte v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offset:1
-; GCN-DAG: buffer_store_byte v{{[0-9]+}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offen{{$}}
-; GCN-DAG: buffer_store_byte v{{[0-9]+}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offen{{$}}
+; GCN-DAG: buffer_store_byte v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}}{{$}}
+
+; GCN: buffer_store_byte v{{[0-9]+}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offen{{$}}
 
 ; GCN: buffer_load_ubyte
 ; GCN: buffer_load_ubyte

diff --git a/test/CodeGen/AMDGPU/merge-stores.ll b/test/CodeGen/AMDGPU/merge-stores.ll
@@ -1,5 +1,8 @@
-; RUN: llc -march=amdgcn -verify-machineinstrs -amdgpu-load-store-vectorizer=0 < %s | FileCheck -check-prefix=SI -check-prefix=GCN -check-prefix=GCN-AA %s
-; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs -amdgpu-load-store-vectorizer=0 < %s | FileCheck -check-prefix=SI -check-prefix=GCN -check-prefix=GCN-AA %s
+; RUN: llc -march=amdgcn -verify-machineinstrs -amdgpu-load-store-vectorizer=0 < %s | FileCheck -check-prefix=SI -check-prefix=GCN -check-prefix=GCN-NOAA %s
+; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs -amdgpu-load-store-vectorizer=0 < %s | FileCheck -check-prefix=SI -check-prefix=GCN -check-prefix=GCN-NOAA %s
+
+; RUN: llc -march=amdgcn -verify-machineinstrs -combiner-alias-analysis -amdgpu-load-store-vectorizer=0 < %s | FileCheck -check-prefix=SI -check-prefix=GCN -check-prefix=GCN-AA %s
+; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs -combiner-alias-analysis -amdgpu-load-store-vectorizer=0 < %s | FileCheck -check-prefix=SI -check-prefix=GCN -check-prefix=GCN-AA %s
 
 ; This test is mostly to test DAG store merging, so disable the vectorizer.
 ; Run with devices with different unaligned load restrictions.
@@ -147,7 +150,12 @@ define void @merge_global_store_4_constants_f32(float addrspace(1)* %out) #0 {
 }
 
 ; GCN-LABEL: {{^}}merge_global_store_4_constants_mixed_i32_f32:
-; GCN-AA: buffer_store_dwordx4 v
+; GCN-NOAA: buffer_store_dwordx4 v
+
+; GCN-AA: buffer_store_dwordx2
+; GCN-AA: buffer_store_dword v
+; GCN-AA: buffer_store_dword v
+
 ; GCN: s_endpgm
 define void @merge_global_store_4_constants_mixed_i32_f32(float addrspace(1)* %out) #0 {
   %out.gep.1 = getelementptr float, float addrspace(1)* %out, i32 1
@@ -466,9 +474,17 @@ define void @merge_global_store_4_adjacent_loads_i8_natural_align(i8 addrspace(1
   ret void
 }
 
+; This works once AA is enabled on the subtarget
 ; GCN-LABEL: {{^}}merge_global_store_4_vector_elts_loads_v4i32:
 ; GCN: buffer_load_dwordx4 [[LOAD:v\[[0-9]+:[0-9]+\]]]
-; GCN: buffer_store_dwordx4 [[LOAD]]
+
+; GCN-NOAA: buffer_store_dword v
+; GCN-NOAA: buffer_store_dword v
+; GCN-NOAA: buffer_store_dword v
+; GCN-NOAA: buffer_store_dword v
+
+; GCN-AA: buffer_store_dwordx4 [[LOAD]]
+
 ; GCN: s_endpgm
 define void @merge_global_store_4_vector_elts_loads_v4i32(i32 addrspace(1)* %out, <4 x i32> addrspace(1)* %in) #0 {
   %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1

diff --git a/test/CodeGen/AMDGPU/private-element-size.ll b/test/CodeGen/AMDGPU/private-element-size.ll
@@ -32,10 +32,10 @@
 ; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], s9 offset:24{{$}}
 ; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], s9 offset:28{{$}}
 
-; HSA-ELT4-DAG: buffer_load_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s9 offen{{$}}
-; HSA-ELT4-DAG: buffer_load_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s9 offen offset:4{{$}}
-; HSA-ELT4-DAG: buffer_load_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s9 offen offset:8{{$}}
-; HSA-ELT4-DAG: buffer_load_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s9 offen offset:12{{$}}
+; HSA-ELT4: buffer_load_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s9 offen{{$}}
+; HSA-ELT4: buffer_load_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s9 offen offset:4{{$}}
+; HSA-ELT4: buffer_load_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s9 offen offset:8{{$}}
+; HSA-ELT4: buffer_load_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s9 offen offset:12{{$}}
 define void @private_elt_size_v4i32(<4 x i32> addrspace(1)* %out, i32 addrspace(1)* %index.array) #0 {
 entry:
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
@@ -130,8 +130,8 @@ entry:
 ; HSA-ELT8: private_element_size = 2
 ; HSA-ELT4: private_element_size = 1
 
-; HSA-ELTGE8-DAG: buffer_store_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, {{off|v[0-9]}}, s[0:3], s9{{$}}
-; HSA-ELTGE8-DAG: buffer_store_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, {{off|v[0-9]}}, s[0:3], s9 offset:8
+; HSA-ELTGE8-DAG: buffer_store_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, off, s[0:3], s9{{$}}
+; HSA-ELTGE8-DAG: buffer_store_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, off, s[0:3], s9 offset:8
 
 ; HSA-ELTGE8: buffer_load_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s[0:3], s9 offen
 

diff --git a/test/CodeGen/AMDGPU/si-triv-disjoint-mem-access.ll b/test/CodeGen/AMDGPU/si-triv-disjoint-mem-access.ll
@@ -157,8 +157,9 @@ define void @reorder_global_load_local_store_global_load(i32 addrspace(1)* %out,
 
 ; FUNC-LABEL: @reorder_local_offsets
 ; CI: ds_read2_b32 {{v\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}} offset0:100 offset1:102
-; CI-DAG: ds_write2_b32 {{v[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset0:3 offset1:100
-; CI-DAG: ds_write_b32 {{v[0-9]+}}, {{v[0-9]+}} offset:408
+; CI: ds_write2_b32 {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} offset0:3 offset1:100
+; CI: ds_read_b32 {{v[0-9]+}}, {{v[0-9]+}} offset:12
+; CI: ds_write_b32 {{v[0-9]+}}, {{v[0-9]+}} offset:408
 ; CI: buffer_store_dword
 ; CI: s_endpgm
 define void @reorder_local_offsets(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* noalias nocapture readnone %gptr, i32 addrspace(3)* noalias nocapture %ptr0) #0 {
@@ -180,12 +181,12 @@ define void @reorder_local_offsets(i32 addrspace(1)* nocapture %out, i32 addrspa
 }
 
 ; FUNC-LABEL: @reorder_global_offsets
-; CI-DAG: buffer_load_dword {{v[0-9]+}}, off, {{s\[[0-9]+:[0-9]+\]}}, 0 offset:400
-; CI-DAG: buffer_load_dword {{v[0-9]+}}, off, {{s\[[0-9]+:[0-9]+\]}}, 0 offset:408
-; CI-DAG: buffer_store_dword {{v[0-9]+}}, off, {{s\[[0-9]+:[0-9]+\]}}, 0 offset:12
-; CI-DAG: buffer_store_dword {{v[0-9]+}}, off, {{s\[[0-9]+:[0-9]+\]}}, 0 offset:400
-; CI-DAG: buffer_store_dword {{v[0-9]+}}, off, {{s\[[0-9]+:[0-9]+\]}}, 0 offset:408
-; CI: buffer_store_dword
+; CI: buffer_load_dword {{v[0-9]+}}, off, {{s\[[0-9]+:[0-9]+\]}}, 0 offset:400
+; CI: buffer_load_dword {{v[0-9]+}}, off, {{s\[[0-9]+:[0-9]+\]}}, 0 offset:408
+; CI: buffer_store_dword {{v[0-9]+}}, off, {{s\[[0-9]+:[0-9]+\]}}, 0 offset:12
+; CI: buffer_store_dword {{v[0-9]+}}, off, {{s\[[0-9]+:[0-9]+\]}}, 0 offset:400
+; CI: buffer_store_dword {{v[0-9]+}}, off, {{s\[[0-9]+:[0-9]+\]}}, 0 offset:408
+; CI: buffer_load_dword {{v[0-9]+}}, off, {{s\[[0-9]+:[0-9]+\]}}, 0 offset:12
 ; CI: s_endpgm
 define void @reorder_global_offsets(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* noalias nocapture readnone %gptr, i32 addrspace(1)* noalias nocapture %ptr0) #0 {
   %ptr1 = getelementptr inbounds i32, i32 addrspace(1)* %ptr0, i32 3

diff --git a/test/CodeGen/ARM/2012-10-04-AAPCS-byval-align8.ll b/test/CodeGen/ARM/2012-10-04-AAPCS-byval-align8.ll
@@ -12,8 +12,7 @@ define void @test_byval_8_bytes_alignment(i32 %i, ...) {
 entry:
 ; CHECK: sub       sp, sp, #12
 ; CHECK: sub       sp, sp, #4
-; CHECK: add       r0, sp, #4
-; CHECK: stm       sp, {r0, r1, r2, r3}
+; CHECK: stmib     sp, {r1, r2, r3}
   %g = alloca i8*
   %g1 = bitcast i8** %g to i8*
   call void @llvm.va_start(i8* %g1)

diff --git a/test/CodeGen/ARM/alloc-no-stack-realign.ll b/test/CodeGen/ARM/alloc-no-stack-realign.ll
@@ -1,4 +1,5 @@
-; RUN: llc < %s -mtriple=armv7-apple-ios -O0 | FileCheck %s
+; RUN: llc < %s -mtriple=armv7-apple-ios -O0 | FileCheck %s -check-prefix=NO-REALIGN
+; RUN: llc < %s -mtriple=armv7-apple-ios -O0 | FileCheck %s -check-prefix=REALIGN
 
 ; rdar://12713765
 ; When realign-stack is set to false, make sure we are not creating stack
@@ -7,31 +8,29 @@
 
 define void @test1(<16 x float>* noalias sret %agg.result) nounwind ssp "no-realign-stack" {
 entry:
-; CHECK-LABEL: test1
-; CHECK:	ldr	r[[R1:[0-9]+]], [pc, r1]
-; CHECK:	add	r[[R2:[0-9]+]], r1, #48
-; CHECK:	vld1.64	 {{{d[0-9]+, d[0-9]+}}}, [r[[R2]]:128]
-; CHECK:	mov	r[[R2:[0-9]+]], r[[R1]]
-; CHECK:	vld1.32	 {{{d[0-9]+, d[0-9]+}}}, [r[[R2]]:128]!
-; CHECK:	vld1.64	 {{{d[0-9]+, d[0-9]+}}}, [r[[R2]]:128]
-; CHECK:	add	r[[R1:[0-9]+]], r[[R1]], #32
-; CHECK:	vld1.64 {{{d[0-9]+, d[0-9]+}}}, [r[[R1]]:128]
-; CHECK:	mov	r[[R1:[0-9]+]], sp
-; CHECK:	vst1.64	{{{d[0-9]+, d[0-9]+}}}, [r[[R1]]:128]
-; CHECK:	add	r[[R2:[0-9]+]], r[[R1]], #32
-; CHECK:	vst1.64 {{{d[0-9]+, d[0-9]+}}}, [r[[R2]]:128]
-; CHECK:	vld1.32 {{{d[0-9]+, d[0-9]+}}}, [r[[R1]]:128]!
-; CHECK:	vst1.64 {{{d[0-9]+, d[0-9]+}}}, [r[[R1]]:128]
-; CHECK:	vld1.32 {{{d[0-9]+, d[0-9]+}}}, [r[[R2]]:128]!
-; CHECK:	vst1.64 {{{d[0-9]+, d[0-9]+}}}, [r[[R2]]:128]
-; CHECK:	vld1.64 {{{d[0-9]+, d[0-9]+}}}, [r[[R1]]:128]
-; CHECK:	vld1.64 {{{d[0-9]+, d[0-9]+}}}, [r[[R2]]:128]
-; CHECK:	add	r[[R1:[0-9]+]], r0, #48
-; CHECK:	vst1.64 {{{d[0-9]+, d[0-9]+}}}, [r[[R1]]:128]
-; CHECK:	add	r[[R1:[0-9]+]], r0, #32
-; CHECK:	vst1.64 {{{d[0-9]+, d[0-9]+}}}, [r[[R1]]:128]
-; CHECK:	vst1.32 {{{d[0-9]+, d[0-9]+}}}, [r0:128]!
-; CHECK:	vst1.64 {{{d[0-9]+, d[0-9]+}}}, [r0:128]
+; NO-REALIGN-LABEL: test1
+; NO-REALIGN: mov r[[R2:[0-9]+]], r[[R1:[0-9]+]]
+; NO-REALIGN: vld1.32 {{{d[0-9]+, d[0-9]+}}}, [r[[R2]]:128]!
+; NO-REALIGN: vld1.64 {{{d[0-9]+, d[0-9]+}}}, [r[[R2]]:128]
+; NO-REALIGN: add r[[R2:[0-9]+]], r[[R1]], #32
+; NO-REALIGN: vld1.64 {{{d[0-9]+, d[0-9]+}}}, [r[[R2]]:128]
+; NO-REALIGN: add r[[R2:[0-9]+]], r[[R1]], #48
+; NO-REALIGN: vld1.64 {{{d[0-9]+, d[0-9]+}}}, [r[[R2]]:128]
+
+; NO-REALIGN: add r[[R2:[0-9]+]], r[[R1:[0-9]+]], #48
+; NO-REALIGN: vst1.64 {{{d[0-9]+, d[0-9]+}}}, [r[[R2]]:128]
+; NO-REALIGN: add r[[R2:[0-9]+]], r[[R1]], #32
+; NO-REALIGN: vst1.64 {{{d[0-9]+, d[0-9]+}}}, [r[[R2]]:128]
+; NO-REALIGN: mov	r[[R3:[0-9]+]], r[[R1]]
+; NO-REALIGN: vst1.32 {{{d[0-9]+, d[0-9]+}}}, [r[[R3]]:128]!
+; NO-REALIGN: vst1.64 {{{d[0-9]+, d[0-9]+}}}, [r[[R3]]:128]
+
+; NO-REALIGN: add r[[R2:[0-9]+]], r[[R0:0]], #48
+; NO-REALIGN: vst1.64 {{{d[0-9]+, d[0-9]+}}}, [r[[R2]]:128]
+; NO-REALIGN: add r[[R2:[0-9]+]], r[[R0]], #32
+; NO-REALIGN: vst1.64 {{{d[0-9]+, d[0-9]+}}}, [r[[R2]]:128]
+; NO-REALIGN: vst1.32 {{{d[0-9]+, d[0-9]+}}}, [r[[R0]]:128]!
+; NO-REALIGN: vst1.64 {{{d[0-9]+, d[0-9]+}}}, [r[[R0]]:128]
  %retval = alloca <16 x float>, align 16
  %0 = load <16 x float>, <16 x float>* @T3_retval, align 16
  store <16 x float> %0, <16 x float>* %retval
@@ -42,33 +41,32 @@ entry:
 
 define void @test2(<16 x float>* noalias sret %agg.result) nounwind ssp {
 entry:
-; CHECK:	ldr	r[[R1:[0-9]+]], [pc, r1]
-; CHECK:	add	r[[R2:[0-9]+]], r[[R1]], #48
-; CHECK:	vld1.64	{{{d[0-9]+, d[0-9]+}}}, [r[[R2]]:128]
-; CHECK:	mov	r[[R2:[0-9]+]], r[[R1]]
-; CHECK:	vld1.32	{{{d[0-9]+, d[0-9]+}}}, [r[[R2]]:128]!
-; CHECK:	vld1.64	{{{d[0-9]+, d[0-9]+}}}, [r[[R2]]:128]
-; CHECK:	add	r[[R1:[0-9]+]], r[[R1]], #32
-; CHECK:	vld1.64	{{{d[0-9]+, d[0-9]+}}}, [r[[R1]]:128]
-; CHECK:	mov	r[[R1:[0-9]+]], sp
-; CHECK:	vst1.64	{{{d[0-9]+, d[0-9]+}}}, [r[[R1]]:128]
-; CHECK:	orr	r[[R2:[0-9]+]], r[[R1]], #32
-; CHECK:	vst1.64	{{{d[0-9]+, d[0-9]+}}}, [r[[R2]]:128]
-; CHECK:	vld1.32	{{{d[0-9]+, d[0-9]+}}}, [r[[R1]]:128]!
-; CHECK:	vst1.64	{{{d[0-9]+, d[0-9]+}}}, [r[[R1]]:128]
-; CHECK:	vld1.32	{{{d[0-9]+, d[0-9]+}}}, [r[[R2]]:128]!
-; CHECK:	vst1.64	{{{d[0-9]+, d[0-9]+}}}, [r[[R2]]:128]
-; CHECK:	vld1.64	{{{d[0-9]+, d[0-9]+}}}, [r[[R1]]:128]
-; CHECK:	vld1.64	{{{d[0-9]+, d[0-9]+}}}, [r[[R2]]:128]
-; CHECK:	add	r[[R1:[0-9]+]], r0, #48
-; CHECK:	vst1.64	{{{d[0-9]+, d[0-9]+}}}, [r[[R1]]:128]
-; CHECK:	add	r[[R1:[0-9]+]], r0, #32
-; CHECK:	vst1.64	{{{d[0-9]+, d[0-9]+}}}, [r[[R1]]:128]
-; CHECK:	vst1.32	{{{d[0-9]+, d[0-9]+}}}, [r0:128]!
-; CHECK:	vst1.64	{{{d[0-9]+, d[0-9]+}}}, [r0:128]
+; REALIGN-LABEL: test2
+; REALIGN: bfc sp, #0, #6
+; REALIGN: mov r[[R2:[0-9]+]], r[[R1:[0-9]+]]
+; REALIGN: vld1.32 {{{d[0-9]+, d[0-9]+}}}, [r[[R2]]:128]!
+; REALIGN: vld1.64 {{{d[0-9]+, d[0-9]+}}}, [r[[R2]]:128]
+; REALIGN: add r[[R2:[0-9]+]], r[[R1]], #32
+; REALIGN: vld1.64 {{{d[0-9]+, d[0-9]+}}}, [r[[R2]]:128]
+; REALIGN: add r[[R2:[0-9]+]], r[[R1]], #48
+; REALIGN: vld1.64 {{{d[0-9]+, d[0-9]+}}}, [r[[R2]]:128]
 
 
-%retval = alloca <16 x float>, align 16
+; REALIGN: orr r[[R2:[0-9]+]], r[[R1:[0-9]+]], #48
+; REALIGN: vst1.64 {{{d[0-9]+, d[0-9]+}}}, [r[[R2]]:128]
+; REALIGN: orr r[[R2:[0-9]+]], r[[R1]], #32
+; REALIGN: vst1.64 {{{d[0-9]+, d[0-9]+}}}, [r[[R2]]:128]
+; REALIGN: orr r[[R2:[0-9]+]], r[[R1]], #16
+; REALIGN: vst1.64 {{{d[0-9]+, d[0-9]+}}}, [r[[R2]]:128]
+; REALIGN: vst1.64 {{{d[0-9]+, d[0-9]+}}}, [r[[R1]]:128]
+
+; REALIGN: add r[[R1:[0-9]+]], r[[R0:0]], #48
+; REALIGN: vst1.64 {{{d[0-9]+, d[0-9]+}}}, [r[[R1]]:128]
+; REALIGN: add r[[R1:[0-9]+]], r[[R0]], #32
+; REALIGN: vst1.64 {{{d[0-9]+, d[0-9]+}}}, [r[[R1]]:128]
+; REALIGN: vst1.32 {{{d[0-9]+, d[0-9]+}}}, [r[[R0]]:128]!
+; REALIGN: vst1.64 {{{d[0-9]+, d[0-9]+}}}, [r[[R0]]:128]
+ %retval = alloca <16 x float>, align 16
  %0 = load <16 x float>, <16 x float>* @T3_retval, align 16
  store <16 x float> %0, <16 x float>* %retval
  %1 = load <16 x float>, <16 x float>* %retval