forked from llvm-mirror/llvm
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
[AMDGPU] Add sdwa support for ADD|SUB U64 decomposed Pseudos
The introduction of S_{ADD|SUB}_U64_PSEUDO instructions which are decomposed into VOP3 instruction pairs for S_ADD_U64_PSEUDO: V_ADD_I32_e64 V_ADDC_U32_e64 and for S_SUB_U64_PSEUDO V_SUB_I32_e64 V_SUBB_U32_e64 preclude the use of SDWA to encode a constant. SDWA: Sub-Dword addressing is supported on VOP1 and VOP2 instructions, but not on VOP3 instructions. We desire to fold the bit-and operand into the instruction encoding for the V_ADD_I32 instruction. This requires that we transform the VOP3 into a VOP2 form of the instruction (_e32). %19:vgpr_32 = V_AND_B32_e32 255, killed %16:vgpr_32, implicit $exec %47:vgpr_32, %49:sreg_64_xexec = V_ADD_I32_e64 %26.sub0:vreg_64, %19:vgpr_32, implicit $exec %48:vgpr_32, dead %50:sreg_64_xexec = V_ADDC_U32_e64 %26.sub1:vreg_64, %54:vgpr_32, killed %49:sreg_64_xexec, implicit $exec which then allows the SDWA encoding and becomes %47:vgpr_32 = V_ADD_I32_sdwa 0, %26.sub0:vreg_64, 0, killed %16:vgpr_32, 0, 6, 0, 6, 0, implicit-def $vcc, implicit $exec %48:vgpr_32 = V_ADDC_U32_e32 0, %26.sub1:vreg_64, implicit-def $vcc, implicit $vcc, implicit $exec Differential Revision: https://reviews.llvm.org/D54882 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@348132 91177308-0d34-0410-b5e6-96231b3b80d8
- Loading branch information
Showing
4 changed files
with
563 additions
and
2 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,74 @@ | ||
; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9,GCN %s | ||
; RUN: llc -march=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefixes=FIJI,GCN %s | ||
|
||
; GCN-LABEL: {{^}}test_add_co_sdwa: | ||
; GFX9: v_add_co_u32_sdwa v{{[0-9]+}}, vcc, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 | ||
; GFX9: v_addc_co_u32_e32 v{{[0-9]+}}, vcc, 0, v{{[0-9]+}}, vcc{{$}} | ||
; FIJI: v_add_u32_sdwa v{{[0-9]+}}, vcc, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 | ||
; FIJI: v_addc_u32_e32 v{{[0-9]+}}, vcc, 0, v{{[0-9]+}}, vcc{{$}} | ||
define amdgpu_kernel void @test_add_co_sdwa(i64 addrspace(1)* %arg, i32 addrspace(1)* %arg1) #0 { | ||
bb: | ||
%tmp = tail call i32 @llvm.amdgcn.workitem.id.x() | ||
%tmp3 = getelementptr inbounds i32, i32 addrspace(1)* %arg1, i32 %tmp | ||
%tmp4 = load i32, i32 addrspace(1)* %tmp3, align 4 | ||
%tmp5 = and i32 %tmp4, 255 | ||
%tmp6 = zext i32 %tmp5 to i64 | ||
%tmp7 = getelementptr inbounds i64, i64 addrspace(1)* %arg, i32 %tmp | ||
%tmp8 = load i64, i64 addrspace(1)* %tmp7, align 8 | ||
%tmp9 = add nsw i64 %tmp8, %tmp6 | ||
store i64 %tmp9, i64 addrspace(1)* %tmp7, align 8 | ||
ret void | ||
} | ||
|
||
|
||
; GCN-LABEL: {{^}}test_sub_co_sdwa: | ||
; GFX9: v_sub_co_u32_sdwa v{{[0-9]+}}, vcc, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 | ||
; GFX9: v_subbrev_co_u32_e32 v{{[0-9]+}}, vcc, 0, v{{[0-9]+}}, vcc{{$}} | ||
; FIJI: v_sub_u32_sdwa v{{[0-9]+}}, vcc, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 | ||
; FIJI: v_subbrev_u32_e32 v{{[0-9]+}}, vcc, 0, v{{[0-9]+}}, vcc{{$}} | ||
define amdgpu_kernel void @test_sub_co_sdwa(i64 addrspace(1)* %arg, i32 addrspace(1)* %arg1) #0 { | ||
bb: | ||
%tmp = tail call i32 @llvm.amdgcn.workitem.id.x() | ||
%tmp3 = getelementptr inbounds i32, i32 addrspace(1)* %arg1, i32 %tmp | ||
%tmp4 = load i32, i32 addrspace(1)* %tmp3, align 4 | ||
%tmp5 = and i32 %tmp4, 255 | ||
%tmp6 = zext i32 %tmp5 to i64 | ||
%tmp7 = getelementptr inbounds i64, i64 addrspace(1)* %arg, i32 %tmp | ||
%tmp8 = load i64, i64 addrspace(1)* %tmp7, align 8 | ||
%tmp9 = sub nsw i64 %tmp8, %tmp6 | ||
store i64 %tmp9, i64 addrspace(1)* %tmp7, align 8 | ||
ret void | ||
} | ||
|
||
; GCN-LABEL: {{^}}test1_add_co_sdwa: | ||
; GFX9: v_add_co_u32_sdwa v{{[0-9]+}}, vcc, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 | ||
; GFX9: v_addc_co_u32_e32 v{{[0-9]+}}, vcc, 0, v{{[0-9]+}}, vcc{{$}} | ||
; GFX9: v_add_co_u32_sdwa v{{[0-9]+}}, vcc, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 | ||
; GFX9: v_addc_co_u32_e32 v{{[0-9]+}}, vcc, 0, v{{[0-9]+}}, vcc{{$}} | ||
; FIJI: v_add_u32_sdwa v{{[0-9]+}}, vcc, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 | ||
; FIJI: v_addc_u32_e32 v{{[0-9]+}}, vcc, 0, v{{[0-9]+}}, vcc{{$}} | ||
; FIJI: v_add_u32_sdwa v{{[0-9]+}}, vcc, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 | ||
; FIJI: v_addc_u32_e32 v{{[0-9]+}}, vcc, 0, v{{[0-9]+}}, vcc{{$}} | ||
define amdgpu_kernel void @test1_add_co_sdwa(i64 addrspace(1)* %arg, i32 addrspace(1)* %arg1, i64 addrspace(1)* %arg2) #0 { | ||
bb: | ||
%tmp = tail call i32 @llvm.amdgcn.workitem.id.x() | ||
%tmp3 = getelementptr inbounds i32, i32 addrspace(1)* %arg1, i32 %tmp | ||
%tmp4 = load i32, i32 addrspace(1)* %tmp3, align 4 | ||
%tmp5 = and i32 %tmp4, 255 | ||
%tmp6 = zext i32 %tmp5 to i64 | ||
%tmp7 = getelementptr inbounds i64, i64 addrspace(1)* %arg, i32 %tmp | ||
%tmp8 = load i64, i64 addrspace(1)* %tmp7, align 8 | ||
%tmp9 = add nsw i64 %tmp8, %tmp6 | ||
store i64 %tmp9, i64 addrspace(1)* %tmp7, align 8 | ||
%tmp13 = getelementptr inbounds i32, i32 addrspace(1)* %arg1, i32 %tmp | ||
%tmp14 = load i32, i32 addrspace(1)* %tmp13, align 4 | ||
%tmp15 = and i32 %tmp14, 255 | ||
%tmp16 = zext i32 %tmp15 to i64 | ||
%tmp17 = getelementptr inbounds i64, i64 addrspace(1)* %arg2, i32 %tmp | ||
%tmp18 = load i64, i64 addrspace(1)* %tmp17, align 8 | ||
%tmp19 = add nsw i64 %tmp18, %tmp16 | ||
store i64 %tmp19, i64 addrspace(1)* %tmp17, align 8 | ||
ret void | ||
} | ||
|
||
declare i32 @llvm.amdgcn.workitem.id.x() |
Oops, something went wrong.