diff --git a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td index 897ee89323f08..142dd64ddea9d 100644 --- a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td +++ b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td @@ -1752,8 +1752,9 @@ def ROTL64reg_sw : ".reg .b64 %lhs;\n\t" ".reg .b64 %rhs;\n\t" ".reg .u32 %amt2;\n\t" - "shl.b64 \t%lhs, $src, $amt;\n\t" - "sub.u32 \t%amt2, 64, $amt;\n\t" + "and.b32 \t%amt2, $amt, 63;\n\t" + "shl.b64 \t%lhs, $src, %amt2;\n\t" + "sub.u32 \t%amt2, 64, %amt2;\n\t" "shr.b64 \t%rhs, $src, %amt2;\n\t" "add.u64 \t$dst, %lhs, %rhs;\n\t" "}}", @@ -1765,8 +1766,9 @@ def ROTR64reg_sw : ".reg .b64 %lhs;\n\t" ".reg .b64 %rhs;\n\t" ".reg .u32 %amt2;\n\t" - "shr.b64 \t%lhs, $src, $amt;\n\t" - "sub.u32 \t%amt2, 64, $amt;\n\t" + "and.b32 \t%amt2, $amt, 63;\n\t" + "shr.b64 \t%lhs, $src, %amt2;\n\t" + "sub.u32 \t%amt2, 64, %amt2;\n\t" "shl.b64 \t%rhs, $src, %amt2;\n\t" "add.u64 \t$dst, %lhs, %rhs;\n\t" "}}", diff --git a/llvm/test/CodeGen/NVPTX/rotate.ll b/llvm/test/CodeGen/NVPTX/rotate.ll index 9d058662c2717..20c7ae5908d29 100644 --- a/llvm/test/CodeGen/NVPTX/rotate.ll +++ b/llvm/test/CodeGen/NVPTX/rotate.ll @@ -1,7 +1,8 @@ -; RUN: llc < %s -march=nvptx64 -mcpu=sm_20 | FileCheck --check-prefix=SM20 %s -; RUN: llc < %s -march=nvptx64 -mcpu=sm_35 | FileCheck --check-prefix=SM35 %s -; RUN: %if ptxas %{ llc < %s -march=nvptx64 -mcpu=sm_20 | %ptxas-verify %} -; RUN: %if ptxas %{ llc < %s -march=nvptx64 -mcpu=sm_35 | %ptxas-verify %} +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4 +; RUN: llc < %s --mtriple=nvptx64 -mcpu=sm_20 | FileCheck --check-prefix=SM20 %s +; RUN: llc < %s --mtriple=nvptx64 -mcpu=sm_35 | FileCheck --check-prefix=SM35 %s +; RUN: %if ptxas %{ llc < %s --mtriple=nvptx64 -mcpu=sm_20 | %ptxas-verify %} +; RUN: %if ptxas %{ llc < %s --mtriple=nvptx64 -mcpu=sm_35 | %ptxas-verify %} declare i32 @llvm.nvvm.rotate.b32(i32, i32) @@ -11,11 +12,35 @@ declare i64 @llvm.nvvm.rotate.right.b64(i64, i32) ; SM20: rotate32 ; SM35: rotate32 define i32 @rotate32(i32 %a, i32 %b) { -; SM20: shl.b32 -; SM20: sub.s32 -; SM20: shr.b32 -; SM20: add.u32 -; SM35: shf.l.wrap.b32 +; SM20-LABEL: rotate32( +; SM20: { +; SM20-NEXT: .reg .b32 %r<4>; +; SM20-EMPTY: +; SM20-NEXT: // %bb.0: +; SM20-NEXT: ld.param.u32 %r1, [rotate32_param_0]; +; SM20-NEXT: ld.param.u32 %r2, [rotate32_param_1]; +; SM20-NEXT: { +; SM20-NEXT: .reg .b32 %lhs; +; SM20-NEXT: .reg .b32 %rhs; +; SM20-NEXT: .reg .b32 %amt2; +; SM20-NEXT: shl.b32 %lhs, %r1, %r2; +; SM20-NEXT: sub.s32 %amt2, 32, %r2; +; SM20-NEXT: shr.b32 %rhs, %r1, %amt2; +; SM20-NEXT: add.u32 %r3, %lhs, %rhs; +; SM20-NEXT: } +; SM20-NEXT: st.param.b32 [func_retval0+0], %r3; +; SM20-NEXT: ret; +; +; SM35-LABEL: rotate32( +; SM35: { +; SM35-NEXT: .reg .b32 %r<4>; +; SM35-EMPTY: +; SM35-NEXT: // %bb.0: +; SM35-NEXT: ld.param.u32 %r1, [rotate32_param_0]; +; SM35-NEXT: ld.param.u32 %r2, [rotate32_param_1]; +; SM35-NEXT: shf.l.wrap.b32 %r3, %r1, %r1, %r2; +; SM35-NEXT: st.param.b32 [func_retval0+0], %r3; +; SM35-NEXT: ret; %val = tail call i32 @llvm.nvvm.rotate.b32(i32 %a, i32 %b) ret i32 %val } @@ -23,12 +48,48 @@ define i32 @rotate32(i32 %a, i32 %b) { ; SM20: rotate64 ; SM35: rotate64 define i64 @rotate64(i64 %a, i32 %b) { -; SM20: shl.b64 -; SM20: sub.u32 -; SM20: shr.b64 -; SM20: add.u64 -; SM35: shf.l.wrap.b32 -; SM35: shf.l.wrap.b32 +; SM20-LABEL: rotate64( +; SM20: { +; SM20-NEXT: .reg .b32 %r<2>; +; SM20-NEXT: .reg .b64 %rd<3>; +; SM20-EMPTY: +; SM20-NEXT: // %bb.0: +; SM20-NEXT: ld.param.u64 %rd1, [rotate64_param_0]; +; SM20-NEXT: ld.param.u32 %r1, [rotate64_param_1]; +; SM20-NEXT: { +; SM20-NEXT: .reg .b64 %lhs; +; SM20-NEXT: .reg .b64 %rhs; +; SM20-NEXT: .reg .u32 %amt2; +; SM20-NEXT: and.b32 %amt2, %r1, 63; +; SM20-NEXT: shl.b64 %lhs, %rd1, %amt2; +; SM20-NEXT: sub.u32 %amt2, 64, %amt2; +; SM20-NEXT: shr.b64 %rhs, %rd1, %amt2; +; SM20-NEXT: add.u64 %rd2, %lhs, %rhs; +; SM20-NEXT: } +; SM20-NEXT: st.param.b64 [func_retval0+0], %rd2; +; SM20-NEXT: ret; +; +; SM35-LABEL: rotate64( +; SM35: { +; SM35-NEXT: .reg .b32 %r<6>; +; SM35-NEXT: .reg .b64 %rd<3>; +; SM35-EMPTY: +; SM35-NEXT: // %bb.0: +; SM35-NEXT: ld.param.u64 %rd1, [rotate64_param_0]; +; SM35-NEXT: { +; SM35-NEXT: .reg .b32 %dummy; +; SM35-NEXT: mov.b64 {%dummy,%r1}, %rd1; +; SM35-NEXT: } +; SM35-NEXT: { +; SM35-NEXT: .reg .b32 %dummy; +; SM35-NEXT: mov.b64 {%r2,%dummy}, %rd1; +; SM35-NEXT: } +; SM35-NEXT: ld.param.u32 %r3, [rotate64_param_1]; +; SM35-NEXT: shf.l.wrap.b32 %r4, %r2, %r1, %r3; +; SM35-NEXT: shf.l.wrap.b32 %r5, %r1, %r2, %r3; +; SM35-NEXT: mov.b64 %rd2, {%r5, %r4}; +; SM35-NEXT: st.param.b64 [func_retval0+0], %rd2; +; SM35-NEXT: ret; %val = tail call i64 @llvm.nvvm.rotate.b64(i64 %a, i32 %b) ret i64 %val } @@ -36,12 +97,48 @@ define i64 @rotate64(i64 %a, i32 %b) { ; SM20: rotateright64 ; SM35: rotateright64 define i64 @rotateright64(i64 %a, i32 %b) { -; SM20: shr.b64 -; SM20: sub.u32 -; SM20: shl.b64 -; SM20: add.u64 -; SM35: shf.r.wrap.b32 -; SM35: shf.r.wrap.b32 +; SM20-LABEL: rotateright64( +; SM20: { +; SM20-NEXT: .reg .b32 %r<2>; +; SM20-NEXT: .reg .b64 %rd<3>; +; SM20-EMPTY: +; SM20-NEXT: // %bb.0: +; SM20-NEXT: ld.param.u64 %rd1, [rotateright64_param_0]; +; SM20-NEXT: ld.param.u32 %r1, [rotateright64_param_1]; +; SM20-NEXT: { +; SM20-NEXT: .reg .b64 %lhs; +; SM20-NEXT: .reg .b64 %rhs; +; SM20-NEXT: .reg .u32 %amt2; +; SM20-NEXT: and.b32 %amt2, %r1, 63; +; SM20-NEXT: shr.b64 %lhs, %rd1, %amt2; +; SM20-NEXT: sub.u32 %amt2, 64, %amt2; +; SM20-NEXT: shl.b64 %rhs, %rd1, %amt2; +; SM20-NEXT: add.u64 %rd2, %lhs, %rhs; +; SM20-NEXT: } +; SM20-NEXT: st.param.b64 [func_retval0+0], %rd2; +; SM20-NEXT: ret; +; +; SM35-LABEL: rotateright64( +; SM35: { +; SM35-NEXT: .reg .b32 %r<6>; +; SM35-NEXT: .reg .b64 %rd<3>; +; SM35-EMPTY: +; SM35-NEXT: // %bb.0: +; SM35-NEXT: ld.param.u64 %rd1, [rotateright64_param_0]; +; SM35-NEXT: { +; SM35-NEXT: .reg .b32 %dummy; +; SM35-NEXT: mov.b64 {%r1,%dummy}, %rd1; +; SM35-NEXT: } +; SM35-NEXT: { +; SM35-NEXT: .reg .b32 %dummy; +; SM35-NEXT: mov.b64 {%dummy,%r2}, %rd1; +; SM35-NEXT: } +; SM35-NEXT: ld.param.u32 %r3, [rotateright64_param_1]; +; SM35-NEXT: shf.r.wrap.b32 %r4, %r2, %r1, %r3; +; SM35-NEXT: shf.r.wrap.b32 %r5, %r1, %r2, %r3; +; SM35-NEXT: mov.b64 %rd2, {%r5, %r4}; +; SM35-NEXT: st.param.b64 [func_retval0+0], %rd2; +; SM35-NEXT: ret; %val = tail call i64 @llvm.nvvm.rotate.right.b64(i64 %a, i32 %b) ret i64 %val } @@ -49,12 +146,204 @@ define i64 @rotateright64(i64 %a, i32 %b) { ; SM20: rotl0 ; SM35: rotl0 define i32 @rotl0(i32 %x) { -; SM20: shl.b32 -; SM20: shr.b32 -; SM20: add.u32 -; SM35: shf.l.wrap.b32 +; SM20-LABEL: rotl0( +; SM20: { +; SM20-NEXT: .reg .b32 %r<3>; +; SM20-EMPTY: +; SM20-NEXT: // %bb.0: +; SM20-NEXT: ld.param.u32 %r1, [rotl0_param_0]; +; SM20-NEXT: { +; SM20-NEXT: .reg .b32 %lhs; +; SM20-NEXT: .reg .b32 %rhs; +; SM20-NEXT: shl.b32 %lhs, %r1, 8; +; SM20-NEXT: shr.b32 %rhs, %r1, 24; +; SM20-NEXT: add.u32 %r2, %lhs, %rhs; +; SM20-NEXT: } +; SM20-NEXT: st.param.b32 [func_retval0+0], %r2; +; SM20-NEXT: ret; +; +; SM35-LABEL: rotl0( +; SM35: { +; SM35-NEXT: .reg .b32 %r<3>; +; SM35-EMPTY: +; SM35-NEXT: // %bb.0: +; SM35-NEXT: ld.param.u32 %r1, [rotl0_param_0]; +; SM35-NEXT: shf.l.wrap.b32 %r2, %r1, %r1, 8; +; SM35-NEXT: st.param.b32 [func_retval0+0], %r2; +; SM35-NEXT: ret; %t0 = shl i32 %x, 8 %t1 = lshr i32 %x, 24 %t2 = or i32 %t0, %t1 ret i32 %t2 } + +declare i64 @llvm.fshl.i64(i64, i64, i64) +declare i64 @llvm.fshr.i64(i64, i64, i64) + +; SM35: rotl64 +define i64 @rotl64(i64 %a, i64 %n) { +; SM20-LABEL: rotl64( +; SM20: { +; SM20-NEXT: .reg .b32 %r<2>; +; SM20-NEXT: .reg .b64 %rd<3>; +; SM20-EMPTY: +; SM20-NEXT: // %bb.0: +; SM20-NEXT: ld.param.u64 %rd1, [rotl64_param_0]; +; SM20-NEXT: ld.param.u32 %r1, [rotl64_param_1]; +; SM20-NEXT: { +; SM20-NEXT: .reg .b64 %lhs; +; SM20-NEXT: .reg .b64 %rhs; +; SM20-NEXT: .reg .u32 %amt2; +; SM20-NEXT: and.b32 %amt2, %r1, 63; +; SM20-NEXT: shl.b64 %lhs, %rd1, %amt2; +; SM20-NEXT: sub.u32 %amt2, 64, %amt2; +; SM20-NEXT: shr.b64 %rhs, %rd1, %amt2; +; SM20-NEXT: add.u64 %rd2, %lhs, %rhs; +; SM20-NEXT: } +; SM20-NEXT: st.param.b64 [func_retval0+0], %rd2; +; SM20-NEXT: ret; +; +; SM35-LABEL: rotl64( +; SM35: { +; SM35-NEXT: .reg .b32 %r<2>; +; SM35-NEXT: .reg .b64 %rd<3>; +; SM35-EMPTY: +; SM35-NEXT: // %bb.0: +; SM35-NEXT: ld.param.u64 %rd1, [rotl64_param_0]; +; SM35-NEXT: ld.param.u32 %r1, [rotl64_param_1]; +; SM35-NEXT: { +; SM35-NEXT: .reg .b64 %lhs; +; SM35-NEXT: .reg .b64 %rhs; +; SM35-NEXT: .reg .u32 %amt2; +; SM35-NEXT: and.b32 %amt2, %r1, 63; +; SM35-NEXT: shl.b64 %lhs, %rd1, %amt2; +; SM35-NEXT: sub.u32 %amt2, 64, %amt2; +; SM35-NEXT: shr.b64 %rhs, %rd1, %amt2; +; SM35-NEXT: add.u64 %rd2, %lhs, %rhs; +; SM35-NEXT: } +; SM35-NEXT: st.param.b64 [func_retval0+0], %rd2; +; SM35-NEXT: ret; + %val = tail call i64 @llvm.fshl.i64(i64 %a, i64 %a, i64 %n) + ret i64 %val +} + +; SM35: rotl64_imm +define i64 @rotl64_imm(i64 %a) { +; SM20-LABEL: rotl64_imm( +; SM20: { +; SM20-NEXT: .reg .b64 %rd<3>; +; SM20-EMPTY: +; SM20-NEXT: // %bb.0: +; SM20-NEXT: ld.param.u64 %rd1, [rotl64_imm_param_0]; +; SM20-NEXT: { +; SM20-NEXT: .reg .b64 %lhs; +; SM20-NEXT: .reg .b64 %rhs; +; SM20-NEXT: shl.b64 %lhs, %rd1, 2; +; SM20-NEXT: shr.b64 %rhs, %rd1, 62; +; SM20-NEXT: add.u64 %rd2, %lhs, %rhs; +; SM20-NEXT: } +; SM20-NEXT: st.param.b64 [func_retval0+0], %rd2; +; SM20-NEXT: ret; +; +; SM35-LABEL: rotl64_imm( +; SM35: { +; SM35-NEXT: .reg .b64 %rd<3>; +; SM35-EMPTY: +; SM35-NEXT: // %bb.0: +; SM35-NEXT: ld.param.u64 %rd1, [rotl64_imm_param_0]; +; SM35-NEXT: { +; SM35-NEXT: .reg .b64 %lhs; +; SM35-NEXT: .reg .b64 %rhs; +; SM35-NEXT: shl.b64 %lhs, %rd1, 2; +; SM35-NEXT: shr.b64 %rhs, %rd1, 62; +; SM35-NEXT: add.u64 %rd2, %lhs, %rhs; +; SM35-NEXT: } +; SM35-NEXT: st.param.b64 [func_retval0+0], %rd2; +; SM35-NEXT: ret; + %val = tail call i64 @llvm.fshl.i64(i64 %a, i64 %a, i64 66) + ret i64 %val +} + +; SM35: rotr64 +define i64 @rotr64(i64 %a, i64 %n) { +; SM20-LABEL: rotr64( +; SM20: { +; SM20-NEXT: .reg .b32 %r<2>; +; SM20-NEXT: .reg .b64 %rd<3>; +; SM20-EMPTY: +; SM20-NEXT: // %bb.0: +; SM20-NEXT: ld.param.u64 %rd1, [rotr64_param_0]; +; SM20-NEXT: ld.param.u32 %r1, [rotr64_param_1]; +; SM20-NEXT: { +; SM20-NEXT: .reg .b64 %lhs; +; SM20-NEXT: .reg .b64 %rhs; +; SM20-NEXT: .reg .u32 %amt2; +; SM20-NEXT: and.b32 %amt2, %r1, 63; +; SM20-NEXT: shr.b64 %lhs, %rd1, %amt2; +; SM20-NEXT: sub.u32 %amt2, 64, %amt2; +; SM20-NEXT: shl.b64 %rhs, %rd1, %amt2; +; SM20-NEXT: add.u64 %rd2, %lhs, %rhs; +; SM20-NEXT: } +; SM20-NEXT: st.param.b64 [func_retval0+0], %rd2; +; SM20-NEXT: ret; +; +; SM35-LABEL: rotr64( +; SM35: { +; SM35-NEXT: .reg .b32 %r<2>; +; SM35-NEXT: .reg .b64 %rd<3>; +; SM35-EMPTY: +; SM35-NEXT: // %bb.0: +; SM35-NEXT: ld.param.u64 %rd1, [rotr64_param_0]; +; SM35-NEXT: ld.param.u32 %r1, [rotr64_param_1]; +; SM35-NEXT: { +; SM35-NEXT: .reg .b64 %lhs; +; SM35-NEXT: .reg .b64 %rhs; +; SM35-NEXT: .reg .u32 %amt2; +; SM35-NEXT: and.b32 %amt2, %r1, 63; +; SM35-NEXT: shr.b64 %lhs, %rd1, %amt2; +; SM35-NEXT: sub.u32 %amt2, 64, %amt2; +; SM35-NEXT: shl.b64 %rhs, %rd1, %amt2; +; SM35-NEXT: add.u64 %rd2, %lhs, %rhs; +; SM35-NEXT: } +; SM35-NEXT: st.param.b64 [func_retval0+0], %rd2; +; SM35-NEXT: ret; + %val = tail call i64 @llvm.fshr.i64(i64 %a, i64 %a, i64 %n) + ret i64 %val +} + +; SM35: rotr64_imm +define i64 @rotr64_imm(i64 %a) { +; SM20-LABEL: rotr64_imm( +; SM20: { +; SM20-NEXT: .reg .b64 %rd<3>; +; SM20-EMPTY: +; SM20-NEXT: // %bb.0: +; SM20-NEXT: ld.param.u64 %rd1, [rotr64_imm_param_0]; +; SM20-NEXT: { +; SM20-NEXT: .reg .b64 %lhs; +; SM20-NEXT: .reg .b64 %rhs; +; SM20-NEXT: shl.b64 %lhs, %rd1, 62; +; SM20-NEXT: shr.b64 %rhs, %rd1, 2; +; SM20-NEXT: add.u64 %rd2, %lhs, %rhs; +; SM20-NEXT: } +; SM20-NEXT: st.param.b64 [func_retval0+0], %rd2; +; SM20-NEXT: ret; +; +; SM35-LABEL: rotr64_imm( +; SM35: { +; SM35-NEXT: .reg .b64 %rd<3>; +; SM35-EMPTY: +; SM35-NEXT: // %bb.0: +; SM35-NEXT: ld.param.u64 %rd1, [rotr64_imm_param_0]; +; SM35-NEXT: { +; SM35-NEXT: .reg .b64 %lhs; +; SM35-NEXT: .reg .b64 %rhs; +; SM35-NEXT: shl.b64 %lhs, %rd1, 62; +; SM35-NEXT: shr.b64 %rhs, %rd1, 2; +; SM35-NEXT: add.u64 %rd2, %lhs, %rhs; +; SM35-NEXT: } +; SM35-NEXT: st.param.b64 [func_retval0+0], %rd2; +; SM35-NEXT: ret; + %val = tail call i64 @llvm.fshr.i64(i64 %a, i64 %a, i64 66) + ret i64 %val +}