Skip to content

Commit

Permalink
[NVPTX] Fix 64 bits rotations with large shift values (#89399)
Browse files Browse the repository at this point in the history
ROTL and ROTR can take a shift amount larger than the element size, in
which case the effective shift amount should be the shift amount modulo
the element size.

This patch adds the modulo step when the shift amount isn't known at
compile time. Without it the existing implementation would end up
shifting beyond the type size and give incorrect results.
  • Loading branch information
npmiller authored May 1, 2024
1 parent cf2f32c commit 7396ab1
Show file tree
Hide file tree
Showing 2 changed files with 320 additions and 29 deletions.
10 changes: 6 additions & 4 deletions llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
Original file line number Diff line number Diff line change
Expand Up @@ -1752,8 +1752,9 @@ def ROTL64reg_sw :
".reg .b64 %lhs;\n\t"
".reg .b64 %rhs;\n\t"
".reg .u32 %amt2;\n\t"
"shl.b64 \t%lhs, $src, $amt;\n\t"
"sub.u32 \t%amt2, 64, $amt;\n\t"
"and.b32 \t%amt2, $amt, 63;\n\t"
"shl.b64 \t%lhs, $src, %amt2;\n\t"
"sub.u32 \t%amt2, 64, %amt2;\n\t"
"shr.b64 \t%rhs, $src, %amt2;\n\t"
"add.u64 \t$dst, %lhs, %rhs;\n\t"
"}}",
Expand All @@ -1765,8 +1766,9 @@ def ROTR64reg_sw :
".reg .b64 %lhs;\n\t"
".reg .b64 %rhs;\n\t"
".reg .u32 %amt2;\n\t"
"shr.b64 \t%lhs, $src, $amt;\n\t"
"sub.u32 \t%amt2, 64, $amt;\n\t"
"and.b32 \t%amt2, $amt, 63;\n\t"
"shr.b64 \t%lhs, $src, %amt2;\n\t"
"sub.u32 \t%amt2, 64, %amt2;\n\t"
"shl.b64 \t%rhs, $src, %amt2;\n\t"
"add.u64 \t$dst, %lhs, %rhs;\n\t"
"}}",
Expand Down
339 changes: 314 additions & 25 deletions llvm/test/CodeGen/NVPTX/rotate.ll
Original file line number Diff line number Diff line change
@@ -1,7 +1,8 @@
; RUN: llc < %s -march=nvptx64 -mcpu=sm_20 | FileCheck --check-prefix=SM20 %s
; RUN: llc < %s -march=nvptx64 -mcpu=sm_35 | FileCheck --check-prefix=SM35 %s
; RUN: %if ptxas %{ llc < %s -march=nvptx64 -mcpu=sm_20 | %ptxas-verify %}
; RUN: %if ptxas %{ llc < %s -march=nvptx64 -mcpu=sm_35 | %ptxas-verify %}
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
; RUN: llc < %s --mtriple=nvptx64 -mcpu=sm_20 | FileCheck --check-prefix=SM20 %s
; RUN: llc < %s --mtriple=nvptx64 -mcpu=sm_35 | FileCheck --check-prefix=SM35 %s
; RUN: %if ptxas %{ llc < %s --mtriple=nvptx64 -mcpu=sm_20 | %ptxas-verify %}
; RUN: %if ptxas %{ llc < %s --mtriple=nvptx64 -mcpu=sm_35 | %ptxas-verify %}


declare i32 @llvm.nvvm.rotate.b32(i32, i32)
Expand All @@ -11,50 +12,338 @@ declare i64 @llvm.nvvm.rotate.right.b64(i64, i32)
; SM20: rotate32
; SM35: rotate32
define i32 @rotate32(i32 %a, i32 %b) {
; SM20: shl.b32
; SM20: sub.s32
; SM20: shr.b32
; SM20: add.u32
; SM35: shf.l.wrap.b32
; SM20-LABEL: rotate32(
; SM20: {
; SM20-NEXT: .reg .b32 %r<4>;
; SM20-EMPTY:
; SM20-NEXT: // %bb.0:
; SM20-NEXT: ld.param.u32 %r1, [rotate32_param_0];
; SM20-NEXT: ld.param.u32 %r2, [rotate32_param_1];
; SM20-NEXT: {
; SM20-NEXT: .reg .b32 %lhs;
; SM20-NEXT: .reg .b32 %rhs;
; SM20-NEXT: .reg .b32 %amt2;
; SM20-NEXT: shl.b32 %lhs, %r1, %r2;
; SM20-NEXT: sub.s32 %amt2, 32, %r2;
; SM20-NEXT: shr.b32 %rhs, %r1, %amt2;
; SM20-NEXT: add.u32 %r3, %lhs, %rhs;
; SM20-NEXT: }
; SM20-NEXT: st.param.b32 [func_retval0+0], %r3;
; SM20-NEXT: ret;
;
; SM35-LABEL: rotate32(
; SM35: {
; SM35-NEXT: .reg .b32 %r<4>;
; SM35-EMPTY:
; SM35-NEXT: // %bb.0:
; SM35-NEXT: ld.param.u32 %r1, [rotate32_param_0];
; SM35-NEXT: ld.param.u32 %r2, [rotate32_param_1];
; SM35-NEXT: shf.l.wrap.b32 %r3, %r1, %r1, %r2;
; SM35-NEXT: st.param.b32 [func_retval0+0], %r3;
; SM35-NEXT: ret;
%val = tail call i32 @llvm.nvvm.rotate.b32(i32 %a, i32 %b)
ret i32 %val
}

; SM20: rotate64
; SM35: rotate64
define i64 @rotate64(i64 %a, i32 %b) {
; SM20: shl.b64
; SM20: sub.u32
; SM20: shr.b64
; SM20: add.u64
; SM35: shf.l.wrap.b32
; SM35: shf.l.wrap.b32
; SM20-LABEL: rotate64(
; SM20: {
; SM20-NEXT: .reg .b32 %r<2>;
; SM20-NEXT: .reg .b64 %rd<3>;
; SM20-EMPTY:
; SM20-NEXT: // %bb.0:
; SM20-NEXT: ld.param.u64 %rd1, [rotate64_param_0];
; SM20-NEXT: ld.param.u32 %r1, [rotate64_param_1];
; SM20-NEXT: {
; SM20-NEXT: .reg .b64 %lhs;
; SM20-NEXT: .reg .b64 %rhs;
; SM20-NEXT: .reg .u32 %amt2;
; SM20-NEXT: and.b32 %amt2, %r1, 63;
; SM20-NEXT: shl.b64 %lhs, %rd1, %amt2;
; SM20-NEXT: sub.u32 %amt2, 64, %amt2;
; SM20-NEXT: shr.b64 %rhs, %rd1, %amt2;
; SM20-NEXT: add.u64 %rd2, %lhs, %rhs;
; SM20-NEXT: }
; SM20-NEXT: st.param.b64 [func_retval0+0], %rd2;
; SM20-NEXT: ret;
;
; SM35-LABEL: rotate64(
; SM35: {
; SM35-NEXT: .reg .b32 %r<6>;
; SM35-NEXT: .reg .b64 %rd<3>;
; SM35-EMPTY:
; SM35-NEXT: // %bb.0:
; SM35-NEXT: ld.param.u64 %rd1, [rotate64_param_0];
; SM35-NEXT: {
; SM35-NEXT: .reg .b32 %dummy;
; SM35-NEXT: mov.b64 {%dummy,%r1}, %rd1;
; SM35-NEXT: }
; SM35-NEXT: {
; SM35-NEXT: .reg .b32 %dummy;
; SM35-NEXT: mov.b64 {%r2,%dummy}, %rd1;
; SM35-NEXT: }
; SM35-NEXT: ld.param.u32 %r3, [rotate64_param_1];
; SM35-NEXT: shf.l.wrap.b32 %r4, %r2, %r1, %r3;
; SM35-NEXT: shf.l.wrap.b32 %r5, %r1, %r2, %r3;
; SM35-NEXT: mov.b64 %rd2, {%r5, %r4};
; SM35-NEXT: st.param.b64 [func_retval0+0], %rd2;
; SM35-NEXT: ret;
%val = tail call i64 @llvm.nvvm.rotate.b64(i64 %a, i32 %b)
ret i64 %val
}

; SM20: rotateright64
; SM35: rotateright64
define i64 @rotateright64(i64 %a, i32 %b) {
; SM20: shr.b64
; SM20: sub.u32
; SM20: shl.b64
; SM20: add.u64
; SM35: shf.r.wrap.b32
; SM35: shf.r.wrap.b32
; SM20-LABEL: rotateright64(
; SM20: {
; SM20-NEXT: .reg .b32 %r<2>;
; SM20-NEXT: .reg .b64 %rd<3>;
; SM20-EMPTY:
; SM20-NEXT: // %bb.0:
; SM20-NEXT: ld.param.u64 %rd1, [rotateright64_param_0];
; SM20-NEXT: ld.param.u32 %r1, [rotateright64_param_1];
; SM20-NEXT: {
; SM20-NEXT: .reg .b64 %lhs;
; SM20-NEXT: .reg .b64 %rhs;
; SM20-NEXT: .reg .u32 %amt2;
; SM20-NEXT: and.b32 %amt2, %r1, 63;
; SM20-NEXT: shr.b64 %lhs, %rd1, %amt2;
; SM20-NEXT: sub.u32 %amt2, 64, %amt2;
; SM20-NEXT: shl.b64 %rhs, %rd1, %amt2;
; SM20-NEXT: add.u64 %rd2, %lhs, %rhs;
; SM20-NEXT: }
; SM20-NEXT: st.param.b64 [func_retval0+0], %rd2;
; SM20-NEXT: ret;
;
; SM35-LABEL: rotateright64(
; SM35: {
; SM35-NEXT: .reg .b32 %r<6>;
; SM35-NEXT: .reg .b64 %rd<3>;
; SM35-EMPTY:
; SM35-NEXT: // %bb.0:
; SM35-NEXT: ld.param.u64 %rd1, [rotateright64_param_0];
; SM35-NEXT: {
; SM35-NEXT: .reg .b32 %dummy;
; SM35-NEXT: mov.b64 {%r1,%dummy}, %rd1;
; SM35-NEXT: }
; SM35-NEXT: {
; SM35-NEXT: .reg .b32 %dummy;
; SM35-NEXT: mov.b64 {%dummy,%r2}, %rd1;
; SM35-NEXT: }
; SM35-NEXT: ld.param.u32 %r3, [rotateright64_param_1];
; SM35-NEXT: shf.r.wrap.b32 %r4, %r2, %r1, %r3;
; SM35-NEXT: shf.r.wrap.b32 %r5, %r1, %r2, %r3;
; SM35-NEXT: mov.b64 %rd2, {%r5, %r4};
; SM35-NEXT: st.param.b64 [func_retval0+0], %rd2;
; SM35-NEXT: ret;
%val = tail call i64 @llvm.nvvm.rotate.right.b64(i64 %a, i32 %b)
ret i64 %val
}

; SM20: rotl0
; SM35: rotl0
define i32 @rotl0(i32 %x) {
; SM20: shl.b32
; SM20: shr.b32
; SM20: add.u32
; SM35: shf.l.wrap.b32
; SM20-LABEL: rotl0(
; SM20: {
; SM20-NEXT: .reg .b32 %r<3>;
; SM20-EMPTY:
; SM20-NEXT: // %bb.0:
; SM20-NEXT: ld.param.u32 %r1, [rotl0_param_0];
; SM20-NEXT: {
; SM20-NEXT: .reg .b32 %lhs;
; SM20-NEXT: .reg .b32 %rhs;
; SM20-NEXT: shl.b32 %lhs, %r1, 8;
; SM20-NEXT: shr.b32 %rhs, %r1, 24;
; SM20-NEXT: add.u32 %r2, %lhs, %rhs;
; SM20-NEXT: }
; SM20-NEXT: st.param.b32 [func_retval0+0], %r2;
; SM20-NEXT: ret;
;
; SM35-LABEL: rotl0(
; SM35: {
; SM35-NEXT: .reg .b32 %r<3>;
; SM35-EMPTY:
; SM35-NEXT: // %bb.0:
; SM35-NEXT: ld.param.u32 %r1, [rotl0_param_0];
; SM35-NEXT: shf.l.wrap.b32 %r2, %r1, %r1, 8;
; SM35-NEXT: st.param.b32 [func_retval0+0], %r2;
; SM35-NEXT: ret;
%t0 = shl i32 %x, 8
%t1 = lshr i32 %x, 24
%t2 = or i32 %t0, %t1
ret i32 %t2
}

declare i64 @llvm.fshl.i64(i64, i64, i64)
declare i64 @llvm.fshr.i64(i64, i64, i64)

; SM35: rotl64
define i64 @rotl64(i64 %a, i64 %n) {
; SM20-LABEL: rotl64(
; SM20: {
; SM20-NEXT: .reg .b32 %r<2>;
; SM20-NEXT: .reg .b64 %rd<3>;
; SM20-EMPTY:
; SM20-NEXT: // %bb.0:
; SM20-NEXT: ld.param.u64 %rd1, [rotl64_param_0];
; SM20-NEXT: ld.param.u32 %r1, [rotl64_param_1];
; SM20-NEXT: {
; SM20-NEXT: .reg .b64 %lhs;
; SM20-NEXT: .reg .b64 %rhs;
; SM20-NEXT: .reg .u32 %amt2;
; SM20-NEXT: and.b32 %amt2, %r1, 63;
; SM20-NEXT: shl.b64 %lhs, %rd1, %amt2;
; SM20-NEXT: sub.u32 %amt2, 64, %amt2;
; SM20-NEXT: shr.b64 %rhs, %rd1, %amt2;
; SM20-NEXT: add.u64 %rd2, %lhs, %rhs;
; SM20-NEXT: }
; SM20-NEXT: st.param.b64 [func_retval0+0], %rd2;
; SM20-NEXT: ret;
;
; SM35-LABEL: rotl64(
; SM35: {
; SM35-NEXT: .reg .b32 %r<2>;
; SM35-NEXT: .reg .b64 %rd<3>;
; SM35-EMPTY:
; SM35-NEXT: // %bb.0:
; SM35-NEXT: ld.param.u64 %rd1, [rotl64_param_0];
; SM35-NEXT: ld.param.u32 %r1, [rotl64_param_1];
; SM35-NEXT: {
; SM35-NEXT: .reg .b64 %lhs;
; SM35-NEXT: .reg .b64 %rhs;
; SM35-NEXT: .reg .u32 %amt2;
; SM35-NEXT: and.b32 %amt2, %r1, 63;
; SM35-NEXT: shl.b64 %lhs, %rd1, %amt2;
; SM35-NEXT: sub.u32 %amt2, 64, %amt2;
; SM35-NEXT: shr.b64 %rhs, %rd1, %amt2;
; SM35-NEXT: add.u64 %rd2, %lhs, %rhs;
; SM35-NEXT: }
; SM35-NEXT: st.param.b64 [func_retval0+0], %rd2;
; SM35-NEXT: ret;
%val = tail call i64 @llvm.fshl.i64(i64 %a, i64 %a, i64 %n)
ret i64 %val
}

; SM35: rotl64_imm
define i64 @rotl64_imm(i64 %a) {
; SM20-LABEL: rotl64_imm(
; SM20: {
; SM20-NEXT: .reg .b64 %rd<3>;
; SM20-EMPTY:
; SM20-NEXT: // %bb.0:
; SM20-NEXT: ld.param.u64 %rd1, [rotl64_imm_param_0];
; SM20-NEXT: {
; SM20-NEXT: .reg .b64 %lhs;
; SM20-NEXT: .reg .b64 %rhs;
; SM20-NEXT: shl.b64 %lhs, %rd1, 2;
; SM20-NEXT: shr.b64 %rhs, %rd1, 62;
; SM20-NEXT: add.u64 %rd2, %lhs, %rhs;
; SM20-NEXT: }
; SM20-NEXT: st.param.b64 [func_retval0+0], %rd2;
; SM20-NEXT: ret;
;
; SM35-LABEL: rotl64_imm(
; SM35: {
; SM35-NEXT: .reg .b64 %rd<3>;
; SM35-EMPTY:
; SM35-NEXT: // %bb.0:
; SM35-NEXT: ld.param.u64 %rd1, [rotl64_imm_param_0];
; SM35-NEXT: {
; SM35-NEXT: .reg .b64 %lhs;
; SM35-NEXT: .reg .b64 %rhs;
; SM35-NEXT: shl.b64 %lhs, %rd1, 2;
; SM35-NEXT: shr.b64 %rhs, %rd1, 62;
; SM35-NEXT: add.u64 %rd2, %lhs, %rhs;
; SM35-NEXT: }
; SM35-NEXT: st.param.b64 [func_retval0+0], %rd2;
; SM35-NEXT: ret;
%val = tail call i64 @llvm.fshl.i64(i64 %a, i64 %a, i64 66)
ret i64 %val
}

; SM35: rotr64
define i64 @rotr64(i64 %a, i64 %n) {
; SM20-LABEL: rotr64(
; SM20: {
; SM20-NEXT: .reg .b32 %r<2>;
; SM20-NEXT: .reg .b64 %rd<3>;
; SM20-EMPTY:
; SM20-NEXT: // %bb.0:
; SM20-NEXT: ld.param.u64 %rd1, [rotr64_param_0];
; SM20-NEXT: ld.param.u32 %r1, [rotr64_param_1];
; SM20-NEXT: {
; SM20-NEXT: .reg .b64 %lhs;
; SM20-NEXT: .reg .b64 %rhs;
; SM20-NEXT: .reg .u32 %amt2;
; SM20-NEXT: and.b32 %amt2, %r1, 63;
; SM20-NEXT: shr.b64 %lhs, %rd1, %amt2;
; SM20-NEXT: sub.u32 %amt2, 64, %amt2;
; SM20-NEXT: shl.b64 %rhs, %rd1, %amt2;
; SM20-NEXT: add.u64 %rd2, %lhs, %rhs;
; SM20-NEXT: }
; SM20-NEXT: st.param.b64 [func_retval0+0], %rd2;
; SM20-NEXT: ret;
;
; SM35-LABEL: rotr64(
; SM35: {
; SM35-NEXT: .reg .b32 %r<2>;
; SM35-NEXT: .reg .b64 %rd<3>;
; SM35-EMPTY:
; SM35-NEXT: // %bb.0:
; SM35-NEXT: ld.param.u64 %rd1, [rotr64_param_0];
; SM35-NEXT: ld.param.u32 %r1, [rotr64_param_1];
; SM35-NEXT: {
; SM35-NEXT: .reg .b64 %lhs;
; SM35-NEXT: .reg .b64 %rhs;
; SM35-NEXT: .reg .u32 %amt2;
; SM35-NEXT: and.b32 %amt2, %r1, 63;
; SM35-NEXT: shr.b64 %lhs, %rd1, %amt2;
; SM35-NEXT: sub.u32 %amt2, 64, %amt2;
; SM35-NEXT: shl.b64 %rhs, %rd1, %amt2;
; SM35-NEXT: add.u64 %rd2, %lhs, %rhs;
; SM35-NEXT: }
; SM35-NEXT: st.param.b64 [func_retval0+0], %rd2;
; SM35-NEXT: ret;
%val = tail call i64 @llvm.fshr.i64(i64 %a, i64 %a, i64 %n)
ret i64 %val
}

; SM35: rotr64_imm
define i64 @rotr64_imm(i64 %a) {
; SM20-LABEL: rotr64_imm(
; SM20: {
; SM20-NEXT: .reg .b64 %rd<3>;
; SM20-EMPTY:
; SM20-NEXT: // %bb.0:
; SM20-NEXT: ld.param.u64 %rd1, [rotr64_imm_param_0];
; SM20-NEXT: {
; SM20-NEXT: .reg .b64 %lhs;
; SM20-NEXT: .reg .b64 %rhs;
; SM20-NEXT: shl.b64 %lhs, %rd1, 62;
; SM20-NEXT: shr.b64 %rhs, %rd1, 2;
; SM20-NEXT: add.u64 %rd2, %lhs, %rhs;
; SM20-NEXT: }
; SM20-NEXT: st.param.b64 [func_retval0+0], %rd2;
; SM20-NEXT: ret;
;
; SM35-LABEL: rotr64_imm(
; SM35: {
; SM35-NEXT: .reg .b64 %rd<3>;
; SM35-EMPTY:
; SM35-NEXT: // %bb.0:
; SM35-NEXT: ld.param.u64 %rd1, [rotr64_imm_param_0];
; SM35-NEXT: {
; SM35-NEXT: .reg .b64 %lhs;
; SM35-NEXT: .reg .b64 %rhs;
; SM35-NEXT: shl.b64 %lhs, %rd1, 62;
; SM35-NEXT: shr.b64 %rhs, %rd1, 2;
; SM35-NEXT: add.u64 %rd2, %lhs, %rhs;
; SM35-NEXT: }
; SM35-NEXT: st.param.b64 [func_retval0+0], %rd2;
; SM35-NEXT: ret;
%val = tail call i64 @llvm.fshr.i64(i64 %a, i64 %a, i64 66)
ret i64 %val
}

0 comments on commit 7396ab1

Please sign in to comment.