Skip to content

Commit

Permalink
[ARM,AArch64] Add intrinsics for dot product instructions
Browse files Browse the repository at this point in the history
The ACLE spec which describes these intrinsics hasn't been published yet, but
this is based on the final draft which will be published soon, and these have
already been implemented by GCC.

Differential revision: https://reviews.llvm.org/D46109



git-svn-id: https://llvm.org/svn/llvm-project/cfe/trunk@331039 91177308-0d34-0410-b5e6-96231b3b80d8
  • Loading branch information
ostannard committed Apr 27, 2018
1 parent 6769f8d commit acc1201
Show file tree
Hide file tree
Showing 6 changed files with 238 additions and 0 deletions.
17 changes: 17 additions & 0 deletions include/clang/Basic/arm_neon.td
Original file line number Diff line number Diff line change
Expand Up @@ -199,6 +199,13 @@ def OP_SCALAR_HALF_SET_LNQ : Op<(bitcast "float16x8_t",
(bitcast "int16_t", $p0),
(bitcast "int16x8_t", $p1), $p2))>;

def OP_DOT_LN
: Op<(call "vdot", $p0, $p1,
(bitcast $p1, (splat(bitcast "uint32x2_t", $p2), $p3)))>;
def OP_DOT_LNQ
: Op<(call "vdot", $p0, $p1,
(bitcast $p1, (splat(bitcast "uint32x4_t", $p2), $p3)))>;

//===----------------------------------------------------------------------===//
// Instructions
//===----------------------------------------------------------------------===//
Expand Down Expand Up @@ -1579,3 +1586,13 @@ let ArchGuard = "defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(__aarc
def SCALAR_VDUP_LANEH : IInst<"vdup_lane", "sdi", "Sh">;
def SCALAR_VDUP_LANEQH : IInst<"vdup_laneq", "sji", "Sh">;
}

// v8.2-A dot product instructions.
let ArchGuard = "defined(__ARM_FEATURE_DOTPROD)" in {
def DOT : SInst<"vdot", "dd88", "iQiUiQUi">;
def DOT_LANE : SOpInst<"vdot_lane", "dd87i", "iUiQiQUi", OP_DOT_LN>;
}
let ArchGuard = "defined(__ARM_FEATURE_DOTPROD) && defined(__aarch64__)" in {
// Variants indexing into a 128-bit vector are A64 only.
def UDOT_LANEQ : SOpInst<"vdot_laneq", "dd89i", "iUiQiQUi", OP_DOT_LNQ>;
}
3 changes: 3 additions & 0 deletions include/clang/Basic/arm_neon_incl.td
Original file line number Diff line number Diff line change
Expand Up @@ -253,6 +253,9 @@ def OP_UNAVAILABLE : Operation {
// B,C,D: array of default elts, force 'Q' size modifier.
// p: pointer type
// c: const pointer type
// 7: vector of 8-bit elements, ignore 'Q' size modifier
// 8: vector of 8-bit elements, same width as default type
// 9: vector of 8-bit elements, force 'Q' size modifier

// Every intrinsic subclasses Inst.
class Inst <string n, string p, string t, Operation o> {
Expand Down
12 changes: 12 additions & 0 deletions lib/CodeGen/CGBuiltin.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -3867,6 +3867,8 @@ static const NeonIntrinsicInfo ARMSIMDIntrinsicMap [] = {
NEONMAP0(vcvtq_u16_v),
NEONMAP0(vcvtq_u32_v),
NEONMAP0(vcvtq_u64_v),
NEONMAP2(vdot_v, arm_neon_udot, arm_neon_sdot, 0),
NEONMAP2(vdotq_v, arm_neon_udot, arm_neon_sdot, 0),
NEONMAP0(vext_v),
NEONMAP0(vextq_v),
NEONMAP0(vfma_v),
Expand Down Expand Up @@ -4061,6 +4063,8 @@ static const NeonIntrinsicInfo AArch64SIMDIntrinsicMap[] = {
NEONMAP1(vcvtq_n_u32_v, aarch64_neon_vcvtfp2fxu, 0),
NEONMAP1(vcvtq_n_u64_v, aarch64_neon_vcvtfp2fxu, 0),
NEONMAP1(vcvtx_f32_v, aarch64_neon_fcvtxn, AddRetType | Add1ArgType),
NEONMAP2(vdot_v, aarch64_neon_udot, aarch64_neon_sdot, 0),
NEONMAP2(vdotq_v, aarch64_neon_udot, aarch64_neon_sdot, 0),
NEONMAP0(vext_v),
NEONMAP0(vextq_v),
NEONMAP0(vfma_v),
Expand Down Expand Up @@ -4974,6 +4978,14 @@ Value *CodeGenFunction::EmitCommonNeonBuiltinExpr(
}
return SV;
}
case NEON::BI__builtin_neon_vdot_v:
case NEON::BI__builtin_neon_vdotq_v: {
llvm::Type *InputTy =
llvm::VectorType::get(Int8Ty, Ty->getPrimitiveSizeInBits() / 8);
llvm::Type *Tys[2] = { Ty, InputTy };
Int = Usgn ? LLVMIntrinsic : AltLLVMIntrinsic;
return EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vdot");
}
}

assert(Int && "Expected valid intrinsic number");
Expand Down
117 changes: 117 additions & 0 deletions test/CodeGen/aarch64-neon-dot-product.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,117 @@
// RUN: %clang_cc1 -triple arm64-none-linux-gnu -target-feature +neon -target-feature +dotprod \
// RUN: -disable-O0-optnone -emit-llvm -o - %s | opt -S -instcombine | FileCheck %s

// REQUIRES: aarch64-registered-target

// Test AArch64 Armv8.2-A dot product intrinsics

#include <arm_neon.h>

uint32x2_t test_vdot_u32(uint32x2_t a, uint8x8_t b, uint8x8_t c) {
// CHECK-LABEL: define <2 x i32> @test_vdot_u32(<2 x i32> %a, <8 x i8> %b, <8 x i8> %c)
// CHECK: [[RESULT:%.*]] = call <2 x i32> @llvm.aarch64.neon.udot.v2i32.v8i8(<2 x i32> %a, <8 x i8> %b, <8 x i8> %c)
// CHECK: ret <2 x i32> [[RESULT]]
return vdot_u32(a, b, c);
}

uint32x4_t test_vdotq_u32(uint32x4_t a, uint8x16_t b, uint8x16_t c) {
// CHECK-LABEL: define <4 x i32> @test_vdotq_u32(<4 x i32> %a, <16 x i8> %b, <16 x i8> %c)
// CHECK: [[RESULT:%.*]] = call <4 x i32> @llvm.aarch64.neon.udot.v4i32.v16i8(<4 x i32> %a, <16 x i8> %b, <16 x i8> %c)
// CHECK: ret <4 x i32> [[RESULT]]
return vdotq_u32(a, b, c);
}

int32x2_t test_vdot_s32(int32x2_t a, int8x8_t b, int8x8_t c) {
// CHECK-LABEL: define <2 x i32> @test_vdot_s32(<2 x i32> %a, <8 x i8> %b, <8 x i8> %c)
// CHECK: [[RESULT:%.*]] = call <2 x i32> @llvm.aarch64.neon.sdot.v2i32.v8i8(<2 x i32> %a, <8 x i8> %b, <8 x i8> %c)
// CHECK: ret <2 x i32> [[RESULT]]
return vdot_s32(a, b, c);
}

int32x4_t test_vdotq_s32(int32x4_t a, int8x16_t b, int8x16_t c) {
// CHECK-LABEL: define <4 x i32> @test_vdotq_s32(<4 x i32> %a, <16 x i8> %b, <16 x i8> %c)
// CHECK: [[RESULT:%.*]] = call <4 x i32> @llvm.aarch64.neon.sdot.v4i32.v16i8(<4 x i32> %a, <16 x i8> %b, <16 x i8> %c)
// CHECK: ret <4 x i32> [[RESULT]]
return vdotq_s32(a, b, c);
}

uint32x2_t test_vdot_lane_u32(uint32x2_t a, uint8x8_t b, uint8x8_t c) {
// CHECK-LABEL: define <2 x i32> @test_vdot_lane_u32(<2 x i32> %a, <8 x i8> %b, <8 x i8> %c)
// CHECK: [[CAST1:%.*]] = bitcast <8 x i8> %c to <2 x i32>
// CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[CAST1]], <2 x i32> undef, <2 x i32> <i32 1, i32 1>
// CHECK: [[CAST2:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
// CHECK: [[RESULT:%.*]] = call <2 x i32> @llvm.aarch64.neon.udot.v2i32.v8i8(<2 x i32> %a, <8 x i8> %b, <8 x i8> [[CAST2]])
// CHECK: ret <2 x i32> [[RESULT]]
return vdot_lane_u32(a, b, c, 1);
}

uint32x4_t test_vdotq_lane_u32(uint32x4_t a, uint8x16_t b, uint8x8_t c) {
// CHECK-LABEL: define <4 x i32> @test_vdotq_lane_u32(<4 x i32> %a, <16 x i8> %b, <8 x i8> %c)
// CHECK: [[CAST1:%.*]] = bitcast <8 x i8> %c to <2 x i32>
// CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[CAST1]], <2 x i32> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
// CHECK: [[CAST2:%.*]] = bitcast <4 x i32> [[SHUFFLE]] to <16 x i8>
// CHECK: [[RESULT:%.*]] = call <4 x i32> @llvm.aarch64.neon.udot.v4i32.v16i8(<4 x i32> %a, <16 x i8> %b, <16 x i8> [[CAST2]])
// CHECK: ret <4 x i32> [[RESULT]]
return vdotq_lane_u32(a, b, c, 1);
}

uint32x2_t test_vdot_laneq_u32(uint32x2_t a, uint8x8_t b, uint8x16_t c) {
// CHECK-LABEL: define <2 x i32> @test_vdot_laneq_u32(<2 x i32> %a, <8 x i8> %b, <16 x i8> %c)
// CHECK: [[CAST1:%.*]] = bitcast <16 x i8> %c to <4 x i32>
// CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[CAST1]], <4 x i32> undef, <2 x i32> <i32 1, i32 1>
// CHECK: [[CAST2:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
// CHECK: [[RESULT:%.*]] = call <2 x i32> @llvm.aarch64.neon.udot.v2i32.v8i8(<2 x i32> %a, <8 x i8> %b, <8 x i8> [[CAST2]])
// CHECK: ret <2 x i32> [[RESULT]]
return vdot_laneq_u32(a, b, c, 1);
}

uint32x4_t test_vdotq_laneq_u32(uint32x4_t a, uint8x16_t b, uint8x16_t c) {
// CHECK-LABEL: define <4 x i32> @test_vdotq_laneq_u32(<4 x i32> %a, <16 x i8> %b, <16 x i8> %c)
// CHECK: [[CAST1:%.*]] = bitcast <16 x i8> %c to <4 x i32>
// CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[CAST1]], <4 x i32> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
// CHECK: [[CAST2:%.*]] = bitcast <4 x i32> [[SHUFFLE]] to <16 x i8>
// CHECK: [[RESULT:%.*]] = call <4 x i32> @llvm.aarch64.neon.udot.v4i32.v16i8(<4 x i32> %a, <16 x i8> %b, <16 x i8> [[CAST2]])
// CHECK: ret <4 x i32> [[RESULT]]
return vdotq_laneq_u32(a, b, c, 1);
}

int32x2_t test_vdot_lane_s32(int32x2_t a, int8x8_t b, int8x8_t c) {
// CHECK-LABEL: define <2 x i32> @test_vdot_lane_s32(<2 x i32> %a, <8 x i8> %b, <8 x i8> %c)
// CHECK: [[CAST1:%.*]] = bitcast <8 x i8> %c to <2 x i32>
// CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[CAST1]], <2 x i32> undef, <2 x i32> <i32 1, i32 1>
// CHECK: [[CAST2:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
// CHECK: [[RESULT:%.*]] = call <2 x i32> @llvm.aarch64.neon.sdot.v2i32.v8i8(<2 x i32> %a, <8 x i8> %b, <8 x i8> [[CAST2]])
// CHECK: ret <2 x i32> [[RESULT]]
return vdot_lane_s32(a, b, c, 1);
}

int32x4_t test_vdotq_lane_s32(int32x4_t a, int8x16_t b, int8x8_t c) {
// CHECK-LABEL: define <4 x i32> @test_vdotq_lane_s32(<4 x i32> %a, <16 x i8> %b, <8 x i8> %c)
// CHECK: [[CAST1:%.*]] = bitcast <8 x i8> %c to <2 x i32>
// CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[CAST1]], <2 x i32> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
// CHECK: [[CAST2:%.*]] = bitcast <4 x i32> [[SHUFFLE]] to <16 x i8>
// CHECK: [[RESULT:%.*]] = call <4 x i32> @llvm.aarch64.neon.sdot.v4i32.v16i8(<4 x i32> %a, <16 x i8> %b, <16 x i8> [[CAST2]])
// CHECK: ret <4 x i32> [[RESULT]]
return vdotq_lane_s32(a, b, c, 1);
}

int32x2_t test_vdot_laneq_s32(int32x2_t a, int8x8_t b, int8x16_t c) {
// CHECK-LABEL: define <2 x i32> @test_vdot_laneq_s32(<2 x i32> %a, <8 x i8> %b, <16 x i8> %c)
// CHECK: [[CAST1:%.*]] = bitcast <16 x i8> %c to <4 x i32>
// CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[CAST1]], <4 x i32> undef, <2 x i32> <i32 1, i32 1>
// CHECK: [[CAST2:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
// CHECK: [[RESULT:%.*]] = call <2 x i32> @llvm.aarch64.neon.sdot.v2i32.v8i8(<2 x i32> %a, <8 x i8> %b, <8 x i8> [[CAST2]])
// CHECK: ret <2 x i32> [[RESULT]]
return vdot_laneq_s32(a, b, c, 1);
}

int32x4_t test_vdotq_laneq_s32(int32x4_t a, int8x16_t b, int8x16_t c) {
// CHECK-LABEL: define <4 x i32> @test_vdotq_laneq_s32(<4 x i32> %a, <16 x i8> %b, <16 x i8> %c)
// CHECK: [[CAST1:%.*]] = bitcast <16 x i8> %c to <4 x i32>
// CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[CAST1]], <4 x i32> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
// CHECK: [[CAST2:%.*]] = bitcast <4 x i32> [[SHUFFLE]] to <16 x i8>
// CHECK: [[RESULT:%.*]] = call <4 x i32> @llvm.aarch64.neon.sdot.v4i32.v16i8(<4 x i32> %a, <16 x i8> %b, <16 x i8> [[CAST2]])
// CHECK: ret <4 x i32> [[RESULT]]
return vdotq_laneq_s32(a, b, c, 1);
}

76 changes: 76 additions & 0 deletions test/CodeGen/arm-neon-dot-product.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,76 @@
// RUN: %clang_cc1 -triple armv8-linux-gnueabihf -target-cpu cortex-a75 -target-feature +dotprod \
// RUN: -disable-O0-optnone -emit-llvm -o - %s | opt -S -instcombine | FileCheck %s

// REQUIRES: arm-registered-target

// Test ARM v8.2-A dot product intrinsics

#include <arm_neon.h>

uint32x2_t test_vdot_u32(uint32x2_t a, uint8x8_t b, uint8x8_t c) {
// CHECK-LABEL: define <2 x i32> @test_vdot_u32(<2 x i32> %a, <8 x i8> %b, <8 x i8> %c)
// CHECK: [[RESULT:%.*]] = call <2 x i32> @llvm.arm.neon.udot.v2i32.v8i8(<2 x i32> %a, <8 x i8> %b, <8 x i8> %c)
// CHECK: ret <2 x i32> [[RESULT]]
return vdot_u32(a, b, c);
}

uint32x4_t test_vdotq_u32(uint32x4_t a, uint8x16_t b, uint8x16_t c) {
// CHECK-LABEL: define <4 x i32> @test_vdotq_u32(<4 x i32> %a, <16 x i8> %b, <16 x i8> %c)
// CHECK: [[RESULT:%.*]] = call <4 x i32> @llvm.arm.neon.udot.v4i32.v16i8(<4 x i32> %a, <16 x i8> %b, <16 x i8> %c)
// CHECK: ret <4 x i32> [[RESULT]]
return vdotq_u32(a, b, c);
}

int32x2_t test_vdot_s32(int32x2_t a, int8x8_t b, int8x8_t c) {
// CHECK-LABEL: define <2 x i32> @test_vdot_s32(<2 x i32> %a, <8 x i8> %b, <8 x i8> %c)
// CHECK: [[RESULT:%.*]] = call <2 x i32> @llvm.arm.neon.sdot.v2i32.v8i8(<2 x i32> %a, <8 x i8> %b, <8 x i8> %c)
// CHECK: ret <2 x i32> [[RESULT]]
return vdot_s32(a, b, c);
}

int32x4_t test_vdotq_s32(int32x4_t a, int8x16_t b, int8x16_t c) {
// CHECK-LABEL: define <4 x i32> @test_vdotq_s32(<4 x i32> %a, <16 x i8> %b, <16 x i8> %c)
// CHECK: [[RESULT:%.*]] = call <4 x i32> @llvm.arm.neon.sdot.v4i32.v16i8(<4 x i32> %a, <16 x i8> %b, <16 x i8> %c)
// CHECK: ret <4 x i32> [[RESULT]]
return vdotq_s32(a, b, c);
}

uint32x2_t test_vdot_lane_u32(uint32x2_t a, uint8x8_t b, uint8x8_t c) {
// CHECK-LABEL: define <2 x i32> @test_vdot_lane_u32(<2 x i32> %a, <8 x i8> %b, <8 x i8> %c)
// CHECK: [[CAST1:%.*]] = bitcast <8 x i8> %c to <2 x i32>
// CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[CAST1]], <2 x i32> undef, <2 x i32> <i32 1, i32 1>
// CHECK: [[CAST2:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
// CHECK: [[RESULT:%.*]] = call <2 x i32> @llvm.arm.neon.udot.v2i32.v8i8(<2 x i32> %a, <8 x i8> %b, <8 x i8> [[CAST2]])
// CHECK: ret <2 x i32> [[RESULT]]
return vdot_lane_u32(a, b, c, 1);
}

uint32x4_t test_vdotq_lane_u32(uint32x4_t a, uint8x16_t b, uint8x8_t c) {
// CHECK-LABEL: define <4 x i32> @test_vdotq_lane_u32(<4 x i32> %a, <16 x i8> %b, <8 x i8> %c)
// CHECK: [[CAST1:%.*]] = bitcast <8 x i8> %c to <2 x i32>
// CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[CAST1]], <2 x i32> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
// CHECK: [[CAST2:%.*]] = bitcast <4 x i32> [[SHUFFLE]] to <16 x i8>
// CHECK: [[RESULT:%.*]] = call <4 x i32> @llvm.arm.neon.udot.v4i32.v16i8(<4 x i32> %a, <16 x i8> %b, <16 x i8> [[CAST2]])
// CHECK: ret <4 x i32> [[RESULT]]
return vdotq_lane_u32(a, b, c, 1);
}

int32x2_t test_vdot_lane_s32(int32x2_t a, int8x8_t b, int8x8_t c) {
// CHECK-LABEL: define <2 x i32> @test_vdot_lane_s32(<2 x i32> %a, <8 x i8> %b, <8 x i8> %c)
// CHECK: [[CAST1:%.*]] = bitcast <8 x i8> %c to <2 x i32>
// CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[CAST1]], <2 x i32> undef, <2 x i32> <i32 1, i32 1>
// CHECK: [[CAST2:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
// CHECK: [[RESULT:%.*]] = call <2 x i32> @llvm.arm.neon.sdot.v2i32.v8i8(<2 x i32> %a, <8 x i8> %b, <8 x i8> [[CAST2]])
// CHECK: ret <2 x i32> [[RESULT]]
return vdot_lane_s32(a, b, c, 1);
}

int32x4_t test_vdotq_lane_s32(int32x4_t a, int8x16_t b, int8x8_t c) {
// CHECK-LABEL: define <4 x i32> @test_vdotq_lane_s32(<4 x i32> %a, <16 x i8> %b, <8 x i8> %c)
// CHECK: [[CAST1:%.*]] = bitcast <8 x i8> %c to <2 x i32>
// CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[CAST1]], <2 x i32> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
// CHECK: [[CAST2:%.*]] = bitcast <4 x i32> [[SHUFFLE]] to <16 x i8>
// CHECK: [[RESULT:%.*]] = call <4 x i32> @llvm.arm.neon.sdot.v4i32.v16i8(<4 x i32> %a, <16 x i8> %b, <16 x i8> [[CAST2]])
// CHECK: ret <4 x i32> [[RESULT]]
return vdotq_lane_s32(a, b, c, 1);
}
13 changes: 13 additions & 0 deletions utils/TableGen/NeonEmitter.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -995,6 +995,19 @@ void Type::applyModifier(char Mod) {
if (!AppliedQuad)
Bitwidth *= 2;
break;
case '7':
if (AppliedQuad)
Bitwidth /= 2;
ElementBitwidth = 8;
break;
case '8':
ElementBitwidth = 8;
break;
case '9':
if (!AppliedQuad)
Bitwidth *= 2;
ElementBitwidth = 8;
break;
default:
llvm_unreachable("Unhandled character!");
}
Expand Down

0 comments on commit acc1201

Please sign in to comment.