diff --git a/include/llvm/IR/IntrinsicsX86.td b/include/llvm/IR/IntrinsicsX86.td index ca169b1c208e..3f53159f8213 100644 --- a/include/llvm/IR/IntrinsicsX86.td +++ b/include/llvm/IR/IntrinsicsX86.td @@ -1934,42 +1934,6 @@ let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.". // Vector min, max let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.". - def int_x86_avx2_pmaxu_b : GCCBuiltin<"__builtin_ia32_pmaxub256">, - Intrinsic<[llvm_v32i8_ty], [llvm_v32i8_ty, - llvm_v32i8_ty], [IntrNoMem, Commutative]>; - def int_x86_avx2_pmaxu_w : GCCBuiltin<"__builtin_ia32_pmaxuw256">, - Intrinsic<[llvm_v16i16_ty], [llvm_v16i16_ty, - llvm_v16i16_ty], [IntrNoMem, Commutative]>; - def int_x86_avx2_pmaxu_d : GCCBuiltin<"__builtin_ia32_pmaxud256">, - Intrinsic<[llvm_v8i32_ty], [llvm_v8i32_ty, - llvm_v8i32_ty], [IntrNoMem, Commutative]>; - def int_x86_avx2_pmaxs_b : GCCBuiltin<"__builtin_ia32_pmaxsb256">, - Intrinsic<[llvm_v32i8_ty], [llvm_v32i8_ty, - llvm_v32i8_ty], [IntrNoMem, Commutative]>; - def int_x86_avx2_pmaxs_w : GCCBuiltin<"__builtin_ia32_pmaxsw256">, - Intrinsic<[llvm_v16i16_ty], [llvm_v16i16_ty, - llvm_v16i16_ty], [IntrNoMem, Commutative]>; - def int_x86_avx2_pmaxs_d : GCCBuiltin<"__builtin_ia32_pmaxsd256">, - Intrinsic<[llvm_v8i32_ty], [llvm_v8i32_ty, - llvm_v8i32_ty], [IntrNoMem, Commutative]>; - def int_x86_avx2_pminu_b : GCCBuiltin<"__builtin_ia32_pminub256">, - Intrinsic<[llvm_v32i8_ty], [llvm_v32i8_ty, - llvm_v32i8_ty], [IntrNoMem, Commutative]>; - def int_x86_avx2_pminu_w : GCCBuiltin<"__builtin_ia32_pminuw256">, - Intrinsic<[llvm_v16i16_ty], [llvm_v16i16_ty, - llvm_v16i16_ty], [IntrNoMem, Commutative]>; - def int_x86_avx2_pminu_d : GCCBuiltin<"__builtin_ia32_pminud256">, - Intrinsic<[llvm_v8i32_ty], [llvm_v8i32_ty, - llvm_v8i32_ty], [IntrNoMem, Commutative]>; - def int_x86_avx2_pmins_b : GCCBuiltin<"__builtin_ia32_pminsb256">, - Intrinsic<[llvm_v32i8_ty], [llvm_v32i8_ty, - llvm_v32i8_ty], [IntrNoMem, Commutative]>; - def int_x86_avx2_pmins_w : GCCBuiltin<"__builtin_ia32_pminsw256">, - Intrinsic<[llvm_v16i16_ty], [llvm_v16i16_ty, - llvm_v16i16_ty], [IntrNoMem, Commutative]>; - def int_x86_avx2_pmins_d : GCCBuiltin<"__builtin_ia32_pminsd256">, - Intrinsic<[llvm_v8i32_ty], [llvm_v8i32_ty, - llvm_v8i32_ty], [IntrNoMem, Commutative]>; def int_x86_avx512_mask_pmaxs_b_128 : GCCBuiltin<"__builtin_ia32_pmaxsb128_mask">, Intrinsic<[llvm_v16i8_ty], [llvm_v16i8_ty, llvm_v16i8_ty, llvm_v16i8_ty, llvm_i16_ty], [IntrNoMem]>; diff --git a/lib/IR/AutoUpgrade.cpp b/lib/IR/AutoUpgrade.cpp index f13009a4b4e4..d3ee5e8e5dbb 100644 --- a/lib/IR/AutoUpgrade.cpp +++ b/lib/IR/AutoUpgrade.cpp @@ -186,6 +186,8 @@ static bool UpgradeIntrinsicFunction1(Function *F, Function *&NewFn) { Name == "x86.sse2.pminu.b" || Name == "x86.sse41.pminuw" || Name == "x86.sse41.pminud" || + Name.startswith("x86.avx2.pmax") || + Name.startswith("x86.avx2.pmin") || Name.startswith("x86.avx2.vbroadcast") || Name.startswith("x86.avx2.pbroadcast") || Name.startswith("x86.avx.vpermil.") || @@ -566,19 +568,23 @@ void llvm::UpgradeIntrinsicCall(CallInst *CI, Function *NewFn) { Rep = Builder.CreateSExt(Rep, CI->getType(), ""); } else if (Name == "llvm.x86.sse41.pmaxsb" || Name == "llvm.x86.sse2.pmaxs.w" || - Name == "llvm.x86.sse41.pmaxsd") { + Name == "llvm.x86.sse41.pmaxsd" || + Name.startswith("llvm.x86.avx2.pmaxs")) { Rep = upgradeIntMinMax(Builder, *CI, ICmpInst::ICMP_SGT); } else if (Name == "llvm.x86.sse2.pmaxu.b" || Name == "llvm.x86.sse41.pmaxuw" || - Name == "llvm.x86.sse41.pmaxud") { + Name == "llvm.x86.sse41.pmaxud" || + Name.startswith("llvm.x86.avx2.pmaxu")) { Rep = upgradeIntMinMax(Builder, *CI, ICmpInst::ICMP_UGT); } else if (Name == "llvm.x86.sse41.pminsb" || Name == "llvm.x86.sse2.pmins.w" || - Name == "llvm.x86.sse41.pminsd") { + Name == "llvm.x86.sse41.pminsd" || + Name.startswith("llvm.x86.avx2.pmins")) { Rep = upgradeIntMinMax(Builder, *CI, ICmpInst::ICMP_SLT); } else if (Name == "llvm.x86.sse2.pminu.b" || Name == "llvm.x86.sse41.pminuw" || - Name == "llvm.x86.sse41.pminud") { + Name == "llvm.x86.sse41.pminud" || + Name.startswith("llvm.x86.avx2.pminu")) { Rep = upgradeIntMinMax(Builder, *CI, ICmpInst::ICMP_ULT); } else if (Name == "llvm.x86.sse2.cvtdq2pd" || Name == "llvm.x86.sse2.cvtps2pd" || diff --git a/lib/Target/X86/X86IntrinsicsInfo.h b/lib/Target/X86/X86IntrinsicsInfo.h index 8fab49cf4492..d81412510a72 100644 --- a/lib/Target/X86/X86IntrinsicsInfo.h +++ b/lib/Target/X86/X86IntrinsicsInfo.h @@ -291,18 +291,6 @@ static const IntrinsicData IntrinsicsWithoutChain[] = { X86_INTRINSIC_DATA(avx2_phadd_w, INTR_TYPE_2OP, X86ISD::HADD, 0), X86_INTRINSIC_DATA(avx2_phsub_d, INTR_TYPE_2OP, X86ISD::HSUB, 0), X86_INTRINSIC_DATA(avx2_phsub_w, INTR_TYPE_2OP, X86ISD::HSUB, 0), - X86_INTRINSIC_DATA(avx2_pmaxs_b, INTR_TYPE_2OP, ISD::SMAX, 0), - X86_INTRINSIC_DATA(avx2_pmaxs_d, INTR_TYPE_2OP, ISD::SMAX, 0), - X86_INTRINSIC_DATA(avx2_pmaxs_w, INTR_TYPE_2OP, ISD::SMAX, 0), - X86_INTRINSIC_DATA(avx2_pmaxu_b, INTR_TYPE_2OP, ISD::UMAX, 0), - X86_INTRINSIC_DATA(avx2_pmaxu_d, INTR_TYPE_2OP, ISD::UMAX, 0), - X86_INTRINSIC_DATA(avx2_pmaxu_w, INTR_TYPE_2OP, ISD::UMAX, 0), - X86_INTRINSIC_DATA(avx2_pmins_b, INTR_TYPE_2OP, ISD::SMIN, 0), - X86_INTRINSIC_DATA(avx2_pmins_d, INTR_TYPE_2OP, ISD::SMIN, 0), - X86_INTRINSIC_DATA(avx2_pmins_w, INTR_TYPE_2OP, ISD::SMIN, 0), - X86_INTRINSIC_DATA(avx2_pminu_b, INTR_TYPE_2OP, ISD::UMIN, 0), - X86_INTRINSIC_DATA(avx2_pminu_d, INTR_TYPE_2OP, ISD::UMIN, 0), - X86_INTRINSIC_DATA(avx2_pminu_w, INTR_TYPE_2OP, ISD::UMIN, 0), X86_INTRINSIC_DATA(avx2_pmovmskb, INTR_TYPE_1OP, X86ISD::MOVMSK, 0), X86_INTRINSIC_DATA(avx2_pmul_dq, INTR_TYPE_2OP, X86ISD::PMULDQ, 0), X86_INTRINSIC_DATA(avx2_pmulh_w, INTR_TYPE_2OP, ISD::MULHS, 0), diff --git a/test/CodeGen/X86/avx2-intrinsics-fast-isel.ll b/test/CodeGen/X86/avx2-intrinsics-fast-isel.ll index b9f1f2283f26..4f410b415c08 100644 --- a/test/CodeGen/X86/avx2-intrinsics-fast-isel.ll +++ b/test/CodeGen/X86/avx2-intrinsics-fast-isel.ll @@ -1931,11 +1931,11 @@ define <4 x i64> @test_mm256_max_epi8(<4 x i64> %a0, <4 x i64> %a1) { ; X64-NEXT: retq %arg0 = bitcast <4 x i64> %a0 to <32 x i8> %arg1 = bitcast <4 x i64> %a1 to <32 x i8> - %res = call <32 x i8> @llvm.x86.avx2.pmaxs.b(<32 x i8> %arg0, <32 x i8> %arg1) - %bc = bitcast <32 x i8> %res to <4 x i64> + %cmp = icmp sgt <32 x i8> %arg0, %arg1 + %sel = select <32 x i1> %cmp, <32 x i8> %arg0, <32 x i8> %arg1 + %bc = bitcast <32 x i8> %sel to <4 x i64> ret <4 x i64> %bc } -declare <32 x i8> @llvm.x86.avx2.pmaxs.b(<32 x i8>, <32 x i8>) nounwind readnone define <4 x i64> @test_mm256_max_epi16(<4 x i64> %a0, <4 x i64> %a1) { ; X32-LABEL: test_mm256_max_epi16: @@ -1949,11 +1949,11 @@ define <4 x i64> @test_mm256_max_epi16(<4 x i64> %a0, <4 x i64> %a1) { ; X64-NEXT: retq %arg0 = bitcast <4 x i64> %a0 to <16 x i16> %arg1 = bitcast <4 x i64> %a1 to <16 x i16> - %res = call <16 x i16> @llvm.x86.avx2.pmaxs.w(<16 x i16> %arg0, <16 x i16> %arg1) - %bc = bitcast <16 x i16> %res to <4 x i64> + %cmp = icmp sgt <16 x i16> %arg0, %arg1 + %sel = select <16 x i1> %cmp, <16 x i16> %arg0, <16 x i16> %arg1 + %bc = bitcast <16 x i16> %sel to <4 x i64> ret <4 x i64> %bc } -declare <16 x i16> @llvm.x86.avx2.pmaxs.w(<16 x i16>, <16 x i16>) nounwind readnone define <4 x i64> @test_mm256_max_epi32(<4 x i64> %a0, <4 x i64> %a1) { ; X32-LABEL: test_mm256_max_epi32: @@ -1967,11 +1967,11 @@ define <4 x i64> @test_mm256_max_epi32(<4 x i64> %a0, <4 x i64> %a1) { ; X64-NEXT: retq %arg0 = bitcast <4 x i64> %a0 to <8 x i32> %arg1 = bitcast <4 x i64> %a1 to <8 x i32> - %res = call <8 x i32> @llvm.x86.avx2.pmaxs.d(<8 x i32> %arg0, <8 x i32> %arg1) - %bc = bitcast <8 x i32> %res to <4 x i64> + %cmp = icmp sgt <8 x i32> %arg0, %arg1 + %sel = select <8 x i1> %cmp, <8 x i32> %arg0, <8 x i32> %arg1 + %bc = bitcast <8 x i32> %sel to <4 x i64> ret <4 x i64> %bc } -declare <8 x i32> @llvm.x86.avx2.pmaxs.d(<8 x i32>, <8 x i32>) nounwind readnone define <4 x i64> @test_mm256_max_epu8(<4 x i64> %a0, <4 x i64> %a1) { ; X32-LABEL: test_mm256_max_epu8: @@ -1985,11 +1985,11 @@ define <4 x i64> @test_mm256_max_epu8(<4 x i64> %a0, <4 x i64> %a1) { ; X64-NEXT: retq %arg0 = bitcast <4 x i64> %a0 to <32 x i8> %arg1 = bitcast <4 x i64> %a1 to <32 x i8> - %res = call <32 x i8> @llvm.x86.avx2.pmaxu.b(<32 x i8> %arg0, <32 x i8> %arg1) - %bc = bitcast <32 x i8> %res to <4 x i64> + %cmp = icmp ugt <32 x i8> %arg0, %arg1 + %sel = select <32 x i1> %cmp, <32 x i8> %arg0, <32 x i8> %arg1 + %bc = bitcast <32 x i8> %sel to <4 x i64> ret <4 x i64> %bc } -declare <32 x i8> @llvm.x86.avx2.pmaxu.b(<32 x i8>, <32 x i8>) nounwind readnone define <4 x i64> @test_mm256_max_epu16(<4 x i64> %a0, <4 x i64> %a1) { ; X32-LABEL: test_mm256_max_epu16: @@ -2003,11 +2003,11 @@ define <4 x i64> @test_mm256_max_epu16(<4 x i64> %a0, <4 x i64> %a1) { ; X64-NEXT: retq %arg0 = bitcast <4 x i64> %a0 to <16 x i16> %arg1 = bitcast <4 x i64> %a1 to <16 x i16> - %res = call <16 x i16> @llvm.x86.avx2.pmaxu.w(<16 x i16> %arg0, <16 x i16> %arg1) - %bc = bitcast <16 x i16> %res to <4 x i64> + %cmp = icmp ugt <16 x i16> %arg0, %arg1 + %sel = select <16 x i1> %cmp, <16 x i16> %arg0, <16 x i16> %arg1 + %bc = bitcast <16 x i16> %sel to <4 x i64> ret <4 x i64> %bc } -declare <16 x i16> @llvm.x86.avx2.pmaxu.w(<16 x i16>, <16 x i16>) nounwind readnone define <4 x i64> @test_mm256_max_epu32(<4 x i64> %a0, <4 x i64> %a1) { ; X32-LABEL: test_mm256_max_epu32: @@ -2021,11 +2021,11 @@ define <4 x i64> @test_mm256_max_epu32(<4 x i64> %a0, <4 x i64> %a1) { ; X64-NEXT: retq %arg0 = bitcast <4 x i64> %a0 to <8 x i32> %arg1 = bitcast <4 x i64> %a1 to <8 x i32> - %res = call <8 x i32> @llvm.x86.avx2.pmaxu.d(<8 x i32> %arg0, <8 x i32> %arg1) - %bc = bitcast <8 x i32> %res to <4 x i64> + %cmp = icmp ugt <8 x i32> %arg0, %arg1 + %sel = select <8 x i1> %cmp, <8 x i32> %arg0, <8 x i32> %arg1 + %bc = bitcast <8 x i32> %sel to <4 x i64> ret <4 x i64> %bc } -declare <8 x i32> @llvm.x86.avx2.pmaxu.d(<8 x i32>, <8 x i32>) nounwind readnone define <4 x i64> @test_mm256_min_epi8(<4 x i64> %a0, <4 x i64> %a1) { ; X32-LABEL: test_mm256_min_epi8: @@ -2039,11 +2039,11 @@ define <4 x i64> @test_mm256_min_epi8(<4 x i64> %a0, <4 x i64> %a1) { ; X64-NEXT: retq %arg0 = bitcast <4 x i64> %a0 to <32 x i8> %arg1 = bitcast <4 x i64> %a1 to <32 x i8> - %res = call <32 x i8> @llvm.x86.avx2.pmins.b(<32 x i8> %arg0, <32 x i8> %arg1) - %bc = bitcast <32 x i8> %res to <4 x i64> + %cmp = icmp slt <32 x i8> %arg0, %arg1 + %sel = select <32 x i1> %cmp, <32 x i8> %arg0, <32 x i8> %arg1 + %bc = bitcast <32 x i8> %sel to <4 x i64> ret <4 x i64> %bc } -declare <32 x i8> @llvm.x86.avx2.pmins.b(<32 x i8>, <32 x i8>) nounwind readnone define <4 x i64> @test_mm256_min_epi16(<4 x i64> %a0, <4 x i64> %a1) { ; X32-LABEL: test_mm256_min_epi16: @@ -2057,11 +2057,11 @@ define <4 x i64> @test_mm256_min_epi16(<4 x i64> %a0, <4 x i64> %a1) { ; X64-NEXT: retq %arg0 = bitcast <4 x i64> %a0 to <16 x i16> %arg1 = bitcast <4 x i64> %a1 to <16 x i16> - %res = call <16 x i16> @llvm.x86.avx2.pmins.w(<16 x i16> %arg0, <16 x i16> %arg1) - %bc = bitcast <16 x i16> %res to <4 x i64> + %cmp = icmp slt <16 x i16> %arg0, %arg1 + %sel = select <16 x i1> %cmp, <16 x i16> %arg0, <16 x i16> %arg1 + %bc = bitcast <16 x i16> %sel to <4 x i64> ret <4 x i64> %bc } -declare <16 x i16> @llvm.x86.avx2.pmins.w(<16 x i16>, <16 x i16>) nounwind readnone define <4 x i64> @test_mm256_min_epi32(<4 x i64> %a0, <4 x i64> %a1) { ; X32-LABEL: test_mm256_min_epi32: @@ -2075,11 +2075,11 @@ define <4 x i64> @test_mm256_min_epi32(<4 x i64> %a0, <4 x i64> %a1) { ; X64-NEXT: retq %arg0 = bitcast <4 x i64> %a0 to <8 x i32> %arg1 = bitcast <4 x i64> %a1 to <8 x i32> - %res = call <8 x i32> @llvm.x86.avx2.pmins.d(<8 x i32> %arg0, <8 x i32> %arg1) - %bc = bitcast <8 x i32> %res to <4 x i64> + %cmp = icmp slt <8 x i32> %arg0, %arg1 + %sel = select <8 x i1> %cmp, <8 x i32> %arg0, <8 x i32> %arg1 + %bc = bitcast <8 x i32> %sel to <4 x i64> ret <4 x i64> %bc } -declare <8 x i32> @llvm.x86.avx2.pmins.d(<8 x i32>, <8 x i32>) nounwind readnone define <4 x i64> @test_mm256_min_epu8(<4 x i64> %a0, <4 x i64> %a1) { ; X32-LABEL: test_mm256_min_epu8: @@ -2093,11 +2093,11 @@ define <4 x i64> @test_mm256_min_epu8(<4 x i64> %a0, <4 x i64> %a1) { ; X64-NEXT: retq %arg0 = bitcast <4 x i64> %a0 to <32 x i8> %arg1 = bitcast <4 x i64> %a1 to <32 x i8> - %res = call <32 x i8> @llvm.x86.avx2.pminu.b(<32 x i8> %arg0, <32 x i8> %arg1) - %bc = bitcast <32 x i8> %res to <4 x i64> + %cmp = icmp ult <32 x i8> %arg0, %arg1 + %sel = select <32 x i1> %cmp, <32 x i8> %arg0, <32 x i8> %arg1 + %bc = bitcast <32 x i8> %sel to <4 x i64> ret <4 x i64> %bc } -declare <32 x i8> @llvm.x86.avx2.pminu.b(<32 x i8>, <32 x i8>) nounwind readnone define <4 x i64> @test_mm256_min_epu16(<4 x i64> %a0, <4 x i64> %a1) { ; X32-LABEL: test_mm256_min_epu16: @@ -2111,11 +2111,11 @@ define <4 x i64> @test_mm256_min_epu16(<4 x i64> %a0, <4 x i64> %a1) { ; X64-NEXT: retq %arg0 = bitcast <4 x i64> %a0 to <16 x i16> %arg1 = bitcast <4 x i64> %a1 to <16 x i16> - %res = call <16 x i16> @llvm.x86.avx2.pminu.w(<16 x i16> %arg0, <16 x i16> %arg1) - %bc = bitcast <16 x i16> %res to <4 x i64> + %cmp = icmp ult <16 x i16> %arg0, %arg1 + %sel = select <16 x i1> %cmp, <16 x i16> %arg0, <16 x i16> %arg1 + %bc = bitcast <16 x i16> %sel to <4 x i64> ret <4 x i64> %bc } -declare <16 x i16> @llvm.x86.avx2.pminu.w(<16 x i16>, <16 x i16>) nounwind readnone define <4 x i64> @test_mm256_min_epu32(<4 x i64> %a0, <4 x i64> %a1) { ; X32-LABEL: test_mm256_min_epu32: @@ -2129,11 +2129,11 @@ define <4 x i64> @test_mm256_min_epu32(<4 x i64> %a0, <4 x i64> %a1) { ; X64-NEXT: retq %arg0 = bitcast <4 x i64> %a0 to <8 x i32> %arg1 = bitcast <4 x i64> %a1 to <8 x i32> - %res = call <8 x i32> @llvm.x86.avx2.pminu.d(<8 x i32> %arg0, <8 x i32> %arg1) - %bc = bitcast <8 x i32> %res to <4 x i64> + %cmp = icmp ult <8 x i32> %arg0, %arg1 + %sel = select <8 x i1> %cmp, <8 x i32> %arg0, <8 x i32> %arg1 + %bc = bitcast <8 x i32> %sel to <4 x i64> ret <4 x i64> %bc } -declare <8 x i32> @llvm.x86.avx2.pminu.d(<8 x i32>, <8 x i32>) nounwind readnone define i32 @test_mm256_movemask_epi8(<4 x i64> %a0) nounwind { ; X32-LABEL: test_mm256_movemask_epi8: diff --git a/test/CodeGen/X86/avx2-intrinsics-x86-upgrade.ll b/test/CodeGen/X86/avx2-intrinsics-x86-upgrade.ll index 2c7d055fe2a0..b6b8447beda1 100644 --- a/test/CodeGen/X86/avx2-intrinsics-x86-upgrade.ll +++ b/test/CodeGen/X86/avx2-intrinsics-x86-upgrade.ll @@ -1,4 +1,4 @@ -; NOTE: Assertions have been autogenerated by update_llc_test_checks.py +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py ; RUN: llc < %s -mtriple=x86_64-apple-darwin -march=x86 -mattr=avx2 | FileCheck %s define <16 x i16> @test_x86_avx2_pblendw(<16 x i16> %a0, <16 x i16> %a1) { @@ -95,7 +95,6 @@ define <2 x i64> @test_x86_avx2_vextracti128(<4 x i64> %a0) { ; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm0 ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retl - %res = call <2 x i64> @llvm.x86.avx2.vextracti128(<4 x i64> %a0, i8 7) ret <2 x i64> %res } @@ -107,7 +106,6 @@ define <4 x i64> @test_x86_avx2_vinserti128(<4 x i64> %a0, <2 x i64> %a1) { ; CHECK: ## BB#0: ; CHECK-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; CHECK-NEXT: retl - %res = call <4 x i64> @llvm.x86.avx2.vinserti128(<4 x i64> %a0, <2 x i64> %a1, i8 7) ret <4 x i64> %res } @@ -381,3 +379,136 @@ define void @test_x86_avx_storeu_dq_256(i8* %a0, <32 x i8> %a1) { ret void } declare void @llvm.x86.avx.storeu.dq.256(i8*, <32 x i8>) nounwind + +define <32 x i8> @mm256_max_epi8(<32 x i8> %a0, <32 x i8> %a1) { +; CHECK-LABEL: mm256_max_epi8: +; CHECK: ## BB#0: +; CHECK-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0 +; CHECK-NEXT: retl +; + %res = call <32 x i8> @llvm.x86.avx2.pmaxs.b(<32 x i8> %a0, <32 x i8> %a1) + ret <32 x i8> %res +} +declare <32 x i8> @llvm.x86.avx2.pmaxs.b(<32 x i8>, <32 x i8>) nounwind readnone + +define <16 x i16> @mm256_max_epi16(<16 x i16> %a0, <16 x i16> %a1) { +; CHECK-LABEL: mm256_max_epi16: +; CHECK: ## BB#0: +; CHECK-NEXT: vpmaxsw %ymm1, %ymm0, %ymm0 +; CHECK-NEXT: retl +; + %res = call <16 x i16> @llvm.x86.avx2.pmaxs.w(<16 x i16> %a0, <16 x i16> %a1) + ret <16 x i16> %res +} +declare <16 x i16> @llvm.x86.avx2.pmaxs.w(<16 x i16>, <16 x i16>) nounwind readnone + +define <8 x i32> @mm256_max_epi32(<8 x i32> %a0, <8 x i32> %a1) { +; CHECK-LABEL: mm256_max_epi32: +; CHECK: ## BB#0: +; CHECK-NEXT: vpmaxsd %ymm1, %ymm0, %ymm0 +; CHECK-NEXT: retl +; + %res = call <8 x i32> @llvm.x86.avx2.pmaxs.d(<8 x i32> %a0, <8 x i32> %a1) + ret <8 x i32> %res +} +declare <8 x i32> @llvm.x86.avx2.pmaxs.d(<8 x i32>, <8 x i32>) nounwind readnone + +define <32 x i8> @mm256_max_epu8(<32 x i8> %a0, <32 x i8> %a1) { +; CHECK-LABEL: mm256_max_epu8: +; CHECK: ## BB#0: +; CHECK-NEXT: vpmaxub %ymm1, %ymm0, %ymm0 +; CHECK-NEXT: retl +; + %res = call <32 x i8> @llvm.x86.avx2.pmaxu.b(<32 x i8> %a0, <32 x i8> %a1) + ret <32 x i8> %res +} +declare <32 x i8> @llvm.x86.avx2.pmaxu.b(<32 x i8>, <32 x i8>) nounwind readnone + +define <16 x i16> @mm256_max_epu16(<16 x i16> %a0, <16 x i16> %a1) { +; CHECK-LABEL: mm256_max_epu16: +; CHECK: ## BB#0: +; CHECK-NEXT: vpmaxuw %ymm1, %ymm0, %ymm0 +; CHECK-NEXT: retl +; + %res = call <16 x i16> @llvm.x86.avx2.pmaxu.w(<16 x i16> %a0, <16 x i16> %a1) + ret <16 x i16> %res +} +declare <16 x i16> @llvm.x86.avx2.pmaxu.w(<16 x i16>, <16 x i16>) nounwind readnone + +define <8 x i32> @mm256_max_epu32(<8 x i32> %a0, <8 x i32> %a1) { +; CHECK-LABEL: mm256_max_epu32: +; CHECK: ## BB#0: +; CHECK-NEXT: vpmaxud %ymm1, %ymm0, %ymm0 +; CHECK-NEXT: retl +; + %res = call <8 x i32> @llvm.x86.avx2.pmaxu.d(<8 x i32> %a0, <8 x i32> %a1) + ret <8 x i32> %res +} +declare <8 x i32> @llvm.x86.avx2.pmaxu.d(<8 x i32>, <8 x i32>) nounwind readnone + +define <32 x i8> @mm256_min_epi8(<32 x i8> %a0, <32 x i8> %a1) { +; CHECK-LABEL: mm256_min_epi8: +; CHECK: ## BB#0: +; CHECK-NEXT: vpminsb %ymm1, %ymm0, %ymm0 +; CHECK-NEXT: retl +; + %res = call <32 x i8> @llvm.x86.avx2.pmins.b(<32 x i8> %a0, <32 x i8> %a1) + ret <32 x i8> %res +} +declare <32 x i8> @llvm.x86.avx2.pmins.b(<32 x i8>, <32 x i8>) nounwind readnone + +define <16 x i16> @mm256_min_epi16(<16 x i16> %a0, <16 x i16> %a1) { +; CHECK-LABEL: mm256_min_epi16: +; CHECK: ## BB#0: +; CHECK-NEXT: vpminsw %ymm1, %ymm0, %ymm0 +; CHECK-NEXT: retl +; + %res = call <16 x i16> @llvm.x86.avx2.pmins.w(<16 x i16> %a0, <16 x i16> %a1) + ret <16 x i16> %res +} +declare <16 x i16> @llvm.x86.avx2.pmins.w(<16 x i16>, <16 x i16>) nounwind readnone + +define <8 x i32> @mm256_min_epi32(<8 x i32> %a0, <8 x i32> %a1) { +; CHECK-LABEL: mm256_min_epi32: +; CHECK: ## BB#0: +; CHECK-NEXT: vpminsd %ymm1, %ymm0, %ymm0 +; CHECK-NEXT: retl +; + %res = call <8 x i32> @llvm.x86.avx2.pmins.d(<8 x i32> %a0, <8 x i32> %a1) + ret <8 x i32> %res +} +declare <8 x i32> @llvm.x86.avx2.pmins.d(<8 x i32>, <8 x i32>) nounwind readnone + +define <32 x i8> @mm256_min_epu8(<32 x i8> %a0, <32 x i8> %a1) { +; CHECK-LABEL: mm256_min_epu8: +; CHECK: ## BB#0: +; CHECK-NEXT: vpminub %ymm1, %ymm0, %ymm0 +; CHECK-NEXT: retl +; + %res = call <32 x i8> @llvm.x86.avx2.pminu.b(<32 x i8> %a0, <32 x i8> %a1) + ret <32 x i8> %res +} +declare <32 x i8> @llvm.x86.avx2.pminu.b(<32 x i8>, <32 x i8>) nounwind readnone + +define <16 x i16> @mm256_min_epu16(<16 x i16> %a0, <16 x i16> %a1) { +; CHECK-LABEL: mm256_min_epu16: +; CHECK: ## BB#0: +; CHECK-NEXT: vpminuw %ymm1, %ymm0, %ymm0 +; CHECK-NEXT: retl +; + %res = call <16 x i16> @llvm.x86.avx2.pminu.w(<16 x i16> %a0, <16 x i16> %a1) + ret <16 x i16> %res +} +declare <16 x i16> @llvm.x86.avx2.pminu.w(<16 x i16>, <16 x i16>) nounwind readnone + +define <8 x i32> @mm256_min_epu32(<8 x i32> %a0, <8 x i32> %a1) { +; CHECK-LABEL: mm256_min_epu32: +; CHECK: ## BB#0: +; CHECK-NEXT: vpminud %ymm1, %ymm0, %ymm0 +; CHECK-NEXT: retl +; + %res = call <8 x i32> @llvm.x86.avx2.pminu.d(<8 x i32> %a0, <8 x i32> %a1) + ret <8 x i32> %res +} +declare <8 x i32> @llvm.x86.avx2.pminu.d(<8 x i32>, <8 x i32>) nounwind readnone +