aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--llvm/include/llvm/IR/IntrinsicsNVVM.td26
-rw-r--r--llvm/lib/IR/AutoUpgrade.cpp17
-rw-r--r--llvm/lib/Target/NVPTX/NVPTXIntrinsics.td13
-rw-r--r--llvm/test/Assembler/auto_upgrade_nvvm_intrinsics.ll84
-rw-r--r--llvm/test/CodeGen/NVPTX/mulhi-intrins.ll104
-rw-r--r--llvm/test/CodeGen/NVPTX/sad-intrins.ll110
6 files changed, 312 insertions, 42 deletions
diff --git a/llvm/include/llvm/IR/IntrinsicsNVVM.td b/llvm/include/llvm/IR/IntrinsicsNVVM.td
index 2c8f28ac9..bcc3bf2 100644
--- a/llvm/include/llvm/IR/IntrinsicsNVVM.td
+++ b/llvm/include/llvm/IR/IntrinsicsNVVM.td
@@ -615,6 +615,13 @@ let TargetPrefix = "nvvm" in {
// Multiplication
//
+ def int_nvvm_mulhi_s : ClangBuiltin<"__nvvm_mulhi_s">,
+ DefaultAttrsIntrinsic<[llvm_i16_ty], [llvm_i16_ty, llvm_i16_ty],
+ [IntrNoMem, IntrSpeculatable, Commutative]>;
+ def int_nvvm_mulhi_us : ClangBuiltin<"__nvvm_mulhi_us">,
+ DefaultAttrsIntrinsic<[llvm_i16_ty], [llvm_i16_ty, llvm_i16_ty],
+ [IntrNoMem, IntrSpeculatable, Commutative]>;
+
def int_nvvm_mulhi_i : ClangBuiltin<"__nvvm_mulhi_i">,
DefaultAttrsIntrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty],
[IntrNoMem, IntrSpeculatable, Commutative]>;
@@ -730,12 +737,27 @@ let TargetPrefix = "nvvm" in {
// Sad
//
+ def int_nvvm_sad_s : ClangBuiltin<"__nvvm_sad_s">,
+ DefaultAttrsIntrinsic<[llvm_i16_ty], [llvm_i16_ty, llvm_i16_ty, llvm_i16_ty],
+ [IntrNoMem, Commutative, IntrSpeculatable]>;
+ def int_nvvm_sad_us : ClangBuiltin<"__nvvm_sad_us">,
+ DefaultAttrsIntrinsic<[llvm_i16_ty], [llvm_i16_ty, llvm_i16_ty, llvm_i16_ty],
+ [IntrNoMem, Commutative, IntrSpeculatable]>;
+
def int_nvvm_sad_i : ClangBuiltin<"__nvvm_sad_i">,
DefaultAttrsIntrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
- [IntrNoMem, Commutative]>;
+ [IntrNoMem, Commutative, IntrSpeculatable]>;
def int_nvvm_sad_ui : ClangBuiltin<"__nvvm_sad_ui">,
DefaultAttrsIntrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
- [IntrNoMem, Commutative]>;
+ [IntrNoMem, Commutative, IntrSpeculatable]>;
+
+ def int_nvvm_sad_ll : ClangBuiltin<"__nvvm_sad_ll">,
+ DefaultAttrsIntrinsic<[llvm_i64_ty], [llvm_i64_ty, llvm_i64_ty, llvm_i64_ty],
+ [IntrNoMem, Commutative, IntrSpeculatable]>;
+ def int_nvvm_sad_ull : ClangBuiltin<"__nvvm_sad_ull">,
+ DefaultAttrsIntrinsic<[llvm_i64_ty], [llvm_i64_ty, llvm_i64_ty, llvm_i64_ty],
+ [IntrNoMem, Commutative, IntrSpeculatable]>;
+
//
// Floor Ceil
diff --git a/llvm/lib/IR/AutoUpgrade.cpp b/llvm/lib/IR/AutoUpgrade.cpp
index 1a9e474..d2338e0 100644
--- a/llvm/lib/IR/AutoUpgrade.cpp
+++ b/llvm/lib/IR/AutoUpgrade.cpp
@@ -1209,7 +1209,8 @@ static bool UpgradeIntrinsicFunction1(Function *F, Function *&NewFn) {
Expand = true;
else if (Name.consume_front("max.") || Name.consume_front("min."))
// nvvm.{min,max}.{i,ii,ui,ull}
- Expand = Name == "i" || Name == "ll" || Name == "ui" || Name == "ull";
+ Expand = Name == "s" || Name == "i" || Name == "ll" || Name == "us" ||
+ Name == "ui" || Name == "ull";
else if (Name.consume_front("atomic.load.add."))
// nvvm.atomic.load.add.{f32.p,f64.p}
Expand = Name.starts_with("f32.p") || Name.starts_with("f64.p");
@@ -4132,19 +4133,21 @@ void llvm::UpgradeIntrinsicCall(CallBase *CI, Function *NewFn) {
Value *Val = CI->getArgOperand(1);
Rep = Builder.CreateAtomicRMW(AtomicRMWInst::FAdd, Ptr, Val, MaybeAlign(),
AtomicOrdering::SequentiallyConsistent);
- } else if (IsNVVM && (Name == "max.i" || Name == "max.ll" ||
- Name == "max.ui" || Name == "max.ull")) {
+ } else if (IsNVVM && Name.consume_front("max.") &&
+ (Name == "s" || Name == "i" || Name == "ll" || Name == "us" ||
+ Name == "ui" || Name == "ull")) {
Value *Arg0 = CI->getArgOperand(0);
Value *Arg1 = CI->getArgOperand(1);
- Value *Cmp = Name.ends_with(".ui") || Name.ends_with(".ull")
+ Value *Cmp = Name.starts_with("u")
? Builder.CreateICmpUGE(Arg0, Arg1, "max.cond")
: Builder.CreateICmpSGE(Arg0, Arg1, "max.cond");
Rep = Builder.CreateSelect(Cmp, Arg0, Arg1, "max");
- } else if (IsNVVM && (Name == "min.i" || Name == "min.ll" ||
- Name == "min.ui" || Name == "min.ull")) {
+ } else if (IsNVVM && Name.consume_front("min.") &&
+ (Name == "s" || Name == "i" || Name == "ll" || Name == "us" ||
+ Name == "ui" || Name == "ull")) {
Value *Arg0 = CI->getArgOperand(0);
Value *Arg1 = CI->getArgOperand(1);
- Value *Cmp = Name.ends_with(".ui") || Name.ends_with(".ull")
+ Value *Cmp = Name.starts_with("u")
? Builder.CreateICmpULE(Arg0, Arg1, "min.cond")
: Builder.CreateICmpSLE(Arg0, Arg1, "min.cond");
Rep = Builder.CreateSelect(Cmp, Arg0, Arg1, "min");
diff --git a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
index 7b78ffe..15a136f 100644
--- a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
+++ b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
@@ -770,11 +770,14 @@ defm INT_NVVM_FMAN : MIN_MAX<"max">;
// Multiplication
//
+def INT_NVVM_MULHI_S : F_MATH_2<"mul.hi.s16 \t$dst, $src0, $src1;", Int16Regs,
+ Int16Regs, Int16Regs, int_nvvm_mulhi_s>;
+def INT_NVVM_MULHI_US : F_MATH_2<"mul.hi.u16 \t$dst, $src0, $src1;", Int16Regs,
+ Int16Regs, Int16Regs, int_nvvm_mulhi_us>;
def INT_NVVM_MULHI_I : F_MATH_2<"mul.hi.s32 \t$dst, $src0, $src1;", Int32Regs,
Int32Regs, Int32Regs, int_nvvm_mulhi_i>;
def INT_NVVM_MULHI_UI : F_MATH_2<"mul.hi.u32 \t$dst, $src0, $src1;", Int32Regs,
Int32Regs, Int32Regs, int_nvvm_mulhi_ui>;
-
def INT_NVVM_MULHI_LL : F_MATH_2<"mul.hi.s64 \t$dst, $src0, $src1;", Int64Regs,
Int64Regs, Int64Regs, int_nvvm_mulhi_ll>;
def INT_NVVM_MULHI_ULL : F_MATH_2<"mul.hi.u64 \t$dst, $src0, $src1;", Int64Regs,
@@ -851,10 +854,18 @@ def INT_NVVM_DIV_RP_D : F_MATH_2<"div.rp.f64 \t$dst, $src0, $src1;",
// Sad
//
+def INT_NVVM_SAD_S : F_MATH_3<"sad.s16 \t$dst, $src0, $src1, $src2;",
+ Int16Regs, Int16Regs, Int16Regs, Int16Regs, int_nvvm_sad_s>;
+def INT_NVVM_SAD_US : F_MATH_3<"sad.u16 \t$dst, $src0, $src1, $src2;",
+ Int16Regs, Int16Regs, Int16Regs, Int16Regs, int_nvvm_sad_us>;
def INT_NVVM_SAD_I : F_MATH_3<"sad.s32 \t$dst, $src0, $src1, $src2;",
Int32Regs, Int32Regs, Int32Regs, Int32Regs, int_nvvm_sad_i>;
def INT_NVVM_SAD_UI : F_MATH_3<"sad.u32 \t$dst, $src0, $src1, $src2;",
Int32Regs, Int32Regs, Int32Regs, Int32Regs, int_nvvm_sad_ui>;
+def INT_NVVM_SAD_LL : F_MATH_3<"sad.s64 \t$dst, $src0, $src1, $src2;",
+ Int64Regs, Int64Regs, Int64Regs, Int64Regs, int_nvvm_sad_ll>;
+def INT_NVVM_SAD_ULL : F_MATH_3<"sad.u64 \t$dst, $src0, $src1, $src2;",
+ Int64Regs, Int64Regs, Int64Regs, Int64Regs, int_nvvm_sad_ull>;
//
// Floor Ceil
diff --git a/llvm/test/Assembler/auto_upgrade_nvvm_intrinsics.ll b/llvm/test/Assembler/auto_upgrade_nvvm_intrinsics.ll
index a295e5e..1c11e12 100644
--- a/llvm/test/Assembler/auto_upgrade_nvvm_intrinsics.ll
+++ b/llvm/test/Assembler/auto_upgrade_nvvm_intrinsics.ll
@@ -13,12 +13,16 @@ declare float @llvm.nvvm.h2f(i16)
declare i32 @llvm.nvvm.abs.i(i32)
declare i64 @llvm.nvvm.abs.ll(i64)
+declare i16 @llvm.nvvm.max.s(i16, i16)
declare i32 @llvm.nvvm.max.i(i32, i32)
declare i64 @llvm.nvvm.max.ll(i64, i64)
+declare i16 @llvm.nvvm.max.us(i16, i16)
declare i32 @llvm.nvvm.max.ui(i32, i32)
declare i64 @llvm.nvvm.max.ull(i64, i64)
+declare i16 @llvm.nvvm.min.s(i16, i16)
declare i32 @llvm.nvvm.min.i(i32, i32)
declare i64 @llvm.nvvm.min.ll(i64, i64)
+declare i16 @llvm.nvvm.min.us(i16, i16)
declare i32 @llvm.nvvm.min.ui(i32, i32)
declare i64 @llvm.nvvm.min.ull(i64, i64)
@@ -65,38 +69,54 @@ define void @abs(i32 %a, i64 %b) {
}
; CHECK-LABEL: @min_max
-define void @min_max(i32 %a1, i32 %a2, i64 %b1, i64 %b2) {
-; CHECK: [[maxi:%[a-zA-Z0-9.]+]] = icmp sge i32 %a1, %a2
-; CHECK: select i1 [[maxi]], i32 %a1, i32 %a2
- %r1 = call i32 @llvm.nvvm.max.i(i32 %a1, i32 %a2)
-
-; CHECK: [[maxll:%[a-zA-Z0-9.]+]] = icmp sge i64 %b1, %b2
-; CHECK: select i1 [[maxll]], i64 %b1, i64 %b2
- %r2 = call i64 @llvm.nvvm.max.ll(i64 %b1, i64 %b2)
-
-; CHECK: [[maxui:%[a-zA-Z0-9.]+]] = icmp uge i32 %a1, %a2
-; CHECK: select i1 [[maxui]], i32 %a1, i32 %a2
- %r3 = call i32 @llvm.nvvm.max.ui(i32 %a1, i32 %a2)
-
-; CHECK: [[maxull:%[a-zA-Z0-9.]+]] = icmp uge i64 %b1, %b2
-; CHECK: select i1 [[maxull]], i64 %b1, i64 %b2
- %r4 = call i64 @llvm.nvvm.max.ull(i64 %b1, i64 %b2)
-
-; CHECK: [[mini:%[a-zA-Z0-9.]+]] = icmp sle i32 %a1, %a2
-; CHECK: select i1 [[mini]], i32 %a1, i32 %a2
- %r5 = call i32 @llvm.nvvm.min.i(i32 %a1, i32 %a2)
-
-; CHECK: [[minll:%[a-zA-Z0-9.]+]] = icmp sle i64 %b1, %b2
-; CHECK: select i1 [[minll]], i64 %b1, i64 %b2
- %r6 = call i64 @llvm.nvvm.min.ll(i64 %b1, i64 %b2)
-
-; CHECK: [[minui:%[a-zA-Z0-9.]+]] = icmp ule i32 %a1, %a2
-; CHECK: select i1 [[minui]], i32 %a1, i32 %a2
- %r7 = call i32 @llvm.nvvm.min.ui(i32 %a1, i32 %a2)
-
-; CHECK: [[minull:%[a-zA-Z0-9.]+]] = icmp ule i64 %b1, %b2
-; CHECK: select i1 [[minull]], i64 %b1, i64 %b2
- %r8 = call i64 @llvm.nvvm.min.ull(i64 %b1, i64 %b2)
+define void @min_max(i16 %a1, i16 %a2, i32 %b1, i32 %b2, i64 %c1, i64 %c2) {
+; CHECK: [[maxs:%[a-zA-Z0-9.]+]] = icmp sge i16 %a1, %a2
+; CHECK: select i1 [[maxs]], i16 %a1, i16 %a2
+ %r1 = call i16 @llvm.nvvm.max.s(i16 %a1, i16 %a2)
+
+; CHECK: [[maxi:%[a-zA-Z0-9.]+]] = icmp sge i32 %b1, %b2
+; CHECK: select i1 [[maxi]], i32 %b1, i32 %b2
+ %r2 = call i32 @llvm.nvvm.max.i(i32 %b1, i32 %b2)
+
+; CHECK: [[maxll:%[a-zA-Z0-9.]+]] = icmp sge i64 %c1, %c2
+; CHECK: select i1 [[maxll]], i64 %c1, i64 %c2
+ %r3 = call i64 @llvm.nvvm.max.ll(i64 %c1, i64 %c2)
+
+; CHECK: [[maxus:%[a-zA-Z0-9.]+]] = icmp uge i16 %a1, %a2
+; CHECK: select i1 [[maxus]], i16 %a1, i16 %a2
+ %r4 = call i16 @llvm.nvvm.max.us(i16 %a1, i16 %a2)
+
+; CHECK: [[maxui:%[a-zA-Z0-9.]+]] = icmp uge i32 %b1, %b2
+; CHECK: select i1 [[maxui]], i32 %b1, i32 %b2
+ %r5 = call i32 @llvm.nvvm.max.ui(i32 %b1, i32 %b2)
+
+; CHECK: [[maxull:%[a-zA-Z0-9.]+]] = icmp uge i64 %c1, %c2
+; CHECK: select i1 [[maxull]], i64 %c1, i64 %c2
+ %r6 = call i64 @llvm.nvvm.max.ull(i64 %c1, i64 %c2)
+
+; CHECK: [[mins:%[a-zA-Z0-9.]+]] = icmp sle i16 %a1, %a2
+; CHECK: select i1 [[mins]], i16 %a1, i16 %a2
+ %r7 = call i16 @llvm.nvvm.min.s(i16 %a1, i16 %a2)
+
+; CHECK: [[mini:%[a-zA-Z0-9.]+]] = icmp sle i32 %b1, %b2
+; CHECK: select i1 [[mini]], i32 %b1, i32 %b2
+ %r8 = call i32 @llvm.nvvm.min.i(i32 %b1, i32 %b2)
+
+; CHECK: [[minll:%[a-zA-Z0-9.]+]] = icmp sle i64 %c1, %c2
+; CHECK: select i1 [[minll]], i64 %c1, i64 %c2
+ %r9 = call i64 @llvm.nvvm.min.ll(i64 %c1, i64 %c2)
+
+; CHECK: [[minus:%[a-zA-Z0-9.]+]] = icmp ule i16 %a1, %a2
+; CHECK: select i1 [[minus]], i16 %a1, i16 %a2
+ %r10 = call i16 @llvm.nvvm.min.us(i16 %a1, i16 %a2)
+
+; CHECK: [[minui:%[a-zA-Z0-9.]+]] = icmp ule i32 %b1, %b2
+; CHECK: select i1 [[minui]], i32 %b1, i32 %b2
+ %r11 = call i32 @llvm.nvvm.min.ui(i32 %b1, i32 %b2)
+
+; CHECK: [[minull:%[a-zA-Z0-9.]+]] = icmp ule i64 %c1, %c2
+; CHECK: select i1 [[minull]], i64 %c1, i64 %c2
+ %r12 = call i64 @llvm.nvvm.min.ull(i64 %c1, i64 %c2)
ret void
}
diff --git a/llvm/test/CodeGen/NVPTX/mulhi-intrins.ll b/llvm/test/CodeGen/NVPTX/mulhi-intrins.ll
new file mode 100644
index 0000000..efa9946
--- /dev/null
+++ b/llvm/test/CodeGen/NVPTX/mulhi-intrins.ll
@@ -0,0 +1,104 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
+; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_50 | FileCheck %s
+; RUN: %if ptxas %{ llc < %s -mtriple=nvptx64 -mcpu=sm_50 | %ptxas-verify %}
+
+define i16 @test_mulhi_i16(i16 %x, i16 %y) {
+; CHECK-LABEL: test_mulhi_i16(
+; CHECK: {
+; CHECK-NEXT: .reg .b16 %rs<4>;
+; CHECK-NEXT: .reg .b32 %r<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u16 %rs1, [test_mulhi_i16_param_0];
+; CHECK-NEXT: ld.param.u16 %rs2, [test_mulhi_i16_param_1];
+; CHECK-NEXT: mul.hi.s16 %rs3, %rs1, %rs2;
+; CHECK-NEXT: cvt.u32.u16 %r1, %rs3;
+; CHECK-NEXT: st.param.b32 [func_retval0+0], %r1;
+; CHECK-NEXT: ret;
+ %1 = call i16 @llvm.nvvm.mulhi.s(i16 %x, i16 %y)
+ ret i16 %1
+}
+
+define i16 @test_mulhi_u16(i16 %x, i16 %y) {
+; CHECK-LABEL: test_mulhi_u16(
+; CHECK: {
+; CHECK-NEXT: .reg .b16 %rs<4>;
+; CHECK-NEXT: .reg .b32 %r<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u16 %rs1, [test_mulhi_u16_param_0];
+; CHECK-NEXT: ld.param.u16 %rs2, [test_mulhi_u16_param_1];
+; CHECK-NEXT: mul.hi.u16 %rs3, %rs1, %rs2;
+; CHECK-NEXT: cvt.u32.u16 %r1, %rs3;
+; CHECK-NEXT: st.param.b32 [func_retval0+0], %r1;
+; CHECK-NEXT: ret;
+ %1 = call i16 @llvm.nvvm.mulhi.us(i16 %x, i16 %y)
+ ret i16 %1
+}
+
+define i32 @test_mulhi_i32(i32 %x, i32 %y) {
+; CHECK-LABEL: test_mulhi_i32(
+; CHECK: {
+; CHECK-NEXT: .reg .b32 %r<4>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u32 %r1, [test_mulhi_i32_param_0];
+; CHECK-NEXT: ld.param.u32 %r2, [test_mulhi_i32_param_1];
+; CHECK-NEXT: mul.hi.s32 %r3, %r1, %r2;
+; CHECK-NEXT: st.param.b32 [func_retval0+0], %r3;
+; CHECK-NEXT: ret;
+ %1 = call i32 @llvm.nvvm.mulhi.i(i32 %x, i32 %y)
+ ret i32 %1
+}
+
+define i32 @test_mulhi_u32(i32 %x, i32 %y) {
+; CHECK-LABEL: test_mulhi_u32(
+; CHECK: {
+; CHECK-NEXT: .reg .b32 %r<4>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u32 %r1, [test_mulhi_u32_param_0];
+; CHECK-NEXT: ld.param.u32 %r2, [test_mulhi_u32_param_1];
+; CHECK-NEXT: mul.hi.u32 %r3, %r1, %r2;
+; CHECK-NEXT: st.param.b32 [func_retval0+0], %r3;
+; CHECK-NEXT: ret;
+ %1 = call i32 @llvm.nvvm.mulhi.ui(i32 %x, i32 %y)
+ ret i32 %1
+}
+
+define i64 @test_mulhi_i64(i64 %x, i64 %y) {
+; CHECK-LABEL: test_mulhi_i64(
+; CHECK: {
+; CHECK-NEXT: .reg .b64 %rd<4>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u64 %rd1, [test_mulhi_i64_param_0];
+; CHECK-NEXT: ld.param.u64 %rd2, [test_mulhi_i64_param_1];
+; CHECK-NEXT: mul.hi.s64 %rd3, %rd1, %rd2;
+; CHECK-NEXT: st.param.b64 [func_retval0+0], %rd3;
+; CHECK-NEXT: ret;
+ %1 = call i64 @llvm.nvvm.mulhi.ll(i64 %x, i64 %y)
+ ret i64 %1
+}
+
+define i64 @test_mulhi_u64(i64 %x, i64 %y) {
+; CHECK-LABEL: test_mulhi_u64(
+; CHECK: {
+; CHECK-NEXT: .reg .b64 %rd<4>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u64 %rd1, [test_mulhi_u64_param_0];
+; CHECK-NEXT: ld.param.u64 %rd2, [test_mulhi_u64_param_1];
+; CHECK-NEXT: mul.hi.u64 %rd3, %rd1, %rd2;
+; CHECK-NEXT: st.param.b64 [func_retval0+0], %rd3;
+; CHECK-NEXT: ret;
+ %1 = call i64 @llvm.nvvm.mulhi.ull(i64 %x, i64 %y)
+ ret i64 %1
+}
+
+declare i16 @llvm.nvvm.mulhi.s(i16, i16)
+declare i16 @llvm.nvvm.mulhi.us(i16, i16)
+declare i32 @llvm.nvvm.mulhi.i(i32, i32)
+declare i32 @llvm.nvvm.mulhi.ui(i32, i32)
+declare i64 @llvm.nvvm.mulhi.ll(i64, i64)
+declare i64 @llvm.nvvm.mulhi.ull(i64, i64)
diff --git a/llvm/test/CodeGen/NVPTX/sad-intrins.ll b/llvm/test/CodeGen/NVPTX/sad-intrins.ll
new file mode 100644
index 0000000..a09413b
--- /dev/null
+++ b/llvm/test/CodeGen/NVPTX/sad-intrins.ll
@@ -0,0 +1,110 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
+; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_50 | FileCheck %s
+; RUN: %if ptxas %{ llc < %s -mtriple=nvptx64 -mcpu=sm_50 | %ptxas-verify %}
+
+define i16 @test_sad_i16(i16 %x, i16 %y, i16 %z) {
+; CHECK-LABEL: test_sad_i16(
+; CHECK: {
+; CHECK-NEXT: .reg .b16 %rs<5>;
+; CHECK-NEXT: .reg .b32 %r<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u16 %rs1, [test_sad_i16_param_0];
+; CHECK-NEXT: ld.param.u16 %rs2, [test_sad_i16_param_1];
+; CHECK-NEXT: ld.param.u16 %rs3, [test_sad_i16_param_2];
+; CHECK-NEXT: sad.s16 %rs4, %rs1, %rs2, %rs3;
+; CHECK-NEXT: cvt.u32.u16 %r1, %rs4;
+; CHECK-NEXT: st.param.b32 [func_retval0+0], %r1;
+; CHECK-NEXT: ret;
+ %1 = call i16 @llvm.nvvm.sad.s(i16 %x, i16 %y, i16 %z)
+ ret i16 %1
+}
+
+define i16 @test_sad_u16(i16 %x, i16 %y, i16 %z) {
+; CHECK-LABEL: test_sad_u16(
+; CHECK: {
+; CHECK-NEXT: .reg .b16 %rs<5>;
+; CHECK-NEXT: .reg .b32 %r<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u16 %rs1, [test_sad_u16_param_0];
+; CHECK-NEXT: ld.param.u16 %rs2, [test_sad_u16_param_1];
+; CHECK-NEXT: ld.param.u16 %rs3, [test_sad_u16_param_2];
+; CHECK-NEXT: sad.u16 %rs4, %rs1, %rs2, %rs3;
+; CHECK-NEXT: cvt.u32.u16 %r1, %rs4;
+; CHECK-NEXT: st.param.b32 [func_retval0+0], %r1;
+; CHECK-NEXT: ret;
+ %1 = call i16 @llvm.nvvm.sad.us(i16 %x, i16 %y, i16 %z)
+ ret i16 %1
+}
+
+define i32 @test_sad_i32(i32 %x, i32 %y, i32 %z) {
+; CHECK-LABEL: test_sad_i32(
+; CHECK: {
+; CHECK-NEXT: .reg .b32 %r<5>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u32 %r1, [test_sad_i32_param_0];
+; CHECK-NEXT: ld.param.u32 %r2, [test_sad_i32_param_1];
+; CHECK-NEXT: ld.param.u32 %r3, [test_sad_i32_param_2];
+; CHECK-NEXT: sad.s32 %r4, %r1, %r2, %r3;
+; CHECK-NEXT: st.param.b32 [func_retval0+0], %r4;
+; CHECK-NEXT: ret;
+ %1 = call i32 @llvm.nvvm.sad.i(i32 %x, i32 %y, i32 %z)
+ ret i32 %1
+}
+
+define i32 @test_sad_u32(i32 %x, i32 %y, i32 %z) {
+; CHECK-LABEL: test_sad_u32(
+; CHECK: {
+; CHECK-NEXT: .reg .b32 %r<5>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u32 %r1, [test_sad_u32_param_0];
+; CHECK-NEXT: ld.param.u32 %r2, [test_sad_u32_param_1];
+; CHECK-NEXT: ld.param.u32 %r3, [test_sad_u32_param_2];
+; CHECK-NEXT: sad.u32 %r4, %r1, %r2, %r3;
+; CHECK-NEXT: st.param.b32 [func_retval0+0], %r4;
+; CHECK-NEXT: ret;
+ %1 = call i32 @llvm.nvvm.sad.ui(i32 %x, i32 %y, i32 %z)
+ ret i32 %1
+}
+
+define i64 @test_sad_i64(i64 %x, i64 %y, i64 %z) {
+; CHECK-LABEL: test_sad_i64(
+; CHECK: {
+; CHECK-NEXT: .reg .b64 %rd<5>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u64 %rd1, [test_sad_i64_param_0];
+; CHECK-NEXT: ld.param.u64 %rd2, [test_sad_i64_param_1];
+; CHECK-NEXT: ld.param.u64 %rd3, [test_sad_i64_param_2];
+; CHECK-NEXT: sad.s64 %rd4, %rd1, %rd2, %rd3;
+; CHECK-NEXT: st.param.b64 [func_retval0+0], %rd4;
+; CHECK-NEXT: ret;
+ %1 = call i64 @llvm.nvvm.sad.ll(i64 %x, i64 %y, i64 %z)
+ ret i64 %1
+}
+
+define i64 @test_sad_u64(i64 %x, i64 %y, i64 %z) {
+; CHECK-LABEL: test_sad_u64(
+; CHECK: {
+; CHECK-NEXT: .reg .b64 %rd<5>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u64 %rd1, [test_sad_u64_param_0];
+; CHECK-NEXT: ld.param.u64 %rd2, [test_sad_u64_param_1];
+; CHECK-NEXT: ld.param.u64 %rd3, [test_sad_u64_param_2];
+; CHECK-NEXT: sad.u64 %rd4, %rd1, %rd2, %rd3;
+; CHECK-NEXT: st.param.b64 [func_retval0+0], %rd4;
+; CHECK-NEXT: ret;
+ %1 = call i64 @llvm.nvvm.sad.ull(i64 %x, i64 %y, i64 %z)
+ ret i64 %1
+}
+
+declare i16 @llvm.nvvm.sad.s(i16, i16, i16)
+declare i16 @llvm.nvvm.sad.us(i16, i16, i16)
+declare i32 @llvm.nvvm.sad.i(i32, i32, i32)
+declare i32 @llvm.nvvm.sad.ui(i32, i32, i32)
+declare i64 @llvm.nvvm.sad.ll(i64, i64, i64)
+declare i64 @llvm.nvvm.sad.ull(i64, i64, i64)