aboutsummaryrefslogtreecommitdiff
path: root/llvm/test/CodeGen/WebAssembly
diff options
context:
space:
mode:
Diffstat (limited to 'llvm/test/CodeGen/WebAssembly')
-rw-r--r--llvm/test/CodeGen/WebAssembly/bulk-memory.ll97
-rw-r--r--llvm/test/CodeGen/WebAssembly/bulk-memory64.ll91
-rw-r--r--llvm/test/CodeGen/WebAssembly/int-mac-reduction-loops.ll208
3 files changed, 291 insertions, 105 deletions
diff --git a/llvm/test/CodeGen/WebAssembly/bulk-memory.ll b/llvm/test/CodeGen/WebAssembly/bulk-memory.ll
index ae170d7..d949068 100644
--- a/llvm/test/CodeGen/WebAssembly/bulk-memory.ll
+++ b/llvm/test/CodeGen/WebAssembly/bulk-memory.ll
@@ -104,6 +104,31 @@ define void @memset_i32(ptr %dest, i8 %val, i32 %len) {
ret void
}
+; CHECK-LABEL: memcpy_0:
+; CHECK-NEXT: .functype memcpy_0 (i32, i32) -> ()
+; CHECK-NEXT: return
+define void @memcpy_0(ptr %dest, ptr %src) {
+ call void @llvm.memcpy.p0.p0.i32(ptr %dest, ptr %src, i32 0, i1 0)
+ ret void
+}
+
+; CHECK-LABEL: memmove_0:
+; CHECK-NEXT: .functype memmove_0 (i32, i32) -> ()
+; CHECK-NEXT: return
+define void @memmove_0(ptr %dest, ptr %src) {
+ call void @llvm.memmove.p0.p0.i32(ptr %dest, ptr %src, i32 0, i1 0)
+ ret void
+}
+
+; CHECK-LABEL: memset_0:
+; NO-BULK-MEM-NOT: memory.fill
+; BULK-MEM-NEXT: .functype memset_0 (i32, i32) -> ()
+; BULK-MEM-NEXT: return
+define void @memset_0(ptr %dest, i8 %val) {
+ call void @llvm.memset.p0.i32(ptr %dest, i8 %val, i32 0, i1 0)
+ ret void
+}
+
; CHECK-LABEL: memcpy_1:
; CHECK-NEXT: .functype memcpy_1 (i32, i32) -> ()
; CHECK-NEXT: i32.load8_u $push[[L0:[0-9]+]]=, 0($1)
@@ -137,14 +162,8 @@ define void @memset_1(ptr %dest, i8 %val) {
; CHECK-LABEL: memcpy_1024:
; NO-BULK-MEM-NOT: memory.copy
; BULK-MEM-NEXT: .functype memcpy_1024 (i32, i32) -> ()
-; BULK-MEM-NEXT: block
; BULK-MEM-NEXT: i32.const $push[[L0:[0-9]+]]=, 1024
-; BULK-MEM-NEXT: i32.eqz $push[[L1:[0-9]+]]=, $pop[[L0]]
-; BULK-MEM-NEXT: br_if 0, $pop[[L1]]
-; BULK-MEM-NEXT: i32.const $push[[L2:[0-9]+]]=, 1024
-; BULK-MEM-NEXT: memory.copy 0, 0, $0, $1, $pop[[L2]]
-; BULK-MEM-NEXT: .LBB{{.*}}:
-; BULK-MEM-NEXT: end_block
+; BULK-MEM-NEXT: memory.copy 0, 0, $0, $1, $pop[[L0]]
; BULK-MEM-NEXT: return
define void @memcpy_1024(ptr %dest, ptr %src) {
call void @llvm.memcpy.p0.p0.i32(ptr %dest, ptr %src, i32 1024, i1 0)
@@ -154,14 +173,8 @@ define void @memcpy_1024(ptr %dest, ptr %src) {
; CHECK-LABEL: memmove_1024:
; NO-BULK-MEM-NOT: memory.copy
; BULK-MEM-NEXT: .functype memmove_1024 (i32, i32) -> ()
-; BULK-MEM-NEXT: block
; BULK-MEM-NEXT: i32.const $push[[L0:[0-9]+]]=, 1024
-; BULK-MEM-NEXT: i32.eqz $push[[L1:[0-9]+]]=, $pop[[L0]]
-; BULK-MEM-NEXT: br_if 0, $pop[[L1]]
-; BULK-MEM-NEXT: i32.const $push[[L2:[0-9]+]]=, 1024
-; BULK-MEM-NEXT: memory.copy 0, 0, $0, $1, $pop[[L2]]
-; BULK-MEM-NEXT: .LBB{{.*}}:
-; BULK-MEM-NEXT: end_block
+; BULK-MEM-NEXT: memory.copy 0, 0, $0, $1, $pop[[L0]]
; BULK-MEM-NEXT: return
define void @memmove_1024(ptr %dest, ptr %src) {
call void @llvm.memmove.p0.p0.i32(ptr %dest, ptr %src, i32 1024, i1 0)
@@ -171,14 +184,8 @@ define void @memmove_1024(ptr %dest, ptr %src) {
; CHECK-LABEL: memset_1024:
; NO-BULK-MEM-NOT: memory.fill
; BULK-MEM-NEXT: .functype memset_1024 (i32, i32) -> ()
-; BULK-MEM-NEXT: block
; BULK-MEM-NEXT: i32.const $push[[L0:[0-9]+]]=, 1024
-; BULK-MEM-NEXT: i32.eqz $push[[L1:[0-9]+]]=, $pop[[L0]]
-; BULK-MEM-NEXT: br_if 0, $pop[[L1]]
-; BULK-MEM-NEXT: i32.const $push[[L2:[0-9]+]]=, 1024
-; BULK-MEM-NEXT: memory.fill 0, $0, $1, $pop[[L2]]
-; BULK-MEM-NEXT: .LBB{{.*}}:
-; BULK-MEM-NEXT: end_block
+; BULK-MEM-NEXT: memory.fill 0, $0, $1, $pop[[L0]]
; BULK-MEM-NEXT: return
define void @memset_1024(ptr %dest, i8 %val) {
call void @llvm.memset.p0.i32(ptr %dest, i8 %val, i32 1024, i1 0)
@@ -201,17 +208,11 @@ define void @memset_1024(ptr %dest, i8 %val) {
; BULK-MEM-NEXT: .functype memcpy_alloca_src (i32) -> ()
; BULK-MEM-NEXT: global.get $push[[L0:[0-9]+]]=, __stack_pointer
; BULK-MEM-NEXT: i32.const $push[[L1:[0-9]+]]=, 112
-; BULK-MEM-NEXT: i32.sub $[[L2:[0-9]+]]=, $pop[[L0]], $pop[[L1]]
-; BULK-MEM-NEXT: block
-; BULK-MEM-NEXT: i32.const $push[[L3:[0-9]+]]=, 100
-; BULK-MEM-NEXT: i32.eqz $push[[L4:[0-9]+]]=, $pop[[L3]]
-; BULK-MEM-NEXT: br_if 0, $pop[[L4]]
-; BULK-MEM-NEXT: i32.const $push[[L5:[0-9]+]]=, 12
-; BULK-MEM-NEXT: i32.add $push[[L6:[0-9]+]]=, $[[L2]], $pop[[L5]]
-; BULK-MEM-NEXT: i32.const $push[[L7:[0-9]+]]=, 100
-; BULK-MEM-NEXT: memory.copy 0, 0, $0, $pop[[L6]], $pop[[L7]]
-; BULK-MEM-NEXT: .LBB{{.*}}:
-; BULK-MEM-NEXT: end_block
+; BULK-MEM-NEXT: i32.sub $push[[L2:[0-9]+]]=, $pop[[L0]], $pop[[L1]]
+; BULK-MEM-NEXT: i32.const $push[[L3:[0-9]+]]=, 12
+; BULK-MEM-NEXT: i32.add $push[[L4:[0-9]+]]=, $pop[[L2]], $pop[[L3]]
+; BULK-MEM-NEXT: i32.const $push[[L5:[0-9]+]]=, 100
+; BULK-MEM-NEXT: memory.copy 0, 0, $0, $pop[[L4]], $pop[[L5]]
; BULK-MEM-NEXT: return
define void @memcpy_alloca_src(ptr %dst) {
%a = alloca [100 x i8]
@@ -224,17 +225,11 @@ define void @memcpy_alloca_src(ptr %dst) {
; BULK-MEM-NEXT: .functype memcpy_alloca_dst (i32) -> ()
; BULK-MEM-NEXT: global.get $push[[L0:[0-9]+]]=, __stack_pointer
; BULK-MEM-NEXT: i32.const $push[[L1:[0-9]+]]=, 112
-; BULK-MEM-NEXT: i32.sub $[[L2:[0-9]+]]=, $pop[[L0]], $pop[[L1]]
-; BULK-MEM-NEXT: block
-; BULK-MEM-NEXT: i32.const $push[[L3:[0-9]+]]=, 100
-; BULK-MEM-NEXT: i32.eqz $push[[L4:[0-9]+]]=, $pop[[L3]]
-; BULK-MEM-NEXT: br_if 0, $pop[[L4]]
-; BULK-MEM-NEXT: i32.const $push[[L5:[0-9]+]]=, 12
-; BULK-MEM-NEXT: i32.add $push[[L6:[0-9]+]]=, $[[L2]], $pop[[L5]]
-; BULK-MEM-NEXT: i32.const $push[[L7:[0-9]+]]=, 100
-; BULK-MEM-NEXT: memory.copy 0, 0, $pop[[L6]], $0, $pop[[L7]]
-; BULK-MEM-NEXT: .LBB{{.*}}:
-; BULK-MEM-NEXT: end_block
+; BULK-MEM-NEXT: i32.sub $push[[L2:[0-9]+]]=, $pop[[L0]], $pop[[L1]]
+; BULK-MEM-NEXT: i32.const $push[[L3:[0-9]+]]=, 12
+; BULK-MEM-NEXT: i32.add $push[[L4:[0-9]+]]=, $pop[[L2]], $pop[[L3]]
+; BULK-MEM-NEXT: i32.const $push[[L5:[0-9]+]]=, 100
+; BULK-MEM-NEXT: memory.copy 0, 0, $pop[[L4]], $0, $pop[[L5]]
; BULK-MEM-NEXT: return
define void @memcpy_alloca_dst(ptr %src) {
%a = alloca [100 x i8]
@@ -247,17 +242,11 @@ define void @memcpy_alloca_dst(ptr %src) {
; BULK-MEM-NEXT: .functype memset_alloca (i32) -> ()
; BULK-MEM-NEXT: global.get $push[[L0:[0-9]+]]=, __stack_pointer
; BULK-MEM-NEXT: i32.const $push[[L1:[0-9]+]]=, 112
-; BULK-MEM-NEXT: i32.sub $1=, $pop[[L0]], $pop[[L1]]
-; BULK-MEM-NEXT: block
-; BULK-MEM-NEXT: i32.const $push[[L2:[0-9]+]]=, 100
-; BULK-MEM-NEXT: i32.eqz $push[[L3:[0-9]+]]=, $pop[[L2]]
-; BULK-MEM-NEXT: br_if 0, $pop[[L3]]
-; BULK-MEM-NEXT: i32.const $push[[L4:[0-9]+]]=, 12
-; BULK-MEM-NEXT: i32.add $push[[L5:[0-9]+]]=, $1, $pop[[L4]]
-; BULK-MEM-NEXT: i32.const $push[[L6:[0-9]+]]=, 100
-; BULK-MEM-NEXT: memory.fill 0, $pop[[L5]], $0, $pop[[L6]]
-; BULK-MEM-NEXT: .LBB{{.*}}:
-; BULK-MEM-NEXT: end_block
+; BULK-MEM-NEXT: i32.sub $push[[L2:[0-9]+]]=, $pop[[L0]], $pop[[L1]]
+; BULK-MEM-NEXT: i32.const $push[[L3:[0-9]+]]=, 12
+; BULK-MEM-NEXT: i32.add $push[[L4:[0-9]+]]=, $pop[[L2]], $pop[[L3]]
+; BULK-MEM-NEXT: i32.const $push[[L5:[0-9]+]]=, 100
+; BULK-MEM-NEXT: memory.fill 0, $pop[[L4]], $0, $pop[[L5]]
; BULK-MEM-NEXT: return
define void @memset_alloca(i8 %val) {
%a = alloca [100 x i8]
diff --git a/llvm/test/CodeGen/WebAssembly/bulk-memory64.ll b/llvm/test/CodeGen/WebAssembly/bulk-memory64.ll
index 0cf8493..d0206a3 100644
--- a/llvm/test/CodeGen/WebAssembly/bulk-memory64.ll
+++ b/llvm/test/CodeGen/WebAssembly/bulk-memory64.ll
@@ -110,6 +110,31 @@ define void @memset_i32(ptr %dest, i8 %val, i64 %len) {
ret void
}
+; CHECK-LABEL: memcpy_0:
+; CHECK-NEXT: .functype memcpy_0 (i64, i64) -> ()
+; CHECK-NEXT: return
+define void @memcpy_0(ptr %dest, ptr %src) {
+ call void @llvm.memcpy.p0.p0.i64(ptr %dest, ptr %src, i64 0, i1 0)
+ ret void
+}
+
+; CHECK-LABEL: memmove_0:
+; CHECK-NEXT: .functype memmove_0 (i64, i64) -> ()
+; CHECK-NEXT: return
+define void @memmove_0(ptr %dest, ptr %src) {
+ call void @llvm.memmove.p0.p0.i64(ptr %dest, ptr %src, i64 0, i1 0)
+ ret void
+}
+
+; CHECK-LABEL: memset_0:
+; NO-BULK-MEM-NOT: memory.fill
+; BULK-MEM-NEXT: .functype memset_0 (i64, i32) -> ()
+; BULK-MEM-NEXT: return
+define void @memset_0(ptr %dest, i8 %val) {
+ call void @llvm.memset.p0.i64(ptr %dest, i8 %val, i64 0, i1 0)
+ ret void
+}
+
; CHECK-LABEL: memcpy_1:
; CHECK-NEXT: .functype memcpy_1 (i64, i64) -> ()
; CHECK-NEXT: i32.load8_u $push[[L0:[0-9]+]]=, 0($1)
@@ -143,14 +168,8 @@ define void @memset_1(ptr %dest, i8 %val) {
; CHECK-LABEL: memcpy_1024:
; NO-BULK-MEM-NOT: memory.copy
; BULK-MEM-NEXT: .functype memcpy_1024 (i64, i64) -> ()
-; BULK-MEM-NEXT: block
-; BULK-MEM-NEXT: i64.const $push[[L1:[0-9]+]]=, 1024
-; BULK-MEM-NEXT: i64.eqz $push0=, $pop[[L1]]
-; BULK-MEM-NEXT: br_if 0, $pop0
; BULK-MEM-NEXT: i64.const $push[[L0:[0-9]+]]=, 1024
; BULK-MEM-NEXT: memory.copy 0, 0, $0, $1, $pop[[L0]]
-; BULK-MEM-NEXT: .LBB{{.*}}:
-; BULK-MEM-NEXT: end_block
; BULK-MEM-NEXT: return
define void @memcpy_1024(ptr %dest, ptr %src) {
call void @llvm.memcpy.p0.p0.i64(ptr %dest, ptr %src, i64 1024, i1 0)
@@ -160,14 +179,8 @@ define void @memcpy_1024(ptr %dest, ptr %src) {
; CHECK-LABEL: memmove_1024:
; NO-BULK-MEM-NOT: memory.copy
; BULK-MEM-NEXT: .functype memmove_1024 (i64, i64) -> ()
-; BULK-MEM-NEXT: block
-; BULK-MEM-NEXT: i64.const $push[[L1:[0-9]+]]=, 1024
-; BULK-MEM-NEXT: i64.eqz $push0=, $pop[[L1]]
-; BULK-MEM-NEXT: br_if 0, $pop0
; BULK-MEM-NEXT: i64.const $push[[L0:[0-9]+]]=, 1024
; BULK-MEM-NEXT: memory.copy 0, 0, $0, $1, $pop[[L0]]
-; BULK-MEM-NEXT: .LBB{{.*}}:
-; BULK-MEM-NEXT: end_block
; BULK-MEM-NEXT: return
define void @memmove_1024(ptr %dest, ptr %src) {
call void @llvm.memmove.p0.p0.i64(ptr %dest, ptr %src, i64 1024, i1 0)
@@ -177,14 +190,8 @@ define void @memmove_1024(ptr %dest, ptr %src) {
; CHECK-LABEL: memset_1024:
; NO-BULK-MEM-NOT: memory.fill
; BULK-MEM-NEXT: .functype memset_1024 (i64, i32) -> ()
-; BULK-MEM-NEXT: block
-; BULK-MEM-NEXT: i64.const $push[[L1:[0-9]+]]=, 1024
-; BULK-MEM-NEXT: i64.eqz $push0=, $pop[[L1]]
-; BULK-MEM-NEXT: br_if 0, $pop0
; BULK-MEM-NEXT: i64.const $push[[L0:[0-9]+]]=, 1024
; BULK-MEM-NEXT: memory.fill 0, $0, $1, $pop[[L0]]
-; BULK-MEM-NEXT: .LBB{{.*}}:
-; BULK-MEM-NEXT: end_block
; BULK-MEM-NEXT: return
define void @memset_1024(ptr %dest, i8 %val) {
call void @llvm.memset.p0.i64(ptr %dest, i8 %val, i64 1024, i1 0)
@@ -207,17 +214,11 @@ define void @memset_1024(ptr %dest, i8 %val) {
; BULK-MEM-NEXT: .functype memcpy_alloca_src (i64) -> ()
; BULK-MEM-NEXT: global.get $push[[L1:[0-9]+]]=, __stack_pointer
; BULK-MEM-NEXT: i64.const $push[[L0:[0-9]+]]=, 112
-; BULK-MEM-NEXT: i64.sub $[[L2:[0-9]+]]=, $pop[[L1]], $pop[[L0]]
-; BULK-MEM-NEXT: block
-; BULK-MEM-NEXT: i64.const $push[[L3:[0-9]+]]=, 100
-; BULK-MEM-NEXT: i64.eqz $push[[L4:[0-9]+]]=, $pop[[L3]]
-; BULK-MEM-NEXT: br_if 0, $pop[[L4]]
-; BULK-MEM-NEXT: i64.const $push[[L5:[0-9]+]]=, 12
-; BULK-MEM-NEXT: i64.add $push[[L6:[0-9]+]]=, $[[L2]], $pop[[L5]]
-; BULK-MEM-NEXT: i64.const $push[[L7:[0-9]+]]=, 100
-; BULK-MEM-NEXT: memory.copy 0, 0, $0, $pop[[L6]], $pop[[L7]]
-; BULK-MEM-NEXT: .LBB{{.*}}:
-; BULK-MEM-NEXT: end_block
+; BULK-MEM-NEXT: i64.sub $push[[L2:[0-9]+]]=, $pop[[L1]], $pop[[L0]]
+; BULK-MEM-NEXT: i64.const $push[[L3:[0-9]+]]=, 12
+; BULK-MEM-NEXT: i64.add $push[[L4:[0-9]+]]=, $pop[[L2]], $pop[[L3]]
+; BULK-MEM-NEXT: i64.const $push[[L5:[0-9]+]]=, 100
+; BULK-MEM-NEXT: memory.copy 0, 0, $0, $pop[[L4]], $pop[[L5]]
; BULK-MEM-NEXT: return
define void @memcpy_alloca_src(ptr %dst) {
%a = alloca [100 x i8]
@@ -230,17 +231,11 @@ define void @memcpy_alloca_src(ptr %dst) {
; BULK-MEM-NEXT: .functype memcpy_alloca_dst (i64) -> ()
; BULK-MEM-NEXT: global.get $push[[L1:[0-9]+]]=, __stack_pointer
; BULK-MEM-NEXT: i64.const $push[[L0:[0-9]+]]=, 112
-; BULK-MEM-NEXT: i64.sub $[[L2:[0-9]+]]=, $pop[[L1]], $pop[[L0]]
-; BULK-MEM-NEXT: block
-; BULK-MEM-NEXT: i64.const $push[[L3:[0-9]+]]=, 100
-; BULK-MEM-NEXT: i64.eqz $push[[L4:[0-9]+]]=, $pop[[L3]]
-; BULK-MEM-NEXT: br_if 0, $pop[[L4]]
-; BULK-MEM-NEXT: i64.const $push[[L5:[0-9]+]]=, 12
-; BULK-MEM-NEXT: i64.add $push[[L6:[0-9]+]]=, $[[L2]], $pop[[L5]]
-; BULK-MEM-NEXT: i64.const $push[[L7:[0-9]+]]=, 100
-; BULK-MEM-NEXT: memory.copy 0, 0, $pop[[L6]], $0, $pop[[L7]]
-; BULK-MEM-NEXT: .LBB{{.*}}:
-; BULK-MEM-NEXT: end_block
+; BULK-MEM-NEXT: i64.sub $push[[L2:[0-9]+]]=, $pop[[L1]], $pop[[L0]]
+; BULK-MEM-NEXT: i64.const $push[[L3:[0-9]+]]=, 12
+; BULK-MEM-NEXT: i64.add $push[[L4:[0-9]+]]=, $pop[[L2]], $pop[[L3]]
+; BULK-MEM-NEXT: i64.const $push[[L5:[0-9]+]]=, 100
+; BULK-MEM-NEXT: memory.copy 0, 0, $pop[[L4]], $0, $pop[[L5]]
; BULK-MEM-NEXT: return
define void @memcpy_alloca_dst(ptr %src) {
%a = alloca [100 x i8]
@@ -253,17 +248,11 @@ define void @memcpy_alloca_dst(ptr %src) {
; BULK-MEM-NEXT: .functype memset_alloca (i32) -> ()
; BULK-MEM-NEXT: global.get $push[[L1:[0-9]+]]=, __stack_pointer
; BULK-MEM-NEXT: i64.const $push[[L0:[0-9]+]]=, 112
-; BULK-MEM-NEXT: i64.sub $1=, $pop[[L1]], $pop[[L0]]
-; BULK-MEM-NEXT: block
-; BULK-MEM-NEXT: i64.const $push[[L2:[0-9]+]]=, 100
-; BULK-MEM-NEXT: i64.eqz $push[[L3:[0-9]+]]=, $pop[[L2]]
-; BULK-MEM-NEXT: br_if 0, $pop[[L3]]
-; BULK-MEM-NEXT: i64.const $push[[L4:[0-9]+]]=, 12
-; BULK-MEM-NEXT: i64.add $push[[L5:[0-9]+]]=, $1, $pop[[L4]]
-; BULK-MEM-NEXT: i64.const $push[[L6:[0-9]+]]=, 100
-; BULK-MEM-NEXT: memory.fill 0, $pop[[L5]], $0, $pop[[L6]]
-; BULK-MEM-NEXT: .LBB{{.*}}:
-; BULK-MEM-NEXT: end_block
+; BULK-MEM-NEXT: i64.sub $push[[L2:[0-9]+]]=, $pop[[L1]], $pop[[L0]]
+; BULK-MEM-NEXT: i64.const $push[[L3:[0-9]+]]=, 12
+; BULK-MEM-NEXT: i64.add $push[[L4:[0-9]+]]=, $pop[[L2]], $pop[[L3]]
+; BULK-MEM-NEXT: i64.const $push[[L5:[0-9]+]]=, 100
+; BULK-MEM-NEXT: memory.fill 0, $pop[[L4]], $0, $pop[[L5]]
; BULK-MEM-NEXT: return
define void @memset_alloca(i8 %val) {
%a = alloca [100 x i8]
diff --git a/llvm/test/CodeGen/WebAssembly/int-mac-reduction-loops.ll b/llvm/test/CodeGen/WebAssembly/int-mac-reduction-loops.ll
index 04a2268..314e1b4 100644
--- a/llvm/test/CodeGen/WebAssembly/int-mac-reduction-loops.ll
+++ b/llvm/test/CodeGen/WebAssembly/int-mac-reduction-loops.ll
@@ -1,5 +1,6 @@
; RUN: opt -mattr=+simd128 -passes=loop-vectorize %s | llc -mtriple=wasm32 -mattr=+simd128 -verify-machineinstrs -o - | FileCheck %s
; RUN: opt -mattr=+simd128 -passes=loop-vectorize -vectorizer-maximize-bandwidth %s | llc -mtriple=wasm32 -mattr=+simd128 -verify-machineinstrs -o - | FileCheck %s --check-prefix=MAX-BANDWIDTH
+; RUN: opt -mattr=+simd128,+relaxed-simd -passes=loop-vectorize -vectorizer-maximize-bandwidth %s | llc -mtriple=wasm32 -mattr=+simd128,+relaxed-simd -verify-machineinstrs -o - | FileCheck %s --check-prefix=RELAXED-MAX-BANDWIDTH
target triple = "wasm32"
@@ -23,6 +24,10 @@ define hidden i32 @i32_mac_s8(ptr nocapture noundef readonly %a, ptr nocapture n
; MAX-BANDWIDTH: i32x4.add
; MAX-BANDWIDTH: i32x4.add
+; RELAXED-MAX-BANDWIDTH: v128.load
+; RELAXED-MAX-BANDWIDTH: v128.load
+; RELAXED-MAX-BANDWIDTH: i32x4.relaxed_dot_i8x16_i7x16_add_s
+
entry:
%cmp7.not = icmp eq i32 %N, 0
br i1 %cmp7.not, label %for.cond.cleanup, label %for.body
@@ -47,6 +52,109 @@ for.body: ; preds = %entry, %for.body
br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
}
+define hidden i32 @i32_mac_u8_s8(ptr nocapture noundef readonly %a, ptr nocapture noundef readonly %b, i32 noundef %N) {
+; CHECK-LABEL: i32_mac_u8_s8:
+; CHECK: loop
+; CHECK: v128.load32_zero
+; CHECK: i16x8.extend_low_i8x16_u
+; CHECK: i32x4.extend_low_i16x8_u
+; CHECK: v128.load32_zero
+; CHECK: i16x8.extend_low_i8x16_s
+; CHECK: i32x4.extend_low_i16x8_s
+; CHECK: i32x4.mul
+; CHECK: i32x4.add
+
+; MAX-BANDWIDTH: loop
+; MAX-BANDWIDTH: v128.load
+; MAX-BANDWIDTH: i8x16.shuffle 12, 13, 14, 15, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+; MAX-BANDWIDTH: i16x8.extend_low_i8x16_u
+; MAX-BANDWIDTH: i32x4.extend_low_i16x8_u
+; MAX-BANDWIDTH: v128.load
+; MAX-BANDWIDTH: i8x16.shuffle 12, 13, 14, 15, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+; MAX-BANDWIDTH: i16x8.extend_low_i8x16_s
+; MAX-BANDWIDTH: i32x4.extend_low_i16x8_s
+; MAX-BANDWIDTH: i32x4.mul
+; MAX-BANDWIDTH: i32x4.add
+; MAX-BANDWIDTH: i8x16.shuffle 8, 9, 10, 11, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+; MAX-BANDWIDTH: i16x8.extend_low_i8x16_u
+; MAX-BANDWIDTH: i32x4.extend_low_i16x8_u
+; MAX-BANDWIDTH: i8x16.shuffle 8, 9, 10, 11, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+; MAX-BANDWIDTH: i16x8.extend_low_i8x16_s
+; MAX-BANDWIDTH: i32x4.extend_low_i16x8_s
+; MAX-BANDWIDTH: i32x4.mul
+; MAX-BANDWIDTH: i32x4.add
+; MAX-BANDWIDTH: i8x16.shuffle 4, 5, 6, 7, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+; MAX-BANDWIDTH: i16x8.extend_low_i8x16_u
+; MAX-BANDWIDTH: i32x4.extend_low_i16x8_u
+; MAX-BANDWIDTH: i8x16.shuffle 4, 5, 6, 7, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+; MAX-BANDWIDTH: i16x8.extend_low_i8x16_s
+; MAX-BANDWIDTH: i32x4.extend_low_i16x8_s
+; MAX-BANDWIDTH: i32x4.mul
+; MAX-BANDWIDTH: i32x4.add
+; MAX-BANDWIDTH: i16x8.extend_low_i8x16_u
+; MAX-BANDWIDTH: i32x4.extend_low_i16x8_u
+; MAX-BANDWIDTH: i16x8.extend_low_i8x16_s
+; MAX-BANDWIDTH: i32x4.extend_low_i16x8_s
+; MAX-BANDWIDTH: i32x4.mul
+; MAX-BANDWIDTH: i32x4.add
+
+; RELAXED-MAX-BANDWIDTH: loop
+; RELAXED-MAX-BANDWIDTH: v128.load
+; RELAXED-MAX-BANDWIDTH: i8x16.shuffle 12, 13, 14, 15, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+; RELAXED-MAX-BANDWIDTH: i16x8.extend_low_i8x16_u
+; RELAXED-MAX-BANDWIDTH: i32x4.extend_low_i16x8_u
+; RELAXED-MAX-BANDWIDTH: v128.load
+; RELAXED-MAX-BANDWIDTH: i8x16.shuffle 12, 13, 14, 15, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+; RELAXED-MAX-BANDWIDTH: i16x8.extend_low_i8x16_s
+; RELAXED-MAX-BANDWIDTH: i32x4.extend_low_i16x8_s
+; RELAXED-MAX-BANDWIDTH: i32x4.mul
+; RELAXED-MAX-BANDWIDTH: i32x4.add
+; RELAXED-MAX-BANDWIDTH: i8x16.shuffle 8, 9, 10, 11, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+; RELAXED-MAX-BANDWIDTH: i16x8.extend_low_i8x16_u
+; RELAXED-MAX-BANDWIDTH: i32x4.extend_low_i16x8_u
+; RELAXED-MAX-BANDWIDTH: i8x16.shuffle 8, 9, 10, 11, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+; RELAXED-MAX-BANDWIDTH: i16x8.extend_low_i8x16_s
+; RELAXED-MAX-BANDWIDTH: i32x4.extend_low_i16x8_s
+; RELAXED-MAX-BANDWIDTH: i32x4.mul
+; RELAXED-MAX-BANDWIDTH: i32x4.add
+; RELAXED-MAX-BANDWIDTH: i8x16.shuffle 4, 5, 6, 7, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+; RELAXED-MAX-BANDWIDTH: i16x8.extend_low_i8x16_u
+; RELAXED-MAX-BANDWIDTH: i32x4.extend_low_i16x8_u
+; RELAXED-MAX-BANDWIDTH: i8x16.shuffle 4, 5, 6, 7, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+; RELAXED-MAX-BANDWIDTH: i16x8.extend_low_i8x16_s
+; RELAXED-MAX-BANDWIDTH: i32x4.extend_low_i16x8_s
+; RELAXED-MAX-BANDWIDTH: i32x4.mul
+; RELAXED-MAX-BANDWIDTH: i32x4.add
+; RELAXED-MAX-BANDWIDTH: i16x8.extend_low_i8x16_u
+; RELAXED-MAX-BANDWIDTH: i32x4.extend_low_i16x8_u
+; RELAXED-MAX-BANDWIDTH: i16x8.extend_low_i8x16_s
+; RELAXED-MAX-BANDWIDTH: i32x4.extend_low_i16x8_s
+; RELAXED-MAX-BANDWIDTH: i32x4.mul
+; RELAXED-MAX-BANDWIDTH: i32x4.add
+entry:
+ %cmp7.not = icmp eq i32 %N, 0
+ br i1 %cmp7.not, label %for.cond.cleanup, label %for.body
+
+for.cond.cleanup: ; preds = %for.body, %entry
+ %res.0.lcssa = phi i32 [ 0, %entry ], [ %add, %for.body ]
+ ret i32 %res.0.lcssa
+
+for.body: ; preds = %entry, %for.body
+ %i.09 = phi i32 [ %inc, %for.body ], [ 0, %entry ]
+ %res.08 = phi i32 [ %add, %for.body ], [ 0, %entry ]
+ %arrayidx = getelementptr inbounds i8, ptr %a, i32 %i.09
+ %0 = load i8, ptr %arrayidx, align 1
+ %conv = sext i8 %0 to i32
+ %arrayidx1 = getelementptr inbounds i8, ptr %b, i32 %i.09
+ %1 = load i8, ptr %arrayidx1, align 1
+ %conv2 = zext i8 %1 to i32
+ %mul = mul nsw i32 %conv2, %conv
+ %add = add nsw i32 %mul, %res.08
+ %inc = add nuw i32 %i.09, 1
+ %exitcond.not = icmp eq i32 %inc, %N
+ br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+}
+
define hidden i32 @i32_mac_s16(ptr nocapture noundef readonly %a, ptr nocapture noundef readonly %b, i32 noundef %N) {
; CHECK-LABEL: i32_mac_s16:
; CHECK: i32x4.load16x4_s 0:p2align=1
@@ -57,6 +165,12 @@ define hidden i32 @i32_mac_s16(ptr nocapture noundef readonly %a, ptr nocapture
; MAX-BANDWIDTH: v128.load
; MAX-BANDWIDTH: v128.load
; MAX-BANDWIDTH: i32x4.dot_i16x8_s
+; MAX-BANDWIDTH: i32x4.add
+
+; RELAXED-MAX-BANDWIDTH: v128.load
+; RELAXED-MAX-BANDWIDTH: v128.load
+; RELAXED-MAX-BANDWIDTH: i32x4.dot_i16x8_s
+; RELAXED-MAX-BANDWIDTH: i32x4.add
entry:
%cmp7.not = icmp eq i32 %N, 0
@@ -116,6 +230,31 @@ define hidden i64 @i64_mac_s16(ptr nocapture noundef readonly %a, ptr nocapture
; MAX-BANDWIDTH: i64x2.extmul_low_i32x4_s
; MAX-BANDWIDTH: i64x2.add
+; RELAXED-MAX-BANDWIDTH: v128.load
+; RELAXED-MAX-BANDWIDTH: i8x16.shuffle 12, 13, 14, 15, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1
+; RELAXED-MAX-BANDWIDTH: i32x4.extend_low_i16x8_s
+; RELAXED-MAX-BANDWIDTH: v128.load
+; RELAXED-MAX-BANDWIDTH: i8x16.shuffle 12, 13, 14, 15, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1
+; RELAXED-MAX-BANDWIDTH: i32x4.extend_low_i16x8_s
+; RELAXED-MAX-BANDWIDTH: i64x2.extmul_low_i32x4_s
+; RELAXED-MAX-BANDWIDTH: i64x2.add
+; RELAXED-MAX-BANDWIDTH: i8x16.shuffle 8, 9, 10, 11, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1
+; RELAXED-MAX-BANDWIDTH: i32x4.extend_low_i16x8_s
+; RELAXED-MAX-BANDWIDTH: i8x16.shuffle 8, 9, 10, 11, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1
+; RELAXED-MAX-BANDWIDTH: i32x4.extend_low_i16x8_s
+; RELAXED-MAX-BANDWIDTH: i64x2.extmul_low_i32x4_s
+; RELAXED-MAX-BANDWIDTH: i64x2.add
+; RELAXED-MAX-BANDWIDTH: i8x16.shuffle 4, 5, 6, 7, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1
+; RELAXED-MAX-BANDWIDTH: i32x4.extend_low_i16x8_s
+; RELAXED-MAX-BANDWIDTH: i8x16.shuffle 4, 5, 6, 7, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1
+; RELAXED-MAX-BANDWIDTH: i32x4.extend_low_i16x8_s
+; RELAXED-MAX-BANDWIDTH: i64x2.extmul_low_i32x4_s
+; RELAXED-MAX-BANDWIDTH: i64x2.add
+; RELAXED-MAX-BANDWIDTH: i32x4.extend_low_i16x8_s
+; RELAXED-MAX-BANDWIDTH: i32x4.extend_low_i16x8_s
+; RELAXED-MAX-BANDWIDTH: i64x2.extmul_low_i32x4_s
+; RELAXED-MAX-BANDWIDTH: i64x2.add
+
entry:
%cmp7.not = icmp eq i32 %N, 0
br i1 %cmp7.not, label %for.cond.cleanup, label %for.body
@@ -156,6 +295,14 @@ define hidden i64 @i64_mac_s32(ptr nocapture noundef readonly %a, ptr nocapture
; MAX-BANDWIDTH: i64x2.extend_low_i32x4_s
; MAX-BANDWIDTH: i64x2.add
+; RELAXED-MAX-BANDWIDTH: v128.load
+; RELAXED-MAX-BANDWIDTH: v128.load
+; RELAXED-MAX-BANDWIDTH: i32x4.mul
+; RELAXED-MAX-BANDWIDTH: i64x2.extend_high_i32x4_s
+; RELAXED-MAX-BANDWIDTH: i64x2.add
+; RELAXED-MAX-BANDWIDTH: i64x2.extend_low_i32x4_s
+; RELAXED-MAX-BANDWIDTH: i64x2.add
+
entry:
%cmp6.not = icmp eq i32 %N, 0
br i1 %cmp6.not, label %for.cond.cleanup, label %for.body
@@ -197,6 +344,15 @@ define hidden i32 @i32_mac_u8(ptr nocapture noundef readonly %a, ptr nocapture n
; MAX-BANDWIDTH: i32x4.add
; MAX-BANDWIDTH: i32x4.add
+; RELAXED-MAX-BANDWIDTH: v128.load
+; RELAXED-MAX-BANDWIDTH: v128.load
+; RELAXED-MAX-BANDWIDTH: i16x8.extmul_low_i8x16_u
+; RELAXED-MAX-BANDWIDTH: i32x4.extadd_pairwise_i16x8_u
+; RELAXED-MAX-BANDWIDTH: i16x8.extmul_high_i8x16_u
+; RELAXED-MAX-BANDWIDTH: i32x4.extadd_pairwise_i16x8_u
+; RELAXED-MAX-BANDWIDTH: i32x4.add
+; RELAXED-MAX-BANDWIDTH: i32x4.add
+
entry:
%cmp7.not = icmp eq i32 %N, 0
br i1 %cmp7.not, label %for.cond.cleanup, label %for.body
@@ -235,6 +391,13 @@ define hidden i32 @i32_mac_u16(ptr nocapture noundef readonly %a, ptr nocapture
; MAX-BANDWIDTH: i32x4.add
; MAX-BANDWIDTH: i32x4.add
+; RELAXED-MAX-BANDWIDTH: v128.load
+; RELAXED-MAX-BANDWIDTH: v128.load
+; RELAXED-MAX-BANDWIDTH: i32x4.extmul_low_i16x8_u
+; RELAXED-MAX-BANDWIDTH: i32x4.extmul_high_i16x8_u
+; RELAXED-MAX-BANDWIDTH: i32x4.add
+; RELAXED-MAX-BANDWIDTH: i32x4.add
+
entry:
%cmp7.not = icmp eq i32 %N, 0
br i1 %cmp7.not, label %for.cond.cleanup, label %for.body
@@ -277,6 +440,17 @@ define hidden i32 @i32_mac_u16_s16(ptr nocapture noundef readonly %a, ptr nocapt
; MAX-BANDWIDTH: i32x4.add
; MAX-BANDWIDTH: i32x4.add
+; RELAXED-MAX-BANDWIDTH: v128.load
+; RELAXED-MAX-BANDWIDTH: i32x4.extend_high_i16x8_s
+; RELAXED-MAX-BANDWIDTH: v128.load
+; RELAXED-MAX-BANDWIDTH: i32x4.extend_high_i16x8_u
+; RELAXED-MAX-BANDWIDTH: i32x4.mul
+; RELAXED-MAX-BANDWIDTH: i32x4.extend_low_i16x8_s
+; RELAXED-MAX-BANDWIDTH: i32x4.extend_low_i16x8_u
+; RELAXED-MAX-BANDWIDTH: i32x4.mul
+; RELAXED-MAX-BANDWIDTH: i32x4.add
+; RELAXED-MAX-BANDWIDTH: i32x4.add
+
entry:
%cmp7.not = icmp eq i32 %N, 0
br i1 %cmp7.not, label %for.cond.cleanup, label %for.body
@@ -335,6 +509,32 @@ define hidden i64 @i64_mac_u16(ptr nocapture noundef readonly %a, ptr nocapture
; MAX-BANDWIDTH: i64x2.extmul_low_i32x4_u
; MAX-BANDWIDTH: i64x2.add
+; RELAXED-MAX-BANDWIDTH: v128.load
+; RELAXED-MAX-BANDWIDTH: i8x16.shuffle 12, 13, 14, 15, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1
+; RELAXED-MAX-BANDWIDTH: i32x4.extend_low_i16x8_u
+; RELAXED-MAX-BANDWIDTH: v128.load
+; RELAXED-MAX-BANDWIDTH: i8x16.shuffle 12, 13, 14, 15, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1
+; RELAXED-MAX-BANDWIDTH: i32x4.extend_low_i16x8_u
+; RELAXED-MAX-BANDWIDTH: i64x2.extmul_low_i32x4_u
+; RELAXED-MAX-BANDWIDTH: i64x2.add
+; RELAXED-MAX-BANDWIDTH: i8x16.shuffle 8, 9, 10, 11, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1
+; RELAXED-MAX-BANDWIDTH: i32x4.extend_low_i16x8_u
+; RELAXED-MAX-BANDWIDTH: i8x16.shuffle 8, 9, 10, 11, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1
+; RELAXED-MAX-BANDWIDTH: i32x4.extend_low_i16x8_u
+; RELAXED-MAX-BANDWIDTH: i64x2.extmul_low_i32x4_u
+; RELAXED-MAX-BANDWIDTH: i64x2.add
+; RELAXED-MAX-BANDWIDTH: i8x16.shuffle 4, 5, 6, 7, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1
+; RELAXED-MAX-BANDWIDTH: i32x4.extend_low_i16x8_u
+; RELAXED-MAX-BANDWIDTH: i8x16.shuffle 4, 5, 6, 7, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1
+; RELAXED-MAX-BANDWIDTH: i32x4.extend_low_i16x8_u
+; RELAXED-MAX-BANDWIDTH: i64x2.extmul_low_i32x4_u
+; RELAXED-MAX-BANDWIDTH: i64x2.add
+; RELAXED-MAX-BANDWIDTH: i32x4.extend_low_i16x8_u
+; RELAXED-MAX-BANDWIDTH: i32x4.extend_low_i16x8_u
+; RELAXED-MAX-BANDWIDTH: i64x2.extmul_low_i32x4_u
+; RELAXED-MAX-BANDWIDTH: i64x2.add
+
+
entry:
%cmp8.not = icmp eq i32 %N, 0
br i1 %cmp8.not, label %for.cond.cleanup, label %for.body
@@ -375,6 +575,14 @@ define hidden i64 @i64_mac_u32(ptr nocapture noundef readonly %a, ptr nocapture
; MAX-BANDWIDTH: i64x2.extend_low_i32x4_u
; MAX-BANDWIDTH: i64x2.add
+; RELAXED-MAX-BANDWIDTH: v128.load
+; RELAXED-MAX-BANDWIDTH: v128.load
+; RELAXED-MAX-BANDWIDTH: i32x4.mul
+; RELAXED-MAX-BANDWIDTH: i64x2.extend_high_i32x4_u
+; RELAXED-MAX-BANDWIDTH: i64x2.add
+; RELAXED-MAX-BANDWIDTH: i64x2.extend_low_i32x4_u
+; RELAXED-MAX-BANDWIDTH: i64x2.add
+
entry:
%cmp6.not = icmp eq i32 %N, 0
br i1 %cmp6.not, label %for.cond.cleanup, label %for.body