12 files changed, 375 insertions, 177 deletions
diff --git a/llvm/test/CodeGen/WebAssembly/libcall_vectorized.ll b/llvm/test/CodeGen/WebAssembly/libcall_vectorized.ll
new file mode 100644
index 0000000..2d1056f
--- /dev/null
+++ b/llvm/test/CodeGen/WebAssembly/libcall_vectorized.ll
@@ -0,0 +1,33 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+
+; RUN: llc < %s -disable-wasm-fallthrough-return-opt -wasm-keep-registers  -mattr=+simd128 | FileCheck %s
+
+target triple = "wasm32-unknown-unknown"
+
+declare <4 x float> @llvm.exp10.v4f32(<4 x float>)
+
+define <4 x float> @exp10_f32v4(<4 x float> %v) {
+; CHECK-LABEL: exp10_f32v4:
+; CHECK:         .functype exp10_f32v4 (v128) -> (v128)
+; CHECK-NEXT:  # %bb.0: # %entry
+; CHECK-NEXT:    local.get $push12=, 0
+; CHECK-NEXT:    f32x4.extract_lane $push0=, $pop12, 0
+; CHECK-NEXT:    call $push1=, exp10f, $pop0
+; CHECK-NEXT:    f32x4.splat $push2=, $pop1
+; CHECK-NEXT:    local.get $push13=, 0
+; CHECK-NEXT:    f32x4.extract_lane $push3=, $pop13, 1
+; CHECK-NEXT:    call $push4=, exp10f, $pop3
+; CHECK-NEXT:    f32x4.replace_lane $push5=, $pop2, 1, $pop4
+; CHECK-NEXT:    local.get $push14=, 0
+; CHECK-NEXT:    f32x4.extract_lane $push6=, $pop14, 2
+; CHECK-NEXT:    call $push7=, exp10f, $pop6
+; CHECK-NEXT:    f32x4.replace_lane $push8=, $pop5, 2, $pop7
+; CHECK-NEXT:    local.get $push15=, 0
+; CHECK-NEXT:    f32x4.extract_lane $push9=, $pop15, 3
+; CHECK-NEXT:    call $push10=, exp10f, $pop9
+; CHECK-NEXT:    f32x4.replace_lane $push11=, $pop8, 3, $pop10
+; CHECK-NEXT:    return $pop11
+entry:
+  %r = call <4 x float> @llvm.exp10.v4f32(<4 x float> %v)
+  ret <4 x float> %r
+}
diff --git a/llvm/test/CodeGen/WebAssembly/lower-em-sjlj-alloca.ll b/llvm/test/CodeGen/WebAssembly/lower-em-sjlj-alloca.ll
new file mode 100644
index 0000000..0f968de
--- /dev/null
+++ b/llvm/test/CodeGen/WebAssembly/lower-em-sjlj-alloca.ll
@@ -0,0 +1,129 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -S -wasm-lower-em-ehsjlj -wasm-enable-sjlj -mtriple=wasm32-unknown-emscripten < %s | FileCheck %s
+
+@buf = external global i8
+declare i32 @setjmp(ptr) returns_twice
+declare void @dummy()
+
+define void @test_static() {
+; CHECK-LABEL: define void @test_static() personality ptr @__gxx_wasm_personality_v0 {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    [[X:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    [[FUNCTIONINVOCATIONID:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    br label %[[SETJMP_DISPATCH:.*]]
+; CHECK:       [[SETJMP_DISPATCH]]:
+; CHECK-NEXT:    [[VAL1:%.*]] = phi i32 [ [[VAL:%.*]], %[[IF_END:.*]] ], [ undef, %[[ENTRY]] ]
+; CHECK-NEXT:    [[LABEL_PHI:%.*]] = phi i32 [ [[LABEL:%.*]], %[[IF_END]] ], [ -1, %[[ENTRY]] ]
+; CHECK-NEXT:    switch i32 [[LABEL_PHI]], label %[[ENTRY_SPLIT:.*]] [
+; CHECK-NEXT:      i32 1, label %[[ENTRY_SPLIT_SPLIT:.*]]
+; CHECK-NEXT:    ]
+; CHECK:       [[ENTRY_SPLIT]]:
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 4, ptr [[X]])
+; CHECK-NEXT:    call void @__wasm_setjmp(ptr @buf, i32 1, ptr [[FUNCTIONINVOCATIONID]])
+; CHECK-NEXT:    br label %[[ENTRY_SPLIT_SPLIT]]
+; CHECK:       [[ENTRY_SPLIT_SPLIT]]:
+; CHECK-NEXT:    [[SETJMP_RET:%.*]] = phi i32 [ 0, %[[ENTRY_SPLIT]] ], [ [[VAL1]], %[[SETJMP_DISPATCH]] ]
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[SETJMP_RET]], 0
+; CHECK-NEXT:    br i1 [[CMP]], label %[[IF:.*]], label %[[ELSE:.*]]
+; CHECK:       [[IF]]:
+; CHECK-NEXT:    invoke void @dummy()
+; CHECK-NEXT:            to [[DOTNOEXC:label %.*]] unwind label %[[CATCH_DISPATCH_LONGJMP:.*]]
+; CHECK:       [[_NOEXC:.*:]]
+; CHECK-NEXT:    ret void
+; CHECK:       [[ELSE]]:
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 4, ptr [[X]])
+; CHECK-NEXT:    ret void
+; CHECK:       [[CATCH_DISPATCH_LONGJMP]]:
+; CHECK-NEXT:    [[TMP0:%.*]] = catchswitch within none [label %catch.longjmp] unwind to caller
+; CHECK:       [[CATCH_LONGJMP:.*:]]
+; CHECK-NEXT:    [[TMP1:%.*]] = catchpad within [[TMP0]] []
+; CHECK-NEXT:    [[THROWN:%.*]] = call ptr @llvm.wasm.catch(i32 1)
+; CHECK-NEXT:    [[ENV_GEP:%.*]] = getelementptr { ptr, i32 }, ptr [[THROWN]], i32 0, i32 0
+; CHECK-NEXT:    [[VAL_GEP:%.*]] = getelementptr { ptr, i32 }, ptr [[THROWN]], i32 0, i32 1
+; CHECK-NEXT:    [[ENV:%.*]] = load ptr, ptr [[ENV_GEP]], align 4
+; CHECK-NEXT:    [[VAL]] = load i32, ptr [[VAL_GEP]], align 4
+; CHECK-NEXT:    [[LABEL]] = call i32 @__wasm_setjmp_test(ptr [[ENV]], ptr [[FUNCTIONINVOCATIONID]]) [ "funclet"(token [[TMP1]]) ]
+; CHECK-NEXT:    [[TMP2:%.*]] = icmp eq i32 [[LABEL]], 0
+; CHECK-NEXT:    br i1 [[TMP2]], label %[[IF_THEN:.*]], label %[[IF_END]]
+; CHECK:       [[IF_THEN]]:
+; CHECK-NEXT:    call void @__wasm_longjmp(ptr [[ENV]], i32 [[VAL]]) [ "funclet"(token [[TMP1]]) ]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[IF_END]]:
+; CHECK-NEXT:    catchret from [[TMP1]] to label %[[SETJMP_DISPATCH]]
+;
+entry:
+  %x = alloca i32, align 4
+  call void @llvm.lifetime.start.p0(i64 4, ptr %x)
+  %call = call i32 @setjmp(ptr @buf) returns_twice
+  %cmp = icmp eq i32 %call, 0
+  br i1 %cmp, label %if, label %else
+
+if:
+  call void @dummy()
+  ret void
+
+else:
+  call void @llvm.lifetime.end.p0(i64 4, ptr %x)
+  ret void
+}
+
+define void @test_dynamic(i32 %size) {
+; CHECK-LABEL: define void @test_dynamic(
+; CHECK-SAME: i32 [[SIZE:%.*]]) personality ptr @__gxx_wasm_personality_v0 {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    [[FUNCTIONINVOCATIONID:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    br label %[[SETJMP_DISPATCH:.*]]
+; CHECK:       [[SETJMP_DISPATCH]]:
+; CHECK-NEXT:    [[VAL1:%.*]] = phi i32 [ [[VAL:%.*]], %[[IF_END:.*]] ], [ undef, %[[ENTRY]] ]
+; CHECK-NEXT:    [[LABEL_PHI:%.*]] = phi i32 [ [[LABEL:%.*]], %[[IF_END]] ], [ -1, %[[ENTRY]] ]
+; CHECK-NEXT:    switch i32 [[LABEL_PHI]], label %[[ENTRY_SPLIT:.*]] [
+; CHECK-NEXT:      i32 1, label %[[ENTRY_SPLIT_SPLIT:.*]]
+; CHECK-NEXT:    ]
+; CHECK:       [[ENTRY_SPLIT]]:
+; CHECK-NEXT:    [[X:%.*]] = alloca i32, i32 [[SIZE]], align 4
+; CHECK-NEXT:    call void @__wasm_setjmp(ptr @buf, i32 1, ptr [[FUNCTIONINVOCATIONID]])
+; CHECK-NEXT:    br label %[[ENTRY_SPLIT_SPLIT]]
+; CHECK:       [[ENTRY_SPLIT_SPLIT]]:
+; CHECK-NEXT:    [[SETJMP_RET:%.*]] = phi i32 [ 0, %[[ENTRY_SPLIT]] ], [ [[VAL1]], %[[SETJMP_DISPATCH]] ]
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[SETJMP_RET]], 0
+; CHECK-NEXT:    br i1 [[CMP]], label %[[IF:.*]], label %[[ELSE:.*]]
+; CHECK:       [[IF]]:
+; CHECK-NEXT:    invoke void @dummy()
+; CHECK-NEXT:            to [[DOTNOEXC:label %.*]] unwind label %[[CATCH_DISPATCH_LONGJMP:.*]]
+; CHECK:       [[_NOEXC:.*:]]
+; CHECK-NEXT:    ret void
+; CHECK:       [[ELSE]]:
+; CHECK-NEXT:    ret void
+; CHECK:       [[CATCH_DISPATCH_LONGJMP]]:
+; CHECK-NEXT:    [[TMP0:%.*]] = catchswitch within none [label %catch.longjmp] unwind to caller
+; CHECK:       [[CATCH_LONGJMP:.*:]]
+; CHECK-NEXT:    [[TMP1:%.*]] = catchpad within [[TMP0]] []
+; CHECK-NEXT:    [[THROWN:%.*]] = call ptr @llvm.wasm.catch(i32 1)
+; CHECK-NEXT:    [[ENV_GEP:%.*]] = getelementptr { ptr, i32 }, ptr [[THROWN]], i32 0, i32 0
+; CHECK-NEXT:    [[VAL_GEP:%.*]] = getelementptr { ptr, i32 }, ptr [[THROWN]], i32 0, i32 1
+; CHECK-NEXT:    [[ENV:%.*]] = load ptr, ptr [[ENV_GEP]], align 4
+; CHECK-NEXT:    [[VAL]] = load i32, ptr [[VAL_GEP]], align 4
+; CHECK-NEXT:    [[LABEL]] = call i32 @__wasm_setjmp_test(ptr [[ENV]], ptr [[FUNCTIONINVOCATIONID]]) [ "funclet"(token [[TMP1]]) ]
+; CHECK-NEXT:    [[TMP2:%.*]] = icmp eq i32 [[LABEL]], 0
+; CHECK-NEXT:    br i1 [[TMP2]], label %[[IF_THEN:.*]], label %[[IF_END]]
+; CHECK:       [[IF_THEN]]:
+; CHECK-NEXT:    call void @__wasm_longjmp(ptr [[ENV]], i32 [[VAL]]) [ "funclet"(token [[TMP1]]) ]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[IF_END]]:
+; CHECK-NEXT:    catchret from [[TMP1]] to label %[[SETJMP_DISPATCH]]
+;
+entry:
+  %x = alloca i32, i32 %size, align 4
+  call void @llvm.lifetime.start.p0(i64 -1, ptr %x)
+  %call = call i32 @setjmp(ptr @buf) returns_twice
+  %cmp = icmp eq i32 %call, 0
+  br i1 %cmp, label %if, label %else
+
+if:
+  call void @dummy()
+  ret void
+
+else:
+  call void @llvm.lifetime.end.p0(i64 -1, ptr %x)
+  ret void
+}
diff --git a/llvm/test/CodeGen/WebAssembly/lower-em-sjlj-debuginfo.ll b/llvm/test/CodeGen/WebAssembly/lower-em-sjlj-debuginfo.ll
index fec9836..bab8403 100644
--- a/llvm/test/CodeGen/WebAssembly/lower-em-sjlj-debuginfo.ll
+++ b/llvm/test/CodeGen/WebAssembly/lower-em-sjlj-debuginfo.ll
@@ -16,10 +16,10 @@ entry:
   call void @foo(), !dbg !7
   ret void, !dbg !8
 ; CHECK: entry:
-  ; CHECK-NEXT: %functionInvocationId = alloca i32, align 4, !dbg ![[DL0:.*]]
+  ; CHECK-NEXT: %buf = alloca [1 x %struct.__jmp_buf_tag], align 16, !dbg ![[DL0:.*]]
+  ; CHECK-NEXT: %functionInvocationId = alloca i32, align 4, !dbg ![[DL0]]
 
 ; CHECK: entry.split:
-  ; CHECK: alloca {{.*}}, !dbg ![[DL0]]
   ; CHECK: call void @__wasm_setjmp{{.*}}, !dbg ![[DL1:.*]]
   ; CHECK-NEXT: br {{.*}}, !dbg ![[DL2:.*]]
 
diff --git a/llvm/test/CodeGen/WebAssembly/lower-em-sjlj.ll b/llvm/test/CodeGen/WebAssembly/lower-em-sjlj.ll
index b584342..51dcf2f 100644
--- a/llvm/test/CodeGen/WebAssembly/lower-em-sjlj.ll
+++ b/llvm/test/CodeGen/WebAssembly/lower-em-sjlj.ll
@@ -22,17 +22,17 @@ entry:
   call void @longjmp(ptr %buf, i32 1) #1
   unreachable
 ; CHECK: entry:
+; CHECK-NEXT:  %buf = alloca [1 x %struct.__jmp_buf_tag], align 16
 ; CHECK-NEXT: %functionInvocationId = alloca i32, align 4
 ; CHECK-NEXT: br label %entry.split
 
 ; CHECK: entry.split
-; CHECK-NEXT: %[[BUF:.*]] = alloca [1 x %struct.__jmp_buf_tag]
-; CHECK-NEXT: call void @__wasm_setjmp(ptr %[[BUF]], i32 1, ptr %functionInvocationId)
+; CHECK-NEXT: call void @__wasm_setjmp(ptr %buf, i32 1, ptr %functionInvocationId)
 ; CHECK-NEXT: br label %entry.split.split
 
 ; CHECK: entry.split.split:
 ; CHECK-NEXT: phi i32 [ 0, %entry.split ], [ %[[LONGJMP_RESULT:.*]], %if.end ]
-; CHECK-NEXT: %[[JMPBUF:.*]] = ptrtoint ptr %[[BUF]] to [[PTR]]
+; CHECK-NEXT: %[[JMPBUF:.*]] = ptrtoint ptr %buf to [[PTR]]
 ; CHECK-NEXT: store [[PTR]] 0, ptr @__THREW__
 ; CHECK-NEXT: call cc{{.*}} void @__invoke_void_[[PTR]]_i32(ptr @emscripten_longjmp, [[PTR]] %[[JMPBUF]], i32 1)
 ; CHECK-NEXT: %[[__THREW__VAL:.*]] = load [[PTR]], ptr @__THREW__
diff --git a/llvm/test/CodeGen/WebAssembly/lower-wasm-ehsjlj.ll b/llvm/test/CodeGen/WebAssembly/lower-wasm-ehsjlj.ll
index b4c93c4..9de6652 100644
--- a/llvm/test/CodeGen/WebAssembly/lower-wasm-ehsjlj.ll
+++ b/llvm/test/CodeGen/WebAssembly/lower-wasm-ehsjlj.ll
@@ -108,7 +108,7 @@ catch:                                            ; preds = %catch.start
   call void @__cxa_end_catch() [ "funclet"(token %2) ]
   catchret from %2 to label %catchret.dest
 ; CHECK: catch:                                            ; preds = %catch.start
-; CHECK-NEXT:   %exn = load ptr, ptr %exn.slot6, align 4
+; CHECK-NEXT:   %exn = load ptr, ptr %exn.slot, align 4
 ; CHECK-NEXT:   %5 = call ptr @__cxa_begin_catch(ptr %exn) #3 [ "funclet"(token %2) ]
 ; CHECK-NEXT:   invoke void @__cxa_end_catch() [ "funclet"(token %2) ]
 ; CHECK-NEXT:           to label %.noexc unwind label %catch.dispatch.longjmp
diff --git a/llvm/test/CodeGen/WebAssembly/lower-wasm-sjlj.ll b/llvm/test/CodeGen/WebAssembly/lower-wasm-sjlj.ll
index 82c04e2..e1cb859 100644
--- a/llvm/test/CodeGen/WebAssembly/lower-wasm-sjlj.ll
+++ b/llvm/test/CodeGen/WebAssembly/lower-wasm-sjlj.ll
@@ -25,26 +25,24 @@ entry:
   unreachable
 
 ; CHECK:    entry:
+; CHECK-NEXT: %buf = alloca [1 x %struct.__jmp_buf_tag], align 16
 ; CHECK-NEXT: %functionInvocationId = alloca i32, align 4
 ; CHECK-NEXT: br label %setjmp.dispatch
 
 ; CHECK:    setjmp.dispatch:
 ; CHECK-NEXT: %[[VAL2:.*]] = phi i32 [ %val, %if.end ], [ undef, %entry ]
-; CHECK-NEXT: %[[BUF:.*]] = phi ptr [ %[[BUF2:.*]], %if.end ], [ undef, %entry ]
 ; CHECK-NEXT: %label.phi = phi i32 [ %label, %if.end ], [ -1, %entry ]
 ; CHECK-NEXT: switch i32 %label.phi, label %entry.split [
 ; CHECK-NEXT:   i32 1, label %entry.split.split
 ; CHECK-NEXT: ]
 
 ; CHECK:    entry.split:
-; CHECK-NEXT: %buf = alloca [1 x %struct.__jmp_buf_tag], align 16
 ; CHECK-NEXT: call void @__wasm_setjmp(ptr %buf, i32 1, ptr %functionInvocationId)
 ; CHECK-NEXT: br label %entry.split.split
 
 ; CHECK:    entry.split.split:
-; CHECK-NEXT: %[[BUF2]] = phi ptr [ %[[BUF]], %setjmp.dispatch ], [ %buf, %entry.split ]
 ; CHECK-NEXT: %setjmp.ret = phi i32 [ 0, %entry.split ], [ %[[VAL2]], %setjmp.dispatch ]
-; CHECK-NEXT: invoke void @__wasm_longjmp(ptr %[[BUF2]], i32 1)
+; CHECK-NEXT: invoke void @__wasm_longjmp(ptr %buf, i32 1)
 ; CHECK-NEXT:         to label %.noexc unwind label %catch.dispatch.longjmp
 
 ; CHECK:    .noexc:
diff --git a/llvm/test/CodeGen/WebAssembly/ref-test-func.ll b/llvm/test/CodeGen/WebAssembly/ref-test-func.ll
index e4014ba..ea2453f 100644
--- a/llvm/test/CodeGen/WebAssembly/ref-test-func.ll
+++ b/llvm/test/CodeGen/WebAssembly/ref-test-func.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
-; RUN: llc < %s --mtriple=wasm32-unknown-unknown -mcpu=mvp -mattr=+reference-types -verify-machineinstrs | FileCheck --check-prefixes CHECK,CHK32 %s
-; RUN: llc < %s --mtriple=wasm64-unknown-unknown -mcpu=mvp -mattr=+reference-types -verify-machineinstrs | FileCheck --check-prefixes CHECK,CHK64 %s
+; RUN: llc < %s --mtriple=wasm32-unknown-unknown -mcpu=mvp -mattr=+reference-types -mattr=+gc -verify-machineinstrs | FileCheck --check-prefixes CHECK,CHK32 %s
+; RUN: llc < %s --mtriple=wasm64-unknown-unknown -mcpu=mvp -mattr=+reference-types -mattr=+gc -verify-machineinstrs | FileCheck --check-prefixes CHECK,CHK64 %s
 
 define void @test_fpsig_void_void(ptr noundef %func) local_unnamed_addr #0 {
 ; CHECK-LABEL: test_fpsig_void_void:
diff --git a/llvm/test/CodeGen/WebAssembly/returned.ll b/llvm/test/CodeGen/WebAssembly/returned.ll
index e767e29..aef75d8 100644
--- a/llvm/test/CodeGen/WebAssembly/returned.ll
+++ b/llvm/test/CodeGen/WebAssembly/returned.ll
@@ -80,3 +80,27 @@ define i32 @test_second_arg(i32 %a, i32 %b) {
     %call = call i32 @do_something_else(i32 %a, i32 %b)
     ret i32 %b
 }
+
+define void @test() {
+; CHECK-LABEL: test:
+; CHECK:         .functype test () -> ()
+; CHECK-NEXT:  # %bb.0: # %entry
+; CHECK-NEXT:    global.get $push0=, __stack_pointer
+; CHECK-NEXT:    i32.const $push1=, 16
+; CHECK-NEXT:    i32.sub $push7=, $pop0, $pop1
+; CHECK-NEXT:    local.tee $push6=, $0=, $pop7
+; CHECK-NEXT:    global.set __stack_pointer, $pop6
+; CHECK-NEXT:    i32.const $push4=, 12
+; CHECK-NEXT:    i32.add $push5=, $0, $pop4
+; CHECK-NEXT:    call $drop=, returns_arg, $pop5
+; CHECK-NEXT:    i32.const $push2=, 16
+; CHECK-NEXT:    i32.add $push3=, $0, $pop2
+; CHECK-NEXT:    global.set __stack_pointer, $pop3
+; CHECK-NEXT:    return
+entry:
+  %a = alloca i32
+  call void @llvm.lifetime.start.p0(i64 4, ptr %a)
+  %ret = call ptr @returns_arg(ptr %a)
+  call void @llvm.lifetime.end.p0(i64 4, ptr %a)
+  ret void
+}
diff --git a/llvm/test/CodeGen/WebAssembly/simd-arith.ll b/llvm/test/CodeGen/WebAssembly/simd-arith.ll
index e3607e1..36637e1 100644
--- a/llvm/test/CodeGen/WebAssembly/simd-arith.ll
+++ b/llvm/test/CodeGen/WebAssembly/simd-arith.ll
@@ -199,139 +199,17 @@ define <16 x i8> @mul_v16i8(<16 x i8> %x, <16 x i8> %y) {
 ; SIMD128-LABEL: mul_v16i8:
 ; SIMD128:         .functype mul_v16i8 (v128, v128) -> (v128)
 ; SIMD128-NEXT:  # %bb.0:
-; SIMD128-NEXT:    i8x16.extract_lane_u $push4=, $0, 0
-; SIMD128-NEXT:    i8x16.extract_lane_u $push3=, $1, 0
-; SIMD128-NEXT:    i32.mul $push5=, $pop4, $pop3
-; SIMD128-NEXT:    i8x16.splat $push6=, $pop5
-; SIMD128-NEXT:    i8x16.extract_lane_u $push1=, $0, 1
-; SIMD128-NEXT:    i8x16.extract_lane_u $push0=, $1, 1
-; SIMD128-NEXT:    i32.mul $push2=, $pop1, $pop0
-; SIMD128-NEXT:    i8x16.replace_lane $push7=, $pop6, 1, $pop2
-; SIMD128-NEXT:    i8x16.extract_lane_u $push9=, $0, 2
-; SIMD128-NEXT:    i8x16.extract_lane_u $push8=, $1, 2
-; SIMD128-NEXT:    i32.mul $push10=, $pop9, $pop8
-; SIMD128-NEXT:    i8x16.replace_lane $push11=, $pop7, 2, $pop10
-; SIMD128-NEXT:    i8x16.extract_lane_u $push13=, $0, 3
-; SIMD128-NEXT:    i8x16.extract_lane_u $push12=, $1, 3
-; SIMD128-NEXT:    i32.mul $push14=, $pop13, $pop12
-; SIMD128-NEXT:    i8x16.replace_lane $push15=, $pop11, 3, $pop14
-; SIMD128-NEXT:    i8x16.extract_lane_u $push17=, $0, 4
-; SIMD128-NEXT:    i8x16.extract_lane_u $push16=, $1, 4
-; SIMD128-NEXT:    i32.mul $push18=, $pop17, $pop16
-; SIMD128-NEXT:    i8x16.replace_lane $push19=, $pop15, 4, $pop18
-; SIMD128-NEXT:    i8x16.extract_lane_u $push21=, $0, 5
-; SIMD128-NEXT:    i8x16.extract_lane_u $push20=, $1, 5
-; SIMD128-NEXT:    i32.mul $push22=, $pop21, $pop20
-; SIMD128-NEXT:    i8x16.replace_lane $push23=, $pop19, 5, $pop22
-; SIMD128-NEXT:    i8x16.extract_lane_u $push25=, $0, 6
-; SIMD128-NEXT:    i8x16.extract_lane_u $push24=, $1, 6
-; SIMD128-NEXT:    i32.mul $push26=, $pop25, $pop24
-; SIMD128-NEXT:    i8x16.replace_lane $push27=, $pop23, 6, $pop26
-; SIMD128-NEXT:    i8x16.extract_lane_u $push29=, $0, 7
-; SIMD128-NEXT:    i8x16.extract_lane_u $push28=, $1, 7
-; SIMD128-NEXT:    i32.mul $push30=, $pop29, $pop28
-; SIMD128-NEXT:    i8x16.replace_lane $push31=, $pop27, 7, $pop30
-; SIMD128-NEXT:    i8x16.extract_lane_u $push33=, $0, 8
-; SIMD128-NEXT:    i8x16.extract_lane_u $push32=, $1, 8
-; SIMD128-NEXT:    i32.mul $push34=, $pop33, $pop32
-; SIMD128-NEXT:    i8x16.replace_lane $push35=, $pop31, 8, $pop34
-; SIMD128-NEXT:    i8x16.extract_lane_u $push37=, $0, 9
-; SIMD128-NEXT:    i8x16.extract_lane_u $push36=, $1, 9
-; SIMD128-NEXT:    i32.mul $push38=, $pop37, $pop36
-; SIMD128-NEXT:    i8x16.replace_lane $push39=, $pop35, 9, $pop38
-; SIMD128-NEXT:    i8x16.extract_lane_u $push41=, $0, 10
-; SIMD128-NEXT:    i8x16.extract_lane_u $push40=, $1, 10
-; SIMD128-NEXT:    i32.mul $push42=, $pop41, $pop40
-; SIMD128-NEXT:    i8x16.replace_lane $push43=, $pop39, 10, $pop42
-; SIMD128-NEXT:    i8x16.extract_lane_u $push45=, $0, 11
-; SIMD128-NEXT:    i8x16.extract_lane_u $push44=, $1, 11
-; SIMD128-NEXT:    i32.mul $push46=, $pop45, $pop44
-; SIMD128-NEXT:    i8x16.replace_lane $push47=, $pop43, 11, $pop46
-; SIMD128-NEXT:    i8x16.extract_lane_u $push49=, $0, 12
-; SIMD128-NEXT:    i8x16.extract_lane_u $push48=, $1, 12
-; SIMD128-NEXT:    i32.mul $push50=, $pop49, $pop48
-; SIMD128-NEXT:    i8x16.replace_lane $push51=, $pop47, 12, $pop50
-; SIMD128-NEXT:    i8x16.extract_lane_u $push53=, $0, 13
-; SIMD128-NEXT:    i8x16.extract_lane_u $push52=, $1, 13
-; SIMD128-NEXT:    i32.mul $push54=, $pop53, $pop52
-; SIMD128-NEXT:    i8x16.replace_lane $push55=, $pop51, 13, $pop54
-; SIMD128-NEXT:    i8x16.extract_lane_u $push57=, $0, 14
-; SIMD128-NEXT:    i8x16.extract_lane_u $push56=, $1, 14
-; SIMD128-NEXT:    i32.mul $push58=, $pop57, $pop56
-; SIMD128-NEXT:    i8x16.replace_lane $push59=, $pop55, 14, $pop58
-; SIMD128-NEXT:    i8x16.extract_lane_u $push61=, $0, 15
-; SIMD128-NEXT:    i8x16.extract_lane_u $push60=, $1, 15
-; SIMD128-NEXT:    i32.mul $push62=, $pop61, $pop60
-; SIMD128-NEXT:    i8x16.replace_lane $push63=, $pop59, 15, $pop62
-; SIMD128-NEXT:    return $pop63
+; SIMD128-NEXT:    i16x8.extmul_low_i8x16_u $push1=, $0, $1
+; SIMD128-NEXT:    i16x8.extmul_high_i8x16_u $push0=, $0, $1
+; SIMD128-NEXT:    i8x16.shuffle $push2=, $pop1, $pop0, 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30
+; SIMD128-NEXT:    return $pop2
 ;
 ; SIMD128-FAST-LABEL: mul_v16i8:
 ; SIMD128-FAST:         .functype mul_v16i8 (v128, v128) -> (v128)
 ; SIMD128-FAST-NEXT:  # %bb.0:
-; SIMD128-FAST-NEXT:    i8x16.extract_lane_u $push5=, $0, 0
-; SIMD128-FAST-NEXT:    i8x16.extract_lane_u $push4=, $1, 0
-; SIMD128-FAST-NEXT:    i32.mul $push6=, $pop5, $pop4
-; SIMD128-FAST-NEXT:    i8x16.splat $push7=, $pop6
-; SIMD128-FAST-NEXT:    i8x16.extract_lane_u $push2=, $0, 1
-; SIMD128-FAST-NEXT:    i8x16.extract_lane_u $push1=, $1, 1
-; SIMD128-FAST-NEXT:    i32.mul $push3=, $pop2, $pop1
-; SIMD128-FAST-NEXT:    i8x16.replace_lane $push8=, $pop7, 1, $pop3
-; SIMD128-FAST-NEXT:    i8x16.extract_lane_u $push10=, $0, 2
-; SIMD128-FAST-NEXT:    i8x16.extract_lane_u $push9=, $1, 2
-; SIMD128-FAST-NEXT:    i32.mul $push11=, $pop10, $pop9
-; SIMD128-FAST-NEXT:    i8x16.replace_lane $push12=, $pop8, 2, $pop11
-; SIMD128-FAST-NEXT:    i8x16.extract_lane_u $push14=, $0, 3
-; SIMD128-FAST-NEXT:    i8x16.extract_lane_u $push13=, $1, 3
-; SIMD128-FAST-NEXT:    i32.mul $push15=, $pop14, $pop13
-; SIMD128-FAST-NEXT:    i8x16.replace_lane $push16=, $pop12, 3, $pop15
-; SIMD128-FAST-NEXT:    i8x16.extract_lane_u $push18=, $0, 4
-; SIMD128-FAST-NEXT:    i8x16.extract_lane_u $push17=, $1, 4
-; SIMD128-FAST-NEXT:    i32.mul $push19=, $pop18, $pop17
-; SIMD128-FAST-NEXT:    i8x16.replace_lane $push20=, $pop16, 4, $pop19
-; SIMD128-FAST-NEXT:    i8x16.extract_lane_u $push22=, $0, 5
-; SIMD128-FAST-NEXT:    i8x16.extract_lane_u $push21=, $1, 5
-; SIMD128-FAST-NEXT:    i32.mul $push23=, $pop22, $pop21
-; SIMD128-FAST-NEXT:    i8x16.replace_lane $push24=, $pop20, 5, $pop23
-; SIMD128-FAST-NEXT:    i8x16.extract_lane_u $push26=, $0, 6
-; SIMD128-FAST-NEXT:    i8x16.extract_lane_u $push25=, $1, 6
-; SIMD128-FAST-NEXT:    i32.mul $push27=, $pop26, $pop25
-; SIMD128-FAST-NEXT:    i8x16.replace_lane $push28=, $pop24, 6, $pop27
-; SIMD128-FAST-NEXT:    i8x16.extract_lane_u $push30=, $0, 7
-; SIMD128-FAST-NEXT:    i8x16.extract_lane_u $push29=, $1, 7
-; SIMD128-FAST-NEXT:    i32.mul $push31=, $pop30, $pop29
-; SIMD128-FAST-NEXT:    i8x16.replace_lane $push32=, $pop28, 7, $pop31
-; SIMD128-FAST-NEXT:    i8x16.extract_lane_u $push34=, $0, 8
-; SIMD128-FAST-NEXT:    i8x16.extract_lane_u $push33=, $1, 8
-; SIMD128-FAST-NEXT:    i32.mul $push35=, $pop34, $pop33
-; SIMD128-FAST-NEXT:    i8x16.replace_lane $push36=, $pop32, 8, $pop35
-; SIMD128-FAST-NEXT:    i8x16.extract_lane_u $push38=, $0, 9
-; SIMD128-FAST-NEXT:    i8x16.extract_lane_u $push37=, $1, 9
-; SIMD128-FAST-NEXT:    i32.mul $push39=, $pop38, $pop37
-; SIMD128-FAST-NEXT:    i8x16.replace_lane $push40=, $pop36, 9, $pop39
-; SIMD128-FAST-NEXT:    i8x16.extract_lane_u $push42=, $0, 10
-; SIMD128-FAST-NEXT:    i8x16.extract_lane_u $push41=, $1, 10
-; SIMD128-FAST-NEXT:    i32.mul $push43=, $pop42, $pop41
-; SIMD128-FAST-NEXT:    i8x16.replace_lane $push44=, $pop40, 10, $pop43
-; SIMD128-FAST-NEXT:    i8x16.extract_lane_u $push46=, $0, 11
-; SIMD128-FAST-NEXT:    i8x16.extract_lane_u $push45=, $1, 11
-; SIMD128-FAST-NEXT:    i32.mul $push47=, $pop46, $pop45
-; SIMD128-FAST-NEXT:    i8x16.replace_lane $push48=, $pop44, 11, $pop47
-; SIMD128-FAST-NEXT:    i8x16.extract_lane_u $push50=, $0, 12
-; SIMD128-FAST-NEXT:    i8x16.extract_lane_u $push49=, $1, 12
-; SIMD128-FAST-NEXT:    i32.mul $push51=, $pop50, $pop49
-; SIMD128-FAST-NEXT:    i8x16.replace_lane $push52=, $pop48, 12, $pop51
-; SIMD128-FAST-NEXT:    i8x16.extract_lane_u $push54=, $0, 13
-; SIMD128-FAST-NEXT:    i8x16.extract_lane_u $push53=, $1, 13
-; SIMD128-FAST-NEXT:    i32.mul $push55=, $pop54, $pop53
-; SIMD128-FAST-NEXT:    i8x16.replace_lane $push56=, $pop52, 13, $pop55
-; SIMD128-FAST-NEXT:    i8x16.extract_lane_u $push58=, $0, 14
-; SIMD128-FAST-NEXT:    i8x16.extract_lane_u $push57=, $1, 14
-; SIMD128-FAST-NEXT:    i32.mul $push59=, $pop58, $pop57
-; SIMD128-FAST-NEXT:    i8x16.replace_lane $push60=, $pop56, 14, $pop59
-; SIMD128-FAST-NEXT:    i8x16.extract_lane_u $push62=, $0, 15
-; SIMD128-FAST-NEXT:    i8x16.extract_lane_u $push61=, $1, 15
-; SIMD128-FAST-NEXT:    i32.mul $push63=, $pop62, $pop61
-; SIMD128-FAST-NEXT:    i8x16.replace_lane $push0=, $pop60, 15, $pop63
+; SIMD128-FAST-NEXT:    i16x8.extmul_low_i8x16_u $push2=, $0, $1
+; SIMD128-FAST-NEXT:    i16x8.extmul_high_i8x16_u $push1=, $0, $1
+; SIMD128-FAST-NEXT:    i8x16.shuffle $push0=, $pop2, $pop1, 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30
 ; SIMD128-FAST-NEXT:    return $pop0
 ;
 ; NO-SIMD128-LABEL: mul_v16i8:
diff --git a/llvm/test/CodeGen/WebAssembly/simd-relaxed-fnma.ll b/llvm/test/CodeGen/WebAssembly/simd-relaxed-fnma.ll
new file mode 100644
index 0000000..6e2d860
--- /dev/null
+++ b/llvm/test/CodeGen/WebAssembly/simd-relaxed-fnma.ll
@@ -0,0 +1,145 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc < %s -disable-wasm-fallthrough-return-opt -wasm-disable-explicit-locals -wasm-keep-registers  -mattr=+fp16,+simd128,+relaxed-simd | FileCheck %s --check-prefix=RELAXED
+; RUN: llc < %s -disable-wasm-fallthrough-return-opt -wasm-disable-explicit-locals -wasm-keep-registers  -mattr=+fp16,+simd128,              | FileCheck %s --check-prefix=STRICT
+
+target triple = "wasm32"
+
+define double @fsub_fmul_contract_f64(double %a, double %b, double %c) {
+; RELAXED-LABEL: fsub_fmul_contract_f64:
+; RELAXED:         .functype fsub_fmul_contract_f64 (f64, f64, f64) -> (f64)
+; RELAXED-NEXT:  # %bb.0:
+; RELAXED-NEXT:    f64.mul $push0=, $1, $0
+; RELAXED-NEXT:    f64.sub $push1=, $2, $pop0
+; RELAXED-NEXT:    return $pop1
+;
+; STRICT-LABEL: fsub_fmul_contract_f64:
+; STRICT:         .functype fsub_fmul_contract_f64 (f64, f64, f64) -> (f64)
+; STRICT-NEXT:  # %bb.0:
+; STRICT-NEXT:    f64.mul $push0=, $1, $0
+; STRICT-NEXT:    f64.sub $push1=, $2, $pop0
+; STRICT-NEXT:    return $pop1
+  %mul = fmul contract double %b, %a
+  %sub = fsub contract double %c, %mul
+  ret double %sub
+}
+
+define <4 x float> @fsub_fmul_contract_4xf32(<4 x float> %a, <4 x float> %b, <4 x float> %c) {
+; RELAXED-LABEL: fsub_fmul_contract_4xf32:
+; RELAXED:         .functype fsub_fmul_contract_4xf32 (v128, v128, v128) -> (v128)
+; RELAXED-NEXT:  # %bb.0:
+; RELAXED-NEXT:    f32x4.relaxed_nmadd $push0=, $2, $1, $0
+; RELAXED-NEXT:    return $pop0
+;
+; STRICT-LABEL: fsub_fmul_contract_4xf32:
+; STRICT:         .functype fsub_fmul_contract_4xf32 (v128, v128, v128) -> (v128)
+; STRICT-NEXT:  # %bb.0:
+; STRICT-NEXT:    f32x4.mul $push0=, $1, $0
+; STRICT-NEXT:    f32x4.sub $push1=, $2, $pop0
+; STRICT-NEXT:    return $pop1
+  %mul = fmul contract <4 x float> %b, %a
+  %sub = fsub contract <4 x float> %c, %mul
+  ret <4 x float> %sub
+}
+
+
+define <8 x half> @fsub_fmul_contract_8xf16(<8 x half> %a, <8 x half> %b, <8 x half> %c) {
+; RELAXED-LABEL: fsub_fmul_contract_8xf16:
+; RELAXED:         .functype fsub_fmul_contract_8xf16 (v128, v128, v128) -> (v128)
+; RELAXED-NEXT:  # %bb.0:
+; RELAXED-NEXT:    f16x8.relaxed_nmadd $push0=, $2, $1, $0
+; RELAXED-NEXT:    return $pop0
+;
+; STRICT-LABEL: fsub_fmul_contract_8xf16:
+; STRICT:         .functype fsub_fmul_contract_8xf16 (v128, v128, v128) -> (v128)
+; STRICT-NEXT:  # %bb.0:
+; STRICT-NEXT:    f16x8.mul $push0=, $1, $0
+; STRICT-NEXT:    f16x8.sub $push1=, $2, $pop0
+; STRICT-NEXT:    return $pop1
+  %mul = fmul contract <8 x half> %b, %a
+  %sub = fsub contract <8 x half> %c, %mul
+  ret <8 x half> %sub
+}
+
+
+define <4 x float> @fsub_fmul_4xf32(<4 x float> %a, <4 x float> %b, <4 x float> %c) {
+; RELAXED-LABEL: fsub_fmul_4xf32:
+; RELAXED:         .functype fsub_fmul_4xf32 (v128, v128, v128) -> (v128)
+; RELAXED-NEXT:  # %bb.0:
+; RELAXED-NEXT:    f32x4.mul $push0=, $1, $0
+; RELAXED-NEXT:    f32x4.sub $push1=, $2, $pop0
+; RELAXED-NEXT:    return $pop1
+;
+; STRICT-LABEL: fsub_fmul_4xf32:
+; STRICT:         .functype fsub_fmul_4xf32 (v128, v128, v128) -> (v128)
+; STRICT-NEXT:  # %bb.0:
+; STRICT-NEXT:    f32x4.mul $push0=, $1, $0
+; STRICT-NEXT:    f32x4.sub $push1=, $2, $pop0
+; STRICT-NEXT:    return $pop1
+  %mul = fmul <4 x float> %b, %a
+  %sub = fsub contract <4 x float> %c, %mul
+  ret <4 x float> %sub
+}
+
+define <8 x float> @fsub_fmul_contract_8xf32(<8 x float> %a, <8 x float> %b, <8 x float> %c) {
+; RELAXED-LABEL: fsub_fmul_contract_8xf32:
+; RELAXED:         .functype fsub_fmul_contract_8xf32 (i32, v128, v128, v128, v128, v128, v128) -> ()
+; RELAXED-NEXT:  # %bb.0:
+; RELAXED-NEXT:    f32x4.relaxed_nmadd $push0=, $6, $4, $2
+; RELAXED-NEXT:    v128.store 16($0), $pop0
+; RELAXED-NEXT:    f32x4.relaxed_nmadd $push1=, $5, $3, $1
+; RELAXED-NEXT:    v128.store 0($0), $pop1
+; RELAXED-NEXT:    return
+;
+; STRICT-LABEL: fsub_fmul_contract_8xf32:
+; STRICT:         .functype fsub_fmul_contract_8xf32 (i32, v128, v128, v128, v128, v128, v128) -> ()
+; STRICT-NEXT:  # %bb.0:
+; STRICT-NEXT:    f32x4.mul $push0=, $4, $2
+; STRICT-NEXT:    f32x4.sub $push1=, $6, $pop0
+; STRICT-NEXT:    v128.store 16($0), $pop1
+; STRICT-NEXT:    f32x4.mul $push2=, $3, $1
+; STRICT-NEXT:    f32x4.sub $push3=, $5, $pop2
+; STRICT-NEXT:    v128.store 0($0), $pop3
+; STRICT-NEXT:    return
+  %mul = fmul contract <8 x float> %b, %a
+  %sub = fsub contract <8 x float> %c, %mul
+  ret <8 x float> %sub
+}
+
+
+define <2 x double> @fsub_fmul_contract_2xf64(<2 x double> %a, <2 x double> %b, <2 x double> %c) {
+; RELAXED-LABEL: fsub_fmul_contract_2xf64:
+; RELAXED:         .functype fsub_fmul_contract_2xf64 (v128, v128, v128) -> (v128)
+; RELAXED-NEXT:  # %bb.0:
+; RELAXED-NEXT:    f64x2.relaxed_nmadd $push0=, $2, $1, $0
+; RELAXED-NEXT:    return $pop0
+;
+; STRICT-LABEL: fsub_fmul_contract_2xf64:
+; STRICT:         .functype fsub_fmul_contract_2xf64 (v128, v128, v128) -> (v128)
+; STRICT-NEXT:  # %bb.0:
+; STRICT-NEXT:    f64x2.mul $push0=, $1, $0
+; STRICT-NEXT:    f64x2.sub $push1=, $2, $pop0
+; STRICT-NEXT:    return $pop1
+  %mul = fmul contract <2 x double> %b, %a
+  %sub = fsub contract <2 x double> %c, %mul
+  ret <2 x double> %sub
+}
+
+define float @fsub_fmul_contract_f32(float %a, float %b, float %c) {
+; RELAXED-LABEL: fsub_fmul_contract_f32:
+; RELAXED:         .functype fsub_fmul_contract_f32 (f32, f32, f32) -> (f32)
+; RELAXED-NEXT:  # %bb.0:
+; RELAXED-NEXT:    f32.mul $push0=, $1, $0
+; RELAXED-NEXT:    f32.sub $push1=, $2, $pop0
+; RELAXED-NEXT:    return $pop1
+;
+; STRICT-LABEL: fsub_fmul_contract_f32:
+; STRICT:         .functype fsub_fmul_contract_f32 (f32, f32, f32) -> (f32)
+; STRICT-NEXT:  # %bb.0:
+; STRICT-NEXT:    f32.mul $push0=, $1, $0
+; STRICT-NEXT:    f32.sub $push1=, $2, $pop0
+; STRICT-NEXT:    return $pop1
+  %mul = fmul contract float %b, %a
+  %sub = fsub contract float %c, %mul
+  ret float %sub
+}
+
diff --git a/llvm/test/CodeGen/WebAssembly/target-features-cpus.ll b/llvm/test/CodeGen/WebAssembly/target-features-cpus.ll
index 1c77ad5..60cfc27 100644
--- a/llvm/test/CodeGen/WebAssembly/target-features-cpus.ll
+++ b/llvm/test/CodeGen/WebAssembly/target-features-cpus.ll
@@ -70,9 +70,9 @@ target triple = "wasm32-unknown-unknown"
 ;                +call-indirect-overlong, +exception-handling,
 ;                +extended-const, +fp16, +multimemory, +multivalue,
 ;                +mutable-globals, +nontrapping-fptoint, +relaxed-simd,
-;                +reference-types, +simd128, +sign-ext, +tail-call
+;                +reference-types, +simd128, +sign-ext, +tail-call, +gc
 ; BLEEDING-EDGE-LABEL: .section  .custom_section.target_features,"",@
-; BLEEDING-EDGE-NEXT: .int8  16
+; BLEEDING-EDGE-NEXT: .int8  17
 ; BLEEDING-EDGE-NEXT: .int8  43
 ; BLEEDING-EDGE-NEXT: .int8  7
 ; BLEEDING-EDGE-NEXT: .ascii  "atomics"
@@ -95,6 +95,9 @@ target triple = "wasm32-unknown-unknown"
 ; BLEEDING-EDGE-NEXT: .int8  4
 ; BLEEDING-EDGE-NEXT: .ascii  "fp16"
 ; BLEEDING-EDGE-NEXT: .int8  43
+; BLEEDING-EDGE-NEXT: .int8  2
+; BLEEDING-EDGE-NEXT: .ascii  "gc"
+; BLEEDING-EDGE-NEXT: .int8  43
 ; BLEEDING-EDGE-NEXT: .int8  11
 ; BLEEDING-EDGE-NEXT: .ascii  "multimemory"
 ; BLEEDING-EDGE-NEXT: .int8  43
diff --git a/llvm/test/CodeGen/WebAssembly/vector-reduce.ll b/llvm/test/CodeGen/WebAssembly/vector-reduce.ll
index 1d194b6..4c30a3a 100644
--- a/llvm/test/CodeGen/WebAssembly/vector-reduce.ll
+++ b/llvm/test/CodeGen/WebAssembly/vector-reduce.ll
@@ -116,40 +116,28 @@ define i8 @pairwise_mul_v16i8(<16 x i8> %arg) {
 ; SIMD128-LABEL: pairwise_mul_v16i8:
 ; SIMD128:         .functype pairwise_mul_v16i8 (v128) -> (i32)
 ; SIMD128-NEXT:  # %bb.0:
-; SIMD128-NEXT:    i8x16.extract_lane_u $push26=, $0, 0
-; SIMD128-NEXT:    i8x16.shuffle $push32=, $0, $0, 8, 9, 10, 11, 12, 13, 14, 15, 0, 0, 0, 0, 0, 0, 0, 0
-; SIMD128-NEXT:    local.tee $push31=, $1=, $pop32
-; SIMD128-NEXT:    i8x16.extract_lane_u $push25=, $pop31, 0
-; SIMD128-NEXT:    i32.mul $push27=, $pop26, $pop25
-; SIMD128-NEXT:    i8x16.extract_lane_u $push23=, $0, 4
-; SIMD128-NEXT:    i8x16.extract_lane_u $push22=, $1, 4
-; SIMD128-NEXT:    i32.mul $push24=, $pop23, $pop22
-; SIMD128-NEXT:    i32.mul $push28=, $pop27, $pop24
-; SIMD128-NEXT:    i8x16.extract_lane_u $push19=, $0, 2
-; SIMD128-NEXT:    i8x16.extract_lane_u $push18=, $1, 2
-; SIMD128-NEXT:    i32.mul $push20=, $pop19, $pop18
-; SIMD128-NEXT:    i8x16.extract_lane_u $push16=, $0, 6
-; SIMD128-NEXT:    i8x16.extract_lane_u $push15=, $1, 6
-; SIMD128-NEXT:    i32.mul $push17=, $pop16, $pop15
-; SIMD128-NEXT:    i32.mul $push21=, $pop20, $pop17
-; SIMD128-NEXT:    i32.mul $push29=, $pop28, $pop21
-; SIMD128-NEXT:    i8x16.extract_lane_u $push11=, $0, 1
-; SIMD128-NEXT:    i8x16.extract_lane_u $push10=, $1, 1
-; SIMD128-NEXT:    i32.mul $push12=, $pop11, $pop10
-; SIMD128-NEXT:    i8x16.extract_lane_u $push8=, $0, 5
-; SIMD128-NEXT:    i8x16.extract_lane_u $push7=, $1, 5
-; SIMD128-NEXT:    i32.mul $push9=, $pop8, $pop7
-; SIMD128-NEXT:    i32.mul $push13=, $pop12, $pop9
-; SIMD128-NEXT:    i8x16.extract_lane_u $push4=, $0, 3
-; SIMD128-NEXT:    i8x16.extract_lane_u $push3=, $1, 3
-; SIMD128-NEXT:    i32.mul $push5=, $pop4, $pop3
-; SIMD128-NEXT:    i8x16.extract_lane_u $push1=, $0, 7
-; SIMD128-NEXT:    i8x16.extract_lane_u $push0=, $1, 7
-; SIMD128-NEXT:    i32.mul $push2=, $pop1, $pop0
-; SIMD128-NEXT:    i32.mul $push6=, $pop5, $pop2
-; SIMD128-NEXT:    i32.mul $push14=, $pop13, $pop6
-; SIMD128-NEXT:    i32.mul $push30=, $pop29, $pop14
-; SIMD128-NEXT:    return $pop30
+; SIMD128-NEXT:    i8x16.shuffle $push20=, $0, $0, 8, 9, 10, 11, 12, 13, 14, 15, 0, 0, 0, 0, 0, 0, 0, 0
+; SIMD128-NEXT:    local.tee $push19=, $1=, $pop20
+; SIMD128-NEXT:    i16x8.extmul_low_i8x16_u $push1=, $0, $pop19
+; SIMD128-NEXT:    i16x8.extmul_high_i8x16_u $push0=, $0, $1
+; SIMD128-NEXT:    i8x16.shuffle $push18=, $pop1, $pop0, 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30
+; SIMD128-NEXT:    local.tee $push17=, $0=, $pop18
+; SIMD128-NEXT:    i8x16.shuffle $push16=, $0, $0, 4, 5, 6, 7, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+; SIMD128-NEXT:    local.tee $push15=, $1=, $pop16
+; SIMD128-NEXT:    i16x8.extmul_low_i8x16_u $push3=, $pop17, $pop15
+; SIMD128-NEXT:    i16x8.extmul_high_i8x16_u $push2=, $0, $1
+; SIMD128-NEXT:    i8x16.shuffle $push14=, $pop3, $pop2, 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30
+; SIMD128-NEXT:    local.tee $push13=, $0=, $pop14
+; SIMD128-NEXT:    i8x16.shuffle $push12=, $0, $0, 2, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+; SIMD128-NEXT:    local.tee $push11=, $1=, $pop12
+; SIMD128-NEXT:    i16x8.extmul_low_i8x16_u $push5=, $pop13, $pop11
+; SIMD128-NEXT:    i16x8.extmul_high_i8x16_u $push4=, $0, $1
+; SIMD128-NEXT:    i8x16.shuffle $push10=, $pop5, $pop4, 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30
+; SIMD128-NEXT:    local.tee $push9=, $0=, $pop10
+; SIMD128-NEXT:    i8x16.shuffle $push6=, $0, $0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+; SIMD128-NEXT:    i16x8.extmul_low_i8x16_u $push7=, $pop9, $pop6
+; SIMD128-NEXT:    i8x16.extract_lane_u $push8=, $pop7, 0
+; SIMD128-NEXT:    return $pop8
   %res = tail call i8 @llvm.vector.reduce.mul.v16i8(<16 x i8> %arg)
   ret i8 %res
 }