diff options
Diffstat (limited to 'llvm/test')
278 files changed, 20781 insertions, 8783 deletions
diff --git a/llvm/test/Analysis/BasicAA/modref.ll b/llvm/test/Analysis/BasicAA/modref.ll index 0619f8e..1aab28f3 100644 --- a/llvm/test/Analysis/BasicAA/modref.ll +++ b/llvm/test/Analysis/BasicAA/modref.ll @@ -67,27 +67,33 @@ define i8 @test2a(ptr %P) { ret i8 %A } -define void @test3(ptr %P, i8 %X) { +define void @test3(i8 %X) { ; CHECK-LABEL: @test3( -; CHECK-NEXT: [[P2:%.*]] = getelementptr i8, ptr [[P:%.*]], i32 2 +; CHECK-NEXT: [[P:%.*]] = alloca i64, align 8 +; CHECK-NEXT: [[P2:%.*]] = getelementptr i8, ptr [[P]], i32 2 ; CHECK-NEXT: call void @llvm.lifetime.end.p0(i64 1, ptr [[P]]) ; CHECK-NEXT: store i8 2, ptr [[P2]], align 1 +; CHECK-NEXT: call void @external(ptr [[P]]) ; CHECK-NEXT: ret void ; + %P = alloca i64 %Y = add i8 %X, 1 ;; Dead, because the only use (the store) is dead. %P2 = getelementptr i8, ptr %P, i32 2 store i8 %Y, ptr %P2 ;; Not read by lifetime.end, should be removed. call void @llvm.lifetime.end.p0(i64 1, ptr %P) store i8 2, ptr %P2 + call void @external(ptr %P) ret void } -define void @test3a(ptr %P, i8 %X) { +define void @test3a(i8 %X) { ; CHECK-LABEL: @test3a( -; CHECK-NEXT: call void @llvm.lifetime.end.p0(i64 10, ptr [[P:%.*]]) +; CHECK-NEXT: [[P:%.*]] = alloca i64, align 8 +; CHECK-NEXT: call void @llvm.lifetime.end.p0(i64 10, ptr [[P]]) ; CHECK-NEXT: ret void ; + %P = alloca i64 %Y = add i8 %X, 1 ;; Dead, because the only use (the store) is dead. %P2 = getelementptr i8, ptr %P, i32 2 diff --git a/llvm/test/Analysis/CallGraph/ignore-assumelike-calls.ll b/llvm/test/Analysis/CallGraph/ignore-assumelike-calls.ll index 658d738..1c9d201 100644 --- a/llvm/test/Analysis/CallGraph/ignore-assumelike-calls.ll +++ b/llvm/test/Analysis/CallGraph/ignore-assumelike-calls.ll @@ -10,7 +10,7 @@ ; CHECK-EMPTY: ; CHECK-NEXT: Call graph node for function: 'bitcast_only'<<{{.*}}>> #uses=0 ; CHECK-EMPTY: -; CHECK-NEXT: Call graph node for function: 'llvm.lifetime.start.p0'<<{{.*}}>> #uses=3 +; CHECK-NEXT: Call graph node for function: 'llvm.lifetime.start.p0'<<{{.*}}>> #uses=2 ; CHECK-EMPTY: ; CHECK-NEXT: Call graph node for function: 'llvm.memset.p0.i64'<<{{.*}}>> #uses=2 ; CHECK-EMPTY: @@ -25,18 +25,11 @@ ; CHECK-NEXT: Call graph node for function: 'used_by_lifetime'<<{{.*}}>> #uses=0 ; CHECK-NEXT: CS<{{.*}}> calls function 'llvm.lifetime.start.p0' ; CHECK-EMPTY: -; CHECK-NEXT: Call graph node for function: 'used_by_lifetime_cast'<<{{.*}}>> #uses=0 -; CHECK-NEXT: CS<{{.*}}> calls function 'llvm.lifetime.start.p0' -; CHECK-EMPTY: define internal void @used_by_lifetime() { entry: - call void @llvm.lifetime.start.p0(i64 4, ptr @used_by_lifetime) - ret void -} - -define internal void @used_by_lifetime_cast() addrspace(1) { - call void @llvm.lifetime.start.p0(i64 4, ptr addrspacecast (ptr addrspace(1) @used_by_lifetime_cast to ptr)) + %a = alloca i8 + call void @llvm.lifetime.start.p0(i64 4, ptr %a) ret void } diff --git a/llvm/test/Analysis/CostModel/X86/free-intrinsics.ll b/llvm/test/Analysis/CostModel/X86/free-intrinsics.ll index a8c5c43..3a54428 100644 --- a/llvm/test/Analysis/CostModel/X86/free-intrinsics.ll +++ b/llvm/test/Analysis/CostModel/X86/free-intrinsics.ll @@ -4,6 +4,7 @@ define i32 @trivially_free() { ; CHECK-SIZE-LABEL: 'trivially_free' +; CHECK-SIZE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %alloca = alloca i8, align 1 ; CHECK-SIZE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %a0 = call i32 @llvm.annotation.i32.p0(i32 undef, ptr undef, ptr undef, i32 undef) ; CHECK-SIZE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: call void @llvm.assume(i1 undef) ; CHECK-SIZE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: call void @llvm.experimental.noalias.scope.decl(metadata !3) @@ -13,14 +14,15 @@ define i32 @trivially_free() { ; CHECK-SIZE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %a2 = call ptr @llvm.launder.invariant.group.p0(ptr undef) ; CHECK-SIZE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %a3 = call ptr @llvm.strip.invariant.group.p0(ptr undef) ; CHECK-SIZE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %a4 = call i1 @llvm.is.constant.i32(i32 undef) -; CHECK-SIZE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: call void @llvm.lifetime.start.p0(i64 1, ptr undef) -; CHECK-SIZE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: call void @llvm.lifetime.end.p0(i64 1, ptr undef) +; CHECK-SIZE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: call void @llvm.lifetime.start.p0(i64 1, ptr %alloca) +; CHECK-SIZE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: call void @llvm.lifetime.end.p0(i64 1, ptr %alloca) ; CHECK-SIZE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %a5 = call i64 @llvm.objectsize.i64.p0(ptr undef, i1 true, i1 true, i1 true) ; CHECK-SIZE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %a6 = call ptr @llvm.ptr.annotation.p0.p0(ptr undef, ptr undef, ptr undef, i32 undef, ptr undef) ; CHECK-SIZE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: call void @llvm.var.annotation.p0.p0(ptr undef, ptr undef, ptr undef, i32 undef, ptr undef) ; CHECK-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef ; ; CHECK-THROUGHPUT-LABEL: 'trivially_free' +; CHECK-THROUGHPUT-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %alloca = alloca i8, align 1 ; CHECK-THROUGHPUT-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %a0 = call i32 @llvm.annotation.i32.p0(i32 undef, ptr undef, ptr undef, i32 undef) ; CHECK-THROUGHPUT-NEXT: Cost Model: Found an estimated cost of 0 for instruction: call void @llvm.assume(i1 undef) ; CHECK-THROUGHPUT-NEXT: Cost Model: Found an estimated cost of 0 for instruction: call void @llvm.experimental.noalias.scope.decl(metadata !3) @@ -30,13 +32,14 @@ define i32 @trivially_free() { ; CHECK-THROUGHPUT-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %a2 = call ptr @llvm.launder.invariant.group.p0(ptr undef) ; CHECK-THROUGHPUT-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %a3 = call ptr @llvm.strip.invariant.group.p0(ptr undef) ; CHECK-THROUGHPUT-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %a4 = call i1 @llvm.is.constant.i32(i32 undef) -; CHECK-THROUGHPUT-NEXT: Cost Model: Found an estimated cost of 0 for instruction: call void @llvm.lifetime.start.p0(i64 1, ptr undef) -; CHECK-THROUGHPUT-NEXT: Cost Model: Found an estimated cost of 0 for instruction: call void @llvm.lifetime.end.p0(i64 1, ptr undef) +; CHECK-THROUGHPUT-NEXT: Cost Model: Found an estimated cost of 0 for instruction: call void @llvm.lifetime.start.p0(i64 1, ptr %alloca) +; CHECK-THROUGHPUT-NEXT: Cost Model: Found an estimated cost of 0 for instruction: call void @llvm.lifetime.end.p0(i64 1, ptr %alloca) ; CHECK-THROUGHPUT-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %a5 = call i64 @llvm.objectsize.i64.p0(ptr undef, i1 true, i1 true, i1 true) ; CHECK-THROUGHPUT-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %a6 = call ptr @llvm.ptr.annotation.p0.p0(ptr undef, ptr undef, ptr undef, i32 undef, ptr undef) ; CHECK-THROUGHPUT-NEXT: Cost Model: Found an estimated cost of 0 for instruction: call void @llvm.var.annotation.p0.p0(ptr undef, ptr undef, ptr undef, i32 undef, ptr undef) ; CHECK-THROUGHPUT-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; + %alloca = alloca i8 %a0 = call i32 @llvm.annotation.i32(i32 undef, ptr undef, ptr undef, i32 undef) call void @llvm.assume(i1 undef) call void @llvm.experimental.noalias.scope.decl(metadata !4) @@ -46,8 +49,8 @@ define i32 @trivially_free() { %a2 = call ptr @llvm.launder.invariant.group.p0(ptr undef) %a3 = call ptr @llvm.strip.invariant.group.p0(ptr undef) %a4 = call i1 @llvm.is.constant.i32(i32 undef) - call void @llvm.lifetime.start.p0(i64 1, ptr undef) - call void @llvm.lifetime.end.p0(i64 1, ptr undef) + call void @llvm.lifetime.start.p0(i64 1, ptr %alloca) + call void @llvm.lifetime.end.p0(i64 1, ptr %alloca) %a5 = call i64 @llvm.objectsize.i64.p0(ptr undef, i1 1, i1 1, i1 1) %a6 = call ptr @llvm.ptr.annotation.p0(ptr undef, ptr undef, ptr undef, i32 undef, ptr undef) call void @llvm.var.annotation(ptr undef, ptr undef, ptr undef, i32 undef, ptr undef) diff --git a/llvm/test/Analysis/CostModel/free-intrinsics-datalayout.ll b/llvm/test/Analysis/CostModel/free-intrinsics-datalayout.ll index 560af3d..96064dc 100644 --- a/llvm/test/Analysis/CostModel/free-intrinsics-datalayout.ll +++ b/llvm/test/Analysis/CostModel/free-intrinsics-datalayout.ll @@ -6,6 +6,7 @@ target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128" define i32 @trivially_free() { ; CHECK-SIZE-LABEL: 'trivially_free' +; CHECK-SIZE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %alloca = alloca i8, align 4 ; CHECK-SIZE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %a0 = call i32 @llvm.annotation.i32.p0(i32 undef, ptr undef, ptr undef, i32 undef) ; CHECK-SIZE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: call void @llvm.assume(i1 undef) ; CHECK-SIZE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: call void @llvm.experimental.noalias.scope.decl(metadata !3) @@ -15,8 +16,8 @@ define i32 @trivially_free() { ; CHECK-SIZE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %a2 = call ptr @llvm.launder.invariant.group.p0(ptr undef) ; CHECK-SIZE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %a3 = call ptr @llvm.strip.invariant.group.p0(ptr undef) ; CHECK-SIZE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %a4 = call i1 @llvm.is.constant.i32(i32 undef) -; CHECK-SIZE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: call void @llvm.lifetime.start.p0(i64 1, ptr undef) -; CHECK-SIZE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: call void @llvm.lifetime.end.p0(i64 1, ptr undef) +; CHECK-SIZE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: call void @llvm.lifetime.start.p0(i64 1, ptr %alloca) +; CHECK-SIZE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: call void @llvm.lifetime.end.p0(i64 1, ptr %alloca) ; CHECK-SIZE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %a5 = call i64 @llvm.objectsize.i64.p0(ptr undef, i1 true, i1 true, i1 true) ; CHECK-SIZE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %a6 = call ptr @llvm.ptr.annotation.p0.p0(ptr undef, ptr undef, ptr undef, i32 undef, ptr undef) ; CHECK-SIZE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %a7 = call i1 @llvm.allow.ubsan.check(i8 123) @@ -25,6 +26,7 @@ define i32 @trivially_free() { ; CHECK-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef ; ; CHECK-THROUGHPUT-LABEL: 'trivially_free' +; CHECK-THROUGHPUT-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %alloca = alloca i8, align 4 ; CHECK-THROUGHPUT-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %a0 = call i32 @llvm.annotation.i32.p0(i32 undef, ptr undef, ptr undef, i32 undef) ; CHECK-THROUGHPUT-NEXT: Cost Model: Found an estimated cost of 0 for instruction: call void @llvm.assume(i1 undef) ; CHECK-THROUGHPUT-NEXT: Cost Model: Found an estimated cost of 0 for instruction: call void @llvm.experimental.noalias.scope.decl(metadata !3) @@ -34,8 +36,8 @@ define i32 @trivially_free() { ; CHECK-THROUGHPUT-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %a2 = call ptr @llvm.launder.invariant.group.p0(ptr undef) ; CHECK-THROUGHPUT-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %a3 = call ptr @llvm.strip.invariant.group.p0(ptr undef) ; CHECK-THROUGHPUT-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %a4 = call i1 @llvm.is.constant.i32(i32 undef) -; CHECK-THROUGHPUT-NEXT: Cost Model: Found an estimated cost of 0 for instruction: call void @llvm.lifetime.start.p0(i64 1, ptr undef) -; CHECK-THROUGHPUT-NEXT: Cost Model: Found an estimated cost of 0 for instruction: call void @llvm.lifetime.end.p0(i64 1, ptr undef) +; CHECK-THROUGHPUT-NEXT: Cost Model: Found an estimated cost of 0 for instruction: call void @llvm.lifetime.start.p0(i64 1, ptr %alloca) +; CHECK-THROUGHPUT-NEXT: Cost Model: Found an estimated cost of 0 for instruction: call void @llvm.lifetime.end.p0(i64 1, ptr %alloca) ; CHECK-THROUGHPUT-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %a5 = call i64 @llvm.objectsize.i64.p0(ptr undef, i1 true, i1 true, i1 true) ; CHECK-THROUGHPUT-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %a6 = call ptr @llvm.ptr.annotation.p0.p0(ptr undef, ptr undef, ptr undef, i32 undef, ptr undef) ; CHECK-THROUGHPUT-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %a7 = call i1 @llvm.allow.ubsan.check(i8 123) @@ -43,6 +45,7 @@ define i32 @trivially_free() { ; CHECK-THROUGHPUT-NEXT: Cost Model: Found an estimated cost of 0 for instruction: call void @llvm.var.annotation.p0.p0(ptr undef, ptr undef, ptr undef, i32 undef, ptr undef) ; CHECK-THROUGHPUT-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef ; + %alloca = alloca i8 %a0 = call i32 @llvm.annotation.i32(i32 undef, ptr undef, ptr undef, i32 undef) call void @llvm.assume(i1 undef) call void @llvm.experimental.noalias.scope.decl(metadata !4) @@ -52,8 +55,8 @@ define i32 @trivially_free() { %a2 = call ptr @llvm.launder.invariant.group.p0(ptr undef) %a3 = call ptr @llvm.strip.invariant.group.p0(ptr undef) %a4 = call i1 @llvm.is.constant.i32(i32 undef) - call void @llvm.lifetime.start.p0(i64 1, ptr undef) - call void @llvm.lifetime.end.p0(i64 1, ptr undef) + call void @llvm.lifetime.start.p0(i64 1, ptr %alloca) + call void @llvm.lifetime.end.p0(i64 1, ptr %alloca) %a5 = call i64 @llvm.objectsize.i64.p0(ptr undef, i1 1, i1 1, i1 1) %a6 = call ptr @llvm.ptr.annotation.p0(ptr undef, ptr undef, ptr undef, i32 undef, ptr undef) %a7 = call i1 @llvm.allow.ubsan.check(i8 123) diff --git a/llvm/test/Analysis/CostModel/free-intrinsics-no_info.ll b/llvm/test/Analysis/CostModel/free-intrinsics-no_info.ll index 53828f2..f989ebe 100644 --- a/llvm/test/Analysis/CostModel/free-intrinsics-no_info.ll +++ b/llvm/test/Analysis/CostModel/free-intrinsics-no_info.ll @@ -4,6 +4,7 @@ define i32 @trivially_free() { ; CHECK-SIZE-LABEL: 'trivially_free' +; CHECK-SIZE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %alloca = alloca i8, align 1 ; CHECK-SIZE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %a0 = call i32 @llvm.annotation.i32.p0(i32 undef, ptr undef, ptr undef, i32 undef) ; CHECK-SIZE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: call void @llvm.assume(i1 undef) ; CHECK-SIZE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: call void @llvm.experimental.noalias.scope.decl(metadata !3) @@ -13,8 +14,8 @@ define i32 @trivially_free() { ; CHECK-SIZE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %a2 = call ptr @llvm.launder.invariant.group.p0(ptr undef) ; CHECK-SIZE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %a3 = call ptr @llvm.strip.invariant.group.p0(ptr undef) ; CHECK-SIZE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %a4 = call i1 @llvm.is.constant.i32(i32 undef) -; CHECK-SIZE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: call void @llvm.lifetime.start.p0(i64 1, ptr undef) -; CHECK-SIZE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: call void @llvm.lifetime.end.p0(i64 1, ptr undef) +; CHECK-SIZE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: call void @llvm.lifetime.start.p0(i64 1, ptr %alloca) +; CHECK-SIZE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: call void @llvm.lifetime.end.p0(i64 1, ptr %alloca) ; CHECK-SIZE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %a5 = call i64 @llvm.objectsize.i64.p0(ptr undef, i1 true, i1 true, i1 true) ; CHECK-SIZE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %a6 = call ptr @llvm.ptr.annotation.p0.p0(ptr undef, ptr undef, ptr undef, i32 undef, ptr undef) ; CHECK-SIZE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: call void @llvm.var.annotation.p0.p0(ptr undef, ptr undef, ptr undef, i32 undef, ptr undef) @@ -23,6 +24,7 @@ define i32 @trivially_free() { ; CHECK-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef ; ; CHECK-THROUGHPUT-LABEL: 'trivially_free' +; CHECK-THROUGHPUT-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %alloca = alloca i8, align 1 ; CHECK-THROUGHPUT-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %a0 = call i32 @llvm.annotation.i32.p0(i32 undef, ptr undef, ptr undef, i32 undef) ; CHECK-THROUGHPUT-NEXT: Cost Model: Found an estimated cost of 0 for instruction: call void @llvm.assume(i1 undef) ; CHECK-THROUGHPUT-NEXT: Cost Model: Found an estimated cost of 0 for instruction: call void @llvm.experimental.noalias.scope.decl(metadata !3) @@ -32,8 +34,8 @@ define i32 @trivially_free() { ; CHECK-THROUGHPUT-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %a2 = call ptr @llvm.launder.invariant.group.p0(ptr undef) ; CHECK-THROUGHPUT-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %a3 = call ptr @llvm.strip.invariant.group.p0(ptr undef) ; CHECK-THROUGHPUT-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %a4 = call i1 @llvm.is.constant.i32(i32 undef) -; CHECK-THROUGHPUT-NEXT: Cost Model: Found an estimated cost of 0 for instruction: call void @llvm.lifetime.start.p0(i64 1, ptr undef) -; CHECK-THROUGHPUT-NEXT: Cost Model: Found an estimated cost of 0 for instruction: call void @llvm.lifetime.end.p0(i64 1, ptr undef) +; CHECK-THROUGHPUT-NEXT: Cost Model: Found an estimated cost of 0 for instruction: call void @llvm.lifetime.start.p0(i64 1, ptr %alloca) +; CHECK-THROUGHPUT-NEXT: Cost Model: Found an estimated cost of 0 for instruction: call void @llvm.lifetime.end.p0(i64 1, ptr %alloca) ; CHECK-THROUGHPUT-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %a5 = call i64 @llvm.objectsize.i64.p0(ptr undef, i1 true, i1 true, i1 true) ; CHECK-THROUGHPUT-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %a6 = call ptr @llvm.ptr.annotation.p0.p0(ptr undef, ptr undef, ptr undef, i32 undef, ptr undef) ; CHECK-THROUGHPUT-NEXT: Cost Model: Found an estimated cost of 0 for instruction: call void @llvm.var.annotation.p0.p0(ptr undef, ptr undef, ptr undef, i32 undef, ptr undef) @@ -41,6 +43,7 @@ define i32 @trivially_free() { ; CHECK-THROUGHPUT-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %a8 = call i1 @llvm.allow.runtime.check(metadata !"test_check") ; CHECK-THROUGHPUT-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef ; + %alloca = alloca i8 %a0 = call i32 @llvm.annotation.i32(i32 undef, ptr undef, ptr undef, i32 undef) call void @llvm.assume(i1 undef) call void @llvm.experimental.noalias.scope.decl(metadata !4) @@ -50,8 +53,8 @@ define i32 @trivially_free() { %a2 = call ptr @llvm.launder.invariant.group.p0(ptr undef) %a3 = call ptr @llvm.strip.invariant.group.p0(ptr undef) %a4 = call i1 @llvm.is.constant.i32(i32 undef) - call void @llvm.lifetime.start.p0(i64 1, ptr undef) - call void @llvm.lifetime.end.p0(i64 1, ptr undef) + call void @llvm.lifetime.start.p0(i64 1, ptr %alloca) + call void @llvm.lifetime.end.p0(i64 1, ptr %alloca) %a5 = call i64 @llvm.objectsize.i64.p0(ptr undef, i1 1, i1 1, i1 1) %a6 = call ptr @llvm.ptr.annotation.p0(ptr undef, ptr undef, ptr undef, i32 undef, ptr undef) call void @llvm.var.annotation(ptr undef, ptr undef, ptr undef, i32 undef, ptr undef) diff --git a/llvm/test/Analysis/LoopAccessAnalysis/different-strides-safe-dep-due-to-backedge-taken-count.ll b/llvm/test/Analysis/LoopAccessAnalysis/different-strides-safe-dep-due-to-backedge-taken-count.ll index 0d1b082..72b620a 100644 --- a/llvm/test/Analysis/LoopAccessAnalysis/different-strides-safe-dep-due-to-backedge-taken-count.ll +++ b/llvm/test/Analysis/LoopAccessAnalysis/different-strides-safe-dep-due-to-backedge-taken-count.ll @@ -106,8 +106,10 @@ exit: ret void } -define void @backward_dep_known_distance_less_than_btc(ptr %A) { -; CHECK-LABEL: 'backward_dep_known_distance_less_than_btc' +; TOOD: The loop should be safe without dependence, as all accesses to %l are +; completely before the first store. +define void @backward_dep_known_safe_due_to_backedge_taken_count(ptr %A) { +; CHECK-LABEL: 'backward_dep_known_safe_due_to_backedge_taken_count' ; CHECK-NEXT: loop: ; CHECK-NEXT: Memory dependences are safe with a maximum safe vector width of 8160 bits ; CHECK-NEXT: Dependences: @@ -142,3 +144,40 @@ loop: exit: ret void } + +define void @backward_dep_known_distance_less_than_btc(ptr %A) { +; CHECK-LABEL: 'backward_dep_known_distance_less_than_btc' +; CHECK-NEXT: loop: +; CHECK-NEXT: Memory dependences are safe with a maximum safe vector width of 4064 bits +; CHECK-NEXT: Dependences: +; CHECK-NEXT: BackwardVectorizable: +; CHECK-NEXT: %l = load i32, ptr %gep, align 4 -> +; CHECK-NEXT: store i32 %add, ptr %gep.mul.2, align 4 +; CHECK-EMPTY: +; CHECK-NEXT: Run-time memory checks: +; CHECK-NEXT: Grouped accesses: +; CHECK-EMPTY: +; CHECK-NEXT: Non vectorizable stores to invariant address were not found in loop. +; CHECK-NEXT: SCEV assumptions: +; CHECK-EMPTY: +; CHECK-NEXT: Expressions re-written: +; +entry: + %A.510 = getelementptr inbounds i32, ptr %A, i64 510 + br label %loop + +loop: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ] + %iv.mul.2 = shl nuw nsw i64 %iv, 1 + %gep = getelementptr inbounds i32, ptr %A, i64 %iv.mul.2 + %l = load i32, ptr %gep, align 4 + %add = add nsw i32 %l, 5 + %gep.mul.2 = getelementptr inbounds i32, ptr %A.510, i64 %iv + store i32 %add, ptr %gep.mul.2, align 4 + %iv.next = add nuw nsw i64 %iv, 1 + %exitcond.not = icmp eq i64 %iv.next, 256 + br i1 %exitcond.not, label %exit, label %loop + +exit: + ret void +} diff --git a/llvm/test/Analysis/MemorySSA/lifetime-simple.ll b/llvm/test/Analysis/MemorySSA/lifetime-simple.ll index d409c14..18d2459 100644 --- a/llvm/test/Analysis/MemorySSA/lifetime-simple.ll +++ b/llvm/test/Analysis/MemorySSA/lifetime-simple.ll @@ -2,8 +2,12 @@ ; This test checks that lifetime markers are considered clobbers of %P, ; and due to lack of noalias information, of %Q as well. -define i8 @test(ptr %P, ptr %Q) { +declare ptr @obscure(ptr) memory(none) + +define i8 @test() { entry: + %P = alloca [32 x i8] + %Q = call ptr @obscure(ptr %P) ; CHECK: 1 = MemoryDef(liveOnEntry) ; CHECK-NEXT: call void @llvm.lifetime.start.p0(i64 32, ptr %P) call void @llvm.lifetime.start.p0(i64 32, ptr %P) diff --git a/llvm/test/Analysis/MemorySSA/pr43427.ll b/llvm/test/Analysis/MemorySSA/pr43427.ll index a9b442c..254fb11 100644 --- a/llvm/test/Analysis/MemorySSA/pr43427.ll +++ b/llvm/test/Analysis/MemorySSA/pr43427.ll @@ -30,7 +30,7 @@ ; CHECK-NEXT: ; [[NO6:.*]] = MemoryDef([[NO7]]) ; CHECK-NEXT: store i16 undef, ptr %e, align 1 ; CHECK-NEXT: 3 = MemoryDef([[NO6]]) -; CHECK-NEXT: call void @llvm.lifetime.end.p0(i64 1, ptr null) +; CHECK-NEXT: call void @g() define void @f(i1 %arg) { entry: @@ -57,7 +57,7 @@ cleanup: ; preds = %lbl3 br i1 %switch, label %cleanup.cont, label %lbl1 cleanup.cont: ; preds = %cleanup - call void @llvm.lifetime.end.p0(i64 1, ptr null) + call void @g() ret void if.else: ; preds = %lbl1 @@ -65,6 +65,3 @@ if.else: ; preds = %lbl1 } declare void @g() - -; Function Attrs: argmemonly nounwind willreturn -declare void @llvm.lifetime.end.p0(i64 immarg, ptr nocapture) diff --git a/llvm/test/Analysis/MemorySSA/pr43438.ll b/llvm/test/Analysis/MemorySSA/pr43438.ll index d137c52..0e09137 100644 --- a/llvm/test/Analysis/MemorySSA/pr43438.ll +++ b/llvm/test/Analysis/MemorySSA/pr43438.ll @@ -87,7 +87,7 @@ if.else: ; preds = %lbl1 ] if.end12: ; preds = %cleanup.cont11s, %cleanup.cont - call void @llvm.lifetime.end.p0(i64 1, ptr undef) + call i16 @g(i16 1) ret void unreachable: ; preds = %if.else, %for.end5 @@ -95,6 +95,3 @@ unreachable: ; preds = %if.else, %for.end5 } declare i16 @g(i16) - -; Function Attrs: argmemonly nounwind willreturn -declare void @llvm.lifetime.end.p0(i64 immarg, ptr nocapture) diff --git a/llvm/test/Analysis/StackSafetyAnalysis/lifetime.ll b/llvm/test/Analysis/StackSafetyAnalysis/lifetime.ll index 37fa7d3e..7fa1cf4 100644 --- a/llvm/test/Analysis/StackSafetyAnalysis/lifetime.ll +++ b/llvm/test/Analysis/StackSafetyAnalysis/lifetime.ll @@ -786,83 +786,6 @@ end: ret void } -define void @non_alloca(ptr %p) { -; CHECK-LABEL: define void @non_alloca -entry: -; CHECK: entry: -; MAY-NEXT: Alive: <x y> -; MUST-NEXT: Alive: <> - %x = alloca i8, align 4 - %y = alloca i8, align 4 - - call void @llvm.lifetime.start.p0(i64 4, ptr %p) -; CHECK: call void @llvm.lifetime.start.p0(i64 4, ptr %p) -; MAY-NEXT: Alive: <x y> -; MUST-NEXT: Alive: <> - - call void @llvm.lifetime.start.p0(i64 4, ptr %x) -; CHECK: call void @llvm.lifetime.start.p0(i64 4, ptr %x) -; MAY-NEXT: Alive: <x y> -; MUST-NEXT: Alive: <> - - call void @llvm.lifetime.end.p0(i64 4, ptr %p) -; CHECK: call void @llvm.lifetime.end.p0(i64 4, ptr %p) -; MAY-NEXT: Alive: <x y> -; MUST-NEXT: Alive: <> - - ret void -} - -define void @select_alloca(i1 %v) { -; CHECK-LABEL: define void @select_alloca -entry: -; CHECK: entry: -; MAY-NEXT: Alive: <x y> -; MUST-NEXT: Alive: <> - %x = alloca i8, align 4 - %y = alloca i8, align 4 - %cxcy = select i1 %v, ptr %x, ptr %y - - call void @llvm.lifetime.start.p0(i64 1, ptr %cxcy) -; CHECK: call void @llvm.lifetime.start.p0(i64 1, ptr %cxcy) -; MAY-NEXT: Alive: <x y> -; MUST-NEXT: Alive: <> - - call void @llvm.lifetime.start.p0(i64 1, ptr %x) -; CHECK: call void @llvm.lifetime.start.p0(i64 1, ptr %x) -; MAY-NEXT: Alive: <x y> -; MUST-NEXT: Alive: <> - - call void @llvm.lifetime.end.p0(i64 1, ptr %x) -; CHECK: call void @llvm.lifetime.end.p0(i64 1, ptr %x) -; MAY-NEXT: Alive: <x y> -; MUST-NEXT: Alive: <> - - ret void -} - -define void @alloca_offset() { -; CHECK-LABEL: define void @alloca_offset -entry: -; CHECK: entry: -; MAY-NEXT: Alive: <x> -; MUST-NEXT: Alive: <> - %x = alloca [5 x i32], align 4 - %x2 = getelementptr [5 x i32], ptr %x, i64 0, i64 1 - - call void @llvm.lifetime.start.p0(i64 20, ptr %x2) -; CHECK: call void @llvm.lifetime.start.p0(i64 20, ptr %x2) -; MAY-NEXT: Alive: <x> -; MUST-NEXT: Alive: <> - - call void @llvm.lifetime.end.p0(i64 20, ptr %x2) -; CHECK: call void @llvm.lifetime.end.p0(i64 20, ptr %x2) -; MAY-NEXT: Alive: <x> -; MUST-NEXT: Alive: <> - - ret void -} - define void @alloca_size() { ; CHECK-LABEL: define void @alloca_size entry: diff --git a/llvm/test/Analysis/UniformityAnalysis/AMDGPU/intrinsics.ll b/llvm/test/Analysis/UniformityAnalysis/AMDGPU/intrinsics.ll index 705c128..10c656a 100644 --- a/llvm/test/Analysis/UniformityAnalysis/AMDGPU/intrinsics.ll +++ b/llvm/test/Analysis/UniformityAnalysis/AMDGPU/intrinsics.ll @@ -302,6 +302,14 @@ define amdgpu_kernel void @wmma_i32_16x16x64_iu8(<8 x i32> %A, <8 x i32> %B, <8 ret void } +; CHECK: DIVERGENT: %tmp0 = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v16i32.v16i32(i32 0, <16 x i32> %A, i32 0, <16 x i32> %B, i16 0, <8 x float> %C) +define amdgpu_ps void @wmma_f32_16x16x128_f8f6f4(<16 x i32> %A, <16 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) { +bb: + %tmp0 = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v16i32.v16i32(i32 0, <16 x i32> %A, i32 0, <16 x i32> %B, i16 0, <8 x float> %C) + store <8 x float> %tmp0, ptr addrspace(1) %out + ret void +} + ; CHRCK: DIVERGENT: %tmp0 = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x64.f16.v8f32.v16f16.v32f16.i16(i1 false, <16 x half> %A, i1 false, <32 x half> %B, <8 x float> %C, i16 %Index, i1 false, i1 false) define amdgpu_ps void @swmmac_f32_16x16x64_f16(<16 x half> %A, <32 x half> %B, <8 x float> %C, i16 %Index, ptr addrspace(1) %out) { %tmp0 = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x64.f16.v8f32.v16f16.v32f16.i16(i1 0, <16 x half> %A, i1 0, <32 x half> %B, <8 x float> %C, i16 %Index, i1 false, i1 false) @@ -836,6 +844,7 @@ declare <8 x half> @llvm.amdgcn.wmma.f16.16x16x64.fp8.bf8.v8f16.v8i32(<8 x i32>, declare <8 x half> @llvm.amdgcn.wmma.f16.16x16x64.bf8.fp8.v8f16.v8i32(<8 x i32>, <8 x i32>, i16, <8 x half>, i1, i1) declare <8 x half> @llvm.amdgcn.wmma.f16.16x16x64.bf8.bf8.v8f16.v8i32(<8 x i32>, <8 x i32>, i16, <8 x half>, i1, i1) declare <8 x i32> @llvm.amdgcn.wmma.i32.16x16x64.iu8.v8i32.v8i32(i1 immarg, <8 x i32>, i1 immarg, <8 x i32>, <8 x i32>, i1, i1) +declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v16i32.v16i32(i32, <16 x i32>, i32, <16 x i32>, i16, <8 x float>) declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x64.f16.v8f32.v16f16.v32f16.i16(i1, <16 x half>, i1, <32 x half>, <8 x float>, i16, i1, i1) declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x64.bf16.v8f32.v16bf16.v32bf16.i16(i1, <16 x bfloat>, i1, <32 x bfloat>, <8 x float>, i16, i1, i1) declare <8 x half> @llvm.amdgcn.swmmac.f16.16x16x64.f16.v8f16.v16f16.v32f16.i16(i1, <16 x half>, i1, <32 x half>, <8 x half>, i16, i1, i1) diff --git a/llvm/test/Assembler/autoupgrade-lifetime-intrinsics.ll b/llvm/test/Assembler/autoupgrade-lifetime-intrinsics.ll new file mode 100644 index 0000000..00ab934 --- /dev/null +++ b/llvm/test/Assembler/autoupgrade-lifetime-intrinsics.ll @@ -0,0 +1,57 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 +; RUN: opt -S < %s | FileCheck %s + +define void @strip_bitcast() { +; CHECK-LABEL: define void @strip_bitcast() { +; CHECK-NEXT: [[A:%.*]] = alloca i8, align 1 +; CHECK-NEXT: [[B:%.*]] = bitcast ptr [[A]] to ptr +; CHECK-NEXT: call void @llvm.lifetime.start.p0(i64 1, ptr [[A]]) +; CHECK-NEXT: call void @llvm.lifetime.end.p0(i64 1, ptr [[A]]) +; CHECK-NEXT: ret void +; + %a = alloca i8 + %b = bitcast ptr %a to ptr + call void @llvm.lifetime.start.p0(i64 1, ptr %b) + call void @llvm.lifetime.end.p0(i64 1, ptr %b) + ret void +} + +define void @strip_addrspacecast() { +; CHECK-LABEL: define void @strip_addrspacecast() { +; CHECK-NEXT: [[A:%.*]] = alloca i8, align 1 +; CHECK-NEXT: [[B:%.*]] = addrspacecast ptr [[A]] to ptr addrspace(1) +; CHECK-NEXT: call void @llvm.lifetime.start.p0(i64 1, ptr [[A]]) +; CHECK-NEXT: call void @llvm.lifetime.end.p0(i64 1, ptr [[A]]) +; CHECK-NEXT: ret void +; + %a = alloca i8 + %b = addrspacecast ptr %a to ptr addrspace(1) + call void @llvm.lifetime.start.p1(i64 1, ptr addrspace(1) %b) + call void @llvm.lifetime.end.p1(i64 1, ptr addrspace(1) %b) + ret void +} + +define void @strip_gep() { +; CHECK-LABEL: define void @strip_gep() { +; CHECK-NEXT: [[A:%.*]] = alloca [2 x i8], align 1 +; CHECK-NEXT: [[B:%.*]] = getelementptr [2 x i8], ptr [[A]], i64 0, i64 0 +; CHECK-NEXT: call void @llvm.lifetime.start.p0(i64 1, ptr [[A]]) +; CHECK-NEXT: call void @llvm.lifetime.end.p0(i64 1, ptr [[A]]) +; CHECK-NEXT: ret void +; + %a = alloca [2 x i8] + %b = getelementptr [2 x i8], ptr %a, i64 0, i64 0 + call void @llvm.lifetime.start.p0(i64 1, ptr %b) + call void @llvm.lifetime.end.p0(i64 1, ptr %b) + ret void +} + +define void @remove_unanalyzable(ptr %p) { +; CHECK-LABEL: define void @remove_unanalyzable( +; CHECK-SAME: ptr [[P:%.*]]) { +; CHECK-NEXT: ret void +; + call void @llvm.lifetime.start.p0(i64 1, ptr %p) + call void @llvm.lifetime.end.p0(i64 1, ptr %p) + ret void +} diff --git a/llvm/test/Bitcode/compatibility.ll b/llvm/test/Bitcode/compatibility.ll index 9cf3fdb..0b5ce08 100644 --- a/llvm/test/Bitcode/compatibility.ll +++ b/llvm/test/Bitcode/compatibility.ll @@ -564,6 +564,10 @@ declare riscv_vls_cc(32768) void @riscv_vls_cc_32768() ; CHECK: declare riscv_vls_cc(32768) void @riscv_vls_cc_32768() declare riscv_vls_cc(65536) void @riscv_vls_cc_65536() ; CHECK: declare riscv_vls_cc(65536) void @riscv_vls_cc_65536() +declare cc124 void @f.cc124(i1) +; CHECK: declare amdgpu_gfx_whole_wave void @f.cc124(i1) +declare amdgpu_gfx_whole_wave void @f.amdgpu_gfx_whole_wave(i1) +; CHECK: declare amdgpu_gfx_whole_wave void @f.amdgpu_gfx_whole_wave(i1) declare cc1023 void @f.cc1023() ; CHECK: declare cc1023 void @f.cc1023() diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/irtranslator-switch-split.ll b/llvm/test/CodeGen/AArch64/GlobalISel/irtranslator-switch-split.ll index 55cf48e..d1a6584a 100644 --- a/llvm/test/CodeGen/AArch64/GlobalISel/irtranslator-switch-split.ll +++ b/llvm/test/CodeGen/AArch64/GlobalISel/irtranslator-switch-split.ll @@ -9,7 +9,7 @@ declare void @llvm.lifetime.start.p0(i64 immarg, ptr nocapture) #0 declare i32 @logg(...) -define i32 @scanfile(i32 %call148) { +define i32 @scanfile(i32 %call148, ptr %p) { ; CHECK-LABEL: scanfile: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: stp x29, x30, [sp, #-16]! ; 16-byte Folded Spill @@ -26,7 +26,7 @@ define i32 @scanfile(i32 %call148) { ; CHECK-NEXT: ldp x29, x30, [sp], #16 ; 16-byte Folded Reload ; CHECK-NEXT: ret ; CHECK-NEXT: LBB0_3: ; %entry -; CHECK-NEXT: b.eq LBB0_2 +; CHECK-NEXT: b.eq LBB0_10 ; CHECK-NEXT: ; %bb.4: ; %entry ; CHECK-NEXT: cmp w8, #2 ; CHECK-NEXT: b.eq LBB0_6 @@ -46,6 +46,10 @@ define i32 @scanfile(i32 %call148) { ; CHECK-NEXT: LBB0_9: ; %sw.bb150 ; CHECK-NEXT: bl _logg ; CHECK-NEXT: brk #0x1 +; CHECK-NEXT: LBB0_10: ; %sw.bb178 +; CHECK-NEXT: str wzr, [x1] +; CHECK-NEXT: ldp x29, x30, [sp], #16 ; 16-byte Folded Reload +; CHECK-NEXT: ret entry: switch i32 %call148, label %common.ret [ i32 -1, label %sw.bb @@ -80,7 +84,7 @@ sw.bb152: ; preds = %entry br label %common.ret sw.bb178: ; preds = %entry - call void @llvm.lifetime.start.p0(i64 0, ptr null) + store i32 0, ptr %p br label %common.ret } diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/knownbits-ashr.mir b/llvm/test/CodeGen/AArch64/GlobalISel/knownbits-ashr.mir new file mode 100644 index 0000000..8552931 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/GlobalISel/knownbits-ashr.mir @@ -0,0 +1,109 @@ +# NOTE: Assertions have been autogenerated by utils/update_givaluetracking_test_checks.py UTC_ARGS: --version 5 +# RUN: llc -mtriple aarch64 -passes="print<gisel-value-tracking>" %s -o - 2>&1 | FileCheck %s + +--- +name: Cst +body: | + bb.1: + ; CHECK-LABEL: name: @Cst + ; CHECK-NEXT: %0:_ KnownBits:10000000 SignBits:1 + ; CHECK-NEXT: %1:_ KnownBits:00000011 SignBits:6 + ; CHECK-NEXT: %2:_ KnownBits:11110000 SignBits:4 + %0:_(s8) = G_CONSTANT i8 128 + %1:_(s8) = G_CONSTANT i8 3 + %2:_(s8) = G_ASHR %0, %1 +... +--- +name: CstBig +body: | + bb.1: + ; CHECK-LABEL: name: @CstBig + ; CHECK-NEXT: %0:_ KnownBits:11111000 SignBits:5 + ; CHECK-NEXT: %1:_ KnownBits:00000110 SignBits:5 + ; CHECK-NEXT: %2:_ KnownBits:11111111 SignBits:8 + %0:_(s8) = G_CONSTANT i8 248 + %1:_(s8) = G_CONSTANT i8 6 + %2:_(s8) = G_ASHR %0, %1 +... +--- +name: ScalarVar +body: | + bb.1: + ; CHECK-LABEL: name: @ScalarVar + ; CHECK-NEXT: %0:_ KnownBits:???????? SignBits:1 + ; CHECK-NEXT: %1:_ KnownBits:???????? SignBits:1 + ; CHECK-NEXT: %2:_ KnownBits:???????? SignBits:1 + %0:_(s8) = COPY $b0 + %1:_(s8) = COPY $b1 + %2:_(s8) = G_ASHR %0, %1 +... +--- +name: ScalarCst +body: | + bb.1: + ; CHECK-LABEL: name: @ScalarCst + ; CHECK-NEXT: %0:_ KnownBits:???????? SignBits:1 + ; CHECK-NEXT: %1:_ KnownBits:00000011 SignBits:6 + ; CHECK-NEXT: %2:_ KnownBits:???????? SignBits:4 + %0:_(s8) = COPY $b0 + %1:_(s8) = G_CONSTANT i8 3 + %2:_(s8) = G_ASHR %0, %1 +... +--- +name: VectorVar +body: | + bb.1: + ; CHECK-LABEL: name: @VectorVar + ; CHECK-NEXT: %0:_ KnownBits:???????????????? SignBits:1 + ; CHECK-NEXT: %1:_ KnownBits:???????????????? SignBits:1 + ; CHECK-NEXT: %2:_ KnownBits:???????????????? SignBits:1 + %0:_(<4 x s16>) = COPY $d0 + %1:_(<4 x s16>) = COPY $d1 + %2:_(<4 x s16>) = G_ASHR %0, %1 +... +--- +name: VectorCst +body: | + bb.1: + ; CHECK-LABEL: name: @VectorCst + ; CHECK-NEXT: %0:_ KnownBits:???????????????? SignBits:1 + ; CHECK-NEXT: %1:_ KnownBits:0000000000000011 SignBits:14 + ; CHECK-NEXT: %2:_ KnownBits:0000000000000011 SignBits:14 + ; CHECK-NEXT: %3:_ KnownBits:???????????????? SignBits:4 + %0:_(<4 x s16>) = COPY $d0 + %1:_(s16) = G_CONSTANT i16 3 + %2:_(<4 x s16>) = G_BUILD_VECTOR %1, %1, %1, %1 + %3:_(<4 x s16>) = G_ASHR %0, %2 +... +--- +name: VectorCst36 +body: | + bb.1: + ; CHECK-LABEL: name: @VectorCst36 + ; CHECK-NEXT: %0:_ KnownBits:???????????????? SignBits:1 + ; CHECK-NEXT: %1:_ KnownBits:0000000000000011 SignBits:14 + ; CHECK-NEXT: %2:_ KnownBits:0000000000000110 SignBits:13 + ; CHECK-NEXT: %3:_ KnownBits:0000000000000?1? SignBits:13 + ; CHECK-NEXT: %4:_ KnownBits:???????????????? SignBits:4 + %0:_(<4 x s16>) = COPY $d0 + %1:_(s16) = G_CONSTANT i16 3 + %2:_(s16) = G_CONSTANT i16 6 + %3:_(<4 x s16>) = G_BUILD_VECTOR %1, %2, %2, %1 + %4:_(<4 x s16>) = G_ASHR %0, %3 +... +--- +name: VectorCst3unknown +body: | + bb.1: + ; CHECK-LABEL: name: @VectorCst3unknown + ; CHECK-NEXT: %0:_ KnownBits:???????????????? SignBits:1 + ; CHECK-NEXT: %1:_ KnownBits:???????????????? SignBits:1 + ; CHECK-NEXT: %2:_ KnownBits:0000000000000011 SignBits:14 + ; CHECK-NEXT: %3:_ KnownBits:???????????????? SignBits:1 + ; CHECK-NEXT: %4:_ KnownBits:???????????????? SignBits:1 + %0:_(<4 x s16>) = COPY $d0 + %2:_(s16) = COPY $h0 + %1:_(s16) = G_CONSTANT i16 3 + %3:_(<4 x s16>) = G_BUILD_VECTOR %1, %2, %2, %1 + %4:_(<4 x s16>) = G_ASHR %0, %3 +... diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/legalizer-info-validation.mir b/llvm/test/CodeGen/AArch64/GlobalISel/legalizer-info-validation.mir index bd2d8c09..5c164bf 100644 --- a/llvm/test/CodeGen/AArch64/GlobalISel/legalizer-info-validation.mir +++ b/llvm/test/CodeGen/AArch64/GlobalISel/legalizer-info-validation.mir @@ -71,12 +71,13 @@ # DEBUG-NEXT: .. the first uncovered imm index: 0, OK # # DEBUG-NEXT: G_ABDS (opcode 65): 1 type index, 0 imm indices -# DEBUG-NEXT:.. type index coverage check SKIPPED: no rules defined -# DEBUG-NEXT:.. imm index coverage check SKIPPED: no rules defined +# DEBUG-NEXT: .. type index coverage check SKIPPED: user-defined predicate detected +# DEBUG-NEXT: .. imm index coverage check SKIPPED: user-defined predicate detected # -# DEBUG-NEXT:G_ABDU (opcode 66): 1 type index, 0 imm indices -# DEBUG-NEXT:.. type index coverage check SKIPPED: no rules defined -# DEBUG-NEXT:.. imm index coverage check SKIPPED: no rules defined +# DEBUG-NEXT: G_ABDU (opcode 66): 1 type index, 0 imm indices +# DEBUG-NEXT: .. opcode {{[0-9]+}} is aliased to {{[0-9]+}} +# DEBUG-NEXT: .. type index coverage check SKIPPED: user-defined predicate detected +# DEBUG-NEXT: .. imm index coverage check SKIPPED: user-defined predicate detected # # DEBUG-NEXT: G_IMPLICIT_DEF (opcode {{[0-9]+}}): 1 type index, 0 imm indices # DEBUG-NEXT: .. the first uncovered type index: {{[0-9]+}}, OK diff --git a/llvm/test/CodeGen/AArch64/aarch64-dup-ext.ll b/llvm/test/CodeGen/AArch64/aarch64-dup-ext.ll index be79135..747db39 100644 --- a/llvm/test/CodeGen/AArch64/aarch64-dup-ext.ll +++ b/llvm/test/CodeGen/AArch64/aarch64-dup-ext.ll @@ -14,10 +14,10 @@ define <8 x i16> @dupsext_v8i8_v8i16(i8 %src, <8 x i8> %b) { ; CHECK-GI-LABEL: dupsext_v8i8_v8i16: ; CHECK-GI: // %bb.0: // %entry ; CHECK-GI-NEXT: lsl w8, w0, #8 -; CHECK-GI-NEXT: sshll v0.8h, v0.8b, #0 ; CHECK-GI-NEXT: sbfx w8, w8, #8, #8 ; CHECK-GI-NEXT: dup v1.8h, w8 -; CHECK-GI-NEXT: mul v0.8h, v1.8h, v0.8h +; CHECK-GI-NEXT: xtn v1.8b, v1.8h +; CHECK-GI-NEXT: smull v0.8h, v1.8b, v0.8b ; CHECK-GI-NEXT: ret entry: %in = sext i8 %src to i16 diff --git a/llvm/test/CodeGen/AArch64/aarch64-smull.ll b/llvm/test/CodeGen/AArch64/aarch64-smull.ll index 2f23a32..6e5c666 100644 --- a/llvm/test/CodeGen/AArch64/aarch64-smull.ll +++ b/llvm/test/CodeGen/AArch64/aarch64-smull.ll @@ -2264,33 +2264,12 @@ define <2 x i64> @lsr_const(<2 x i64> %a, <2 x i64> %b) { } define <2 x i64> @asr(<2 x i64> %a, <2 x i64> %b) { -; CHECK-NEON-LABEL: asr: -; CHECK-NEON: // %bb.0: -; CHECK-NEON-NEXT: shrn v0.2s, v0.2d, #32 -; CHECK-NEON-NEXT: shrn v1.2s, v1.2d, #32 -; CHECK-NEON-NEXT: smull v0.2d, v0.2s, v1.2s -; CHECK-NEON-NEXT: ret -; -; CHECK-SVE-LABEL: asr: -; CHECK-SVE: // %bb.0: -; CHECK-SVE-NEXT: shrn v0.2s, v0.2d, #32 -; CHECK-SVE-NEXT: shrn v1.2s, v1.2d, #32 -; CHECK-SVE-NEXT: smull v0.2d, v0.2s, v1.2s -; CHECK-SVE-NEXT: ret -; -; CHECK-GI-LABEL: asr: -; CHECK-GI: // %bb.0: -; CHECK-GI-NEXT: sshr v0.2d, v0.2d, #32 -; CHECK-GI-NEXT: sshr v1.2d, v1.2d, #32 -; CHECK-GI-NEXT: fmov x10, d0 -; CHECK-GI-NEXT: fmov x11, d1 -; CHECK-GI-NEXT: mov x8, v0.d[1] -; CHECK-GI-NEXT: mov x9, v1.d[1] -; CHECK-GI-NEXT: mul x10, x10, x11 -; CHECK-GI-NEXT: mul x8, x8, x9 -; CHECK-GI-NEXT: fmov d0, x10 -; CHECK-GI-NEXT: mov v0.d[1], x8 -; CHECK-GI-NEXT: ret +; CHECK-LABEL: asr: +; CHECK: // %bb.0: +; CHECK-NEXT: shrn v0.2s, v0.2d, #32 +; CHECK-NEXT: shrn v1.2s, v1.2d, #32 +; CHECK-NEXT: smull v0.2d, v0.2s, v1.2s +; CHECK-NEXT: ret %x = ashr <2 x i64> %a, <i64 32, i64 32> %y = ashr <2 x i64> %b, <i64 32, i64 32> %z = mul nsw <2 x i64> %x, %y @@ -2298,34 +2277,12 @@ define <2 x i64> @asr(<2 x i64> %a, <2 x i64> %b) { } define <2 x i64> @asr_const(<2 x i64> %a, <2 x i64> %b) { -; CHECK-NEON-LABEL: asr_const: -; CHECK-NEON: // %bb.0: -; CHECK-NEON-NEXT: movi v1.2s, #31 -; CHECK-NEON-NEXT: shrn v0.2s, v0.2d, #32 -; CHECK-NEON-NEXT: smull v0.2d, v0.2s, v1.2s -; CHECK-NEON-NEXT: ret -; -; CHECK-SVE-LABEL: asr_const: -; CHECK-SVE: // %bb.0: -; CHECK-SVE-NEXT: movi v1.2s, #31 -; CHECK-SVE-NEXT: shrn v0.2s, v0.2d, #32 -; CHECK-SVE-NEXT: smull v0.2d, v0.2s, v1.2s -; CHECK-SVE-NEXT: ret -; -; CHECK-GI-LABEL: asr_const: -; CHECK-GI: // %bb.0: -; CHECK-GI-NEXT: adrp x8, .LCPI81_0 -; CHECK-GI-NEXT: sshr v0.2d, v0.2d, #32 -; CHECK-GI-NEXT: ldr q1, [x8, :lo12:.LCPI81_0] -; CHECK-GI-NEXT: fmov x10, d0 -; CHECK-GI-NEXT: fmov x11, d1 -; CHECK-GI-NEXT: mov x8, v0.d[1] -; CHECK-GI-NEXT: mov x9, v1.d[1] -; CHECK-GI-NEXT: mul x10, x10, x11 -; CHECK-GI-NEXT: mul x8, x8, x9 -; CHECK-GI-NEXT: fmov d0, x10 -; CHECK-GI-NEXT: mov v0.d[1], x8 -; CHECK-GI-NEXT: ret +; CHECK-LABEL: asr_const: +; CHECK: // %bb.0: +; CHECK-NEXT: movi v1.2s, #31 +; CHECK-NEXT: shrn v0.2s, v0.2d, #32 +; CHECK-NEXT: smull v0.2d, v0.2s, v1.2s +; CHECK-NEXT: ret %x = ashr <2 x i64> %a, <i64 32, i64 32> %z = mul nsw <2 x i64> %x, <i64 31, i64 31> ret <2 x i64> %z diff --git a/llvm/test/CodeGen/AArch64/arm64-neon-3vdiff.ll b/llvm/test/CodeGen/AArch64/arm64-neon-3vdiff.ll index bd28d13..256ff94 100644 --- a/llvm/test/CodeGen/AArch64/arm64-neon-3vdiff.ll +++ b/llvm/test/CodeGen/AArch64/arm64-neon-3vdiff.ll @@ -1,5 +1,11 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc < %s -verify-machineinstrs -mtriple=arm64-none-linux-gnu -mattr=+neon,+aes | FileCheck %s +; RUN: llc < %s -verify-machineinstrs -mtriple=arm64-none-linux-gnu -mattr=+neon,+aes | FileCheck %s --check-prefixes=CHECK,CHECK-SD +; RUN: llc < %s -verify-machineinstrs -mtriple=arm64-none-linux-gnu -mattr=+neon,+aes -global-isel -global-isel-abort=2 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK-GI + +; CHECK-GI: warning: Instruction selection used fallback path for test_vmull_p8 +; CHECK-GI-NEXT: warning: Instruction selection used fallback path for test_vmull_high_p8 +; CHECK-GI-NEXT: warning: Instruction selection used fallback path for test_vmull_p64 +; CHECK-GI-NEXT: warning: Instruction selection used fallback path for test_vmull_high_p64 declare <8 x i16> @llvm.aarch64.neon.pmull.v8i16(<8 x i8>, <8 x i8>) declare <16 x i8> @llvm.aarch64.neon.pmull64(i64, i64) #5 @@ -101,11 +107,18 @@ entry: } define <8 x i16> @test_vaddl_a8(<8 x i8> %a, <8 x i8> %b) { -; CHECK-LABEL: test_vaddl_a8: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: uaddl v0.8h, v0.8b, v1.8b -; CHECK-NEXT: bic v0.8h, #255, lsl #8 -; CHECK-NEXT: ret +; CHECK-SD-LABEL: test_vaddl_a8: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: uaddl v0.8h, v0.8b, v1.8b +; CHECK-SD-NEXT: bic v0.8h, #255, lsl #8 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: test_vaddl_a8: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: movi v2.2d, #0xff00ff00ff00ff +; CHECK-GI-NEXT: uaddl v0.8h, v0.8b, v1.8b +; CHECK-GI-NEXT: and v0.16b, v0.16b, v2.16b +; CHECK-GI-NEXT: ret entry: %vmovl.i.i = zext <8 x i8> %a to <8 x i16> %vmovl.i2.i = zext <8 x i8> %b to <8 x i16> @@ -229,11 +242,18 @@ entry: } define <8 x i16> @test_vaddl_high_a8(<16 x i8> %a, <16 x i8> %b) { -; CHECK-LABEL: test_vaddl_high_a8: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: uaddl2 v0.8h, v0.16b, v1.16b -; CHECK-NEXT: bic v0.8h, #255, lsl #8 -; CHECK-NEXT: ret +; CHECK-SD-LABEL: test_vaddl_high_a8: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: uaddl2 v0.8h, v0.16b, v1.16b +; CHECK-SD-NEXT: bic v0.8h, #255, lsl #8 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: test_vaddl_high_a8: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: movi v2.2d, #0xff00ff00ff00ff +; CHECK-GI-NEXT: uaddl2 v0.8h, v0.16b, v1.16b +; CHECK-GI-NEXT: and v0.16b, v0.16b, v2.16b +; CHECK-GI-NEXT: ret entry: %shuffle.i.i.i = shufflevector <16 x i8> %a, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> %0 = zext <8 x i8> %shuffle.i.i.i to <8 x i16> @@ -345,11 +365,18 @@ entry: } define <8 x i16> @test_vaddw_a8(<8 x i16> %a, <8 x i8> %b) { -; CHECK-LABEL: test_vaddw_a8: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: uaddw v0.8h, v0.8h, v1.8b -; CHECK-NEXT: bic v0.8h, #255, lsl #8 -; CHECK-NEXT: ret +; CHECK-SD-LABEL: test_vaddw_a8: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: uaddw v0.8h, v0.8h, v1.8b +; CHECK-SD-NEXT: bic v0.8h, #255, lsl #8 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: test_vaddw_a8: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: movi v2.2d, #0xff00ff00ff00ff +; CHECK-GI-NEXT: uaddw v0.8h, v0.8h, v1.8b +; CHECK-GI-NEXT: and v0.16b, v0.16b, v2.16b +; CHECK-GI-NEXT: ret entry: %vmovl.i.i = zext <8 x i8> %b to <8 x i16> %add.i = add <8 x i16> %vmovl.i.i, %a @@ -458,11 +485,18 @@ entry: } define <8 x i16> @test_vaddw_high_a8(<8 x i16> %a, <16 x i8> %b) { -; CHECK-LABEL: test_vaddw_high_a8: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: uaddw2 v0.8h, v0.8h, v1.16b -; CHECK-NEXT: bic v0.8h, #255, lsl #8 -; CHECK-NEXT: ret +; CHECK-SD-LABEL: test_vaddw_high_a8: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: uaddw2 v0.8h, v0.8h, v1.16b +; CHECK-SD-NEXT: bic v0.8h, #255, lsl #8 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: test_vaddw_high_a8: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: movi v2.2d, #0xff00ff00ff00ff +; CHECK-GI-NEXT: uaddw2 v0.8h, v0.8h, v1.16b +; CHECK-GI-NEXT: and v0.16b, v0.16b, v2.16b +; CHECK-GI-NEXT: ret entry: %shuffle.i.i.i = shufflevector <16 x i8> %b, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> %0 = zext <8 x i8> %shuffle.i.i.i to <8 x i16> @@ -574,11 +608,18 @@ entry: } define <8 x i16> @test_vsubl_a8(<8 x i8> %a, <8 x i8> %b) { -; CHECK-LABEL: test_vsubl_a8: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: usubl v0.8h, v0.8b, v1.8b -; CHECK-NEXT: bic v0.8h, #255, lsl #8 -; CHECK-NEXT: ret +; CHECK-SD-LABEL: test_vsubl_a8: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: usubl v0.8h, v0.8b, v1.8b +; CHECK-SD-NEXT: bic v0.8h, #255, lsl #8 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: test_vsubl_a8: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: movi v2.2d, #0xff00ff00ff00ff +; CHECK-GI-NEXT: usubl v0.8h, v0.8b, v1.8b +; CHECK-GI-NEXT: and v0.16b, v0.16b, v2.16b +; CHECK-GI-NEXT: ret entry: %vmovl.i.i = zext <8 x i8> %a to <8 x i16> %vmovl.i2.i = zext <8 x i8> %b to <8 x i16> @@ -702,11 +743,18 @@ entry: } define <8 x i16> @test_vsubl_high_a8(<16 x i8> %a, <16 x i8> %b) { -; CHECK-LABEL: test_vsubl_high_a8: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: usubl2 v0.8h, v0.16b, v1.16b -; CHECK-NEXT: bic v0.8h, #255, lsl #8 -; CHECK-NEXT: ret +; CHECK-SD-LABEL: test_vsubl_high_a8: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: usubl2 v0.8h, v0.16b, v1.16b +; CHECK-SD-NEXT: bic v0.8h, #255, lsl #8 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: test_vsubl_high_a8: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: movi v2.2d, #0xff00ff00ff00ff +; CHECK-GI-NEXT: usubl2 v0.8h, v0.16b, v1.16b +; CHECK-GI-NEXT: and v0.16b, v0.16b, v2.16b +; CHECK-GI-NEXT: ret entry: %shuffle.i.i.i = shufflevector <16 x i8> %a, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> %0 = zext <8 x i8> %shuffle.i.i.i to <8 x i16> @@ -818,11 +866,18 @@ entry: } define <8 x i16> @test_vsubw_a8(<8 x i16> %a, <8 x i8> %b) { -; CHECK-LABEL: test_vsubw_a8: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: usubw v0.8h, v0.8h, v1.8b -; CHECK-NEXT: bic v0.8h, #255, lsl #8 -; CHECK-NEXT: ret +; CHECK-SD-LABEL: test_vsubw_a8: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: usubw v0.8h, v0.8h, v1.8b +; CHECK-SD-NEXT: bic v0.8h, #255, lsl #8 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: test_vsubw_a8: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: movi v2.2d, #0xff00ff00ff00ff +; CHECK-GI-NEXT: usubw v0.8h, v0.8h, v1.8b +; CHECK-GI-NEXT: and v0.16b, v0.16b, v2.16b +; CHECK-GI-NEXT: ret entry: %vmovl.i.i = zext <8 x i8> %b to <8 x i16> %sub.i = sub <8 x i16> %a, %vmovl.i.i @@ -931,11 +986,18 @@ entry: } define <8 x i16> @test_vsubw_high_a8(<8 x i16> %a, <16 x i8> %b) { -; CHECK-LABEL: test_vsubw_high_a8: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: usubw2 v0.8h, v0.8h, v1.16b -; CHECK-NEXT: bic v0.8h, #255, lsl #8 -; CHECK-NEXT: ret +; CHECK-SD-LABEL: test_vsubw_high_a8: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: usubw2 v0.8h, v0.8h, v1.16b +; CHECK-SD-NEXT: bic v0.8h, #255, lsl #8 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: test_vsubw_high_a8: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: movi v2.2d, #0xff00ff00ff00ff +; CHECK-GI-NEXT: usubw2 v0.8h, v0.8h, v1.16b +; CHECK-GI-NEXT: and v0.16b, v0.16b, v2.16b +; CHECK-GI-NEXT: ret entry: %shuffle.i.i.i = shufflevector <16 x i8> %b, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> %0 = zext <8 x i8> %shuffle.i.i.i to <8 x i16> @@ -975,10 +1037,16 @@ entry: } define <8 x i8> @test_vaddhn_s16(<8 x i16> %a, <8 x i16> %b) { -; CHECK-LABEL: test_vaddhn_s16: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: addhn v0.8b, v0.8h, v1.8h -; CHECK-NEXT: ret +; CHECK-SD-LABEL: test_vaddhn_s16: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: addhn v0.8b, v0.8h, v1.8h +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: test_vaddhn_s16: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: add v0.8h, v0.8h, v1.8h +; CHECK-GI-NEXT: shrn v0.8b, v0.8h, #8 +; CHECK-GI-NEXT: ret entry: %vaddhn.i = add <8 x i16> %a, %b %vaddhn1.i = lshr <8 x i16> %vaddhn.i, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8> @@ -987,10 +1055,16 @@ entry: } define <4 x i16> @test_vaddhn_s32(<4 x i32> %a, <4 x i32> %b) { -; CHECK-LABEL: test_vaddhn_s32: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: addhn v0.4h, v0.4s, v1.4s -; CHECK-NEXT: ret +; CHECK-SD-LABEL: test_vaddhn_s32: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: addhn v0.4h, v0.4s, v1.4s +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: test_vaddhn_s32: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: add v0.4s, v0.4s, v1.4s +; CHECK-GI-NEXT: shrn v0.4h, v0.4s, #16 +; CHECK-GI-NEXT: ret entry: %vaddhn.i = add <4 x i32> %a, %b %vaddhn1.i = lshr <4 x i32> %vaddhn.i, <i32 16, i32 16, i32 16, i32 16> @@ -999,10 +1073,16 @@ entry: } define <2 x i32> @test_vaddhn_s64(<2 x i64> %a, <2 x i64> %b) { -; CHECK-LABEL: test_vaddhn_s64: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: addhn v0.2s, v0.2d, v1.2d -; CHECK-NEXT: ret +; CHECK-SD-LABEL: test_vaddhn_s64: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: addhn v0.2s, v0.2d, v1.2d +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: test_vaddhn_s64: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: add v0.2d, v0.2d, v1.2d +; CHECK-GI-NEXT: shrn v0.2s, v0.2d, #32 +; CHECK-GI-NEXT: ret entry: %vaddhn.i = add <2 x i64> %a, %b %vaddhn1.i = lshr <2 x i64> %vaddhn.i, <i64 32, i64 32> @@ -1011,10 +1091,16 @@ entry: } define <8 x i8> @test_vaddhn_u16(<8 x i16> %a, <8 x i16> %b) { -; CHECK-LABEL: test_vaddhn_u16: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: addhn v0.8b, v0.8h, v1.8h -; CHECK-NEXT: ret +; CHECK-SD-LABEL: test_vaddhn_u16: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: addhn v0.8b, v0.8h, v1.8h +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: test_vaddhn_u16: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: add v0.8h, v0.8h, v1.8h +; CHECK-GI-NEXT: shrn v0.8b, v0.8h, #8 +; CHECK-GI-NEXT: ret entry: %vaddhn.i = add <8 x i16> %a, %b %vaddhn1.i = lshr <8 x i16> %vaddhn.i, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8> @@ -1023,10 +1109,16 @@ entry: } define <4 x i16> @test_vaddhn_u32(<4 x i32> %a, <4 x i32> %b) { -; CHECK-LABEL: test_vaddhn_u32: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: addhn v0.4h, v0.4s, v1.4s -; CHECK-NEXT: ret +; CHECK-SD-LABEL: test_vaddhn_u32: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: addhn v0.4h, v0.4s, v1.4s +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: test_vaddhn_u32: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: add v0.4s, v0.4s, v1.4s +; CHECK-GI-NEXT: shrn v0.4h, v0.4s, #16 +; CHECK-GI-NEXT: ret entry: %vaddhn.i = add <4 x i32> %a, %b %vaddhn1.i = lshr <4 x i32> %vaddhn.i, <i32 16, i32 16, i32 16, i32 16> @@ -1035,10 +1127,16 @@ entry: } define <2 x i32> @test_vaddhn_u64(<2 x i64> %a, <2 x i64> %b) { -; CHECK-LABEL: test_vaddhn_u64: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: addhn v0.2s, v0.2d, v1.2d -; CHECK-NEXT: ret +; CHECK-SD-LABEL: test_vaddhn_u64: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: addhn v0.2s, v0.2d, v1.2d +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: test_vaddhn_u64: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: add v0.2d, v0.2d, v1.2d +; CHECK-GI-NEXT: shrn v0.2s, v0.2d, #32 +; CHECK-GI-NEXT: ret entry: %vaddhn.i = add <2 x i64> %a, %b %vaddhn1.i = lshr <2 x i64> %vaddhn.i, <i64 32, i64 32> @@ -1047,11 +1145,20 @@ entry: } define <16 x i8> @test_vaddhn_high_s16(<8 x i8> %r, <8 x i16> %a, <8 x i16> %b) { -; CHECK-LABEL: test_vaddhn_high_s16: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-NEXT: addhn2 v0.16b, v1.8h, v2.8h -; CHECK-NEXT: ret +; CHECK-SD-LABEL: test_vaddhn_high_s16: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-SD-NEXT: addhn2 v0.16b, v1.8h, v2.8h +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: test_vaddhn_high_s16: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: add v1.8h, v1.8h, v2.8h +; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-GI-NEXT: shrn v1.8b, v1.8h, #8 +; CHECK-GI-NEXT: fmov x8, d1 +; CHECK-GI-NEXT: mov v0.d[1], x8 +; CHECK-GI-NEXT: ret entry: %vaddhn.i.i = add <8 x i16> %a, %b %vaddhn1.i.i = lshr <8 x i16> %vaddhn.i.i, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8> @@ -1064,11 +1171,20 @@ entry: } define <8 x i16> @test_vaddhn_high_s32(<4 x i16> %r, <4 x i32> %a, <4 x i32> %b) { -; CHECK-LABEL: test_vaddhn_high_s32: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-NEXT: addhn2 v0.8h, v1.4s, v2.4s -; CHECK-NEXT: ret +; CHECK-SD-LABEL: test_vaddhn_high_s32: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-SD-NEXT: addhn2 v0.8h, v1.4s, v2.4s +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: test_vaddhn_high_s32: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: add v1.4s, v1.4s, v2.4s +; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-GI-NEXT: shrn v1.4h, v1.4s, #16 +; CHECK-GI-NEXT: fmov x8, d1 +; CHECK-GI-NEXT: mov v0.d[1], x8 +; CHECK-GI-NEXT: ret entry: %vaddhn.i.i = add <4 x i32> %a, %b %vaddhn1.i.i = lshr <4 x i32> %vaddhn.i.i, <i32 16, i32 16, i32 16, i32 16> @@ -1081,11 +1197,20 @@ entry: } define <4 x i32> @test_vaddhn_high_s64(<2 x i32> %r, <2 x i64> %a, <2 x i64> %b) { -; CHECK-LABEL: test_vaddhn_high_s64: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-NEXT: addhn2 v0.4s, v1.2d, v2.2d -; CHECK-NEXT: ret +; CHECK-SD-LABEL: test_vaddhn_high_s64: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-SD-NEXT: addhn2 v0.4s, v1.2d, v2.2d +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: test_vaddhn_high_s64: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: add v1.2d, v1.2d, v2.2d +; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-GI-NEXT: shrn v1.2s, v1.2d, #32 +; CHECK-GI-NEXT: fmov x8, d1 +; CHECK-GI-NEXT: mov v0.d[1], x8 +; CHECK-GI-NEXT: ret entry: %vaddhn.i.i = add <2 x i64> %a, %b %vaddhn1.i.i = lshr <2 x i64> %vaddhn.i.i, <i64 32, i64 32> @@ -1098,11 +1223,20 @@ entry: } define <16 x i8> @test_vaddhn_high_u16(<8 x i8> %r, <8 x i16> %a, <8 x i16> %b) { -; CHECK-LABEL: test_vaddhn_high_u16: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-NEXT: addhn2 v0.16b, v1.8h, v2.8h -; CHECK-NEXT: ret +; CHECK-SD-LABEL: test_vaddhn_high_u16: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-SD-NEXT: addhn2 v0.16b, v1.8h, v2.8h +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: test_vaddhn_high_u16: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: add v1.8h, v1.8h, v2.8h +; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-GI-NEXT: shrn v1.8b, v1.8h, #8 +; CHECK-GI-NEXT: fmov x8, d1 +; CHECK-GI-NEXT: mov v0.d[1], x8 +; CHECK-GI-NEXT: ret entry: %vaddhn.i.i = add <8 x i16> %a, %b %vaddhn1.i.i = lshr <8 x i16> %vaddhn.i.i, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8> @@ -1115,11 +1249,20 @@ entry: } define <8 x i16> @test_vaddhn_high_u32(<4 x i16> %r, <4 x i32> %a, <4 x i32> %b) { -; CHECK-LABEL: test_vaddhn_high_u32: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-NEXT: addhn2 v0.8h, v1.4s, v2.4s -; CHECK-NEXT: ret +; CHECK-SD-LABEL: test_vaddhn_high_u32: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-SD-NEXT: addhn2 v0.8h, v1.4s, v2.4s +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: test_vaddhn_high_u32: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: add v1.4s, v1.4s, v2.4s +; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-GI-NEXT: shrn v1.4h, v1.4s, #16 +; CHECK-GI-NEXT: fmov x8, d1 +; CHECK-GI-NEXT: mov v0.d[1], x8 +; CHECK-GI-NEXT: ret entry: %vaddhn.i.i = add <4 x i32> %a, %b %vaddhn1.i.i = lshr <4 x i32> %vaddhn.i.i, <i32 16, i32 16, i32 16, i32 16> @@ -1132,11 +1275,20 @@ entry: } define <4 x i32> @test_vaddhn_high_u64(<2 x i32> %r, <2 x i64> %a, <2 x i64> %b) { -; CHECK-LABEL: test_vaddhn_high_u64: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-NEXT: addhn2 v0.4s, v1.2d, v2.2d -; CHECK-NEXT: ret +; CHECK-SD-LABEL: test_vaddhn_high_u64: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-SD-NEXT: addhn2 v0.4s, v1.2d, v2.2d +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: test_vaddhn_high_u64: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: add v1.2d, v1.2d, v2.2d +; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-GI-NEXT: shrn v1.2s, v1.2d, #32 +; CHECK-GI-NEXT: fmov x8, d1 +; CHECK-GI-NEXT: mov v0.d[1], x8 +; CHECK-GI-NEXT: ret entry: %vaddhn.i.i = add <2 x i64> %a, %b %vaddhn1.i.i = lshr <2 x i64> %vaddhn.i.i, <i64 32, i64 32> @@ -1209,11 +1361,19 @@ entry: } define <16 x i8> @test_vraddhn_high_s16(<8 x i8> %r, <8 x i16> %a, <8 x i16> %b) { -; CHECK-LABEL: test_vraddhn_high_s16: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-NEXT: raddhn2 v0.16b, v1.8h, v2.8h -; CHECK-NEXT: ret +; CHECK-SD-LABEL: test_vraddhn_high_s16: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-SD-NEXT: raddhn2 v0.16b, v1.8h, v2.8h +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: test_vraddhn_high_s16: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: raddhn v1.8b, v1.8h, v2.8h +; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-GI-NEXT: fmov x8, d1 +; CHECK-GI-NEXT: mov v0.d[1], x8 +; CHECK-GI-NEXT: ret entry: %vraddhn2.i.i = tail call <8 x i8> @llvm.aarch64.neon.raddhn.v8i8(<8 x i16> %a, <8 x i16> %b) %0 = bitcast <8 x i8> %r to <1 x i64> @@ -1224,11 +1384,19 @@ entry: } define <8 x i16> @test_vraddhn_high_s32(<4 x i16> %r, <4 x i32> %a, <4 x i32> %b) { -; CHECK-LABEL: test_vraddhn_high_s32: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-NEXT: raddhn2 v0.8h, v1.4s, v2.4s -; CHECK-NEXT: ret +; CHECK-SD-LABEL: test_vraddhn_high_s32: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-SD-NEXT: raddhn2 v0.8h, v1.4s, v2.4s +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: test_vraddhn_high_s32: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: raddhn v1.4h, v1.4s, v2.4s +; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-GI-NEXT: fmov x8, d1 +; CHECK-GI-NEXT: mov v0.d[1], x8 +; CHECK-GI-NEXT: ret entry: %vraddhn2.i.i = tail call <4 x i16> @llvm.aarch64.neon.raddhn.v4i16(<4 x i32> %a, <4 x i32> %b) %0 = bitcast <4 x i16> %r to <1 x i64> @@ -1239,11 +1407,19 @@ entry: } define <4 x i32> @test_vraddhn_high_s64(<2 x i32> %r, <2 x i64> %a, <2 x i64> %b) { -; CHECK-LABEL: test_vraddhn_high_s64: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-NEXT: raddhn2 v0.4s, v1.2d, v2.2d -; CHECK-NEXT: ret +; CHECK-SD-LABEL: test_vraddhn_high_s64: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-SD-NEXT: raddhn2 v0.4s, v1.2d, v2.2d +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: test_vraddhn_high_s64: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: raddhn v1.2s, v1.2d, v2.2d +; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-GI-NEXT: fmov x8, d1 +; CHECK-GI-NEXT: mov v0.d[1], x8 +; CHECK-GI-NEXT: ret entry: %vraddhn2.i.i = tail call <2 x i32> @llvm.aarch64.neon.raddhn.v2i32(<2 x i64> %a, <2 x i64> %b) %0 = bitcast <2 x i32> %r to <1 x i64> @@ -1254,11 +1430,19 @@ entry: } define <16 x i8> @test_vraddhn_high_u16(<8 x i8> %r, <8 x i16> %a, <8 x i16> %b) { -; CHECK-LABEL: test_vraddhn_high_u16: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-NEXT: raddhn2 v0.16b, v1.8h, v2.8h -; CHECK-NEXT: ret +; CHECK-SD-LABEL: test_vraddhn_high_u16: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-SD-NEXT: raddhn2 v0.16b, v1.8h, v2.8h +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: test_vraddhn_high_u16: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: raddhn v1.8b, v1.8h, v2.8h +; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-GI-NEXT: fmov x8, d1 +; CHECK-GI-NEXT: mov v0.d[1], x8 +; CHECK-GI-NEXT: ret entry: %vraddhn2.i.i = tail call <8 x i8> @llvm.aarch64.neon.raddhn.v8i8(<8 x i16> %a, <8 x i16> %b) %0 = bitcast <8 x i8> %r to <1 x i64> @@ -1269,11 +1453,19 @@ entry: } define <8 x i16> @test_vraddhn_high_u32(<4 x i16> %r, <4 x i32> %a, <4 x i32> %b) { -; CHECK-LABEL: test_vraddhn_high_u32: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-NEXT: raddhn2 v0.8h, v1.4s, v2.4s -; CHECK-NEXT: ret +; CHECK-SD-LABEL: test_vraddhn_high_u32: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-SD-NEXT: raddhn2 v0.8h, v1.4s, v2.4s +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: test_vraddhn_high_u32: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: raddhn v1.4h, v1.4s, v2.4s +; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-GI-NEXT: fmov x8, d1 +; CHECK-GI-NEXT: mov v0.d[1], x8 +; CHECK-GI-NEXT: ret entry: %vraddhn2.i.i = tail call <4 x i16> @llvm.aarch64.neon.raddhn.v4i16(<4 x i32> %a, <4 x i32> %b) %0 = bitcast <4 x i16> %r to <1 x i64> @@ -1284,11 +1476,19 @@ entry: } define <4 x i32> @test_vraddhn_high_u64(<2 x i32> %r, <2 x i64> %a, <2 x i64> %b) { -; CHECK-LABEL: test_vraddhn_high_u64: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-NEXT: raddhn2 v0.4s, v1.2d, v2.2d -; CHECK-NEXT: ret +; CHECK-SD-LABEL: test_vraddhn_high_u64: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-SD-NEXT: raddhn2 v0.4s, v1.2d, v2.2d +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: test_vraddhn_high_u64: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: raddhn v1.2s, v1.2d, v2.2d +; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-GI-NEXT: fmov x8, d1 +; CHECK-GI-NEXT: mov v0.d[1], x8 +; CHECK-GI-NEXT: ret entry: %vraddhn2.i.i = tail call <2 x i32> @llvm.aarch64.neon.raddhn.v2i32(<2 x i64> %a, <2 x i64> %b) %0 = bitcast <2 x i32> %r to <1 x i64> @@ -1299,10 +1499,16 @@ entry: } define <8 x i8> @test_vsubhn_s16(<8 x i16> %a, <8 x i16> %b) { -; CHECK-LABEL: test_vsubhn_s16: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: subhn v0.8b, v0.8h, v1.8h -; CHECK-NEXT: ret +; CHECK-SD-LABEL: test_vsubhn_s16: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: subhn v0.8b, v0.8h, v1.8h +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: test_vsubhn_s16: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: sub v0.8h, v0.8h, v1.8h +; CHECK-GI-NEXT: shrn v0.8b, v0.8h, #8 +; CHECK-GI-NEXT: ret entry: %vsubhn.i = sub <8 x i16> %a, %b %vsubhn1.i = lshr <8 x i16> %vsubhn.i, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8> @@ -1311,10 +1517,16 @@ entry: } define <4 x i16> @test_vsubhn_s32(<4 x i32> %a, <4 x i32> %b) { -; CHECK-LABEL: test_vsubhn_s32: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: subhn v0.4h, v0.4s, v1.4s -; CHECK-NEXT: ret +; CHECK-SD-LABEL: test_vsubhn_s32: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: subhn v0.4h, v0.4s, v1.4s +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: test_vsubhn_s32: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: sub v0.4s, v0.4s, v1.4s +; CHECK-GI-NEXT: shrn v0.4h, v0.4s, #16 +; CHECK-GI-NEXT: ret entry: %vsubhn.i = sub <4 x i32> %a, %b %vsubhn1.i = lshr <4 x i32> %vsubhn.i, <i32 16, i32 16, i32 16, i32 16> @@ -1323,10 +1535,16 @@ entry: } define <2 x i32> @test_vsubhn_s64(<2 x i64> %a, <2 x i64> %b) { -; CHECK-LABEL: test_vsubhn_s64: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: subhn v0.2s, v0.2d, v1.2d -; CHECK-NEXT: ret +; CHECK-SD-LABEL: test_vsubhn_s64: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: subhn v0.2s, v0.2d, v1.2d +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: test_vsubhn_s64: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: sub v0.2d, v0.2d, v1.2d +; CHECK-GI-NEXT: shrn v0.2s, v0.2d, #32 +; CHECK-GI-NEXT: ret entry: %vsubhn.i = sub <2 x i64> %a, %b %vsubhn1.i = lshr <2 x i64> %vsubhn.i, <i64 32, i64 32> @@ -1335,10 +1553,16 @@ entry: } define <8 x i8> @test_vsubhn_u16(<8 x i16> %a, <8 x i16> %b) { -; CHECK-LABEL: test_vsubhn_u16: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: subhn v0.8b, v0.8h, v1.8h -; CHECK-NEXT: ret +; CHECK-SD-LABEL: test_vsubhn_u16: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: subhn v0.8b, v0.8h, v1.8h +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: test_vsubhn_u16: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: sub v0.8h, v0.8h, v1.8h +; CHECK-GI-NEXT: shrn v0.8b, v0.8h, #8 +; CHECK-GI-NEXT: ret entry: %vsubhn.i = sub <8 x i16> %a, %b %vsubhn1.i = lshr <8 x i16> %vsubhn.i, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8> @@ -1347,10 +1571,16 @@ entry: } define <4 x i16> @test_vsubhn_u32(<4 x i32> %a, <4 x i32> %b) { -; CHECK-LABEL: test_vsubhn_u32: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: subhn v0.4h, v0.4s, v1.4s -; CHECK-NEXT: ret +; CHECK-SD-LABEL: test_vsubhn_u32: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: subhn v0.4h, v0.4s, v1.4s +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: test_vsubhn_u32: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: sub v0.4s, v0.4s, v1.4s +; CHECK-GI-NEXT: shrn v0.4h, v0.4s, #16 +; CHECK-GI-NEXT: ret entry: %vsubhn.i = sub <4 x i32> %a, %b %vsubhn1.i = lshr <4 x i32> %vsubhn.i, <i32 16, i32 16, i32 16, i32 16> @@ -1359,10 +1589,16 @@ entry: } define <2 x i32> @test_vsubhn_u64(<2 x i64> %a, <2 x i64> %b) { -; CHECK-LABEL: test_vsubhn_u64: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: subhn v0.2s, v0.2d, v1.2d -; CHECK-NEXT: ret +; CHECK-SD-LABEL: test_vsubhn_u64: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: subhn v0.2s, v0.2d, v1.2d +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: test_vsubhn_u64: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: sub v0.2d, v0.2d, v1.2d +; CHECK-GI-NEXT: shrn v0.2s, v0.2d, #32 +; CHECK-GI-NEXT: ret entry: %vsubhn.i = sub <2 x i64> %a, %b %vsubhn1.i = lshr <2 x i64> %vsubhn.i, <i64 32, i64 32> @@ -1371,11 +1607,20 @@ entry: } define <16 x i8> @test_vsubhn_high_s16(<8 x i8> %r, <8 x i16> %a, <8 x i16> %b) { -; CHECK-LABEL: test_vsubhn_high_s16: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-NEXT: subhn2 v0.16b, v1.8h, v2.8h -; CHECK-NEXT: ret +; CHECK-SD-LABEL: test_vsubhn_high_s16: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-SD-NEXT: subhn2 v0.16b, v1.8h, v2.8h +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: test_vsubhn_high_s16: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: sub v1.8h, v1.8h, v2.8h +; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-GI-NEXT: shrn v1.8b, v1.8h, #8 +; CHECK-GI-NEXT: fmov x8, d1 +; CHECK-GI-NEXT: mov v0.d[1], x8 +; CHECK-GI-NEXT: ret entry: %vsubhn.i.i = sub <8 x i16> %a, %b %vsubhn1.i.i = lshr <8 x i16> %vsubhn.i.i, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8> @@ -1388,11 +1633,20 @@ entry: } define <8 x i16> @test_vsubhn_high_s32(<4 x i16> %r, <4 x i32> %a, <4 x i32> %b) { -; CHECK-LABEL: test_vsubhn_high_s32: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-NEXT: subhn2 v0.8h, v1.4s, v2.4s -; CHECK-NEXT: ret +; CHECK-SD-LABEL: test_vsubhn_high_s32: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-SD-NEXT: subhn2 v0.8h, v1.4s, v2.4s +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: test_vsubhn_high_s32: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: sub v1.4s, v1.4s, v2.4s +; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-GI-NEXT: shrn v1.4h, v1.4s, #16 +; CHECK-GI-NEXT: fmov x8, d1 +; CHECK-GI-NEXT: mov v0.d[1], x8 +; CHECK-GI-NEXT: ret entry: %vsubhn.i.i = sub <4 x i32> %a, %b %vsubhn1.i.i = lshr <4 x i32> %vsubhn.i.i, <i32 16, i32 16, i32 16, i32 16> @@ -1405,11 +1659,20 @@ entry: } define <4 x i32> @test_vsubhn_high_s64(<2 x i32> %r, <2 x i64> %a, <2 x i64> %b) { -; CHECK-LABEL: test_vsubhn_high_s64: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-NEXT: subhn2 v0.4s, v1.2d, v2.2d -; CHECK-NEXT: ret +; CHECK-SD-LABEL: test_vsubhn_high_s64: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-SD-NEXT: subhn2 v0.4s, v1.2d, v2.2d +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: test_vsubhn_high_s64: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: sub v1.2d, v1.2d, v2.2d +; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-GI-NEXT: shrn v1.2s, v1.2d, #32 +; CHECK-GI-NEXT: fmov x8, d1 +; CHECK-GI-NEXT: mov v0.d[1], x8 +; CHECK-GI-NEXT: ret entry: %vsubhn.i.i = sub <2 x i64> %a, %b %vsubhn1.i.i = lshr <2 x i64> %vsubhn.i.i, <i64 32, i64 32> @@ -1422,11 +1685,20 @@ entry: } define <16 x i8> @test_vsubhn_high_u16(<8 x i8> %r, <8 x i16> %a, <8 x i16> %b) { -; CHECK-LABEL: test_vsubhn_high_u16: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-NEXT: subhn2 v0.16b, v1.8h, v2.8h -; CHECK-NEXT: ret +; CHECK-SD-LABEL: test_vsubhn_high_u16: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-SD-NEXT: subhn2 v0.16b, v1.8h, v2.8h +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: test_vsubhn_high_u16: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: sub v1.8h, v1.8h, v2.8h +; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-GI-NEXT: shrn v1.8b, v1.8h, #8 +; CHECK-GI-NEXT: fmov x8, d1 +; CHECK-GI-NEXT: mov v0.d[1], x8 +; CHECK-GI-NEXT: ret entry: %vsubhn.i.i = sub <8 x i16> %a, %b %vsubhn1.i.i = lshr <8 x i16> %vsubhn.i.i, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8> @@ -1439,11 +1711,20 @@ entry: } define <8 x i16> @test_vsubhn_high_u32(<4 x i16> %r, <4 x i32> %a, <4 x i32> %b) { -; CHECK-LABEL: test_vsubhn_high_u32: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-NEXT: subhn2 v0.8h, v1.4s, v2.4s -; CHECK-NEXT: ret +; CHECK-SD-LABEL: test_vsubhn_high_u32: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-SD-NEXT: subhn2 v0.8h, v1.4s, v2.4s +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: test_vsubhn_high_u32: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: sub v1.4s, v1.4s, v2.4s +; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-GI-NEXT: shrn v1.4h, v1.4s, #16 +; CHECK-GI-NEXT: fmov x8, d1 +; CHECK-GI-NEXT: mov v0.d[1], x8 +; CHECK-GI-NEXT: ret entry: %vsubhn.i.i = sub <4 x i32> %a, %b %vsubhn1.i.i = lshr <4 x i32> %vsubhn.i.i, <i32 16, i32 16, i32 16, i32 16> @@ -1456,11 +1737,20 @@ entry: } define <4 x i32> @test_vsubhn_high_u64(<2 x i32> %r, <2 x i64> %a, <2 x i64> %b) { -; CHECK-LABEL: test_vsubhn_high_u64: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-NEXT: subhn2 v0.4s, v1.2d, v2.2d -; CHECK-NEXT: ret +; CHECK-SD-LABEL: test_vsubhn_high_u64: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-SD-NEXT: subhn2 v0.4s, v1.2d, v2.2d +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: test_vsubhn_high_u64: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: sub v1.2d, v1.2d, v2.2d +; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-GI-NEXT: shrn v1.2s, v1.2d, #32 +; CHECK-GI-NEXT: fmov x8, d1 +; CHECK-GI-NEXT: mov v0.d[1], x8 +; CHECK-GI-NEXT: ret entry: %vsubhn.i.i = sub <2 x i64> %a, %b %vsubhn1.i.i = lshr <2 x i64> %vsubhn.i.i, <i64 32, i64 32> @@ -1533,11 +1823,19 @@ entry: } define <16 x i8> @test_vrsubhn_high_s16(<8 x i8> %r, <8 x i16> %a, <8 x i16> %b) { -; CHECK-LABEL: test_vrsubhn_high_s16: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-NEXT: rsubhn2 v0.16b, v1.8h, v2.8h -; CHECK-NEXT: ret +; CHECK-SD-LABEL: test_vrsubhn_high_s16: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-SD-NEXT: rsubhn2 v0.16b, v1.8h, v2.8h +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: test_vrsubhn_high_s16: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: rsubhn v1.8b, v1.8h, v2.8h +; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-GI-NEXT: fmov x8, d1 +; CHECK-GI-NEXT: mov v0.d[1], x8 +; CHECK-GI-NEXT: ret entry: %vrsubhn2.i.i = tail call <8 x i8> @llvm.aarch64.neon.rsubhn.v8i8(<8 x i16> %a, <8 x i16> %b) %0 = bitcast <8 x i8> %r to <1 x i64> @@ -1548,11 +1846,19 @@ entry: } define <8 x i16> @test_vrsubhn_high_s32(<4 x i16> %r, <4 x i32> %a, <4 x i32> %b) { -; CHECK-LABEL: test_vrsubhn_high_s32: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-NEXT: rsubhn2 v0.8h, v1.4s, v2.4s -; CHECK-NEXT: ret +; CHECK-SD-LABEL: test_vrsubhn_high_s32: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-SD-NEXT: rsubhn2 v0.8h, v1.4s, v2.4s +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: test_vrsubhn_high_s32: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: rsubhn v1.4h, v1.4s, v2.4s +; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-GI-NEXT: fmov x8, d1 +; CHECK-GI-NEXT: mov v0.d[1], x8 +; CHECK-GI-NEXT: ret entry: %vrsubhn2.i.i = tail call <4 x i16> @llvm.aarch64.neon.rsubhn.v4i16(<4 x i32> %a, <4 x i32> %b) %0 = bitcast <4 x i16> %r to <1 x i64> @@ -1563,11 +1869,19 @@ entry: } define <4 x i32> @test_vrsubhn_high_s64(<2 x i32> %r, <2 x i64> %a, <2 x i64> %b) { -; CHECK-LABEL: test_vrsubhn_high_s64: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-NEXT: rsubhn2 v0.4s, v1.2d, v2.2d -; CHECK-NEXT: ret +; CHECK-SD-LABEL: test_vrsubhn_high_s64: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-SD-NEXT: rsubhn2 v0.4s, v1.2d, v2.2d +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: test_vrsubhn_high_s64: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: rsubhn v1.2s, v1.2d, v2.2d +; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-GI-NEXT: fmov x8, d1 +; CHECK-GI-NEXT: mov v0.d[1], x8 +; CHECK-GI-NEXT: ret entry: %vrsubhn2.i.i = tail call <2 x i32> @llvm.aarch64.neon.rsubhn.v2i32(<2 x i64> %a, <2 x i64> %b) %0 = bitcast <2 x i32> %r to <1 x i64> @@ -1578,11 +1892,19 @@ entry: } define <16 x i8> @test_vrsubhn_high_u16(<8 x i8> %r, <8 x i16> %a, <8 x i16> %b) { -; CHECK-LABEL: test_vrsubhn_high_u16: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-NEXT: rsubhn2 v0.16b, v1.8h, v2.8h -; CHECK-NEXT: ret +; CHECK-SD-LABEL: test_vrsubhn_high_u16: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-SD-NEXT: rsubhn2 v0.16b, v1.8h, v2.8h +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: test_vrsubhn_high_u16: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: rsubhn v1.8b, v1.8h, v2.8h +; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-GI-NEXT: fmov x8, d1 +; CHECK-GI-NEXT: mov v0.d[1], x8 +; CHECK-GI-NEXT: ret entry: %vrsubhn2.i.i = tail call <8 x i8> @llvm.aarch64.neon.rsubhn.v8i8(<8 x i16> %a, <8 x i16> %b) %0 = bitcast <8 x i8> %r to <1 x i64> @@ -1593,11 +1915,19 @@ entry: } define <8 x i16> @test_vrsubhn_high_u32(<4 x i16> %r, <4 x i32> %a, <4 x i32> %b) { -; CHECK-LABEL: test_vrsubhn_high_u32: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-NEXT: rsubhn2 v0.8h, v1.4s, v2.4s -; CHECK-NEXT: ret +; CHECK-SD-LABEL: test_vrsubhn_high_u32: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-SD-NEXT: rsubhn2 v0.8h, v1.4s, v2.4s +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: test_vrsubhn_high_u32: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: rsubhn v1.4h, v1.4s, v2.4s +; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-GI-NEXT: fmov x8, d1 +; CHECK-GI-NEXT: mov v0.d[1], x8 +; CHECK-GI-NEXT: ret entry: %vrsubhn2.i.i = tail call <4 x i16> @llvm.aarch64.neon.rsubhn.v4i16(<4 x i32> %a, <4 x i32> %b) %0 = bitcast <4 x i16> %r to <1 x i64> @@ -1608,11 +1938,19 @@ entry: } define <4 x i32> @test_vrsubhn_high_u64(<2 x i32> %r, <2 x i64> %a, <2 x i64> %b) { -; CHECK-LABEL: test_vrsubhn_high_u64: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-NEXT: rsubhn2 v0.4s, v1.2d, v2.2d -; CHECK-NEXT: ret +; CHECK-SD-LABEL: test_vrsubhn_high_u64: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-SD-NEXT: rsubhn2 v0.4s, v1.2d, v2.2d +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: test_vrsubhn_high_u64: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: rsubhn v1.2s, v1.2d, v2.2d +; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-GI-NEXT: fmov x8, d1 +; CHECK-GI-NEXT: mov v0.d[1], x8 +; CHECK-GI-NEXT: ret entry: %vrsubhn2.i.i = tail call <2 x i32> @llvm.aarch64.neon.rsubhn.v2i32(<2 x i64> %a, <2 x i64> %b) %0 = bitcast <2 x i32> %r to <1 x i64> @@ -2535,21 +2873,40 @@ entry: } define <8 x i16> @cmplx_mul_combined_re_im(<8 x i16> noundef %a, i64 %scale.coerce) { -; CHECK-LABEL: cmplx_mul_combined_re_im: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: lsr x8, x0, #16 -; CHECK-NEXT: movi v1.2d, #0xffff0000ffff0000 -; CHECK-NEXT: rev32 v4.8h, v0.8h -; CHECK-NEXT: dup v2.8h, w8 -; CHECK-NEXT: sqneg v3.8h, v2.8h -; CHECK-NEXT: bsl v1.16b, v2.16b, v3.16b -; CHECK-NEXT: fmov d3, x0 -; CHECK-NEXT: sqdmull v2.4s, v4.4h, v1.4h -; CHECK-NEXT: sqdmull2 v1.4s, v4.8h, v1.8h -; CHECK-NEXT: sqdmlal v2.4s, v0.4h, v3.h[0] -; CHECK-NEXT: sqdmlal2 v1.4s, v0.8h, v3.h[0] -; CHECK-NEXT: uzp2 v0.8h, v2.8h, v1.8h -; CHECK-NEXT: ret +; CHECK-SD-LABEL: cmplx_mul_combined_re_im: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: lsr x8, x0, #16 +; CHECK-SD-NEXT: movi v1.2d, #0xffff0000ffff0000 +; CHECK-SD-NEXT: rev32 v4.8h, v0.8h +; CHECK-SD-NEXT: dup v2.8h, w8 +; CHECK-SD-NEXT: sqneg v3.8h, v2.8h +; CHECK-SD-NEXT: bsl v1.16b, v2.16b, v3.16b +; CHECK-SD-NEXT: fmov d3, x0 +; CHECK-SD-NEXT: sqdmull v2.4s, v4.4h, v1.4h +; CHECK-SD-NEXT: sqdmull2 v1.4s, v4.8h, v1.8h +; CHECK-SD-NEXT: sqdmlal v2.4s, v0.4h, v3.h[0] +; CHECK-SD-NEXT: sqdmlal2 v1.4s, v0.8h, v3.h[0] +; CHECK-SD-NEXT: uzp2 v0.8h, v2.8h, v1.8h +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: cmplx_mul_combined_re_im: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: lsr x9, x0, #16 +; CHECK-GI-NEXT: adrp x8, .LCPI196_0 +; CHECK-GI-NEXT: rev32 v4.8h, v0.8h +; CHECK-GI-NEXT: ldr q3, [x8, :lo12:.LCPI196_0] +; CHECK-GI-NEXT: fmov d1, x9 +; CHECK-GI-NEXT: dup v2.8h, v1.h[0] +; CHECK-GI-NEXT: sqneg v1.8h, v2.8h +; CHECK-GI-NEXT: tbl v1.16b, { v1.16b, v2.16b }, v3.16b +; CHECK-GI-NEXT: mov d2, v0.d[1] +; CHECK-GI-NEXT: fmov d3, x0 +; CHECK-GI-NEXT: sqdmull v2.4s, v2.4h, v3.h[0] +; CHECK-GI-NEXT: sqdmull v5.4s, v4.4h, v1.4h +; CHECK-GI-NEXT: sqdmlal v5.4s, v0.4h, v3.h[0] +; CHECK-GI-NEXT: sqdmlal2 v2.4s, v4.8h, v1.8h +; CHECK-GI-NEXT: uzp2 v0.8h, v5.8h, v2.8h +; CHECK-GI-NEXT: ret entry: %scale.sroa.2.0.extract.shift23 = lshr i64 %scale.coerce, 16 %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> poison, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6> diff --git a/llvm/test/CodeGen/AArch64/arm64-neon-aba-abd.ll b/llvm/test/CodeGen/AArch64/arm64-neon-aba-abd.ll index cc9732b..6c7ddd9 100644 --- a/llvm/test/CodeGen/AArch64/arm64-neon-aba-abd.ll +++ b/llvm/test/CodeGen/AArch64/arm64-neon-aba-abd.ll @@ -1,5 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=arm64-none-linux-gnu -mattr=+neon < %s | FileCheck %s +; RUN: llc -mtriple=arm64-none-linux-gnu < %s | FileCheck %s --check-prefixes=CHECK,CHECK-SD +; RUN: llc -mtriple=arm64-none-linux-gnu -global-isel < %s | FileCheck %s --check-prefixes=CHECK,CHECK-GI declare <8 x i8> @llvm.aarch64.neon.uabd.v8i8(<8 x i8>, <8 x i8>) declare <8 x i8> @llvm.aarch64.neon.sabd.v8i8(<8 x i8>, <8 x i8>) @@ -197,11 +198,20 @@ define <2 x i32> @test_sabd_v2i32(<2 x i32> %lhs, <2 x i32> %rhs) { } define <2 x i32> @test_sabd_v2i32_const() { -; CHECK-LABEL: test_sabd_v2i32_const: -; CHECK: // %bb.0: -; CHECK-NEXT: adrp x8, .LCPI19_0 -; CHECK-NEXT: ldr d0, [x8, :lo12:.LCPI19_0] -; CHECK-NEXT: ret +; CHECK-SD-LABEL: test_sabd_v2i32_const: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: adrp x8, .LCPI19_0 +; CHECK-SD-NEXT: ldr d0, [x8, :lo12:.LCPI19_0] +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: test_sabd_v2i32_const: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: adrp x8, .LCPI19_1 +; CHECK-GI-NEXT: adrp x9, .LCPI19_0 +; CHECK-GI-NEXT: ldr d0, [x8, :lo12:.LCPI19_1] +; CHECK-GI-NEXT: ldr d1, [x9, :lo12:.LCPI19_0] +; CHECK-GI-NEXT: sabd v0.2s, v0.2s, v1.2s +; CHECK-GI-NEXT: ret %1 = tail call <2 x i32> @llvm.aarch64.neon.sabd.v2i32( <2 x i32> <i32 -2147483648, i32 2147450880>, <2 x i32> <i32 -65536, i32 65535>) @@ -293,15 +303,26 @@ define <2 x double> @test_fabd_v2f64(<2 x double> %lhs, <2 x double> %rhs) { } define <8 x i16> @test_uabd_knownbits_vec8i16(<8 x i16> %lhs, <8 x i16> %rhs) { -; CHECK-LABEL: test_uabd_knownbits_vec8i16: -; CHECK: // %bb.0: -; CHECK-NEXT: movi v2.8h, #15 -; CHECK-NEXT: and v0.16b, v0.16b, v2.16b -; CHECK-NEXT: and v1.16b, v1.16b, v2.16b -; CHECK-NEXT: uabd v0.8h, v0.8h, v1.8h -; CHECK-NEXT: rev64 v0.8h, v0.8h -; CHECK-NEXT: ext v0.16b, v0.16b, v0.16b, #8 -; CHECK-NEXT: ret +; CHECK-SD-LABEL: test_uabd_knownbits_vec8i16: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: movi v2.8h, #15 +; CHECK-SD-NEXT: and v0.16b, v0.16b, v2.16b +; CHECK-SD-NEXT: and v1.16b, v1.16b, v2.16b +; CHECK-SD-NEXT: uabd v0.8h, v0.8h, v1.8h +; CHECK-SD-NEXT: rev64 v0.8h, v0.8h +; CHECK-SD-NEXT: ext v0.16b, v0.16b, v0.16b, #8 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: test_uabd_knownbits_vec8i16: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: movi v2.8h, #15 +; CHECK-GI-NEXT: and v0.16b, v0.16b, v2.16b +; CHECK-GI-NEXT: and v1.16b, v1.16b, v2.16b +; CHECK-GI-NEXT: uabd v0.8h, v0.8h, v1.8h +; CHECK-GI-NEXT: rev64 v0.8h, v0.8h +; CHECK-GI-NEXT: ext v0.16b, v0.16b, v0.16b, #8 +; CHECK-GI-NEXT: and v0.16b, v0.16b, v2.16b +; CHECK-GI-NEXT: ret %and1 = and <8 x i16> %lhs, <i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15> %and2 = and <8 x i16> %rhs, <i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15> %uabd = call <8 x i16> @llvm.aarch64.neon.uabd.v8i16(<8 x i16> %and1, <8 x i16> %and2) @@ -311,11 +332,22 @@ define <8 x i16> @test_uabd_knownbits_vec8i16(<8 x i16> %lhs, <8 x i16> %rhs) { } define <4 x i32> @knownbits_uabd_mask_and_shuffle_lshr(<4 x i32> %a0, <4 x i32> %a1) { -; CHECK-LABEL: knownbits_uabd_mask_and_shuffle_lshr: -; CHECK: // %bb.0: -; CHECK-NEXT: movi v0.2d, #0000000000000000 -; CHECK-NEXT: ushr v0.4s, v0.4s, #17 -; CHECK-NEXT: ret +; CHECK-SD-LABEL: knownbits_uabd_mask_and_shuffle_lshr: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: movi v0.2d, #0000000000000000 +; CHECK-SD-NEXT: ushr v0.4s, v0.4s, #17 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: knownbits_uabd_mask_and_shuffle_lshr: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: movi v2.2d, #0x00ffff0000ffff +; CHECK-GI-NEXT: and v0.16b, v0.16b, v2.16b +; CHECK-GI-NEXT: and v1.16b, v1.16b, v2.16b +; CHECK-GI-NEXT: uabd v0.4s, v0.4s, v1.4s +; CHECK-GI-NEXT: rev64 v0.4s, v0.4s +; CHECK-GI-NEXT: ext v0.16b, v0.16b, v0.16b, #8 +; CHECK-GI-NEXT: ushr v0.4s, v0.4s, #17 +; CHECK-GI-NEXT: ret %1 = and <4 x i32> %a0, <i32 65535, i32 65535, i32 65535, i32 65535> %2 = and <4 x i32> %a1, <i32 65535, i32 65535, i32 65535, i32 65535> %3 = call <4 x i32> @llvm.aarch64.neon.uabd.v4i32(<4 x i32> %1, <4 x i32> %2) @@ -325,10 +357,19 @@ define <4 x i32> @knownbits_uabd_mask_and_shuffle_lshr(<4 x i32> %a0, <4 x i32> } define <4 x i32> @knownbits_mask_and_shuffle_lshr(<4 x i32> %a0, <4 x i32> %a1) { -; CHECK-LABEL: knownbits_mask_and_shuffle_lshr: -; CHECK: // %bb.0: -; CHECK-NEXT: movi v0.2d, #0000000000000000 -; CHECK-NEXT: ret +; CHECK-SD-LABEL: knownbits_mask_and_shuffle_lshr: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: movi v0.2d, #0000000000000000 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: knownbits_mask_and_shuffle_lshr: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: movi v2.4s, #127, msl #8 +; CHECK-GI-NEXT: and v0.16b, v0.16b, v2.16b +; CHECK-GI-NEXT: and v1.16b, v1.16b, v2.16b +; CHECK-GI-NEXT: uabd v0.4s, v0.4s, v1.4s +; CHECK-GI-NEXT: ushr v0.4s, v0.4s, #17 +; CHECK-GI-NEXT: ret %1 = and <4 x i32> %a0, <i32 32767, i32 32767, i32 32767, i32 32767> %2 = and <4 x i32> %a1, <i32 32767, i32 32767, i32 32767, i32 32767> %3 = call <4 x i32> @llvm.aarch64.neon.uabd.v4i32(<4 x i32> %1, <4 x i32> %2) @@ -338,20 +379,36 @@ define <4 x i32> @knownbits_mask_and_shuffle_lshr(<4 x i32> %a0, <4 x i32> %a1) } define <4 x i32> @test_sabd_knownbits_vec4i32(<4 x i32> %lhs, <4 x i32> %rhs) { -; CHECK-LABEL: test_sabd_knownbits_vec4i32: -; CHECK: // %bb.0: -; CHECK-NEXT: adrp x8, .LCPI31_0 -; CHECK-NEXT: adrp x9, .LCPI31_1 -; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI31_0] -; CHECK-NEXT: ldr q3, [x9, :lo12:.LCPI31_1] -; CHECK-NEXT: and v0.16b, v0.16b, v2.16b -; CHECK-NEXT: and v1.16b, v1.16b, v3.16b -; CHECK-NEXT: sabd v0.4s, v0.4s, v1.4s -; CHECK-NEXT: movi v1.2d, #0x0000ff000000ff -; CHECK-NEXT: mov v0.s[1], v0.s[0] -; CHECK-NEXT: trn2 v0.4s, v0.4s, v0.4s -; CHECK-NEXT: and v0.16b, v0.16b, v1.16b -; CHECK-NEXT: ret +; CHECK-SD-LABEL: test_sabd_knownbits_vec4i32: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: adrp x8, .LCPI31_0 +; CHECK-SD-NEXT: adrp x9, .LCPI31_1 +; CHECK-SD-NEXT: ldr q2, [x8, :lo12:.LCPI31_0] +; CHECK-SD-NEXT: ldr q3, [x9, :lo12:.LCPI31_1] +; CHECK-SD-NEXT: and v0.16b, v0.16b, v2.16b +; CHECK-SD-NEXT: and v1.16b, v1.16b, v3.16b +; CHECK-SD-NEXT: sabd v0.4s, v0.4s, v1.4s +; CHECK-SD-NEXT: movi v1.2d, #0x0000ff000000ff +; CHECK-SD-NEXT: mov v0.s[1], v0.s[0] +; CHECK-SD-NEXT: trn2 v0.4s, v0.4s, v0.4s +; CHECK-SD-NEXT: and v0.16b, v0.16b, v1.16b +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: test_sabd_knownbits_vec4i32: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: adrp x8, .LCPI31_2 +; CHECK-GI-NEXT: adrp x9, .LCPI31_1 +; CHECK-GI-NEXT: ldr q2, [x8, :lo12:.LCPI31_2] +; CHECK-GI-NEXT: ldr q3, [x9, :lo12:.LCPI31_1] +; CHECK-GI-NEXT: adrp x8, .LCPI31_0 +; CHECK-GI-NEXT: and v0.16b, v0.16b, v2.16b +; CHECK-GI-NEXT: and v1.16b, v1.16b, v3.16b +; CHECK-GI-NEXT: ldr q2, [x8, :lo12:.LCPI31_0] +; CHECK-GI-NEXT: movi v3.2d, #0x0000ff000000ff +; CHECK-GI-NEXT: sabd v0.4s, v0.4s, v1.4s +; CHECK-GI-NEXT: tbl v0.16b, { v0.16b, v1.16b }, v2.16b +; CHECK-GI-NEXT: and v0.16b, v0.16b, v3.16b +; CHECK-GI-NEXT: ret %and1 = and <4 x i32> %lhs, <i32 255, i32 -1, i32 -1, i32 255> %and2 = and <4 x i32> %rhs, <i32 255, i32 255, i32 -1, i32 -1> %abd = call <4 x i32> @llvm.aarch64.neon.sabd.v4i32(<4 x i32> %and1, <4 x i32> %and2) @@ -361,15 +418,27 @@ define <4 x i32> @test_sabd_knownbits_vec4i32(<4 x i32> %lhs, <4 x i32> %rhs) { } define <4 x i32> @knownbits_sabd_and_mask(<4 x i32> %a0, <4 x i32> %a1) { -; CHECK-LABEL: knownbits_sabd_and_mask: -; CHECK: // %bb.0: -; CHECK-NEXT: adrp x8, .LCPI32_0 -; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI32_0] -; CHECK-NEXT: and v0.16b, v0.16b, v2.16b -; CHECK-NEXT: and v1.16b, v1.16b, v2.16b -; CHECK-NEXT: sabd v0.4s, v0.4s, v1.4s -; CHECK-NEXT: zip2 v0.4s, v0.4s, v0.4s -; CHECK-NEXT: ret +; CHECK-SD-LABEL: knownbits_sabd_and_mask: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: adrp x8, .LCPI32_0 +; CHECK-SD-NEXT: ldr q2, [x8, :lo12:.LCPI32_0] +; CHECK-SD-NEXT: and v0.16b, v0.16b, v2.16b +; CHECK-SD-NEXT: and v1.16b, v1.16b, v2.16b +; CHECK-SD-NEXT: sabd v0.4s, v0.4s, v1.4s +; CHECK-SD-NEXT: zip2 v0.4s, v0.4s, v0.4s +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: knownbits_sabd_and_mask: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: adrp x8, .LCPI32_1 +; CHECK-GI-NEXT: ldr q2, [x8, :lo12:.LCPI32_1] +; CHECK-GI-NEXT: adrp x8, .LCPI32_0 +; CHECK-GI-NEXT: and v0.16b, v0.16b, v2.16b +; CHECK-GI-NEXT: and v1.16b, v1.16b, v2.16b +; CHECK-GI-NEXT: ldr q2, [x8, :lo12:.LCPI32_0] +; CHECK-GI-NEXT: sabd v0.4s, v0.4s, v1.4s +; CHECK-GI-NEXT: tbl v0.16b, { v0.16b, v1.16b }, v2.16b +; CHECK-GI-NEXT: ret %1 = and <4 x i32> %a0, <i32 -1, i32 -1, i32 255, i32 4085> %2 = and <4 x i32> %a1, <i32 -1, i32 -1, i32 255, i32 4085> %3 = call <4 x i32> @llvm.aarch64.neon.sabd.v4i32(<4 x i32> %1, <4 x i32> %2) @@ -378,10 +447,25 @@ define <4 x i32> @knownbits_sabd_and_mask(<4 x i32> %a0, <4 x i32> %a1) { } define <4 x i32> @knownbits_sabd_and_or_mask(<4 x i32> %a0, <4 x i32> %a1) { -; CHECK-LABEL: knownbits_sabd_and_or_mask: -; CHECK: // %bb.0: -; CHECK-NEXT: movi v0.2d, #0000000000000000 -; CHECK-NEXT: ret +; CHECK-SD-LABEL: knownbits_sabd_and_or_mask: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: movi v0.2d, #0000000000000000 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: knownbits_sabd_and_or_mask: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: adrp x8, .LCPI33_1 +; CHECK-GI-NEXT: movi v3.2d, #0x00ffff0000ffff +; CHECK-GI-NEXT: ldr q2, [x8, :lo12:.LCPI33_1] +; CHECK-GI-NEXT: adrp x8, .LCPI33_0 +; CHECK-GI-NEXT: and v0.16b, v0.16b, v2.16b +; CHECK-GI-NEXT: and v1.16b, v1.16b, v2.16b +; CHECK-GI-NEXT: ldr q2, [x8, :lo12:.LCPI33_0] +; CHECK-GI-NEXT: orr v0.16b, v0.16b, v3.16b +; CHECK-GI-NEXT: orr v1.16b, v1.16b, v3.16b +; CHECK-GI-NEXT: uabd v0.4s, v0.4s, v1.4s +; CHECK-GI-NEXT: tbl v0.16b, { v0.16b, v1.16b }, v2.16b +; CHECK-GI-NEXT: ret %1 = and <4 x i32> %a0, <i32 -1, i32 -1, i32 255, i32 4085> %2 = or <4 x i32> %1, <i32 65535, i32 65535, i32 65535, i32 65535> %3 = and <4 x i32> %a1, <i32 -1, i32 -1, i32 255, i32 4085> @@ -392,18 +476,33 @@ define <4 x i32> @knownbits_sabd_and_or_mask(<4 x i32> %a0, <4 x i32> %a1) { } define <4 x i32> @knownbits_sabd_and_xor_mask(<4 x i32> %a0, <4 x i32> %a1) { -; CHECK-LABEL: knownbits_sabd_and_xor_mask: -; CHECK: // %bb.0: -; CHECK-NEXT: adrp x8, .LCPI34_0 -; CHECK-NEXT: movi v3.2d, #0x00ffff0000ffff -; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI34_0] -; CHECK-NEXT: and v0.16b, v0.16b, v2.16b -; CHECK-NEXT: and v1.16b, v1.16b, v2.16b -; CHECK-NEXT: eor v0.16b, v0.16b, v3.16b -; CHECK-NEXT: eor v1.16b, v1.16b, v3.16b -; CHECK-NEXT: sabd v0.4s, v0.4s, v1.4s -; CHECK-NEXT: zip2 v0.4s, v0.4s, v0.4s -; CHECK-NEXT: ret +; CHECK-SD-LABEL: knownbits_sabd_and_xor_mask: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: adrp x8, .LCPI34_0 +; CHECK-SD-NEXT: movi v3.2d, #0x00ffff0000ffff +; CHECK-SD-NEXT: ldr q2, [x8, :lo12:.LCPI34_0] +; CHECK-SD-NEXT: and v0.16b, v0.16b, v2.16b +; CHECK-SD-NEXT: and v1.16b, v1.16b, v2.16b +; CHECK-SD-NEXT: eor v0.16b, v0.16b, v3.16b +; CHECK-SD-NEXT: eor v1.16b, v1.16b, v3.16b +; CHECK-SD-NEXT: sabd v0.4s, v0.4s, v1.4s +; CHECK-SD-NEXT: zip2 v0.4s, v0.4s, v0.4s +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: knownbits_sabd_and_xor_mask: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: adrp x8, .LCPI34_1 +; CHECK-GI-NEXT: movi v3.2d, #0x00ffff0000ffff +; CHECK-GI-NEXT: ldr q2, [x8, :lo12:.LCPI34_1] +; CHECK-GI-NEXT: adrp x8, .LCPI34_0 +; CHECK-GI-NEXT: and v0.16b, v0.16b, v2.16b +; CHECK-GI-NEXT: and v1.16b, v1.16b, v2.16b +; CHECK-GI-NEXT: ldr q2, [x8, :lo12:.LCPI34_0] +; CHECK-GI-NEXT: eor v0.16b, v0.16b, v3.16b +; CHECK-GI-NEXT: eor v1.16b, v1.16b, v3.16b +; CHECK-GI-NEXT: sabd v0.4s, v0.4s, v1.4s +; CHECK-GI-NEXT: tbl v0.16b, { v0.16b, v1.16b }, v2.16b +; CHECK-GI-NEXT: ret %1 = and <4 x i32> %a0, <i32 -1, i32 -1, i32 255, i32 4085> %2 = xor <4 x i32> %1, <i32 65535, i32 65535, i32 65535, i32 65535> %3 = and <4 x i32> %a1, <i32 -1, i32 -1, i32 255, i32 4085> @@ -414,10 +513,24 @@ define <4 x i32> @knownbits_sabd_and_xor_mask(<4 x i32> %a0, <4 x i32> %a1) { } define <4 x i32> @knownbits_sabd_and_shl_mask(<4 x i32> %a0, <4 x i32> %a1) { -; CHECK-LABEL: knownbits_sabd_and_shl_mask: -; CHECK: // %bb.0: -; CHECK-NEXT: movi v0.2d, #0000000000000000 -; CHECK-NEXT: ret +; CHECK-SD-LABEL: knownbits_sabd_and_shl_mask: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: movi v0.2d, #0000000000000000 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: knownbits_sabd_and_shl_mask: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: adrp x8, .LCPI35_1 +; CHECK-GI-NEXT: ldr q2, [x8, :lo12:.LCPI35_1] +; CHECK-GI-NEXT: adrp x8, .LCPI35_0 +; CHECK-GI-NEXT: and v0.16b, v0.16b, v2.16b +; CHECK-GI-NEXT: and v1.16b, v1.16b, v2.16b +; CHECK-GI-NEXT: ldr q2, [x8, :lo12:.LCPI35_0] +; CHECK-GI-NEXT: shl v0.4s, v0.4s, #17 +; CHECK-GI-NEXT: shl v1.4s, v1.4s, #17 +; CHECK-GI-NEXT: sabd v0.4s, v0.4s, v1.4s +; CHECK-GI-NEXT: tbl v0.16b, { v0.16b, v1.16b }, v2.16b +; CHECK-GI-NEXT: ret %1 = and <4 x i32> %a0, <i32 -65536, i32 -7, i32 -7, i32 -65536> %2 = shl <4 x i32> %1, <i32 17, i32 17, i32 17, i32 17> %3 = and <4 x i32> %a1, <i32 -65536, i32 -7, i32 -7, i32 -65536> @@ -428,18 +541,32 @@ define <4 x i32> @knownbits_sabd_and_shl_mask(<4 x i32> %a0, <4 x i32> %a1) { } define <4 x i32> @knownbits_sabd_and_mul_mask(<4 x i32> %a0, <4 x i32> %a1) { -; CHECK-LABEL: knownbits_sabd_and_mul_mask: -; CHECK: // %bb.0: -; CHECK-NEXT: adrp x8, .LCPI36_0 -; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI36_0] -; CHECK-NEXT: and v3.16b, v0.16b, v2.16b -; CHECK-NEXT: and v2.16b, v1.16b, v2.16b -; CHECK-NEXT: mul v0.4s, v0.4s, v3.4s -; CHECK-NEXT: mul v1.4s, v1.4s, v2.4s -; CHECK-NEXT: sabd v0.4s, v0.4s, v1.4s -; CHECK-NEXT: mov v0.s[1], v0.s[0] -; CHECK-NEXT: trn2 v0.4s, v0.4s, v0.4s -; CHECK-NEXT: ret +; CHECK-SD-LABEL: knownbits_sabd_and_mul_mask: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: adrp x8, .LCPI36_0 +; CHECK-SD-NEXT: ldr q2, [x8, :lo12:.LCPI36_0] +; CHECK-SD-NEXT: and v3.16b, v0.16b, v2.16b +; CHECK-SD-NEXT: and v2.16b, v1.16b, v2.16b +; CHECK-SD-NEXT: mul v0.4s, v0.4s, v3.4s +; CHECK-SD-NEXT: mul v1.4s, v1.4s, v2.4s +; CHECK-SD-NEXT: sabd v0.4s, v0.4s, v1.4s +; CHECK-SD-NEXT: mov v0.s[1], v0.s[0] +; CHECK-SD-NEXT: trn2 v0.4s, v0.4s, v0.4s +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: knownbits_sabd_and_mul_mask: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: adrp x8, .LCPI36_1 +; CHECK-GI-NEXT: ldr q2, [x8, :lo12:.LCPI36_1] +; CHECK-GI-NEXT: adrp x8, .LCPI36_0 +; CHECK-GI-NEXT: and v3.16b, v0.16b, v2.16b +; CHECK-GI-NEXT: and v2.16b, v1.16b, v2.16b +; CHECK-GI-NEXT: mul v0.4s, v0.4s, v3.4s +; CHECK-GI-NEXT: mul v1.4s, v1.4s, v2.4s +; CHECK-GI-NEXT: ldr q2, [x8, :lo12:.LCPI36_0] +; CHECK-GI-NEXT: sabd v0.4s, v0.4s, v1.4s +; CHECK-GI-NEXT: tbl v0.16b, { v0.16b, v1.16b }, v2.16b +; CHECK-GI-NEXT: ret %1 = and <4 x i32> %a0, <i32 -65536, i32 -7, i32 -7, i32 -65536> %2 = mul <4 x i32> %a0, %1 %3 = and <4 x i32> %a1, <i32 -65536, i32 -7, i32 -7, i32 -65536> diff --git a/llvm/test/CodeGen/AArch64/combine-sdiv.ll b/llvm/test/CodeGen/AArch64/combine-sdiv.ll index 2b7fa08..e1ba0e9 100644 --- a/llvm/test/CodeGen/AArch64/combine-sdiv.ll +++ b/llvm/test/CodeGen/AArch64/combine-sdiv.ll @@ -1631,7 +1631,6 @@ define i8 @combine_i8_sdiv_const100(i8 %x) { ; CHECK-GI-NEXT: sxtb w8, w0 ; CHECK-GI-NEXT: mov w9, #41 // =0x29 ; CHECK-GI-NEXT: mul w8, w8, w9 -; CHECK-GI-NEXT: sxth w8, w8 ; CHECK-GI-NEXT: sbfx w8, w8, #8, #8 ; CHECK-GI-NEXT: asr w8, w8, #4 ; CHECK-GI-NEXT: ubfx w9, w8, #7, #1 diff --git a/llvm/test/CodeGen/AArch64/highextractbitcast.ll b/llvm/test/CodeGen/AArch64/highextractbitcast.ll index f82d1ed..df4889b 100644 --- a/llvm/test/CodeGen/AArch64/highextractbitcast.ll +++ b/llvm/test/CodeGen/AArch64/highextractbitcast.ll @@ -1,6 +1,10 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=aarch64-unknown-linux-gnu < %s | FileCheck %s --check-prefix CHECK-LE +; RUN: llc -mtriple=aarch64-unknown-linux-gnu < %s | FileCheck %s --check-prefixes CHECK,CHECK-LE ; RUN: llc -mtriple=aarch64_be-unknown-linux-gnu < %s | FileCheck %s --check-prefix CHECK-BE +; RUN: llc -mtriple=aarch64-unknown-linux-gnu -global-isel -global-isel -global-isel-abort=2 2>&1 < %s | FileCheck %s --check-prefixes CHECK,CHECK-GI + +; CHECK-GI: warning: Instruction selection used fallback path for test_pmull_high_p8_128 +; CHECK-GI-NEXT: warning: Instruction selection used fallback path for test_pmull_high_p8_64 declare <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16>, <4 x i16>) declare <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16>, <4 x i16>) @@ -12,10 +16,10 @@ declare <8 x i16> @llvm.aarch64.neon.pmull.v8i16(<8 x i8>, <8 x i8>) declare <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %s1, <2 x i32> %s2) define <4 x i32> @test_smull_high_s16_base(<8 x i16> %a, <8 x i16> %b) #0 { -; CHECK-LE-LABEL: test_smull_high_s16_base: -; CHECK-LE: // %bb.0: // %entry -; CHECK-LE-NEXT: smull2 v0.4s, v0.8h, v1.8h -; CHECK-LE-NEXT: ret +; CHECK-LABEL: test_smull_high_s16_base: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: smull2 v0.4s, v0.8h, v1.8h +; CHECK-NEXT: ret ; ; CHECK-BE-LABEL: test_smull_high_s16_base: ; CHECK-BE: // %bb.0: // %entry @@ -35,10 +39,10 @@ entry: } define <4 x i32> @test_smull_high_s16_bitcasta1(<2 x i64> %aa, <8 x i16> %b) #0 { -; CHECK-LE-LABEL: test_smull_high_s16_bitcasta1: -; CHECK-LE: // %bb.0: // %entry -; CHECK-LE-NEXT: smull2 v0.4s, v0.8h, v1.8h -; CHECK-LE-NEXT: ret +; CHECK-LABEL: test_smull_high_s16_bitcasta1: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: smull2 v0.4s, v0.8h, v1.8h +; CHECK-NEXT: ret ; ; CHECK-BE-LABEL: test_smull_high_s16_bitcasta1: ; CHECK-BE: // %bb.0: // %entry @@ -59,10 +63,10 @@ entry: } define <4 x i32> @test_smull_high_s16_bitcastb1(<8 x i16> %a, <16 x i8> %bb) #0 { -; CHECK-LE-LABEL: test_smull_high_s16_bitcastb1: -; CHECK-LE: // %bb.0: // %entry -; CHECK-LE-NEXT: smull2 v0.4s, v0.8h, v1.8h -; CHECK-LE-NEXT: ret +; CHECK-LABEL: test_smull_high_s16_bitcastb1: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: smull2 v0.4s, v0.8h, v1.8h +; CHECK-NEXT: ret ; ; CHECK-BE-LABEL: test_smull_high_s16_bitcastb1: ; CHECK-BE: // %bb.0: // %entry @@ -83,10 +87,10 @@ entry: } define <4 x i32> @test_smull_high_s16_bitcasta2(<2 x i64> %a, <8 x i16> %b) #0 { -; CHECK-LE-LABEL: test_smull_high_s16_bitcasta2: -; CHECK-LE: // %bb.0: // %entry -; CHECK-LE-NEXT: smull2 v0.4s, v0.8h, v1.8h -; CHECK-LE-NEXT: ret +; CHECK-LABEL: test_smull_high_s16_bitcasta2: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: smull2 v0.4s, v0.8h, v1.8h +; CHECK-NEXT: ret ; ; CHECK-BE-LABEL: test_smull_high_s16_bitcasta2: ; CHECK-BE: // %bb.0: // %entry @@ -109,10 +113,10 @@ entry: } define <4 x i32> @test_smull_high_s16_bitcastb2(<8 x i16> %a, <16 x i8> %b) #0 { -; CHECK-LE-LABEL: test_smull_high_s16_bitcastb2: -; CHECK-LE: // %bb.0: // %entry -; CHECK-LE-NEXT: smull2 v0.4s, v0.8h, v1.8h -; CHECK-LE-NEXT: ret +; CHECK-LABEL: test_smull_high_s16_bitcastb2: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: smull2 v0.4s, v0.8h, v1.8h +; CHECK-NEXT: ret ; ; CHECK-BE-LABEL: test_smull_high_s16_bitcastb2: ; CHECK-BE: // %bb.0: // %entry @@ -157,6 +161,13 @@ define <4 x i32> @test_smull_high_s16_bitcasta1_wrongindex(<2 x i64> %aa, <8 x i ; CHECK-BE-NEXT: rev64 v0.4s, v0.4s ; CHECK-BE-NEXT: ext v0.16b, v0.16b, v0.16b, #8 ; CHECK-BE-NEXT: ret +; +; CHECK-GI-LABEL: test_smull_high_s16_bitcasta1_wrongindex: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: ext v0.16b, v0.16b, v0.16b, #4 +; CHECK-GI-NEXT: mov d1, v1.d[1] +; CHECK-GI-NEXT: smull v0.4s, v0.4h, v1.4h +; CHECK-GI-NEXT: ret entry: %a = bitcast <2 x i64> %aa to <8 x i16> %s1 = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5> @@ -186,6 +197,13 @@ define <4 x i32> @test_smull_high_s16_bitcastb1_wrongindex(<8 x i16> %a, <16 x i ; CHECK-BE-NEXT: rev64 v0.4s, v0.4s ; CHECK-BE-NEXT: ext v0.16b, v0.16b, v0.16b, #8 ; CHECK-BE-NEXT: ret +; +; CHECK-GI-LABEL: test_smull_high_s16_bitcastb1_wrongindex: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: mov d0, v0.d[1] +; CHECK-GI-NEXT: ext v1.16b, v1.16b, v0.16b, #6 +; CHECK-GI-NEXT: smull v0.4s, v0.4h, v1.4h +; CHECK-GI-NEXT: ret entry: %b = bitcast <16 x i8> %bb to <8 x i16> %s1 = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> @@ -215,6 +233,13 @@ define <4 x i32> @test_smull_high_s16_bitcasta2_wrongindex(<4 x i32> %a, <8 x i1 ; CHECK-BE-NEXT: rev64 v0.4s, v0.4s ; CHECK-BE-NEXT: ext v0.16b, v0.16b, v0.16b, #8 ; CHECK-BE-NEXT: ret +; +; CHECK-GI-LABEL: test_smull_high_s16_bitcasta2_wrongindex: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: ext v0.16b, v0.16b, v0.16b, #4 +; CHECK-GI-NEXT: mov d1, v1.d[1] +; CHECK-GI-NEXT: smull v0.4s, v0.4h, v1.4h +; CHECK-GI-NEXT: ret entry: %s1a = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 1, i32 2> %s1 = bitcast <2 x i32> %s1a to <4 x i16> @@ -244,6 +269,13 @@ define <4 x i32> @test_smull_high_s16_bitcastb2_wrongindex(<8 x i16> %a, <16 x i ; CHECK-BE-NEXT: rev64 v0.4s, v0.4s ; CHECK-BE-NEXT: ext v0.16b, v0.16b, v0.16b, #8 ; CHECK-BE-NEXT: ret +; +; CHECK-GI-LABEL: test_smull_high_s16_bitcastb2_wrongindex: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: mov d0, v0.d[1] +; CHECK-GI-NEXT: ext v1.16b, v1.16b, v0.16b, #4 +; CHECK-GI-NEXT: smull v0.4s, v0.4h, v1.4h +; CHECK-GI-NEXT: ret entry: %s1 = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> %s2a = shufflevector <16 x i8> %b, <16 x i8> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11> @@ -269,6 +301,12 @@ define <4 x i32> @test_smull_high_s16_splata1(<2 x i64> %aa, <8 x i16> %b) #0 { ; CHECK-BE-NEXT: rev64 v0.4s, v0.4s ; CHECK-BE-NEXT: ext v0.16b, v0.16b, v0.16b, #8 ; CHECK-BE-NEXT: ret +; +; CHECK-GI-LABEL: test_smull_high_s16_splata1: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: mov d1, v1.d[1] +; CHECK-GI-NEXT: smull v0.4s, v1.4h, v0.h[3] +; CHECK-GI-NEXT: ret entry: %a = bitcast <2 x i64> %aa to <8 x i16> %s1 = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3> @@ -293,6 +331,12 @@ define <4 x i32> @test_smull_high_s16_splatb1(<8 x i16> %a, <16 x i8> %bb) #0 { ; CHECK-BE-NEXT: rev64 v0.4s, v0.4s ; CHECK-BE-NEXT: ext v0.16b, v0.16b, v0.16b, #8 ; CHECK-BE-NEXT: ret +; +; CHECK-GI-LABEL: test_smull_high_s16_splatb1: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: mov d0, v0.d[1] +; CHECK-GI-NEXT: smull v0.4s, v0.4h, v1.h[3] +; CHECK-GI-NEXT: ret entry: %b = bitcast <16 x i8> %bb to <8 x i16> %s1 = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> @@ -322,6 +366,13 @@ define <4 x i32> @test_smull_high_s16_splata2(<4 x i32> %a, <8 x i16> %b) #0 { ; CHECK-BE-NEXT: rev64 v0.4s, v0.4s ; CHECK-BE-NEXT: ext v0.16b, v0.16b, v0.16b, #8 ; CHECK-BE-NEXT: ret +; +; CHECK-GI-LABEL: test_smull_high_s16_splata2: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: dup v0.2s, v0.s[3] +; CHECK-GI-NEXT: mov d1, v1.d[1] +; CHECK-GI-NEXT: smull v0.4s, v0.4h, v1.4h +; CHECK-GI-NEXT: ret entry: %s1a = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 3, i32 3> %s1 = bitcast <2 x i32> %s1a to <4 x i16> @@ -351,6 +402,13 @@ define <4 x i32> @test_smull_high_s16_splatb2(<8 x i16> %a, <16 x i8> %b) #0 { ; CHECK-BE-NEXT: rev64 v0.4s, v0.4s ; CHECK-BE-NEXT: ext v0.16b, v0.16b, v0.16b, #8 ; CHECK-BE-NEXT: ret +; +; CHECK-GI-LABEL: test_smull_high_s16_splatb2: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: mov d0, v0.d[1] +; CHECK-GI-NEXT: dup v1.8b, v1.b[3] +; CHECK-GI-NEXT: smull v0.4s, v0.4h, v1.4h +; CHECK-GI-NEXT: ret entry: %s1 = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> %s2a = shufflevector <16 x i8> %b, <16 x i8> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3> @@ -362,10 +420,10 @@ entry: define <4 x i32> @test_umull_high_s16_bitcasta1(<2 x i64> %aa, <8 x i16> %b) #0 { -; CHECK-LE-LABEL: test_umull_high_s16_bitcasta1: -; CHECK-LE: // %bb.0: // %entry -; CHECK-LE-NEXT: umull2 v0.4s, v0.8h, v1.8h -; CHECK-LE-NEXT: ret +; CHECK-LABEL: test_umull_high_s16_bitcasta1: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: umull2 v0.4s, v0.8h, v1.8h +; CHECK-NEXT: ret ; ; CHECK-BE-LABEL: test_umull_high_s16_bitcasta1: ; CHECK-BE: // %bb.0: // %entry @@ -386,10 +444,10 @@ entry: } define <8 x i16> @test_vabdl_high_u82(<16 x i8> %a, <8 x i16> %bb) { -; CHECK-LE-LABEL: test_vabdl_high_u82: -; CHECK-LE: // %bb.0: // %entry -; CHECK-LE-NEXT: uabdl2 v0.8h, v0.16b, v1.16b -; CHECK-LE-NEXT: ret +; CHECK-LABEL: test_vabdl_high_u82: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: uabdl2 v0.8h, v0.16b, v1.16b +; CHECK-NEXT: ret ; ; CHECK-BE-LABEL: test_vabdl_high_u82: ; CHECK-BE: // %bb.0: // %entry @@ -411,10 +469,10 @@ entry: } define <8 x i16> @test_vabdl_high_s82(<16 x i8> %a, <8 x i16> %bb) { -; CHECK-LE-LABEL: test_vabdl_high_s82: -; CHECK-LE: // %bb.0: // %entry -; CHECK-LE-NEXT: sabdl2 v0.8h, v0.16b, v1.16b -; CHECK-LE-NEXT: ret +; CHECK-LABEL: test_vabdl_high_s82: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: sabdl2 v0.8h, v0.16b, v1.16b +; CHECK-NEXT: ret ; ; CHECK-BE-LABEL: test_vabdl_high_s82: ; CHECK-BE: // %bb.0: // %entry @@ -436,10 +494,10 @@ entry: } define <4 x i32> @test_vqdmlal_high_s16_bitcast(<4 x i32> %a, <8 x i16> %b, <16 x i8> %cc) { -; CHECK-LE-LABEL: test_vqdmlal_high_s16_bitcast: -; CHECK-LE: // %bb.0: // %entry -; CHECK-LE-NEXT: sqdmlal2 v0.4s, v1.8h, v2.8h -; CHECK-LE-NEXT: ret +; CHECK-LABEL: test_vqdmlal_high_s16_bitcast: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: sqdmlal2 v0.4s, v1.8h, v2.8h +; CHECK-NEXT: ret ; ; CHECK-BE-LABEL: test_vqdmlal_high_s16_bitcast: ; CHECK-BE: // %bb.0: // %entry @@ -463,12 +521,12 @@ entry: } define <8 x i16> @test_pmull_high_p8_128(i128 %aa, i128 %bb) { -; CHECK-LE-LABEL: test_pmull_high_p8_128: -; CHECK-LE: // %bb.0: // %entry -; CHECK-LE-NEXT: fmov d0, x3 -; CHECK-LE-NEXT: fmov d1, x1 -; CHECK-LE-NEXT: pmull v0.8h, v1.8b, v0.8b -; CHECK-LE-NEXT: ret +; CHECK-LABEL: test_pmull_high_p8_128: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: fmov d0, x3 +; CHECK-NEXT: fmov d1, x1 +; CHECK-NEXT: pmull v0.8h, v1.8b, v0.8b +; CHECK-NEXT: ret ; ; CHECK-BE-LABEL: test_pmull_high_p8_128: ; CHECK-BE: // %bb.0: // %entry @@ -490,10 +548,10 @@ entry: } define <8 x i16> @test_pmull_high_p8_64(<2 x i64> %aa, <2 x i64> %bb) { -; CHECK-LE-LABEL: test_pmull_high_p8_64: -; CHECK-LE: // %bb.0: // %entry -; CHECK-LE-NEXT: pmull2 v0.8h, v0.16b, v1.16b -; CHECK-LE-NEXT: ret +; CHECK-LABEL: test_pmull_high_p8_64: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: pmull2 v0.8h, v0.16b, v1.16b +; CHECK-NEXT: ret ; ; CHECK-BE-LABEL: test_pmull_high_p8_64: ; CHECK-BE: // %bb.0: // %entry @@ -532,6 +590,14 @@ define <8 x i16> @foov8i16(<16 x i8> %a1, <2 x i64> %b1) { ; CHECK-BE-NEXT: rev64 v0.8h, v0.8h ; CHECK-BE-NEXT: ext v0.16b, v0.16b, v0.16b, #8 ; CHECK-BE-NEXT: ret +; +; CHECK-GI-LABEL: foov8i16: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: shrn v1.4h, v1.4s, #5 +; CHECK-GI-NEXT: shrn v0.4h, v0.4s, #5 +; CHECK-GI-NEXT: fmov x8, d1 +; CHECK-GI-NEXT: mov v0.d[1], x8 +; CHECK-GI-NEXT: ret %a0 = bitcast <16 x i8> %a1 to <4 x i32> %b0 = bitcast <2 x i64> %b1 to <4 x i32> %vshrn_low_shift = lshr <4 x i32> %a0, <i32 5, i32 5, i32 5, i32 5> @@ -558,6 +624,12 @@ define <2 x i64> @hadd32_zext_asr(<16 x i8> %src1a) { ; CHECK-BE-NEXT: ushll2 v0.2d, v0.4s, #1 ; CHECK-BE-NEXT: ext v0.16b, v0.16b, v0.16b, #8 ; CHECK-BE-NEXT: ret +; +; CHECK-GI-LABEL: hadd32_zext_asr: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: mov d0, v0.d[1] +; CHECK-GI-NEXT: ushll v0.2d, v0.2s, #1 +; CHECK-GI-NEXT: ret %src1 = bitcast <16 x i8> %src1a to <4 x i32> %s1 = shufflevector <4 x i32> %src1, <4 x i32> undef, <2 x i32> <i32 2, i32 3> %zextsrc1 = zext <2 x i32> %s1 to <2 x i64> @@ -580,6 +652,12 @@ define <2 x i64> @test_umull_high_s16_splata1(<2 x i64> %aa, <4 x i32> %b) #0 { ; CHECK-BE-NEXT: umull2 v0.2d, v1.4s, v0.s[1] ; CHECK-BE-NEXT: ext v0.16b, v0.16b, v0.16b, #8 ; CHECK-BE-NEXT: ret +; +; CHECK-GI-LABEL: test_umull_high_s16_splata1: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: mov d1, v1.d[1] +; CHECK-GI-NEXT: umull v0.2d, v1.2s, v0.s[1] +; CHECK-GI-NEXT: ret entry: %a = bitcast <2 x i64> %aa to <4 x i32> %s1 = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 1, i32 1> diff --git a/llvm/test/CodeGen/AArch64/neon-saba.ll b/llvm/test/CodeGen/AArch64/neon-saba.ll index 78ccc89..19967bd 100644 --- a/llvm/test/CodeGen/AArch64/neon-saba.ll +++ b/llvm/test/CodeGen/AArch64/neon-saba.ll @@ -1,13 +1,21 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple aarch64-unknown-linux-gnu < %s | FileCheck %s +; RUN: llc -mtriple aarch64-unknown-linux-gnu < %s | FileCheck %s --check-prefixes=CHECK,CHECK-SD +; RUN: llc -mtriple aarch64-unknown-linux-gnu -global-isel < %s | FileCheck %s --check-prefixes=CHECK,CHECK-GI ; SABA from ADD(ABS(SUB NSW)) define <4 x i32> @saba_abs_4s(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) #0 { -; CHECK-LABEL: saba_abs_4s: -; CHECK: // %bb.0: -; CHECK-NEXT: saba v0.4s, v1.4s, v2.4s -; CHECK-NEXT: ret +; CHECK-SD-LABEL: saba_abs_4s: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: saba v0.4s, v1.4s, v2.4s +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: saba_abs_4s: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: sub v1.4s, v1.4s, v2.4s +; CHECK-GI-NEXT: abs v1.4s, v1.4s +; CHECK-GI-NEXT: add v0.4s, v0.4s, v1.4s +; CHECK-GI-NEXT: ret %sub = sub nsw <4 x i32> %b, %c %abs = call <4 x i32> @llvm.abs.v4i32(<4 x i32> %sub, i1 true) %add = add <4 x i32> %a, %abs @@ -15,10 +23,17 @@ define <4 x i32> @saba_abs_4s(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) #0 { } define <2 x i32> @saba_abs_2s(<2 x i32> %a, <2 x i32> %b, <2 x i32> %c) #0 { -; CHECK-LABEL: saba_abs_2s: -; CHECK: // %bb.0: -; CHECK-NEXT: saba v0.2s, v1.2s, v2.2s -; CHECK-NEXT: ret +; CHECK-SD-LABEL: saba_abs_2s: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: saba v0.2s, v1.2s, v2.2s +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: saba_abs_2s: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: sub v1.2s, v1.2s, v2.2s +; CHECK-GI-NEXT: abs v1.2s, v1.2s +; CHECK-GI-NEXT: add v0.2s, v0.2s, v1.2s +; CHECK-GI-NEXT: ret %sub = sub nsw <2 x i32> %b, %c %abs = call <2 x i32> @llvm.abs.v2i32(<2 x i32> %sub, i1 true) %add = add <2 x i32> %a, %abs @@ -26,10 +41,17 @@ define <2 x i32> @saba_abs_2s(<2 x i32> %a, <2 x i32> %b, <2 x i32> %c) #0 { } define <8 x i16> @saba_abs_8h(<8 x i16> %a, <8 x i16> %b, <8 x i16> %c) #0 { -; CHECK-LABEL: saba_abs_8h: -; CHECK: // %bb.0: -; CHECK-NEXT: saba v0.8h, v1.8h, v2.8h -; CHECK-NEXT: ret +; CHECK-SD-LABEL: saba_abs_8h: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: saba v0.8h, v1.8h, v2.8h +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: saba_abs_8h: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: sub v1.8h, v1.8h, v2.8h +; CHECK-GI-NEXT: abs v1.8h, v1.8h +; CHECK-GI-NEXT: add v0.8h, v0.8h, v1.8h +; CHECK-GI-NEXT: ret %sub = sub nsw <8 x i16> %b, %c %abs = call <8 x i16> @llvm.abs.v8i16(<8 x i16> %sub, i1 true) %add = add <8 x i16> %a, %abs @@ -37,10 +59,17 @@ define <8 x i16> @saba_abs_8h(<8 x i16> %a, <8 x i16> %b, <8 x i16> %c) #0 { } define <4 x i16> @saba_abs_4h(<4 x i16> %a, <4 x i16> %b, <4 x i16> %c) #0 { -; CHECK-LABEL: saba_abs_4h: -; CHECK: // %bb.0: -; CHECK-NEXT: saba v0.4h, v1.4h, v2.4h -; CHECK-NEXT: ret +; CHECK-SD-LABEL: saba_abs_4h: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: saba v0.4h, v1.4h, v2.4h +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: saba_abs_4h: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: sub v1.4h, v1.4h, v2.4h +; CHECK-GI-NEXT: abs v1.4h, v1.4h +; CHECK-GI-NEXT: add v0.4h, v0.4h, v1.4h +; CHECK-GI-NEXT: ret %sub = sub nsw <4 x i16> %b, %c %abs = call <4 x i16> @llvm.abs.v4i16(<4 x i16> %sub, i1 true) %add = add <4 x i16> %a, %abs @@ -48,10 +77,17 @@ define <4 x i16> @saba_abs_4h(<4 x i16> %a, <4 x i16> %b, <4 x i16> %c) #0 { } define <16 x i8> @saba_abs_16b(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c) #0 { -; CHECK-LABEL: saba_abs_16b: -; CHECK: // %bb.0: -; CHECK-NEXT: saba v0.16b, v1.16b, v2.16b -; CHECK-NEXT: ret +; CHECK-SD-LABEL: saba_abs_16b: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: saba v0.16b, v1.16b, v2.16b +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: saba_abs_16b: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: sub v1.16b, v1.16b, v2.16b +; CHECK-GI-NEXT: abs v1.16b, v1.16b +; CHECK-GI-NEXT: add v0.16b, v0.16b, v1.16b +; CHECK-GI-NEXT: ret %sub = sub nsw <16 x i8> %b, %c %abs = call <16 x i8> @llvm.abs.v16i8(<16 x i8> %sub, i1 true) %add = add <16 x i8> %a, %abs @@ -59,10 +95,17 @@ define <16 x i8> @saba_abs_16b(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c) #0 { } define <8 x i8> @saba_abs_8b(<8 x i8> %a, <8 x i8> %b, <8 x i8> %c) #0 { -; CHECK-LABEL: saba_abs_8b: -; CHECK: // %bb.0: -; CHECK-NEXT: saba v0.8b, v1.8b, v2.8b -; CHECK-NEXT: ret +; CHECK-SD-LABEL: saba_abs_8b: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: saba v0.8b, v1.8b, v2.8b +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: saba_abs_8b: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: sub v1.8b, v1.8b, v2.8b +; CHECK-GI-NEXT: abs v1.8b, v1.8b +; CHECK-GI-NEXT: add v0.8b, v0.8b, v1.8b +; CHECK-GI-NEXT: ret %sub = sub nsw <8 x i8> %b, %c %abs = call <8 x i8> @llvm.abs.v8i8(<8 x i8> %sub, i1 true) %add = add <8 x i8> %a, %abs diff --git a/llvm/test/CodeGen/AArch64/rem-by-const.ll b/llvm/test/CodeGen/AArch64/rem-by-const.ll index b124042..c57383a 100644 --- a/llvm/test/CodeGen/AArch64/rem-by-const.ll +++ b/llvm/test/CodeGen/AArch64/rem-by-const.ll @@ -52,7 +52,6 @@ define i8 @si8_100(i8 %a, i8 %b) { ; CHECK-GI-NEXT: sxtb w8, w0 ; CHECK-GI-NEXT: mov w9, #41 // =0x29 ; CHECK-GI-NEXT: mul w8, w8, w9 -; CHECK-GI-NEXT: sxth w8, w8 ; CHECK-GI-NEXT: sbfx w8, w8, #8, #8 ; CHECK-GI-NEXT: asr w8, w8, #4 ; CHECK-GI-NEXT: ubfx w9, w8, #7, #1 diff --git a/llvm/test/CodeGen/AArch64/stack-tagging.ll b/llvm/test/CodeGen/AArch64/stack-tagging.ll index 8759fb1..5d73c7b 100644 --- a/llvm/test/CodeGen/AArch64/stack-tagging.ll +++ b/llvm/test/CodeGen/AArch64/stack-tagging.ll @@ -143,54 +143,4 @@ l: ; CHECK-NOT: @llvm.aarch64.irg.sp ; CHECK: ret void -; If we can't trace one of the lifetime markers to a single alloca, fall back -; to poisoning all allocas at the beginning of the function. -; Each alloca must be poisoned only once. -define void @UnrecognizedLifetime(i8 %v) sanitize_memtag { -entry: - %x = alloca i32, align 4 - %y = alloca i32, align 4 - %z = alloca i32, align 4 - %tobool = icmp eq i8 %v, 0 - %xy = select i1 %tobool, ptr %x, ptr %y - %cxcy = select i1 %tobool, ptr %x, ptr %y - br label %another_bb - -another_bb: - call void @llvm.lifetime.start.p0(i64 4, ptr nonnull %z) - store i32 7, ptr %z - call void @noUse32(ptr %z) - call void @llvm.lifetime.end.p0(i64 4, ptr nonnull %z) - call void @llvm.lifetime.start.p0(i64 4, ptr nonnull %z) - store i32 7, ptr %z - call void @llvm.lifetime.end.p0(i64 4, ptr nonnull %z) - call void @llvm.lifetime.start.p0(i64 4, ptr nonnull %cxcy) - store i32 8, ptr %xy - call void @noUse32(ptr %x) - call void @noUse32(ptr %y) - call void @llvm.lifetime.end.p0(i64 4, ptr nonnull %cxcy) - ret void -} - -; CHECK-LABEL: define void @UnrecognizedLifetime( -; CHECK: call ptr @llvm.aarch64.irg.sp(i64 0) -; CHECK: alloca { i32, [12 x i8] }, align 16 -; CHECK: call ptr @llvm.aarch64.tagp -; CHECK: call void @llvm.aarch64.settag( -; CHECK: alloca { i32, [12 x i8] }, align 16 -; CHECK: call ptr @llvm.aarch64.tagp -; CHECK: call void @llvm.aarch64.settag( -; CHECK: alloca { i32, [12 x i8] }, align 16 -; CHECK: call ptr @llvm.aarch64.tagp -; CHECK: call void @llvm.aarch64.settag( -; CHECK: store i32 -; CHECK: call void @noUse32(ptr -; CHECK: store i32 -; CHECK: store i32 -; CHECK: call void @noUse32(ptr -; CHECK: call void @llvm.aarch64.settag( -; CHECK: call void @llvm.aarch64.settag( -; CHECK: call void @llvm.aarch64.settag( -; CHECK: ret void - !0 = !{} diff --git a/llvm/test/CodeGen/AArch64/wineh-reuse-catch-alloca.ll b/llvm/test/CodeGen/AArch64/wineh-reuse-catch-alloca.ll deleted file mode 100644 index 18b8aab..0000000 --- a/llvm/test/CodeGen/AArch64/wineh-reuse-catch-alloca.ll +++ /dev/null @@ -1,100 +0,0 @@ -; RUN: llc %s --mtriple=aarch64-pc-windows-msvc -o - | FileCheck %s - -; Tests the fixed object layouts when two catchpads re-use the same stack -; allocation for this catch objects. - -; Generated from this C++ code, with modifications to the IR (see comments in -; IR): -; https://godbolt.org/z/9qv5Yn68j -; > clang --target=aarch64-pc-windows-msvc test.cpp -; ``` -; extern "C" void boom(); -; extern "C" int calls_boom(); -; { -; try { boom(); } -; catch (int& i) { return i; } -; catch (long& l) { return l; } -; return 0; -; } -; ``` - -; Only need 48 bytes on the stack, not 64. -; CHECK-LABEL: calls_boom: -; CHECK: sub sp, sp, #48 -; CHECK: .seh_stackalloc 48 - -; Both the catch blocks load from the same address. -; CHECK-LABEL: "?catch$3@?0?calls_boom@4HA": -; CHECK: ldr x8, [x29, #24] -; CHECK-LABEL: "?catch$4@?0?calls_boom@4HA": -; CHECK: ldr x8, [x29, #24] - -; There's enough space for the UnwindHelp to be at -16 instead of -32 -; CHECK-LABEL: $cppxdata$calls_boom: -; CHECK: .word -16 // UnwindHelp - -; Both catches have the same object offset. -; CHECK-LABEL: $handlerMap$0$calls_boom: -; CHECK: .word -8 // CatchObjOffset -; CHECK-NEXT: .word "?catch$3@?0?calls_boom@4HA"@IMGREL // Handler -; CHECK: .word -8 // CatchObjOffset -; CHECK-NEXT: .word "?catch$4@?0?calls_boom@4HA"@IMGREL // Handler - -%rtti.TypeDescriptor2 = type { ptr, ptr, [3 x i8] } - -$"??_R0H@8" = comdat any - -$"??_R0J@8" = comdat any - -@"??_7type_info@@6B@" = external constant ptr -@"??_R0H@8" = linkonce_odr global %rtti.TypeDescriptor2 { ptr @"??_7type_info@@6B@", ptr null, [3 x i8] c".H\00" }, comdat -@"??_R0J@8" = linkonce_odr global %rtti.TypeDescriptor2 { ptr @"??_7type_info@@6B@", ptr null, [3 x i8] c".J\00" }, comdat - -define dso_local i32 @calls_boom() personality ptr @__CxxFrameHandler3 { -entry: - %retval = alloca i32, align 4 -; MODIFICATION: Remove unusued alloca -; %l = alloca ptr, align 8 - %i = alloca ptr, align 8 - invoke void @boom() - to label %invoke.cont unwind label %catch.dispatch - -catch.dispatch: - %0 = catchswitch within none [label %catch1, label %catch] unwind to caller - -catch1: - %1 = catchpad within %0 [ptr @"??_R0H@8", i32 8, ptr %i] - %2 = load ptr, ptr %i, align 8 - %3 = load i32, ptr %2, align 4 - store i32 %3, ptr %retval, align 4 - catchret from %1 to label %catchret.dest2 - -catch: -; MODIFICATION: Use %i instead of %l - %4 = catchpad within %0 [ptr @"??_R0J@8", i32 8, ptr %i] - %5 = load ptr, ptr %i, align 8 - %6 = load i32, ptr %5, align 4 - store i32 %6, ptr %retval, align 4 - catchret from %4 to label %catchret.dest - -invoke.cont: - br label %try.cont - -catchret.dest: - br label %return - -catchret.dest2: - br label %return - -try.cont: - store i32 0, ptr %retval, align 4 - br label %return - -return: - %7 = load i32, ptr %retval, align 4 - ret i32 %7 -} - -declare dso_local void @boom() #1 - -declare dso_local i32 @__CxxFrameHandler3(...) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.ll index 9b35920..fa4676e 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.ll @@ -3211,7 +3211,7 @@ define amdgpu_kernel void @dyn_extract_v5f64_s_s(ptr addrspace(1) %out, i32 %sel ; GFX10-NEXT: enable_ieee_mode = 1 ; GFX10-NEXT: enable_wgp_mode = 1 ; GFX10-NEXT: enable_mem_ordered = 1 -; GFX10-NEXT: enable_fwd_progress = 0 +; GFX10-NEXT: enable_fwd_progress = 1 ; GFX10-NEXT: enable_sgpr_private_segment_wave_byte_offset = 0 ; GFX10-NEXT: user_sgpr_count = 14 ; GFX10-NEXT: enable_trap_handler = 0 @@ -3303,7 +3303,7 @@ define amdgpu_kernel void @dyn_extract_v5f64_s_s(ptr addrspace(1) %out, i32 %sel ; GFX11-NEXT: enable_ieee_mode = 1 ; GFX11-NEXT: enable_wgp_mode = 1 ; GFX11-NEXT: enable_mem_ordered = 1 -; GFX11-NEXT: enable_fwd_progress = 0 +; GFX11-NEXT: enable_fwd_progress = 1 ; GFX11-NEXT: enable_sgpr_private_segment_wave_byte_offset = 0 ; GFX11-NEXT: user_sgpr_count = 13 ; GFX11-NEXT: enable_trap_handler = 0 @@ -4215,7 +4215,7 @@ define amdgpu_kernel void @dyn_extract_v4f32_s_s_s(ptr addrspace(1) %out, i32 %s ; GFX10-NEXT: enable_ieee_mode = 1 ; GFX10-NEXT: enable_wgp_mode = 1 ; GFX10-NEXT: enable_mem_ordered = 1 -; GFX10-NEXT: enable_fwd_progress = 0 +; GFX10-NEXT: enable_fwd_progress = 1 ; GFX10-NEXT: enable_sgpr_private_segment_wave_byte_offset = 0 ; GFX10-NEXT: user_sgpr_count = 14 ; GFX10-NEXT: enable_trap_handler = 0 @@ -4300,7 +4300,7 @@ define amdgpu_kernel void @dyn_extract_v4f32_s_s_s(ptr addrspace(1) %out, i32 %s ; GFX11-NEXT: enable_ieee_mode = 1 ; GFX11-NEXT: enable_wgp_mode = 1 ; GFX11-NEXT: enable_mem_ordered = 1 -; GFX11-NEXT: enable_fwd_progress = 0 +; GFX11-NEXT: enable_fwd_progress = 1 ; GFX11-NEXT: enable_sgpr_private_segment_wave_byte_offset = 0 ; GFX11-NEXT: user_sgpr_count = 13 ; GFX11-NEXT: enable_trap_handler = 0 @@ -4569,7 +4569,7 @@ define amdgpu_kernel void @dyn_extract_v4f64_s_s_s(ptr addrspace(1) %out, i32 %s ; GFX10-NEXT: enable_ieee_mode = 1 ; GFX10-NEXT: enable_wgp_mode = 1 ; GFX10-NEXT: enable_mem_ordered = 1 -; GFX10-NEXT: enable_fwd_progress = 0 +; GFX10-NEXT: enable_fwd_progress = 1 ; GFX10-NEXT: enable_sgpr_private_segment_wave_byte_offset = 0 ; GFX10-NEXT: user_sgpr_count = 14 ; GFX10-NEXT: enable_trap_handler = 0 @@ -4657,7 +4657,7 @@ define amdgpu_kernel void @dyn_extract_v4f64_s_s_s(ptr addrspace(1) %out, i32 %s ; GFX11-NEXT: enable_ieee_mode = 1 ; GFX11-NEXT: enable_wgp_mode = 1 ; GFX11-NEXT: enable_mem_ordered = 1 -; GFX11-NEXT: enable_fwd_progress = 0 +; GFX11-NEXT: enable_fwd_progress = 1 ; GFX11-NEXT: enable_sgpr_private_segment_wave_byte_offset = 0 ; GFX11-NEXT: user_sgpr_count = 13 ; GFX11-NEXT: enable_trap_handler = 0 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-bswap.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-bswap.mir index 5b8c284..dde566d9 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-bswap.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-bswap.mir @@ -1,6 +1,8 @@ # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py # RUN: llc -mtriple=amdgcn -mcpu=hawaii -run-pass=instruction-select -verify-machineinstrs -o - %s | FileCheck -check-prefix=GFX7 %s # RUN: llc -mtriple=amdgcn -mcpu=fiji -run-pass=instruction-select -verify-machineinstrs -o - %s | FileCheck -check-prefix=GFX8 %s +# RUN: llc -mtriple=amdgcn -mcpu=gfx900 -run-pass=instruction-select -verify-machineinstrs -o - %s | FileCheck -check-prefix=GFX9 %s +# RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -run-pass=instruction-select -verify-machineinstrs -o - %s | FileCheck -check-prefix=GFX10 %s --- name: bswap_i32_vv @@ -19,6 +21,7 @@ body: | ; GFX7-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 16711935 ; GFX7-NEXT: [[V_BFI_B32_e64_:%[0-9]+]]:vgpr_32 = V_BFI_B32_e64 [[S_MOV_B32_]], [[V_ALIGNBIT_B32_e64_1]], [[V_ALIGNBIT_B32_e64_]], implicit $exec ; GFX7-NEXT: S_ENDPGM 0, implicit [[V_BFI_B32_e64_]] + ; ; GFX8-LABEL: name: bswap_i32_vv ; GFX8: liveins: $vgpr0 ; GFX8-NEXT: {{ $}} @@ -26,6 +29,22 @@ body: | ; GFX8-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 66051 ; GFX8-NEXT: [[V_PERM_B32_e64_:%[0-9]+]]:vgpr_32 = V_PERM_B32_e64 0, [[COPY]], [[S_MOV_B32_]], implicit $exec ; GFX8-NEXT: S_ENDPGM 0, implicit [[V_PERM_B32_e64_]] + ; + ; GFX9-LABEL: name: bswap_i32_vv + ; GFX9: liveins: $vgpr0 + ; GFX9-NEXT: {{ $}} + ; GFX9-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX9-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 66051 + ; GFX9-NEXT: [[V_PERM_B32_e64_:%[0-9]+]]:vgpr_32 = V_PERM_B32_e64 0, [[COPY]], [[S_MOV_B32_]], implicit $exec + ; GFX9-NEXT: S_ENDPGM 0, implicit [[V_PERM_B32_e64_]] + ; + ; GFX10-LABEL: name: bswap_i32_vv + ; GFX10: liveins: $vgpr0 + ; GFX10-NEXT: {{ $}} + ; GFX10-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX10-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 66051 + ; GFX10-NEXT: [[V_PERM_B32_e64_:%[0-9]+]]:vgpr_32 = V_PERM_B32_e64 0, [[COPY]], [[S_MOV_B32_]], implicit $exec + ; GFX10-NEXT: S_ENDPGM 0, implicit [[V_PERM_B32_e64_]] %0:vgpr(s32) = COPY $vgpr0 %1:vgpr(s32) = G_BSWAP %0 S_ENDPGM 0, implicit %1 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fshr.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fshr.mir index 0a4cb3cc..fa95f33 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fshr.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fshr.mir @@ -1,8 +1,8 @@ # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py # RUN: llc -mtriple=amdgcn -mcpu=tahiti -run-pass=instruction-select -verify-machineinstrs -o - %s | FileCheck -check-prefix=GCN %s # RUN: llc -mtriple=amdgcn -mcpu=fiji -run-pass=instruction-select -verify-machineinstrs -o - %s | FileCheck -check-prefix=GCN %s -# RUN: llc -mtriple=amdgcn -mcpu=gfx900 -run-pass=instruction-select -verify-machineinstrs -o - %s | FileCheck -check-prefix=GCN %s -# RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -run-pass=instruction-select -verify-machineinstrs -o - %s | FileCheck -check-prefix=GCN %s +# RUN: llc -mtriple=amdgcn -mcpu=gfx900 -run-pass=instruction-select -verify-machineinstrs -o - %s | FileCheck -check-prefix=GFX9 %s +# RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -run-pass=instruction-select -verify-machineinstrs -o - %s | FileCheck -check-prefix=GFX10 %s # RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -run-pass=instruction-select -verify-machineinstrs -o - %s | FileCheck -check-prefixes=GFX11 %s --- @@ -24,6 +24,24 @@ body: | ; GCN-NEXT: [[V_ALIGNBIT_B32_e64_:%[0-9]+]]:vgpr_32 = V_ALIGNBIT_B32_e64 [[COPY]], [[COPY1]], [[COPY2]], implicit $exec ; GCN-NEXT: S_ENDPGM 0, implicit [[V_ALIGNBIT_B32_e64_]] ; + ; GFX9-LABEL: name: fshr_s32 + ; GFX9: liveins: $vgpr0, $vgpr1, $vgpr2 + ; GFX9-NEXT: {{ $}} + ; GFX9-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX9-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX9-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; GFX9-NEXT: [[V_ALIGNBIT_B32_opsel_e64_:%[0-9]+]]:vgpr_32 = V_ALIGNBIT_B32_opsel_e64 0, [[COPY]], 0, [[COPY1]], 0, [[COPY2]], 0, 0, implicit $exec + ; GFX9-NEXT: S_ENDPGM 0, implicit [[V_ALIGNBIT_B32_opsel_e64_]] + ; + ; GFX10-LABEL: name: fshr_s32 + ; GFX10: liveins: $vgpr0, $vgpr1, $vgpr2 + ; GFX10-NEXT: {{ $}} + ; GFX10-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX10-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX10-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; GFX10-NEXT: [[V_ALIGNBIT_B32_opsel_e64_:%[0-9]+]]:vgpr_32 = V_ALIGNBIT_B32_opsel_e64 0, [[COPY]], 0, [[COPY1]], 0, [[COPY2]], 0, 0, implicit $exec + ; GFX10-NEXT: S_ENDPGM 0, implicit [[V_ALIGNBIT_B32_opsel_e64_]] + ; ; GFX11-LABEL: name: fshr_s32 ; GFX11: liveins: $vgpr0, $vgpr1, $vgpr2 ; GFX11-NEXT: {{ $}} diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-smulh.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-smulh.mir index cd69104..69e3561 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-smulh.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-smulh.mir @@ -80,8 +80,7 @@ body: | ; GFX8-NEXT: [[MUL:%[0-9]+]]:_(s32) = G_MUL [[SEXT_INREG]], [[SEXT_INREG1]] ; GFX8-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 ; GFX8-NEXT: [[ASHR:%[0-9]+]]:_(s32) = G_ASHR [[MUL]], [[C]](s32) - ; GFX8-NEXT: [[SEXT_INREG2:%[0-9]+]]:_(s32) = G_SEXT_INREG [[ASHR]], 16 - ; GFX8-NEXT: $vgpr0 = COPY [[SEXT_INREG2]](s32) + ; GFX8-NEXT: $vgpr0 = COPY [[ASHR]](s32) ; ; GFX9-LABEL: name: test_smulh_s16 ; GFX9: liveins: $vgpr0, $vgpr1 @@ -93,8 +92,7 @@ body: | ; GFX9-NEXT: [[MUL:%[0-9]+]]:_(s32) = G_MUL [[SEXT_INREG]], [[SEXT_INREG1]] ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 ; GFX9-NEXT: [[ASHR:%[0-9]+]]:_(s32) = G_ASHR [[MUL]], [[C]](s32) - ; GFX9-NEXT: [[SEXT_INREG2:%[0-9]+]]:_(s32) = G_SEXT_INREG [[ASHR]], 16 - ; GFX9-NEXT: $vgpr0 = COPY [[SEXT_INREG2]](s32) + ; GFX9-NEXT: $vgpr0 = COPY [[ASHR]](s32) %0:_(s32) = COPY $vgpr0 %1:_(s32) = COPY $vgpr1 %2:_(s16) = G_TRUNC %0 @@ -200,9 +198,7 @@ body: | ; GFX9-NEXT: [[SEXT_INREG3:%[0-9]+]]:_(s32) = G_SEXT_INREG [[UV3]], 16 ; GFX9-NEXT: [[MUL1:%[0-9]+]]:_(s32) = G_MUL [[SEXT_INREG2]], [[SEXT_INREG3]] ; GFX9-NEXT: [[ASHR1:%[0-9]+]]:_(s32) = G_ASHR [[MUL1]], [[C]](s32) - ; GFX9-NEXT: [[SEXT_INREG4:%[0-9]+]]:_(s32) = G_SEXT_INREG [[ASHR]], 16 - ; GFX9-NEXT: [[SEXT_INREG5:%[0-9]+]]:_(s32) = G_SEXT_INREG [[ASHR1]], 16 - ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[SEXT_INREG4]](s32), [[SEXT_INREG5]](s32) + ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[ASHR]](s32), [[ASHR1]](s32) ; GFX9-NEXT: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x s32>) %0:_(<2 x s32>) = COPY $vgpr0_vgpr1 %1:_(<2 x s32>) = COPY $vgpr2_vgpr3 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/postlegalizercombiner-sbfx.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/postlegalizercombiner-sbfx.mir index 2c545c8..1025d60 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/postlegalizercombiner-sbfx.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/postlegalizercombiner-sbfx.mir @@ -92,8 +92,7 @@ body: | ; GCN-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 ; GCN-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 ; GCN-NEXT: [[ASHR:%[0-9]+]]:_(s32) = G_ASHR [[COPY]], [[C]](s32) - ; GCN-NEXT: [[SEXT_INREG:%[0-9]+]]:_(s32) = G_SEXT_INREG [[ASHR]], 20 - ; GCN-NEXT: $vgpr0 = COPY [[SEXT_INREG]](s32) + ; GCN-NEXT: $vgpr0 = COPY [[ASHR]](s32) %0:_(s32) = COPY $vgpr0 %1:_(s32) = G_CONSTANT i32 16 %2:_(s32) = G_ASHR %0, %1(s32) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-whole-wave-functions.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-whole-wave-functions.mir new file mode 100644 index 0000000..beca901 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-whole-wave-functions.mir @@ -0,0 +1,40 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5 +# RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx1200 -run-pass=regbankselect %s -verify-machineinstrs -o - -regbankselect-fast | FileCheck %s +# RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx1200 -run-pass=regbankselect %s -verify-machineinstrs -o - -regbankselect-greedy | FileCheck %s +# RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx1200 -mattr=+wavefrontsize64 -run-pass=regbankselect %s -verify-machineinstrs -o - -regbankselect-greedy | FileCheck %s +--- +name: basic_test +legalized: true +machineFunctionInfo: + isWholeWaveFunction: true +body: | + bb.1: + liveins: $vgpr0, $vgpr1 + + ; CHECK-LABEL: name: basic_test + ; CHECK: liveins: $vgpr0, $vgpr1 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr1 + ; CHECK-NEXT: [[AMDGPU_WHOLE_WAVE_FUNC_SETUP:%[0-9]+]]:vcc(s1) = G_AMDGPU_WHOLE_WAVE_FUNC_SETUP + ; CHECK-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 5 + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32) + ; CHECK-NEXT: [[SELECT:%[0-9]+]]:vgpr(s32) = G_SELECT [[AMDGPU_WHOLE_WAVE_FUNC_SETUP]](s1), [[COPY]], [[COPY2]] + ; CHECK-NEXT: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 3 + ; CHECK-NEXT: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY [[C1]](s32) + ; CHECK-NEXT: [[SELECT1:%[0-9]+]]:vgpr(s32) = G_SELECT [[AMDGPU_WHOLE_WAVE_FUNC_SETUP]](s1), [[COPY1]], [[COPY3]] + ; CHECK-NEXT: [[INTRINSIC_CONVERGENT:%[0-9]+]]:vgpr(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.update.dpp), [[SELECT]](s32), [[SELECT1]](s32), 1, 1, 1, 0 + ; CHECK-NEXT: $vgpr0 = COPY [[INTRINSIC_CONVERGENT]](s32) + ; CHECK-NEXT: G_AMDGPU_WHOLE_WAVE_FUNC_RETURN [[AMDGPU_WHOLE_WAVE_FUNC_SETUP]](s1), implicit $vgpr0 + %1:_(s32) = COPY $vgpr0 + %2:_(s32) = COPY $vgpr1 + %0:_(s1) = G_AMDGPU_WHOLE_WAVE_FUNC_SETUP + %12:_(s32) = G_CONSTANT i32 5 + %11:_(s32) = G_SELECT %0(s1), %1, %12 + %14:_(s32) = G_CONSTANT i32 3 + %13:_(s32) = G_SELECT %0(s1), %2, %14 + %15:_(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.update.dpp), %11(s32), %13(s32), 1, 1, 1, 0 + $vgpr0 = COPY %15(s32) + G_AMDGPU_WHOLE_WAVE_FUNC_RETURN %0(s1), implicit $vgpr0 + +... diff --git a/llvm/test/CodeGen/AMDGPU/amdpal-msgpack-ieee.ll b/llvm/test/CodeGen/AMDGPU/amdpal-msgpack-ieee.ll index d4826a2..6044f6e 100644 --- a/llvm/test/CodeGen/AMDGPU/amdpal-msgpack-ieee.ll +++ b/llvm/test/CodeGen/AMDGPU/amdpal-msgpack-ieee.ll @@ -7,7 +7,7 @@ ; SI-DAG: '0x2e12 (COMPUTE_PGM_RSRC1)': 0xaf0000{{$}} ; VI-DAG: '0x2e12 (COMPUTE_PGM_RSRC1)': 0xaf02c0{{$}} ; GFX9-DAG: '0x2e12 (COMPUTE_PGM_RSRC1)': 0xaf0000{{$}} -; GFX12-DAG: '0x2e12 (COMPUTE_PGM_RSRC1)': 0x600f0000{{$}} +; GFX12-DAG: '0x2e12 (COMPUTE_PGM_RSRC1)': 0xe00f0000{{$}} define amdgpu_cs half @cs_amdpal(half %arg0) #0 { %add = fadd half %arg0, 1.0 ret half %add diff --git a/llvm/test/CodeGen/AMDGPU/attributor-noalias-addrspace.ll b/llvm/test/CodeGen/AMDGPU/attributor-noalias-addrspace.ll index 7ce5a00..d91b2117 100644 --- a/llvm/test/CodeGen/AMDGPU/attributor-noalias-addrspace.ll +++ b/llvm/test/CodeGen/AMDGPU/attributor-noalias-addrspace.ll @@ -514,9 +514,9 @@ define internal void @callee_no_alias_addr_space_select(ptr %ptr1, ptr %ptr2, pt ret void } -define internal void @callee_alias_addr_space_branch(ptr %ptr1, ptr %ptr2, ptr %ptr3, i1 %cond1, i1 %cond2, i32 %val, i32 %offset) #0 { +define internal void @callee_alias_addr_space_branch(ptr %ptr1, ptr %ptr2, ptr %ptr3, i1 %cond1, i1 %cond2, i32 %val) #0 { ; CHECK-LABEL: define internal void @callee_alias_addr_space_branch( -; CHECK-SAME: ptr [[PTR1:%.*]], ptr [[PTR2:%.*]], ptr [[PTR3:%.*]], i1 [[COND1:%.*]], i1 [[COND2:%.*]], i32 [[VAL:%.*]], i32 [[OFFSET:%.*]]) #[[ATTR1]] { +; CHECK-SAME: ptr [[PTR1:%.*]], ptr [[PTR2:%.*]], ptr [[PTR3:%.*]], i1 [[COND1:%.*]], i1 [[COND2:%.*]], i32 [[VAL:%.*]]) #[[ATTR1]] { ; CHECK-NEXT: br i1 [[COND1]], label %[[BB_1_TRUE:.*]], label %[[BB_1_FALSE:.*]] ; CHECK: [[BB_1_TRUE]]: ; CHECK-NEXT: br label %[[BB_1_END:.*]] diff --git a/llvm/test/CodeGen/AMDGPU/bf16-math.ll b/llvm/test/CodeGen/AMDGPU/bf16-math.ll index 029604c..b49614d 100644 --- a/llvm/test/CodeGen/AMDGPU/bf16-math.ll +++ b/llvm/test/CodeGen/AMDGPU/bf16-math.ll @@ -2,6 +2,27 @@ ; RUN: llc -mtriple=amdgcn -mcpu=gfx1250 < %s | FileCheck --check-prefix=GCN %s ; TODO: Add global-isel when it can support bf16 +define amdgpu_ps void @llvm_sqrt_bf16_v(ptr addrspace(1) %out, bfloat %src) { +; GCN-LABEL: llvm_sqrt_bf16_v: +; GCN: ; %bb.0: +; GCN-NEXT: v_sqrt_bf16_e32 v2, v2 +; GCN-NEXT: global_store_b16 v[0:1], v2, off +; GCN-NEXT: s_endpgm + %sqrt = call bfloat @llvm.sqrt.bf16(bfloat %src) + store bfloat %sqrt, ptr addrspace(1) %out, align 2 + ret void +} + +define amdgpu_ps void @llvm_sqrt_bf16_s(ptr addrspace(1) %out, bfloat inreg %src) { +; GCN-LABEL: llvm_sqrt_bf16_s: +; GCN: ; %bb.0: +; GCN-NEXT: v_sqrt_bf16_e32 v2, s0 +; GCN-NEXT: global_store_b16 v[0:1], v2, off +; GCN-NEXT: s_endpgm + %sqrt = call bfloat @llvm.sqrt.bf16(bfloat %src) + store bfloat %sqrt, ptr addrspace(1) %out, align 2 + ret void +} define amdgpu_ps void @llvm_log2_bf16_v(ptr addrspace(1) %out, bfloat %src) { ; GCN-LABEL: llvm_log2_bf16_v: @@ -47,5 +68,6 @@ define amdgpu_ps void @llvm_exp2_bf16_s(ptr addrspace(1) %out, bfloat inreg %src ret void } +declare bfloat @llvm.sqrt.bf16(bfloat) declare bfloat @llvm.log2.bf16(bfloat) declare bfloat @llvm.exp2.bf16(bfloat) diff --git a/llvm/test/CodeGen/AMDGPU/bf16.ll b/llvm/test/CodeGen/AMDGPU/bf16.ll index cd6d741..7859fcdf 100644 --- a/llvm/test/CodeGen/AMDGPU/bf16.ll +++ b/llvm/test/CodeGen/AMDGPU/bf16.ll @@ -2,7 +2,8 @@ ; RUN: llc < %s -mtriple=amdgcn | FileCheck %s -check-prefixes=GCN ; RUN: llc < %s -mtriple=amdgcn -mcpu=hawaii | FileCheck %s -check-prefixes=GFX7 ; RUN: llc < %s -mtriple=amdgcn -mcpu=tonga | FileCheck %s -check-prefixes=GFX8 -; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx900 | FileCheck %s -check-prefixes=GFX9 +; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx900 | FileCheck %s -check-prefixes=GFX9,GFX900 +; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx950 | FileCheck %s -check-prefixes=GFX9,GFX950 ; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1010 | FileCheck %s -check-prefixes=GFX10 ; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 | FileCheck %s -check-prefixes=GFX11,GFX11TRUE16 ; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 | FileCheck %s -check-prefixes=GFX11,GFX11FAKE16 @@ -967,12 +968,21 @@ define void @v_store_global_v2bf16(<2 x bfloat> %val, ptr addrspace(1) %ptr) { ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_store_global_v2bf16: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_store_dword v[1:2], v0, off -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: v_store_global_v2bf16: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: global_store_dword v[1:2], v0, off +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_store_global_v2bf16: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_mov_b32_e32 v3, v2 +; GFX950-NEXT: v_mov_b32_e32 v2, v1 +; GFX950-NEXT: global_store_dword v[2:3], v0, off +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_store_global_v2bf16: ; GFX10: ; %bb.0: @@ -2019,23 +2029,41 @@ define void @v_store_global_v64bf16(<64 x bfloat> %val, ptr addrspace(1) %ptr) { ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_store_global_v64bf16: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8 -; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 -; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: global_store_dwordx4 v[32:33], v[28:31], off offset:112 -; GFX9-NEXT: global_store_dwordx4 v[32:33], v[24:27], off offset:96 -; GFX9-NEXT: global_store_dwordx4 v[32:33], v[20:23], off offset:80 -; GFX9-NEXT: global_store_dwordx4 v[32:33], v[16:19], off offset:64 -; GFX9-NEXT: global_store_dwordx4 v[32:33], v[12:15], off offset:48 -; GFX9-NEXT: global_store_dwordx4 v[32:33], v[8:11], off offset:32 -; GFX9-NEXT: global_store_dwordx4 v[32:33], v[4:7], off offset:16 -; GFX9-NEXT: global_store_dwordx4 v[32:33], v[0:3], off -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: v_store_global_v64bf16: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8 +; GFX900-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 +; GFX900-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: global_store_dwordx4 v[32:33], v[28:31], off offset:112 +; GFX900-NEXT: global_store_dwordx4 v[32:33], v[24:27], off offset:96 +; GFX900-NEXT: global_store_dwordx4 v[32:33], v[20:23], off offset:80 +; GFX900-NEXT: global_store_dwordx4 v[32:33], v[16:19], off offset:64 +; GFX900-NEXT: global_store_dwordx4 v[32:33], v[12:15], off offset:48 +; GFX900-NEXT: global_store_dwordx4 v[32:33], v[8:11], off offset:32 +; GFX900-NEXT: global_store_dwordx4 v[32:33], v[4:7], off offset:16 +; GFX900-NEXT: global_store_dwordx4 v[32:33], v[0:3], off +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_store_global_v64bf16: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: scratch_load_dword v33, off, s32 offset:8 +; GFX950-NEXT: scratch_load_dword v32, off, s32 offset:4 +; GFX950-NEXT: scratch_load_dword v31, off, s32 +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: global_store_dwordx4 v[32:33], v[28:31], off offset:112 +; GFX950-NEXT: global_store_dwordx4 v[32:33], v[24:27], off offset:96 +; GFX950-NEXT: global_store_dwordx4 v[32:33], v[20:23], off offset:80 +; GFX950-NEXT: global_store_dwordx4 v[32:33], v[16:19], off offset:64 +; GFX950-NEXT: global_store_dwordx4 v[32:33], v[12:15], off offset:48 +; GFX950-NEXT: global_store_dwordx4 v[32:33], v[8:11], off offset:32 +; GFX950-NEXT: global_store_dwordx4 v[32:33], v[4:7], off offset:16 +; GFX950-NEXT: global_store_dwordx4 v[32:33], v[0:3], off +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_store_global_v64bf16: ; GFX10: ; %bb.0: @@ -2204,20 +2232,30 @@ define void @test_load_store_f32_to_bf16(ptr addrspace(1) %in, ptr addrspace(1) ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: test_load_store_f32_to_bf16: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_load_dword v0, v[0:1], off -; GFX9-NEXT: s_movk_i32 s4, 0x7fff -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1 -; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v0 -; GFX9-NEXT: v_add3_u32 v1, v1, v0, s4 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v4, vcc -; GFX9-NEXT: global_store_short_d16_hi v[2:3], v0, off -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: test_load_store_f32_to_bf16: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: global_load_dword v0, v[0:1], off +; GFX900-NEXT: s_movk_i32 s4, 0x7fff +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: v_bfe_u32 v1, v0, 16, 1 +; GFX900-NEXT: v_or_b32_e32 v4, 0x400000, v0 +; GFX900-NEXT: v_add3_u32 v1, v1, v0, s4 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX900-NEXT: v_cndmask_b32_e32 v0, v1, v4, vcc +; GFX900-NEXT: global_store_short_d16_hi v[2:3], v0, off +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: test_load_store_f32_to_bf16: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: global_load_dword v0, v[0:1], off +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: v_cvt_pk_bf16_f32 v0, v0, s0 +; GFX950-NEXT: global_store_short v[2:3], v0, off +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: test_load_store_f32_to_bf16: ; GFX10: ; %bb.0: @@ -2308,30 +2346,50 @@ define void @test_load_store_f64_to_bf16(ptr addrspace(1) %in, ptr addrspace(1) ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: test_load_store_f64_to_bf16: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off -; GFX9-NEXT: s_movk_i32 s8, 0x7fff -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_cvt_f32_f64_e32 v6, v[0:1] -; GFX9-NEXT: v_cvt_f64_f32_e32 v[4:5], v6 -; GFX9-NEXT: v_and_b32_e32 v7, 1, v6 -; GFX9-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v7 -; GFX9-NEXT: v_cmp_gt_f64_e64 s[6:7], |v[0:1]|, |v[4:5]| -; GFX9-NEXT: v_cmp_nlg_f64_e32 vcc, v[0:1], v[4:5] -; GFX9-NEXT: v_cndmask_b32_e64 v4, -1, 1, s[6:7] -; GFX9-NEXT: v_add_u32_e32 v4, v6, v4 -; GFX9-NEXT: s_or_b64 vcc, vcc, s[4:5] -; GFX9-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc -; GFX9-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[0:1] -; GFX9-NEXT: v_bfe_u32 v5, v4, 16, 1 -; GFX9-NEXT: v_or_b32_e32 v6, 0x400000, v4 -; GFX9-NEXT: v_add3_u32 v4, v5, v4, s8 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v6, vcc -; GFX9-NEXT: global_store_short_d16_hi v[2:3], v0, off -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: test_load_store_f64_to_bf16: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: global_load_dwordx2 v[0:1], v[0:1], off +; GFX900-NEXT: s_movk_i32 s8, 0x7fff +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: v_cvt_f32_f64_e32 v6, v[0:1] +; GFX900-NEXT: v_cvt_f64_f32_e32 v[4:5], v6 +; GFX900-NEXT: v_and_b32_e32 v7, 1, v6 +; GFX900-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v7 +; GFX900-NEXT: v_cmp_gt_f64_e64 s[6:7], |v[0:1]|, |v[4:5]| +; GFX900-NEXT: v_cmp_nlg_f64_e32 vcc, v[0:1], v[4:5] +; GFX900-NEXT: v_cndmask_b32_e64 v4, -1, 1, s[6:7] +; GFX900-NEXT: v_add_u32_e32 v4, v6, v4 +; GFX900-NEXT: s_or_b64 vcc, vcc, s[4:5] +; GFX900-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc +; GFX900-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[0:1] +; GFX900-NEXT: v_bfe_u32 v5, v4, 16, 1 +; GFX900-NEXT: v_or_b32_e32 v6, 0x400000, v4 +; GFX900-NEXT: v_add3_u32 v4, v5, v4, s8 +; GFX900-NEXT: v_cndmask_b32_e32 v0, v4, v6, vcc +; GFX900-NEXT: global_store_short_d16_hi v[2:3], v0, off +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: test_load_store_f64_to_bf16: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: global_load_dwordx2 v[0:1], v[0:1], off +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: v_cvt_f32_f64_e32 v6, v[0:1] +; GFX950-NEXT: v_cvt_f64_f32_e32 v[4:5], v6 +; GFX950-NEXT: v_and_b32_e32 v7, 1, v6 +; GFX950-NEXT: v_cmp_gt_f64_e64 s[2:3], |v[0:1]|, |v[4:5]| +; GFX950-NEXT: v_cmp_nlg_f64_e32 vcc, v[0:1], v[4:5] +; GFX950-NEXT: v_cmp_eq_u32_e64 s[0:1], 1, v7 +; GFX950-NEXT: v_cndmask_b32_e64 v0, -1, 1, s[2:3] +; GFX950-NEXT: v_add_u32_e32 v0, v6, v0 +; GFX950-NEXT: s_or_b64 vcc, vcc, s[0:1] +; GFX950-NEXT: v_cndmask_b32_e32 v0, v0, v6, vcc +; GFX950-NEXT: v_cvt_pk_bf16_f32 v0, v0, s0 +; GFX950-NEXT: global_store_short v[2:3], v0, off +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: test_load_store_f64_to_bf16: ; GFX10: ; %bb.0: @@ -2858,12 +2916,21 @@ define void @test_arg_store(bfloat %in, ptr addrspace(1) %out) { ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: test_arg_store: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_store_short v[1:2], v0, off -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: test_arg_store: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: global_store_short v[1:2], v0, off +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: test_arg_store: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_mov_b32_e32 v3, v2 +; GFX950-NEXT: v_mov_b32_e32 v2, v1 +; GFX950-NEXT: global_store_short v[2:3], v0, off +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: test_arg_store: ; GFX10: ; %bb.0: @@ -2918,12 +2985,21 @@ define void @test_arg_store_v2bf16(<2 x bfloat> %in, ptr addrspace(1) %out) { ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: test_arg_store_v2bf16: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_store_dword v[1:2], v0, off -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: test_arg_store_v2bf16: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: global_store_dword v[1:2], v0, off +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: test_arg_store_v2bf16: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_mov_b32_e32 v3, v2 +; GFX950-NEXT: v_mov_b32_e32 v2, v1 +; GFX950-NEXT: global_store_dword v[2:3], v0, off +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: test_arg_store_v2bf16: ; GFX10: ; %bb.0: @@ -3384,12 +3460,19 @@ define bfloat @test_byval(ptr addrspace(5) byval(bfloat) %bv, bfloat %val) { ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: test_byval: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: buffer_store_short v0, off, s[0:3], s32 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: test_byval: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: buffer_store_short v0, off, s[0:3], s32 +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: test_byval: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: scratch_store_short off, v0, s32 +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: test_byval: ; GFX10: ; %bb.0: @@ -3440,12 +3523,19 @@ define void @test_sret(ptr addrspace(5) sret(bfloat) %sret, bfloat %val) { ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: test_sret: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: buffer_store_short v1, v0, s[0:3], 0 offen -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: test_sret: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: buffer_store_short v1, v0, s[0:3], 0 offen +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: test_sret: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: scratch_store_short v0, v1, off +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: test_sret: ; GFX10: ; %bb.0: @@ -3907,34 +3997,63 @@ define void @test_call(bfloat %in, ptr addrspace(5) %out) { ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: test_call: -; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_mov_b32 s18, s33 -; GFX9-NEXT: s_mov_b32 s33, s32 -; GFX9-NEXT: s_xor_saveexec_b64 s[16:17], -1 -; GFX9-NEXT: buffer_store_dword v2, off, s[0:3], s33 ; 4-byte Folded Spill -; GFX9-NEXT: s_mov_b64 exec, s[16:17] -; GFX9-NEXT: s_addk_i32 s32, 0x400 -; GFX9-NEXT: s_getpc_b64 s[16:17] -; GFX9-NEXT: s_add_u32 s16, s16, test_arg_store@gotpcrel32@lo+4 -; GFX9-NEXT: s_addc_u32 s17, s17, test_arg_store@gotpcrel32@hi+12 -; GFX9-NEXT: s_load_dwordx2 s[16:17], s[16:17], 0x0 -; GFX9-NEXT: v_writelane_b32 v2, s30, 0 -; GFX9-NEXT: v_writelane_b32 v2, s31, 1 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX9-NEXT: buffer_store_short v0, v1, s[0:3], 0 offen -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_readlane_b32 s31, v2, 1 -; GFX9-NEXT: v_readlane_b32 s30, v2, 0 -; GFX9-NEXT: s_mov_b32 s32, s33 -; GFX9-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s33 ; 4-byte Folded Reload -; GFX9-NEXT: s_mov_b64 exec, s[4:5] -; GFX9-NEXT: s_mov_b32 s33, s18 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: test_call: +; GFX900: ; %bb.0: ; %entry +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: s_mov_b32 s18, s33 +; GFX900-NEXT: s_mov_b32 s33, s32 +; GFX900-NEXT: s_xor_saveexec_b64 s[16:17], -1 +; GFX900-NEXT: buffer_store_dword v2, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX900-NEXT: s_mov_b64 exec, s[16:17] +; GFX900-NEXT: s_addk_i32 s32, 0x400 +; GFX900-NEXT: s_getpc_b64 s[16:17] +; GFX900-NEXT: s_add_u32 s16, s16, test_arg_store@gotpcrel32@lo+4 +; GFX900-NEXT: s_addc_u32 s17, s17, test_arg_store@gotpcrel32@hi+12 +; GFX900-NEXT: s_load_dwordx2 s[16:17], s[16:17], 0x0 +; GFX900-NEXT: v_writelane_b32 v2, s30, 0 +; GFX900-NEXT: v_writelane_b32 v2, s31, 1 +; GFX900-NEXT: s_waitcnt lgkmcnt(0) +; GFX900-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX900-NEXT: buffer_store_short v0, v1, s[0:3], 0 offen +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: v_readlane_b32 s31, v2, 1 +; GFX900-NEXT: v_readlane_b32 s30, v2, 0 +; GFX900-NEXT: s_mov_b32 s32, s33 +; GFX900-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GFX900-NEXT: buffer_load_dword v2, off, s[0:3], s33 ; 4-byte Folded Reload +; GFX900-NEXT: s_mov_b64 exec, s[4:5] +; GFX900-NEXT: s_mov_b32 s33, s18 +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: test_call: +; GFX950: ; %bb.0: ; %entry +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: s_mov_b32 s2, s33 +; GFX950-NEXT: s_mov_b32 s33, s32 +; GFX950-NEXT: s_xor_saveexec_b64 s[0:1], -1 +; GFX950-NEXT: scratch_store_dword off, v4, s33 ; 4-byte Folded Spill +; GFX950-NEXT: s_mov_b64 exec, s[0:1] +; GFX950-NEXT: s_add_i32 s32, s32, 16 +; GFX950-NEXT: s_getpc_b64 s[0:1] +; GFX950-NEXT: s_add_u32 s0, s0, test_arg_store@gotpcrel32@lo+4 +; GFX950-NEXT: s_addc_u32 s1, s1, test_arg_store@gotpcrel32@hi+12 +; GFX950-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX950-NEXT: v_writelane_b32 v4, s30, 0 +; GFX950-NEXT: v_writelane_b32 v4, s31, 1 +; GFX950-NEXT: s_waitcnt lgkmcnt(0) +; GFX950-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX950-NEXT: scratch_store_short v1, v0, off sc0 sc1 +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: v_readlane_b32 s31, v4, 1 +; GFX950-NEXT: v_readlane_b32 s30, v4, 0 +; GFX950-NEXT: s_mov_b32 s32, s33 +; GFX950-NEXT: s_xor_saveexec_b64 s[0:1], -1 +; GFX950-NEXT: scratch_load_dword v4, off, s33 ; 4-byte Folded Reload +; GFX950-NEXT: s_mov_b64 exec, s[0:1] +; GFX950-NEXT: s_mov_b32 s33, s2 +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: test_call: ; GFX10: ; %bb.0: ; %entry @@ -4104,34 +4223,63 @@ define void @test_call_v2bf16(<2 x bfloat> %in, ptr addrspace(5) %out) { ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: test_call_v2bf16: -; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_mov_b32 s18, s33 -; GFX9-NEXT: s_mov_b32 s33, s32 -; GFX9-NEXT: s_xor_saveexec_b64 s[16:17], -1 -; GFX9-NEXT: buffer_store_dword v2, off, s[0:3], s33 ; 4-byte Folded Spill -; GFX9-NEXT: s_mov_b64 exec, s[16:17] -; GFX9-NEXT: s_addk_i32 s32, 0x400 -; GFX9-NEXT: s_getpc_b64 s[16:17] -; GFX9-NEXT: s_add_u32 s16, s16, test_arg_store_v2bf16@gotpcrel32@lo+4 -; GFX9-NEXT: s_addc_u32 s17, s17, test_arg_store_v2bf16@gotpcrel32@hi+12 -; GFX9-NEXT: s_load_dwordx2 s[16:17], s[16:17], 0x0 -; GFX9-NEXT: v_writelane_b32 v2, s30, 0 -; GFX9-NEXT: v_writelane_b32 v2, s31, 1 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX9-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_readlane_b32 s31, v2, 1 -; GFX9-NEXT: v_readlane_b32 s30, v2, 0 -; GFX9-NEXT: s_mov_b32 s32, s33 -; GFX9-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s33 ; 4-byte Folded Reload -; GFX9-NEXT: s_mov_b64 exec, s[4:5] -; GFX9-NEXT: s_mov_b32 s33, s18 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: test_call_v2bf16: +; GFX900: ; %bb.0: ; %entry +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: s_mov_b32 s18, s33 +; GFX900-NEXT: s_mov_b32 s33, s32 +; GFX900-NEXT: s_xor_saveexec_b64 s[16:17], -1 +; GFX900-NEXT: buffer_store_dword v2, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX900-NEXT: s_mov_b64 exec, s[16:17] +; GFX900-NEXT: s_addk_i32 s32, 0x400 +; GFX900-NEXT: s_getpc_b64 s[16:17] +; GFX900-NEXT: s_add_u32 s16, s16, test_arg_store_v2bf16@gotpcrel32@lo+4 +; GFX900-NEXT: s_addc_u32 s17, s17, test_arg_store_v2bf16@gotpcrel32@hi+12 +; GFX900-NEXT: s_load_dwordx2 s[16:17], s[16:17], 0x0 +; GFX900-NEXT: v_writelane_b32 v2, s30, 0 +; GFX900-NEXT: v_writelane_b32 v2, s31, 1 +; GFX900-NEXT: s_waitcnt lgkmcnt(0) +; GFX900-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX900-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: v_readlane_b32 s31, v2, 1 +; GFX900-NEXT: v_readlane_b32 s30, v2, 0 +; GFX900-NEXT: s_mov_b32 s32, s33 +; GFX900-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GFX900-NEXT: buffer_load_dword v2, off, s[0:3], s33 ; 4-byte Folded Reload +; GFX900-NEXT: s_mov_b64 exec, s[4:5] +; GFX900-NEXT: s_mov_b32 s33, s18 +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: test_call_v2bf16: +; GFX950: ; %bb.0: ; %entry +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: s_mov_b32 s2, s33 +; GFX950-NEXT: s_mov_b32 s33, s32 +; GFX950-NEXT: s_xor_saveexec_b64 s[0:1], -1 +; GFX950-NEXT: scratch_store_dword off, v4, s33 ; 4-byte Folded Spill +; GFX950-NEXT: s_mov_b64 exec, s[0:1] +; GFX950-NEXT: s_add_i32 s32, s32, 16 +; GFX950-NEXT: s_getpc_b64 s[0:1] +; GFX950-NEXT: s_add_u32 s0, s0, test_arg_store_v2bf16@gotpcrel32@lo+4 +; GFX950-NEXT: s_addc_u32 s1, s1, test_arg_store_v2bf16@gotpcrel32@hi+12 +; GFX950-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX950-NEXT: v_writelane_b32 v4, s30, 0 +; GFX950-NEXT: v_writelane_b32 v4, s31, 1 +; GFX950-NEXT: s_waitcnt lgkmcnt(0) +; GFX950-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX950-NEXT: scratch_store_dword v1, v0, off sc0 sc1 +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: v_readlane_b32 s31, v4, 1 +; GFX950-NEXT: v_readlane_b32 s30, v4, 0 +; GFX950-NEXT: s_mov_b32 s32, s33 +; GFX950-NEXT: s_xor_saveexec_b64 s[0:1], -1 +; GFX950-NEXT: scratch_load_dword v4, off, s33 ; 4-byte Folded Reload +; GFX950-NEXT: s_mov_b64 exec, s[0:1] +; GFX950-NEXT: s_mov_b32 s33, s2 +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: test_call_v2bf16: ; GFX10: ; %bb.0: ; %entry @@ -4308,36 +4456,68 @@ define void @test_call_v3bf16(<3 x bfloat> %in, ptr addrspace(5) %out) { ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: test_call_v3bf16: -; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_mov_b32 s18, s33 -; GFX9-NEXT: s_mov_b32 s33, s32 -; GFX9-NEXT: s_xor_saveexec_b64 s[16:17], -1 -; GFX9-NEXT: buffer_store_dword v3, off, s[0:3], s33 ; 4-byte Folded Spill -; GFX9-NEXT: s_mov_b64 exec, s[16:17] -; GFX9-NEXT: s_addk_i32 s32, 0x400 -; GFX9-NEXT: s_getpc_b64 s[16:17] -; GFX9-NEXT: s_add_u32 s16, s16, test_arg_store_v2bf16@gotpcrel32@lo+4 -; GFX9-NEXT: s_addc_u32 s17, s17, test_arg_store_v2bf16@gotpcrel32@hi+12 -; GFX9-NEXT: s_load_dwordx2 s[16:17], s[16:17], 0x0 -; GFX9-NEXT: v_writelane_b32 v3, s30, 0 -; GFX9-NEXT: v_writelane_b32 v3, s31, 1 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX9-NEXT: buffer_store_short v1, v2, s[0:3], 0 offen offset:4 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_readlane_b32 s31, v3, 1 -; GFX9-NEXT: v_readlane_b32 s30, v3, 0 -; GFX9-NEXT: s_mov_b32 s32, s33 -; GFX9-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s33 ; 4-byte Folded Reload -; GFX9-NEXT: s_mov_b64 exec, s[4:5] -; GFX9-NEXT: s_mov_b32 s33, s18 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: test_call_v3bf16: +; GFX900: ; %bb.0: ; %entry +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: s_mov_b32 s18, s33 +; GFX900-NEXT: s_mov_b32 s33, s32 +; GFX900-NEXT: s_xor_saveexec_b64 s[16:17], -1 +; GFX900-NEXT: buffer_store_dword v3, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX900-NEXT: s_mov_b64 exec, s[16:17] +; GFX900-NEXT: s_addk_i32 s32, 0x400 +; GFX900-NEXT: s_getpc_b64 s[16:17] +; GFX900-NEXT: s_add_u32 s16, s16, test_arg_store_v2bf16@gotpcrel32@lo+4 +; GFX900-NEXT: s_addc_u32 s17, s17, test_arg_store_v2bf16@gotpcrel32@hi+12 +; GFX900-NEXT: s_load_dwordx2 s[16:17], s[16:17], 0x0 +; GFX900-NEXT: v_writelane_b32 v3, s30, 0 +; GFX900-NEXT: v_writelane_b32 v3, s31, 1 +; GFX900-NEXT: s_waitcnt lgkmcnt(0) +; GFX900-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX900-NEXT: buffer_store_short v1, v2, s[0:3], 0 offen offset:4 +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: v_readlane_b32 s31, v3, 1 +; GFX900-NEXT: v_readlane_b32 s30, v3, 0 +; GFX900-NEXT: s_mov_b32 s32, s33 +; GFX900-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GFX900-NEXT: buffer_load_dword v3, off, s[0:3], s33 ; 4-byte Folded Reload +; GFX900-NEXT: s_mov_b64 exec, s[4:5] +; GFX900-NEXT: s_mov_b32 s33, s18 +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: test_call_v3bf16: +; GFX950: ; %bb.0: ; %entry +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: s_mov_b32 s2, s33 +; GFX950-NEXT: s_mov_b32 s33, s32 +; GFX950-NEXT: s_xor_saveexec_b64 s[0:1], -1 +; GFX950-NEXT: scratch_store_dword off, v5, s33 ; 4-byte Folded Spill +; GFX950-NEXT: s_mov_b64 exec, s[0:1] +; GFX950-NEXT: s_add_i32 s32, s32, 16 +; GFX950-NEXT: s_getpc_b64 s[0:1] +; GFX950-NEXT: s_add_u32 s0, s0, test_arg_store_v2bf16@gotpcrel32@lo+4 +; GFX950-NEXT: s_addc_u32 s1, s1, test_arg_store_v2bf16@gotpcrel32@hi+12 +; GFX950-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX950-NEXT: v_writelane_b32 v5, s30, 0 +; GFX950-NEXT: v_writelane_b32 v5, s31, 1 +; GFX950-NEXT: v_mov_b32_e32 v4, v2 +; GFX950-NEXT: s_waitcnt lgkmcnt(0) +; GFX950-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX950-NEXT: scratch_store_short v4, v1, off offset:4 sc0 sc1 +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: scratch_store_dword v4, v0, off sc0 sc1 +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: v_readlane_b32 s31, v5, 1 +; GFX950-NEXT: v_readlane_b32 s30, v5, 0 +; GFX950-NEXT: s_mov_b32 s32, s33 +; GFX950-NEXT: s_xor_saveexec_b64 s[0:1], -1 +; GFX950-NEXT: scratch_load_dword v5, off, s33 ; 4-byte Folded Reload +; GFX950-NEXT: s_mov_b64 exec, s[0:1] +; GFX950-NEXT: s_mov_b32 s33, s2 +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: test_call_v3bf16: ; GFX10: ; %bb.0: ; %entry @@ -4534,36 +4714,66 @@ define void @test_call_v4bf16(<4 x bfloat> %in, ptr addrspace(5) %out) { ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: test_call_v4bf16: -; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_mov_b32 s18, s33 -; GFX9-NEXT: s_mov_b32 s33, s32 -; GFX9-NEXT: s_xor_saveexec_b64 s[16:17], -1 -; GFX9-NEXT: buffer_store_dword v3, off, s[0:3], s33 ; 4-byte Folded Spill -; GFX9-NEXT: s_mov_b64 exec, s[16:17] -; GFX9-NEXT: s_addk_i32 s32, 0x400 -; GFX9-NEXT: s_getpc_b64 s[16:17] -; GFX9-NEXT: s_add_u32 s16, s16, test_arg_store_v2bf16@gotpcrel32@lo+4 -; GFX9-NEXT: s_addc_u32 s17, s17, test_arg_store_v2bf16@gotpcrel32@hi+12 -; GFX9-NEXT: s_load_dwordx2 s[16:17], s[16:17], 0x0 -; GFX9-NEXT: v_writelane_b32 v3, s30, 0 -; GFX9-NEXT: v_writelane_b32 v3, s31, 1 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX9-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:4 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_readlane_b32 s31, v3, 1 -; GFX9-NEXT: v_readlane_b32 s30, v3, 0 -; GFX9-NEXT: s_mov_b32 s32, s33 -; GFX9-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s33 ; 4-byte Folded Reload -; GFX9-NEXT: s_mov_b64 exec, s[4:5] -; GFX9-NEXT: s_mov_b32 s33, s18 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: test_call_v4bf16: +; GFX900: ; %bb.0: ; %entry +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: s_mov_b32 s18, s33 +; GFX900-NEXT: s_mov_b32 s33, s32 +; GFX900-NEXT: s_xor_saveexec_b64 s[16:17], -1 +; GFX900-NEXT: buffer_store_dword v3, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX900-NEXT: s_mov_b64 exec, s[16:17] +; GFX900-NEXT: s_addk_i32 s32, 0x400 +; GFX900-NEXT: s_getpc_b64 s[16:17] +; GFX900-NEXT: s_add_u32 s16, s16, test_arg_store_v2bf16@gotpcrel32@lo+4 +; GFX900-NEXT: s_addc_u32 s17, s17, test_arg_store_v2bf16@gotpcrel32@hi+12 +; GFX900-NEXT: s_load_dwordx2 s[16:17], s[16:17], 0x0 +; GFX900-NEXT: v_writelane_b32 v3, s30, 0 +; GFX900-NEXT: v_writelane_b32 v3, s31, 1 +; GFX900-NEXT: s_waitcnt lgkmcnt(0) +; GFX900-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX900-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:4 +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: v_readlane_b32 s31, v3, 1 +; GFX900-NEXT: v_readlane_b32 s30, v3, 0 +; GFX900-NEXT: s_mov_b32 s32, s33 +; GFX900-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GFX900-NEXT: buffer_load_dword v3, off, s[0:3], s33 ; 4-byte Folded Reload +; GFX900-NEXT: s_mov_b64 exec, s[4:5] +; GFX900-NEXT: s_mov_b32 s33, s18 +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: test_call_v4bf16: +; GFX950: ; %bb.0: ; %entry +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: s_mov_b32 s2, s33 +; GFX950-NEXT: s_mov_b32 s33, s32 +; GFX950-NEXT: s_xor_saveexec_b64 s[0:1], -1 +; GFX950-NEXT: scratch_store_dword off, v5, s33 ; 4-byte Folded Spill +; GFX950-NEXT: s_mov_b64 exec, s[0:1] +; GFX950-NEXT: s_add_i32 s32, s32, 16 +; GFX950-NEXT: s_getpc_b64 s[0:1] +; GFX950-NEXT: s_add_u32 s0, s0, test_arg_store_v2bf16@gotpcrel32@lo+4 +; GFX950-NEXT: s_addc_u32 s1, s1, test_arg_store_v2bf16@gotpcrel32@hi+12 +; GFX950-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX950-NEXT: v_writelane_b32 v5, s30, 0 +; GFX950-NEXT: v_writelane_b32 v5, s31, 1 +; GFX950-NEXT: v_mov_b32_e32 v4, v2 +; GFX950-NEXT: s_waitcnt lgkmcnt(0) +; GFX950-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX950-NEXT: scratch_store_dwordx2 v4, v[0:1], off sc0 sc1 +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: v_readlane_b32 s31, v5, 1 +; GFX950-NEXT: v_readlane_b32 s30, v5, 0 +; GFX950-NEXT: s_mov_b32 s32, s33 +; GFX950-NEXT: s_xor_saveexec_b64 s[0:1], -1 +; GFX950-NEXT: scratch_load_dword v5, off, s33 ; 4-byte Folded Reload +; GFX950-NEXT: s_mov_b64 exec, s[0:1] +; GFX950-NEXT: s_mov_b32 s33, s2 +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: test_call_v4bf16: ; GFX10: ; %bb.0: ; %entry @@ -4804,40 +5014,69 @@ define void @test_call_v8bf16(<8 x bfloat> %in, ptr addrspace(5) %out) { ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: test_call_v8bf16: -; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_mov_b32 s18, s33 -; GFX9-NEXT: s_mov_b32 s33, s32 -; GFX9-NEXT: s_xor_saveexec_b64 s[16:17], -1 -; GFX9-NEXT: buffer_store_dword v5, off, s[0:3], s33 ; 4-byte Folded Spill -; GFX9-NEXT: s_mov_b64 exec, s[16:17] -; GFX9-NEXT: s_addk_i32 s32, 0x400 -; GFX9-NEXT: s_getpc_b64 s[16:17] -; GFX9-NEXT: s_add_u32 s16, s16, test_arg_store_v2bf16@gotpcrel32@lo+4 -; GFX9-NEXT: s_addc_u32 s17, s17, test_arg_store_v2bf16@gotpcrel32@hi+12 -; GFX9-NEXT: s_load_dwordx2 s[16:17], s[16:17], 0x0 -; GFX9-NEXT: v_writelane_b32 v5, s30, 0 -; GFX9-NEXT: v_writelane_b32 v5, s31, 1 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX9-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen offset:12 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen offset:8 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen offset:4 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_readlane_b32 s31, v5, 1 -; GFX9-NEXT: v_readlane_b32 s30, v5, 0 -; GFX9-NEXT: s_mov_b32 s32, s33 -; GFX9-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; GFX9-NEXT: buffer_load_dword v5, off, s[0:3], s33 ; 4-byte Folded Reload -; GFX9-NEXT: s_mov_b64 exec, s[4:5] -; GFX9-NEXT: s_mov_b32 s33, s18 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: test_call_v8bf16: +; GFX900: ; %bb.0: ; %entry +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: s_mov_b32 s18, s33 +; GFX900-NEXT: s_mov_b32 s33, s32 +; GFX900-NEXT: s_xor_saveexec_b64 s[16:17], -1 +; GFX900-NEXT: buffer_store_dword v5, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX900-NEXT: s_mov_b64 exec, s[16:17] +; GFX900-NEXT: s_addk_i32 s32, 0x400 +; GFX900-NEXT: s_getpc_b64 s[16:17] +; GFX900-NEXT: s_add_u32 s16, s16, test_arg_store_v2bf16@gotpcrel32@lo+4 +; GFX900-NEXT: s_addc_u32 s17, s17, test_arg_store_v2bf16@gotpcrel32@hi+12 +; GFX900-NEXT: s_load_dwordx2 s[16:17], s[16:17], 0x0 +; GFX900-NEXT: v_writelane_b32 v5, s30, 0 +; GFX900-NEXT: v_writelane_b32 v5, s31, 1 +; GFX900-NEXT: s_waitcnt lgkmcnt(0) +; GFX900-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX900-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen offset:12 +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen offset:8 +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen offset:4 +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: v_readlane_b32 s31, v5, 1 +; GFX900-NEXT: v_readlane_b32 s30, v5, 0 +; GFX900-NEXT: s_mov_b32 s32, s33 +; GFX900-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GFX900-NEXT: buffer_load_dword v5, off, s[0:3], s33 ; 4-byte Folded Reload +; GFX900-NEXT: s_mov_b64 exec, s[4:5] +; GFX900-NEXT: s_mov_b32 s33, s18 +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: test_call_v8bf16: +; GFX950: ; %bb.0: ; %entry +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: s_mov_b32 s2, s33 +; GFX950-NEXT: s_mov_b32 s33, s32 +; GFX950-NEXT: s_xor_saveexec_b64 s[0:1], -1 +; GFX950-NEXT: scratch_store_dword off, v5, s33 ; 4-byte Folded Spill +; GFX950-NEXT: s_mov_b64 exec, s[0:1] +; GFX950-NEXT: s_add_i32 s32, s32, 16 +; GFX950-NEXT: s_getpc_b64 s[0:1] +; GFX950-NEXT: s_add_u32 s0, s0, test_arg_store_v2bf16@gotpcrel32@lo+4 +; GFX950-NEXT: s_addc_u32 s1, s1, test_arg_store_v2bf16@gotpcrel32@hi+12 +; GFX950-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX950-NEXT: v_writelane_b32 v5, s30, 0 +; GFX950-NEXT: v_writelane_b32 v5, s31, 1 +; GFX950-NEXT: s_waitcnt lgkmcnt(0) +; GFX950-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX950-NEXT: scratch_store_dwordx4 v4, v[0:3], off sc0 sc1 +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: v_readlane_b32 s31, v5, 1 +; GFX950-NEXT: v_readlane_b32 s30, v5, 0 +; GFX950-NEXT: s_mov_b32 s32, s33 +; GFX950-NEXT: s_xor_saveexec_b64 s[0:1], -1 +; GFX950-NEXT: scratch_load_dword v5, off, s33 ; 4-byte Folded Reload +; GFX950-NEXT: s_mov_b64 exec, s[0:1] +; GFX950-NEXT: s_mov_b32 s33, s2 +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: test_call_v8bf16: ; GFX10: ; %bb.0: ; %entry @@ -5174,48 +5413,79 @@ define void @test_call_v16bf16(<16 x bfloat> %in, ptr addrspace(5) %out) { ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: test_call_v16bf16: -; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_mov_b32 s18, s33 -; GFX9-NEXT: s_mov_b32 s33, s32 -; GFX9-NEXT: s_xor_saveexec_b64 s[16:17], -1 -; GFX9-NEXT: buffer_store_dword v9, off, s[0:3], s33 ; 4-byte Folded Spill -; GFX9-NEXT: s_mov_b64 exec, s[16:17] -; GFX9-NEXT: s_addk_i32 s32, 0x400 -; GFX9-NEXT: s_getpc_b64 s[16:17] -; GFX9-NEXT: s_add_u32 s16, s16, test_arg_store_v2bf16@gotpcrel32@lo+4 -; GFX9-NEXT: s_addc_u32 s17, s17, test_arg_store_v2bf16@gotpcrel32@hi+12 -; GFX9-NEXT: s_load_dwordx2 s[16:17], s[16:17], 0x0 -; GFX9-NEXT: v_writelane_b32 v9, s30, 0 -; GFX9-NEXT: v_writelane_b32 v9, s31, 1 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX9-NEXT: buffer_store_dword v7, v8, s[0:3], 0 offen offset:28 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dword v6, v8, s[0:3], 0 offen offset:24 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dword v5, v8, s[0:3], 0 offen offset:20 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dword v4, v8, s[0:3], 0 offen offset:16 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dword v3, v8, s[0:3], 0 offen offset:12 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dword v2, v8, s[0:3], 0 offen offset:8 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dword v1, v8, s[0:3], 0 offen offset:4 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dword v0, v8, s[0:3], 0 offen -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_readlane_b32 s31, v9, 1 -; GFX9-NEXT: v_readlane_b32 s30, v9, 0 -; GFX9-NEXT: s_mov_b32 s32, s33 -; GFX9-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s33 ; 4-byte Folded Reload -; GFX9-NEXT: s_mov_b64 exec, s[4:5] -; GFX9-NEXT: s_mov_b32 s33, s18 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: test_call_v16bf16: +; GFX900: ; %bb.0: ; %entry +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: s_mov_b32 s18, s33 +; GFX900-NEXT: s_mov_b32 s33, s32 +; GFX900-NEXT: s_xor_saveexec_b64 s[16:17], -1 +; GFX900-NEXT: buffer_store_dword v9, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX900-NEXT: s_mov_b64 exec, s[16:17] +; GFX900-NEXT: s_addk_i32 s32, 0x400 +; GFX900-NEXT: s_getpc_b64 s[16:17] +; GFX900-NEXT: s_add_u32 s16, s16, test_arg_store_v2bf16@gotpcrel32@lo+4 +; GFX900-NEXT: s_addc_u32 s17, s17, test_arg_store_v2bf16@gotpcrel32@hi+12 +; GFX900-NEXT: s_load_dwordx2 s[16:17], s[16:17], 0x0 +; GFX900-NEXT: v_writelane_b32 v9, s30, 0 +; GFX900-NEXT: v_writelane_b32 v9, s31, 1 +; GFX900-NEXT: s_waitcnt lgkmcnt(0) +; GFX900-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX900-NEXT: buffer_store_dword v7, v8, s[0:3], 0 offen offset:28 +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: buffer_store_dword v6, v8, s[0:3], 0 offen offset:24 +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: buffer_store_dword v5, v8, s[0:3], 0 offen offset:20 +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: buffer_store_dword v4, v8, s[0:3], 0 offen offset:16 +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: buffer_store_dword v3, v8, s[0:3], 0 offen offset:12 +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: buffer_store_dword v2, v8, s[0:3], 0 offen offset:8 +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: buffer_store_dword v1, v8, s[0:3], 0 offen offset:4 +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: buffer_store_dword v0, v8, s[0:3], 0 offen +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: v_readlane_b32 s31, v9, 1 +; GFX900-NEXT: v_readlane_b32 s30, v9, 0 +; GFX900-NEXT: s_mov_b32 s32, s33 +; GFX900-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GFX900-NEXT: buffer_load_dword v9, off, s[0:3], s33 ; 4-byte Folded Reload +; GFX900-NEXT: s_mov_b64 exec, s[4:5] +; GFX900-NEXT: s_mov_b32 s33, s18 +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: test_call_v16bf16: +; GFX950: ; %bb.0: ; %entry +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: s_mov_b32 s2, s33 +; GFX950-NEXT: s_mov_b32 s33, s32 +; GFX950-NEXT: s_xor_saveexec_b64 s[0:1], -1 +; GFX950-NEXT: scratch_store_dword off, v9, s33 ; 4-byte Folded Spill +; GFX950-NEXT: s_mov_b64 exec, s[0:1] +; GFX950-NEXT: s_add_i32 s32, s32, 16 +; GFX950-NEXT: s_getpc_b64 s[0:1] +; GFX950-NEXT: s_add_u32 s0, s0, test_arg_store_v2bf16@gotpcrel32@lo+4 +; GFX950-NEXT: s_addc_u32 s1, s1, test_arg_store_v2bf16@gotpcrel32@hi+12 +; GFX950-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX950-NEXT: v_writelane_b32 v9, s30, 0 +; GFX950-NEXT: v_writelane_b32 v9, s31, 1 +; GFX950-NEXT: s_waitcnt lgkmcnt(0) +; GFX950-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX950-NEXT: scratch_store_dwordx4 v8, v[4:7], off offset:16 sc0 sc1 +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: scratch_store_dwordx4 v8, v[0:3], off sc0 sc1 +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: v_readlane_b32 s31, v9, 1 +; GFX950-NEXT: v_readlane_b32 s30, v9, 0 +; GFX950-NEXT: s_mov_b32 s32, s33 +; GFX950-NEXT: s_xor_saveexec_b64 s[0:1], -1 +; GFX950-NEXT: scratch_load_dword v9, off, s33 ; 4-byte Folded Reload +; GFX950-NEXT: s_mov_b64 exec, s[0:1] +; GFX950-NEXT: s_mov_b32 s33, s2 +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: test_call_v16bf16: ; GFX10: ; %bb.0: ; %entry @@ -5332,14 +5602,23 @@ define bfloat @test_alloca_load_store_ret(bfloat %in) { ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: test_alloca_load_store_ret: -; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: buffer_store_short v0, off, s[0:3], s32 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 glc -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: test_alloca_load_store_ret: +; GFX900: ; %bb.0: ; %entry +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: buffer_store_short v0, off, s[0:3], s32 +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: buffer_load_ushort v0, off, s[0:3], s32 glc +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: test_alloca_load_store_ret: +; GFX950: ; %bb.0: ; %entry +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: scratch_store_short off, v0, s32 sc0 sc1 +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: scratch_load_ushort v0, off, s32 sc0 sc1 +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: test_alloca_load_store_ret: ; GFX10: ; %bb.0: ; %entry @@ -5625,52 +5904,72 @@ define { <32 x i32>, bfloat } @test_overflow_stack(bfloat %a, <32 x i32> %b) { ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: test_overflow_stack: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: buffer_store_dword v30, v0, s[0:3], 0 offen offset:112 -; GFX9-NEXT: buffer_store_dword v29, v0, s[0:3], 0 offen offset:108 -; GFX9-NEXT: buffer_store_dword v28, v0, s[0:3], 0 offen offset:104 -; GFX9-NEXT: buffer_store_dword v27, v0, s[0:3], 0 offen offset:100 -; GFX9-NEXT: buffer_store_dword v26, v0, s[0:3], 0 offen offset:96 -; GFX9-NEXT: buffer_store_dword v25, v0, s[0:3], 0 offen offset:92 -; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:8 -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:4 -; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v24, v0, s[0:3], 0 offen offset:88 -; GFX9-NEXT: buffer_store_dword v23, v0, s[0:3], 0 offen offset:84 -; GFX9-NEXT: buffer_store_dword v22, v0, s[0:3], 0 offen offset:80 -; GFX9-NEXT: buffer_store_dword v21, v0, s[0:3], 0 offen offset:76 -; GFX9-NEXT: buffer_store_dword v20, v0, s[0:3], 0 offen offset:72 -; GFX9-NEXT: buffer_store_dword v19, v0, s[0:3], 0 offen offset:68 -; GFX9-NEXT: buffer_store_dword v18, v0, s[0:3], 0 offen offset:64 -; GFX9-NEXT: buffer_store_dword v17, v0, s[0:3], 0 offen offset:60 -; GFX9-NEXT: buffer_store_dword v16, v0, s[0:3], 0 offen offset:56 -; GFX9-NEXT: buffer_store_dword v15, v0, s[0:3], 0 offen offset:52 -; GFX9-NEXT: buffer_store_dword v14, v0, s[0:3], 0 offen offset:48 -; GFX9-NEXT: buffer_store_dword v13, v0, s[0:3], 0 offen offset:44 -; GFX9-NEXT: buffer_store_dword v12, v0, s[0:3], 0 offen offset:40 -; GFX9-NEXT: buffer_store_dword v11, v0, s[0:3], 0 offen offset:36 -; GFX9-NEXT: buffer_store_dword v10, v0, s[0:3], 0 offen offset:32 -; GFX9-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:28 -; GFX9-NEXT: buffer_store_dword v8, v0, s[0:3], 0 offen offset:24 -; GFX9-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen offset:20 -; GFX9-NEXT: buffer_store_dword v6, v0, s[0:3], 0 offen offset:16 -; GFX9-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:12 -; GFX9-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:8 -; GFX9-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:4 -; GFX9-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen -; GFX9-NEXT: s_waitcnt vmcnt(25) -; GFX9-NEXT: buffer_store_dword v25, v0, s[0:3], 0 offen offset:124 -; GFX9-NEXT: s_waitcnt vmcnt(25) -; GFX9-NEXT: buffer_store_dword v26, v0, s[0:3], 0 offen offset:120 -; GFX9-NEXT: s_waitcnt vmcnt(25) -; GFX9-NEXT: buffer_store_dword v27, v0, s[0:3], 0 offen offset:116 -; GFX9-NEXT: buffer_store_short v1, v0, s[0:3], 0 offen offset:128 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: test_overflow_stack: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: buffer_store_dword v30, v0, s[0:3], 0 offen offset:112 +; GFX900-NEXT: buffer_store_dword v29, v0, s[0:3], 0 offen offset:108 +; GFX900-NEXT: buffer_store_dword v28, v0, s[0:3], 0 offen offset:104 +; GFX900-NEXT: buffer_store_dword v27, v0, s[0:3], 0 offen offset:100 +; GFX900-NEXT: buffer_store_dword v26, v0, s[0:3], 0 offen offset:96 +; GFX900-NEXT: buffer_store_dword v25, v0, s[0:3], 0 offen offset:92 +; GFX900-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:8 +; GFX900-NEXT: s_nop 0 +; GFX900-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:4 +; GFX900-NEXT: buffer_load_dword v27, off, s[0:3], s32 +; GFX900-NEXT: s_nop 0 +; GFX900-NEXT: buffer_store_dword v24, v0, s[0:3], 0 offen offset:88 +; GFX900-NEXT: buffer_store_dword v23, v0, s[0:3], 0 offen offset:84 +; GFX900-NEXT: buffer_store_dword v22, v0, s[0:3], 0 offen offset:80 +; GFX900-NEXT: buffer_store_dword v21, v0, s[0:3], 0 offen offset:76 +; GFX900-NEXT: buffer_store_dword v20, v0, s[0:3], 0 offen offset:72 +; GFX900-NEXT: buffer_store_dword v19, v0, s[0:3], 0 offen offset:68 +; GFX900-NEXT: buffer_store_dword v18, v0, s[0:3], 0 offen offset:64 +; GFX900-NEXT: buffer_store_dword v17, v0, s[0:3], 0 offen offset:60 +; GFX900-NEXT: buffer_store_dword v16, v0, s[0:3], 0 offen offset:56 +; GFX900-NEXT: buffer_store_dword v15, v0, s[0:3], 0 offen offset:52 +; GFX900-NEXT: buffer_store_dword v14, v0, s[0:3], 0 offen offset:48 +; GFX900-NEXT: buffer_store_dword v13, v0, s[0:3], 0 offen offset:44 +; GFX900-NEXT: buffer_store_dword v12, v0, s[0:3], 0 offen offset:40 +; GFX900-NEXT: buffer_store_dword v11, v0, s[0:3], 0 offen offset:36 +; GFX900-NEXT: buffer_store_dword v10, v0, s[0:3], 0 offen offset:32 +; GFX900-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:28 +; GFX900-NEXT: buffer_store_dword v8, v0, s[0:3], 0 offen offset:24 +; GFX900-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen offset:20 +; GFX900-NEXT: buffer_store_dword v6, v0, s[0:3], 0 offen offset:16 +; GFX900-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:12 +; GFX900-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:8 +; GFX900-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:4 +; GFX900-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen +; GFX900-NEXT: s_waitcnt vmcnt(25) +; GFX900-NEXT: buffer_store_dword v25, v0, s[0:3], 0 offen offset:124 +; GFX900-NEXT: s_waitcnt vmcnt(25) +; GFX900-NEXT: buffer_store_dword v26, v0, s[0:3], 0 offen offset:120 +; GFX900-NEXT: s_waitcnt vmcnt(25) +; GFX900-NEXT: buffer_store_dword v27, v0, s[0:3], 0 offen offset:116 +; GFX900-NEXT: buffer_store_short v1, v0, s[0:3], 0 offen offset:128 +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: test_overflow_stack: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: scratch_load_dword v33, off, s32 offset:8 +; GFX950-NEXT: scratch_load_dword v32, off, s32 offset:4 +; GFX950-NEXT: scratch_load_dword v31, off, s32 +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: scratch_store_dwordx4 v0, v[26:29], off offset:96 +; GFX950-NEXT: scratch_store_dwordx4 v0, v[22:25], off offset:80 +; GFX950-NEXT: scratch_store_dwordx4 v0, v[18:21], off offset:64 +; GFX950-NEXT: scratch_store_dwordx4 v0, v[14:17], off offset:48 +; GFX950-NEXT: scratch_store_dwordx4 v0, v[10:13], off offset:32 +; GFX950-NEXT: scratch_store_dwordx4 v0, v[6:9], off offset:16 +; GFX950-NEXT: scratch_store_dwordx4 v0, v[2:5], off +; GFX950-NEXT: s_waitcnt vmcnt(7) +; GFX950-NEXT: scratch_store_dwordx4 v0, v[30:33], off offset:112 +; GFX950-NEXT: scratch_store_short v0, v1, off offset:128 +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: test_overflow_stack: ; GFX10: ; %bb.0: @@ -5870,15 +6169,25 @@ define <3 x float> @global_extload_v3bf16_to_v3f32(ptr addrspace(1) %ptr) { ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: global_extload_v3bf16_to_v3f32: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_load_dwordx2 v[1:2], v[0:1], off -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v1 -; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: global_extload_v3bf16_to_v3f32: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: global_load_dwordx2 v[1:2], v[0:1], off +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: v_lshlrev_b32_e32 v0, 16, v1 +; GFX900-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX900-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: global_extload_v3bf16_to_v3f32: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: global_load_dwordx2 v[2:3], v[0:1], off +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: v_lshlrev_b32_e32 v0, 16, v2 +; GFX950-NEXT: v_and_b32_e32 v1, 0xffff0000, v2 +; GFX950-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: global_extload_v3bf16_to_v3f32: ; GFX10: ; %bb.0: @@ -6120,18 +6429,31 @@ define <6 x float> @global_extload_v6bf16_to_v6f32(ptr addrspace(1) %ptr) { ; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: global_extload_v6bf16_to_v6f32: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_load_dwordx3 v[3:5], v[0:1], off -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v3 -; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v3 -; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v4 -; GFX9-NEXT: v_and_b32_e32 v3, 0xffff0000, v4 -; GFX9-NEXT: v_lshlrev_b32_e32 v4, 16, v5 -; GFX9-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: global_extload_v6bf16_to_v6f32: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: global_load_dwordx3 v[3:5], v[0:1], off +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: v_lshlrev_b32_e32 v0, 16, v3 +; GFX900-NEXT: v_and_b32_e32 v1, 0xffff0000, v3 +; GFX900-NEXT: v_lshlrev_b32_e32 v2, 16, v4 +; GFX900-NEXT: v_and_b32_e32 v3, 0xffff0000, v4 +; GFX900-NEXT: v_lshlrev_b32_e32 v4, 16, v5 +; GFX900-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: global_extload_v6bf16_to_v6f32: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: global_load_dwordx3 v[4:6], v[0:1], off +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: v_lshlrev_b32_e32 v0, 16, v4 +; GFX950-NEXT: v_and_b32_e32 v1, 0xffff0000, v4 +; GFX950-NEXT: v_lshlrev_b32_e32 v2, 16, v5 +; GFX950-NEXT: v_and_b32_e32 v3, 0xffff0000, v5 +; GFX950-NEXT: v_lshlrev_b32_e32 v4, 16, v6 +; GFX950-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: global_extload_v6bf16_to_v6f32: ; GFX10: ; %bb.0: @@ -6766,16 +7088,27 @@ define <2 x double> @global_extload_v2bf16_to_v2f64(ptr addrspace(1) %ptr) { ; GFX8-NEXT: v_cvt_f64_f32_e32 v[2:3], v2 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: global_extload_v2bf16_to_v2f64: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_load_dword v2, v[0:1], off -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v2 -; GFX9-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX9-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 -; GFX9-NEXT: v_cvt_f64_f32_e32 v[2:3], v2 -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: global_extload_v2bf16_to_v2f64: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: global_load_dword v2, v[0:1], off +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: v_lshlrev_b32_e32 v0, 16, v2 +; GFX900-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX900-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 +; GFX900-NEXT: v_cvt_f64_f32_e32 v[2:3], v2 +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: global_extload_v2bf16_to_v2f64: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: global_load_dword v0, v[0:1], off +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: v_lshlrev_b32_e32 v1, 16, v0 +; GFX950-NEXT: v_and_b32_e32 v2, 0xffff0000, v0 +; GFX950-NEXT: v_cvt_f64_f32_e32 v[0:1], v1 +; GFX950-NEXT: v_cvt_f64_f32_e32 v[2:3], v2 +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: global_extload_v2bf16_to_v2f64: ; GFX10: ; %bb.0: @@ -6852,18 +7185,31 @@ define <3 x double> @global_extload_v3bf16_to_v3f64(ptr addrspace(1) %ptr) { ; GFX8-NEXT: v_cvt_f64_f32_e32 v[4:5], v4 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: global_extload_v3bf16_to_v3f64: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_load_dwordx2 v[1:2], v[0:1], off -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v1 -; GFX9-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 -; GFX9-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX9-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 -; GFX9-NEXT: v_cvt_f64_f32_e32 v[2:3], v3 -; GFX9-NEXT: v_cvt_f64_f32_e32 v[4:5], v4 -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: global_extload_v3bf16_to_v3f64: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: global_load_dwordx2 v[1:2], v[0:1], off +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: v_lshlrev_b32_e32 v0, 16, v1 +; GFX900-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 +; GFX900-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX900-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 +; GFX900-NEXT: v_cvt_f64_f32_e32 v[2:3], v3 +; GFX900-NEXT: v_cvt_f64_f32_e32 v[4:5], v4 +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: global_extload_v3bf16_to_v3f64: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: global_load_dwordx2 v[0:1], v[0:1], off +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: v_lshlrev_b32_e32 v2, 16, v0 +; GFX950-NEXT: v_and_b32_e32 v3, 0xffff0000, v0 +; GFX950-NEXT: v_lshlrev_b32_e32 v4, 16, v1 +; GFX950-NEXT: v_cvt_f64_f32_e32 v[0:1], v2 +; GFX950-NEXT: v_cvt_f64_f32_e32 v[2:3], v3 +; GFX950-NEXT: v_cvt_f64_f32_e32 v[4:5], v4 +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: global_extload_v3bf16_to_v3f64: ; GFX10: ; %bb.0: @@ -8476,193 +8822,363 @@ define <32 x double> @global_extload_v32bf16_to_v32f64(ptr addrspace(1) %ptr) { ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: global_extload_v32bf16_to_v32f64: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_load_ushort v9, v[1:2], off offset:62 -; GFX9-NEXT: global_load_ushort v11, v[1:2], off offset:60 -; GFX9-NEXT: global_load_ushort v12, v[1:2], off offset:58 -; GFX9-NEXT: global_load_ushort v13, v[1:2], off offset:56 -; GFX9-NEXT: global_load_ushort v14, v[1:2], off offset:54 -; GFX9-NEXT: global_load_ushort v15, v[1:2], off offset:52 -; GFX9-NEXT: global_load_ushort v16, v[1:2], off offset:50 -; GFX9-NEXT: global_load_ushort v17, v[1:2], off offset:48 -; GFX9-NEXT: global_load_ushort v18, v[1:2], off offset:46 -; GFX9-NEXT: global_load_ushort v19, v[1:2], off offset:44 -; GFX9-NEXT: global_load_ushort v20, v[1:2], off offset:42 -; GFX9-NEXT: global_load_ushort v21, v[1:2], off offset:40 -; GFX9-NEXT: global_load_ushort v22, v[1:2], off offset:38 -; GFX9-NEXT: global_load_ushort v23, v[1:2], off offset:36 -; GFX9-NEXT: global_load_ushort v24, v[1:2], off offset:34 -; GFX9-NEXT: global_load_ushort v25, v[1:2], off offset:32 -; GFX9-NEXT: global_load_ushort v26, v[1:2], off -; GFX9-NEXT: global_load_ushort v27, v[1:2], off offset:2 -; GFX9-NEXT: global_load_ushort v3, v[1:2], off offset:16 -; GFX9-NEXT: global_load_ushort v4, v[1:2], off offset:18 -; GFX9-NEXT: global_load_ushort v5, v[1:2], off offset:20 -; GFX9-NEXT: global_load_ushort v6, v[1:2], off offset:22 -; GFX9-NEXT: global_load_ushort v8, v[1:2], off offset:24 -; GFX9-NEXT: global_load_ushort v28, v[1:2], off offset:30 -; GFX9-NEXT: global_load_ushort v29, v[1:2], off offset:26 -; GFX9-NEXT: global_load_ushort v30, v[1:2], off offset:28 -; GFX9-NEXT: global_load_ushort v31, v[1:2], off offset:4 -; GFX9-NEXT: global_load_ushort v32, v[1:2], off offset:6 -; GFX9-NEXT: global_load_ushort v33, v[1:2], off offset:8 -; GFX9-NEXT: global_load_ushort v34, v[1:2], off offset:10 -; GFX9-NEXT: global_load_ushort v7, v[1:2], off offset:12 -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: global_load_ushort v1, v[1:2], off offset:14 -; GFX9-NEXT: s_waitcnt vmcnt(31) -; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v9 -; GFX9-NEXT: v_cvt_f64_f32_e32 v[9:10], v2 -; GFX9-NEXT: s_waitcnt vmcnt(30) -; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v11 -; GFX9-NEXT: s_waitcnt vmcnt(28) -; GFX9-NEXT: v_lshlrev_b32_e32 v11, 16, v13 -; GFX9-NEXT: buffer_store_dword v10, v0, s[0:3], 0 offen offset:252 -; GFX9-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:248 -; GFX9-NEXT: v_cvt_f64_f32_e32 v[9:10], v2 -; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v12 -; GFX9-NEXT: s_waitcnt vmcnt(29) -; GFX9-NEXT: v_lshlrev_b32_e32 v12, 16, v14 -; GFX9-NEXT: buffer_store_dword v10, v0, s[0:3], 0 offen offset:244 -; GFX9-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:240 -; GFX9-NEXT: v_cvt_f64_f32_e32 v[9:10], v2 -; GFX9-NEXT: s_waitcnt vmcnt(30) -; GFX9-NEXT: v_lshlrev_b32_e32 v13, 16, v15 -; GFX9-NEXT: buffer_store_dword v10, v0, s[0:3], 0 offen offset:236 -; GFX9-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:232 -; GFX9-NEXT: v_cvt_f64_f32_e32 v[9:10], v11 -; GFX9-NEXT: v_cvt_f64_f32_e32 v[11:12], v12 -; GFX9-NEXT: s_waitcnt vmcnt(31) -; GFX9-NEXT: v_lshlrev_b32_e32 v14, 16, v16 -; GFX9-NEXT: s_waitcnt vmcnt(30) -; GFX9-NEXT: v_lshlrev_b32_e32 v15, 16, v17 -; GFX9-NEXT: buffer_store_dword v10, v0, s[0:3], 0 offen offset:228 -; GFX9-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:224 -; GFX9-NEXT: v_cvt_f64_f32_e32 v[9:10], v13 -; GFX9-NEXT: s_waitcnt vmcnt(31) -; GFX9-NEXT: v_lshlrev_b32_e32 v16, 16, v18 -; GFX9-NEXT: v_cvt_f64_f32_e32 v[13:14], v14 -; GFX9-NEXT: buffer_store_dword v12, v0, s[0:3], 0 offen offset:220 -; GFX9-NEXT: buffer_store_dword v11, v0, s[0:3], 0 offen offset:216 -; GFX9-NEXT: v_cvt_f64_f32_e32 v[11:12], v15 -; GFX9-NEXT: v_cvt_f64_f32_e32 v[15:16], v16 -; GFX9-NEXT: s_waitcnt vmcnt(32) -; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v19 -; GFX9-NEXT: s_waitcnt vmcnt(30) -; GFX9-NEXT: v_lshlrev_b32_e32 v19, 16, v21 -; GFX9-NEXT: s_waitcnt vmcnt(28) -; GFX9-NEXT: v_lshlrev_b32_e32 v21, 16, v23 -; GFX9-NEXT: buffer_store_dword v10, v0, s[0:3], 0 offen offset:212 -; GFX9-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:208 -; GFX9-NEXT: v_cvt_f64_f32_e32 v[9:10], v2 -; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v20 -; GFX9-NEXT: v_lshlrev_b32_e32 v20, 16, v22 -; GFX9-NEXT: buffer_store_dword v14, v0, s[0:3], 0 offen offset:204 -; GFX9-NEXT: buffer_store_dword v13, v0, s[0:3], 0 offen offset:200 -; GFX9-NEXT: buffer_store_dword v12, v0, s[0:3], 0 offen offset:196 -; GFX9-NEXT: buffer_store_dword v11, v0, s[0:3], 0 offen offset:192 -; GFX9-NEXT: v_cvt_f64_f32_e32 v[11:12], v21 -; GFX9-NEXT: s_waitcnt vmcnt(33) -; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v24 -; GFX9-NEXT: v_cvt_f64_f32_e32 v[17:18], v17 -; GFX9-NEXT: v_cvt_f64_f32_e32 v[13:14], v19 -; GFX9-NEXT: v_cvt_f64_f32_e32 v[19:20], v20 -; GFX9-NEXT: buffer_store_dword v16, v0, s[0:3], 0 offen offset:188 -; GFX9-NEXT: buffer_store_dword v15, v0, s[0:3], 0 offen offset:184 -; GFX9-NEXT: buffer_store_dword v10, v0, s[0:3], 0 offen offset:180 -; GFX9-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:176 -; GFX9-NEXT: buffer_store_dword v18, v0, s[0:3], 0 offen offset:172 -; GFX9-NEXT: buffer_store_dword v17, v0, s[0:3], 0 offen offset:168 -; GFX9-NEXT: buffer_store_dword v14, v0, s[0:3], 0 offen offset:164 -; GFX9-NEXT: buffer_store_dword v13, v0, s[0:3], 0 offen offset:160 -; GFX9-NEXT: buffer_store_dword v20, v0, s[0:3], 0 offen offset:156 -; GFX9-NEXT: buffer_store_dword v19, v0, s[0:3], 0 offen offset:152 -; GFX9-NEXT: buffer_store_dword v12, v0, s[0:3], 0 offen offset:148 -; GFX9-NEXT: v_cvt_f64_f32_e32 v[9:10], v2 -; GFX9-NEXT: buffer_store_dword v11, v0, s[0:3], 0 offen offset:144 -; GFX9-NEXT: s_waitcnt vmcnt(44) -; GFX9-NEXT: v_lshlrev_b32_e32 v11, 16, v25 -; GFX9-NEXT: buffer_store_dword v10, v0, s[0:3], 0 offen offset:140 -; GFX9-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:136 -; GFX9-NEXT: v_cvt_f64_f32_e32 v[9:10], v11 -; GFX9-NEXT: s_waitcnt vmcnt(38) -; GFX9-NEXT: v_lshlrev_b32_e32 v13, 16, v28 -; GFX9-NEXT: buffer_store_dword v10, v0, s[0:3], 0 offen offset:132 -; GFX9-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:128 -; GFX9-NEXT: v_cvt_f64_f32_e32 v[9:10], v13 -; GFX9-NEXT: s_waitcnt vmcnt(38) -; GFX9-NEXT: v_lshlrev_b32_e32 v15, 16, v30 -; GFX9-NEXT: buffer_store_dword v10, v0, s[0:3], 0 offen offset:124 -; GFX9-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:120 -; GFX9-NEXT: v_cvt_f64_f32_e32 v[9:10], v15 -; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v29 -; GFX9-NEXT: buffer_store_dword v10, v0, s[0:3], 0 offen offset:116 -; GFX9-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:112 -; GFX9-NEXT: v_cvt_f64_f32_e32 v[9:10], v17 -; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v26 -; GFX9-NEXT: v_cvt_f64_f32_e32 v[11:12], v2 -; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v27 -; GFX9-NEXT: v_cvt_f64_f32_e32 v[13:14], v2 -; GFX9-NEXT: s_waitcnt vmcnt(41) -; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v31 -; GFX9-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; GFX9-NEXT: v_cvt_f64_f32_e32 v[15:16], v2 -; GFX9-NEXT: s_waitcnt vmcnt(40) -; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v32 -; GFX9-NEXT: buffer_store_dword v10, v0, s[0:3], 0 offen offset:108 -; GFX9-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:104 -; GFX9-NEXT: v_cvt_f64_f32_e32 v[8:9], v8 -; GFX9-NEXT: v_cvt_f64_f32_e32 v[17:18], v2 -; GFX9-NEXT: s_waitcnt vmcnt(41) -; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v33 -; GFX9-NEXT: v_cvt_f64_f32_e32 v[19:20], v2 -; GFX9-NEXT: s_waitcnt vmcnt(40) -; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v34 -; GFX9-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; GFX9-NEXT: v_cvt_f64_f32_e32 v[21:22], v2 -; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v5 -; GFX9-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:100 -; GFX9-NEXT: buffer_store_dword v8, v0, s[0:3], 0 offen offset:96 -; GFX9-NEXT: v_cvt_f64_f32_e32 v[8:9], v6 -; GFX9-NEXT: v_cvt_f64_f32_e32 v[5:6], v2 -; GFX9-NEXT: s_waitcnt vmcnt(41) -; GFX9-NEXT: v_lshlrev_b32_e32 v10, 16, v7 -; GFX9-NEXT: s_waitcnt vmcnt(40) -; GFX9-NEXT: v_lshlrev_b32_e32 v7, 16, v1 -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v4 -; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX9-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:92 -; GFX9-NEXT: buffer_store_dword v8, v0, s[0:3], 0 offen offset:88 -; GFX9-NEXT: v_cvt_f64_f32_e32 v[1:2], v1 -; GFX9-NEXT: buffer_store_dword v6, v0, s[0:3], 0 offen offset:84 -; GFX9-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:80 -; GFX9-NEXT: v_cvt_f64_f32_e32 v[4:5], v7 -; GFX9-NEXT: v_cvt_f64_f32_e32 v[6:7], v3 -; GFX9-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:76 -; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:72 -; GFX9-NEXT: v_cvt_f64_f32_e32 v[1:2], v10 -; GFX9-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen offset:68 -; GFX9-NEXT: buffer_store_dword v6, v0, s[0:3], 0 offen offset:64 -; GFX9-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:60 -; GFX9-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:56 -; GFX9-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:52 -; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:48 -; GFX9-NEXT: buffer_store_dword v22, v0, s[0:3], 0 offen offset:44 -; GFX9-NEXT: buffer_store_dword v21, v0, s[0:3], 0 offen offset:40 -; GFX9-NEXT: buffer_store_dword v20, v0, s[0:3], 0 offen offset:36 -; GFX9-NEXT: buffer_store_dword v19, v0, s[0:3], 0 offen offset:32 -; GFX9-NEXT: buffer_store_dword v18, v0, s[0:3], 0 offen offset:28 -; GFX9-NEXT: buffer_store_dword v17, v0, s[0:3], 0 offen offset:24 -; GFX9-NEXT: buffer_store_dword v16, v0, s[0:3], 0 offen offset:20 -; GFX9-NEXT: buffer_store_dword v15, v0, s[0:3], 0 offen offset:16 -; GFX9-NEXT: buffer_store_dword v14, v0, s[0:3], 0 offen offset:12 -; GFX9-NEXT: buffer_store_dword v13, v0, s[0:3], 0 offen offset:8 -; GFX9-NEXT: buffer_store_dword v12, v0, s[0:3], 0 offen offset:4 -; GFX9-NEXT: buffer_store_dword v11, v0, s[0:3], 0 offen -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: global_extload_v32bf16_to_v32f64: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: global_load_ushort v9, v[1:2], off offset:62 +; GFX900-NEXT: global_load_ushort v11, v[1:2], off offset:60 +; GFX900-NEXT: global_load_ushort v12, v[1:2], off offset:58 +; GFX900-NEXT: global_load_ushort v13, v[1:2], off offset:56 +; GFX900-NEXT: global_load_ushort v14, v[1:2], off offset:54 +; GFX900-NEXT: global_load_ushort v15, v[1:2], off offset:52 +; GFX900-NEXT: global_load_ushort v16, v[1:2], off offset:50 +; GFX900-NEXT: global_load_ushort v17, v[1:2], off offset:48 +; GFX900-NEXT: global_load_ushort v18, v[1:2], off offset:46 +; GFX900-NEXT: global_load_ushort v19, v[1:2], off offset:44 +; GFX900-NEXT: global_load_ushort v20, v[1:2], off offset:42 +; GFX900-NEXT: global_load_ushort v21, v[1:2], off offset:40 +; GFX900-NEXT: global_load_ushort v22, v[1:2], off offset:38 +; GFX900-NEXT: global_load_ushort v23, v[1:2], off offset:36 +; GFX900-NEXT: global_load_ushort v24, v[1:2], off offset:34 +; GFX900-NEXT: global_load_ushort v25, v[1:2], off offset:32 +; GFX900-NEXT: global_load_ushort v26, v[1:2], off +; GFX900-NEXT: global_load_ushort v27, v[1:2], off offset:2 +; GFX900-NEXT: global_load_ushort v3, v[1:2], off offset:16 +; GFX900-NEXT: global_load_ushort v4, v[1:2], off offset:18 +; GFX900-NEXT: global_load_ushort v5, v[1:2], off offset:20 +; GFX900-NEXT: global_load_ushort v6, v[1:2], off offset:22 +; GFX900-NEXT: global_load_ushort v8, v[1:2], off offset:24 +; GFX900-NEXT: global_load_ushort v28, v[1:2], off offset:30 +; GFX900-NEXT: global_load_ushort v29, v[1:2], off offset:26 +; GFX900-NEXT: global_load_ushort v30, v[1:2], off offset:28 +; GFX900-NEXT: global_load_ushort v31, v[1:2], off offset:4 +; GFX900-NEXT: global_load_ushort v32, v[1:2], off offset:6 +; GFX900-NEXT: global_load_ushort v33, v[1:2], off offset:8 +; GFX900-NEXT: global_load_ushort v34, v[1:2], off offset:10 +; GFX900-NEXT: global_load_ushort v7, v[1:2], off offset:12 +; GFX900-NEXT: s_nop 0 +; GFX900-NEXT: global_load_ushort v1, v[1:2], off offset:14 +; GFX900-NEXT: s_waitcnt vmcnt(31) +; GFX900-NEXT: v_lshlrev_b32_e32 v2, 16, v9 +; GFX900-NEXT: v_cvt_f64_f32_e32 v[9:10], v2 +; GFX900-NEXT: s_waitcnt vmcnt(30) +; GFX900-NEXT: v_lshlrev_b32_e32 v2, 16, v11 +; GFX900-NEXT: s_waitcnt vmcnt(28) +; GFX900-NEXT: v_lshlrev_b32_e32 v11, 16, v13 +; GFX900-NEXT: buffer_store_dword v10, v0, s[0:3], 0 offen offset:252 +; GFX900-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:248 +; GFX900-NEXT: v_cvt_f64_f32_e32 v[9:10], v2 +; GFX900-NEXT: v_lshlrev_b32_e32 v2, 16, v12 +; GFX900-NEXT: s_waitcnt vmcnt(29) +; GFX900-NEXT: v_lshlrev_b32_e32 v12, 16, v14 +; GFX900-NEXT: buffer_store_dword v10, v0, s[0:3], 0 offen offset:244 +; GFX900-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:240 +; GFX900-NEXT: v_cvt_f64_f32_e32 v[9:10], v2 +; GFX900-NEXT: s_waitcnt vmcnt(30) +; GFX900-NEXT: v_lshlrev_b32_e32 v13, 16, v15 +; GFX900-NEXT: buffer_store_dword v10, v0, s[0:3], 0 offen offset:236 +; GFX900-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:232 +; GFX900-NEXT: v_cvt_f64_f32_e32 v[9:10], v11 +; GFX900-NEXT: v_cvt_f64_f32_e32 v[11:12], v12 +; GFX900-NEXT: s_waitcnt vmcnt(31) +; GFX900-NEXT: v_lshlrev_b32_e32 v14, 16, v16 +; GFX900-NEXT: s_waitcnt vmcnt(30) +; GFX900-NEXT: v_lshlrev_b32_e32 v15, 16, v17 +; GFX900-NEXT: buffer_store_dword v10, v0, s[0:3], 0 offen offset:228 +; GFX900-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:224 +; GFX900-NEXT: v_cvt_f64_f32_e32 v[9:10], v13 +; GFX900-NEXT: s_waitcnt vmcnt(31) +; GFX900-NEXT: v_lshlrev_b32_e32 v16, 16, v18 +; GFX900-NEXT: v_cvt_f64_f32_e32 v[13:14], v14 +; GFX900-NEXT: buffer_store_dword v12, v0, s[0:3], 0 offen offset:220 +; GFX900-NEXT: buffer_store_dword v11, v0, s[0:3], 0 offen offset:216 +; GFX900-NEXT: v_cvt_f64_f32_e32 v[11:12], v15 +; GFX900-NEXT: v_cvt_f64_f32_e32 v[15:16], v16 +; GFX900-NEXT: s_waitcnt vmcnt(32) +; GFX900-NEXT: v_lshlrev_b32_e32 v2, 16, v19 +; GFX900-NEXT: s_waitcnt vmcnt(30) +; GFX900-NEXT: v_lshlrev_b32_e32 v19, 16, v21 +; GFX900-NEXT: s_waitcnt vmcnt(28) +; GFX900-NEXT: v_lshlrev_b32_e32 v21, 16, v23 +; GFX900-NEXT: buffer_store_dword v10, v0, s[0:3], 0 offen offset:212 +; GFX900-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:208 +; GFX900-NEXT: v_cvt_f64_f32_e32 v[9:10], v2 +; GFX900-NEXT: v_lshlrev_b32_e32 v17, 16, v20 +; GFX900-NEXT: v_lshlrev_b32_e32 v20, 16, v22 +; GFX900-NEXT: buffer_store_dword v14, v0, s[0:3], 0 offen offset:204 +; GFX900-NEXT: buffer_store_dword v13, v0, s[0:3], 0 offen offset:200 +; GFX900-NEXT: buffer_store_dword v12, v0, s[0:3], 0 offen offset:196 +; GFX900-NEXT: buffer_store_dword v11, v0, s[0:3], 0 offen offset:192 +; GFX900-NEXT: v_cvt_f64_f32_e32 v[11:12], v21 +; GFX900-NEXT: s_waitcnt vmcnt(33) +; GFX900-NEXT: v_lshlrev_b32_e32 v2, 16, v24 +; GFX900-NEXT: v_cvt_f64_f32_e32 v[17:18], v17 +; GFX900-NEXT: v_cvt_f64_f32_e32 v[13:14], v19 +; GFX900-NEXT: v_cvt_f64_f32_e32 v[19:20], v20 +; GFX900-NEXT: buffer_store_dword v16, v0, s[0:3], 0 offen offset:188 +; GFX900-NEXT: buffer_store_dword v15, v0, s[0:3], 0 offen offset:184 +; GFX900-NEXT: buffer_store_dword v10, v0, s[0:3], 0 offen offset:180 +; GFX900-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:176 +; GFX900-NEXT: buffer_store_dword v18, v0, s[0:3], 0 offen offset:172 +; GFX900-NEXT: buffer_store_dword v17, v0, s[0:3], 0 offen offset:168 +; GFX900-NEXT: buffer_store_dword v14, v0, s[0:3], 0 offen offset:164 +; GFX900-NEXT: buffer_store_dword v13, v0, s[0:3], 0 offen offset:160 +; GFX900-NEXT: buffer_store_dword v20, v0, s[0:3], 0 offen offset:156 +; GFX900-NEXT: buffer_store_dword v19, v0, s[0:3], 0 offen offset:152 +; GFX900-NEXT: buffer_store_dword v12, v0, s[0:3], 0 offen offset:148 +; GFX900-NEXT: v_cvt_f64_f32_e32 v[9:10], v2 +; GFX900-NEXT: buffer_store_dword v11, v0, s[0:3], 0 offen offset:144 +; GFX900-NEXT: s_waitcnt vmcnt(44) +; GFX900-NEXT: v_lshlrev_b32_e32 v11, 16, v25 +; GFX900-NEXT: buffer_store_dword v10, v0, s[0:3], 0 offen offset:140 +; GFX900-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:136 +; GFX900-NEXT: v_cvt_f64_f32_e32 v[9:10], v11 +; GFX900-NEXT: s_waitcnt vmcnt(38) +; GFX900-NEXT: v_lshlrev_b32_e32 v13, 16, v28 +; GFX900-NEXT: buffer_store_dword v10, v0, s[0:3], 0 offen offset:132 +; GFX900-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:128 +; GFX900-NEXT: v_cvt_f64_f32_e32 v[9:10], v13 +; GFX900-NEXT: s_waitcnt vmcnt(38) +; GFX900-NEXT: v_lshlrev_b32_e32 v15, 16, v30 +; GFX900-NEXT: buffer_store_dword v10, v0, s[0:3], 0 offen offset:124 +; GFX900-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:120 +; GFX900-NEXT: v_cvt_f64_f32_e32 v[9:10], v15 +; GFX900-NEXT: v_lshlrev_b32_e32 v17, 16, v29 +; GFX900-NEXT: buffer_store_dword v10, v0, s[0:3], 0 offen offset:116 +; GFX900-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:112 +; GFX900-NEXT: v_cvt_f64_f32_e32 v[9:10], v17 +; GFX900-NEXT: v_lshlrev_b32_e32 v2, 16, v26 +; GFX900-NEXT: v_cvt_f64_f32_e32 v[11:12], v2 +; GFX900-NEXT: v_lshlrev_b32_e32 v2, 16, v27 +; GFX900-NEXT: v_cvt_f64_f32_e32 v[13:14], v2 +; GFX900-NEXT: s_waitcnt vmcnt(41) +; GFX900-NEXT: v_lshlrev_b32_e32 v2, 16, v31 +; GFX900-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; GFX900-NEXT: v_cvt_f64_f32_e32 v[15:16], v2 +; GFX900-NEXT: s_waitcnt vmcnt(40) +; GFX900-NEXT: v_lshlrev_b32_e32 v2, 16, v32 +; GFX900-NEXT: buffer_store_dword v10, v0, s[0:3], 0 offen offset:108 +; GFX900-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:104 +; GFX900-NEXT: v_cvt_f64_f32_e32 v[8:9], v8 +; GFX900-NEXT: v_cvt_f64_f32_e32 v[17:18], v2 +; GFX900-NEXT: s_waitcnt vmcnt(41) +; GFX900-NEXT: v_lshlrev_b32_e32 v2, 16, v33 +; GFX900-NEXT: v_cvt_f64_f32_e32 v[19:20], v2 +; GFX900-NEXT: s_waitcnt vmcnt(40) +; GFX900-NEXT: v_lshlrev_b32_e32 v2, 16, v34 +; GFX900-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; GFX900-NEXT: v_cvt_f64_f32_e32 v[21:22], v2 +; GFX900-NEXT: v_lshlrev_b32_e32 v2, 16, v5 +; GFX900-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:100 +; GFX900-NEXT: buffer_store_dword v8, v0, s[0:3], 0 offen offset:96 +; GFX900-NEXT: v_cvt_f64_f32_e32 v[8:9], v6 +; GFX900-NEXT: v_cvt_f64_f32_e32 v[5:6], v2 +; GFX900-NEXT: s_waitcnt vmcnt(41) +; GFX900-NEXT: v_lshlrev_b32_e32 v10, 16, v7 +; GFX900-NEXT: s_waitcnt vmcnt(40) +; GFX900-NEXT: v_lshlrev_b32_e32 v7, 16, v1 +; GFX900-NEXT: v_lshlrev_b32_e32 v1, 16, v4 +; GFX900-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX900-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:92 +; GFX900-NEXT: buffer_store_dword v8, v0, s[0:3], 0 offen offset:88 +; GFX900-NEXT: v_cvt_f64_f32_e32 v[1:2], v1 +; GFX900-NEXT: buffer_store_dword v6, v0, s[0:3], 0 offen offset:84 +; GFX900-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:80 +; GFX900-NEXT: v_cvt_f64_f32_e32 v[4:5], v7 +; GFX900-NEXT: v_cvt_f64_f32_e32 v[6:7], v3 +; GFX900-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:76 +; GFX900-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:72 +; GFX900-NEXT: v_cvt_f64_f32_e32 v[1:2], v10 +; GFX900-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen offset:68 +; GFX900-NEXT: buffer_store_dword v6, v0, s[0:3], 0 offen offset:64 +; GFX900-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:60 +; GFX900-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:56 +; GFX900-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:52 +; GFX900-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:48 +; GFX900-NEXT: buffer_store_dword v22, v0, s[0:3], 0 offen offset:44 +; GFX900-NEXT: buffer_store_dword v21, v0, s[0:3], 0 offen offset:40 +; GFX900-NEXT: buffer_store_dword v20, v0, s[0:3], 0 offen offset:36 +; GFX900-NEXT: buffer_store_dword v19, v0, s[0:3], 0 offen offset:32 +; GFX900-NEXT: buffer_store_dword v18, v0, s[0:3], 0 offen offset:28 +; GFX900-NEXT: buffer_store_dword v17, v0, s[0:3], 0 offen offset:24 +; GFX900-NEXT: buffer_store_dword v16, v0, s[0:3], 0 offen offset:20 +; GFX900-NEXT: buffer_store_dword v15, v0, s[0:3], 0 offen offset:16 +; GFX900-NEXT: buffer_store_dword v14, v0, s[0:3], 0 offen offset:12 +; GFX900-NEXT: buffer_store_dword v13, v0, s[0:3], 0 offen offset:8 +; GFX900-NEXT: buffer_store_dword v12, v0, s[0:3], 0 offen offset:4 +; GFX900-NEXT: buffer_store_dword v11, v0, s[0:3], 0 offen +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: global_extload_v32bf16_to_v32f64: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_mov_b32_e32 v3, v2 +; GFX950-NEXT: v_mov_b32_e32 v2, v1 +; GFX950-NEXT: v_accvgpr_write_b32 a2, v42 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_write_b32 a3, v43 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_write_b32 a6, v46 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_write_b32 a7, v47 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_write_b32 a8, v56 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_write_b32 a9, v57 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_write_b32 a10, v58 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_write_b32 a12, v60 ; Reload Reuse +; GFX950-NEXT: global_load_ushort v1, v[2:3], off offset:2 +; GFX950-NEXT: global_load_ushort v4, v[2:3], off offset:12 +; GFX950-NEXT: global_load_ushort v5, v[2:3], off offset:8 +; GFX950-NEXT: global_load_ushort v6, v[2:3], off offset:4 +; GFX950-NEXT: global_load_ushort v7, v[2:3], off +; GFX950-NEXT: global_load_ushort v8, v[2:3], off offset:6 +; GFX950-NEXT: global_load_ushort v9, v[2:3], off offset:10 +; GFX950-NEXT: global_load_ushort v10, v[2:3], off offset:14 +; GFX950-NEXT: global_load_ushort v11, v[2:3], off offset:18 +; GFX950-NEXT: global_load_ushort v12, v[2:3], off offset:28 +; GFX950-NEXT: global_load_ushort v13, v[2:3], off offset:24 +; GFX950-NEXT: global_load_ushort v14, v[2:3], off offset:20 +; GFX950-NEXT: global_load_ushort v15, v[2:3], off offset:16 +; GFX950-NEXT: global_load_ushort v16, v[2:3], off offset:22 +; GFX950-NEXT: global_load_ushort v17, v[2:3], off offset:26 +; GFX950-NEXT: global_load_ushort v18, v[2:3], off offset:30 +; GFX950-NEXT: global_load_ushort v19, v[2:3], off offset:34 +; GFX950-NEXT: global_load_ushort v20, v[2:3], off offset:44 +; GFX950-NEXT: global_load_ushort v21, v[2:3], off offset:40 +; GFX950-NEXT: global_load_ushort v22, v[2:3], off offset:36 +; GFX950-NEXT: global_load_ushort v23, v[2:3], off offset:32 +; GFX950-NEXT: global_load_ushort v24, v[2:3], off offset:38 +; GFX950-NEXT: global_load_ushort v25, v[2:3], off offset:42 +; GFX950-NEXT: global_load_ushort v26, v[2:3], off offset:46 +; GFX950-NEXT: global_load_ushort v42, v[2:3], off offset:50 +; GFX950-NEXT: global_load_ushort v43, v[2:3], off offset:62 +; GFX950-NEXT: global_load_ushort v46, v[2:3], off offset:60 +; GFX950-NEXT: global_load_ushort v47, v[2:3], off offset:56 +; GFX950-NEXT: global_load_ushort v60, v[2:3], off offset:52 +; GFX950-NEXT: global_load_ushort v56, v[2:3], off offset:48 +; GFX950-NEXT: global_load_ushort v57, v[2:3], off offset:54 +; GFX950-NEXT: global_load_ushort v58, v[2:3], off offset:58 +; GFX950-NEXT: v_accvgpr_write_b32 a4, v44 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_write_b32 a0, v40 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_write_b32 a1, v41 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_write_b32 a5, v45 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_write_b32 a11, v59 ; Reload Reuse +; GFX950-NEXT: s_waitcnt vmcnt(31) +; GFX950-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX950-NEXT: s_waitcnt vmcnt(30) +; GFX950-NEXT: v_lshlrev_b32_e32 v30, 16, v4 +; GFX950-NEXT: s_waitcnt vmcnt(29) +; GFX950-NEXT: v_lshlrev_b32_e32 v28, 16, v5 +; GFX950-NEXT: v_cvt_f64_f32_e32 v[4:5], v1 +; GFX950-NEXT: s_waitcnt vmcnt(27) +; GFX950-NEXT: v_lshlrev_b32_e32 v2, 16, v7 +; GFX950-NEXT: s_waitcnt vmcnt(26) +; GFX950-NEXT: v_lshlrev_b32_e32 v7, 16, v8 +; GFX950-NEXT: s_waitcnt vmcnt(25) +; GFX950-NEXT: v_lshlrev_b32_e32 v27, 16, v9 +; GFX950-NEXT: s_waitcnt vmcnt(24) +; GFX950-NEXT: v_lshlrev_b32_e32 v29, 16, v10 +; GFX950-NEXT: s_waitcnt vmcnt(23) +; GFX950-NEXT: v_lshlrev_b32_e32 v31, 16, v11 +; GFX950-NEXT: s_waitcnt vmcnt(22) +; GFX950-NEXT: v_lshlrev_b32_e32 v38, 16, v12 +; GFX950-NEXT: s_waitcnt vmcnt(21) +; GFX950-NEXT: v_lshlrev_b32_e32 v36, 16, v13 +; GFX950-NEXT: s_waitcnt vmcnt(20) +; GFX950-NEXT: v_lshlrev_b32_e32 v34, 16, v14 +; GFX950-NEXT: s_waitcnt vmcnt(19) +; GFX950-NEXT: v_lshlrev_b32_e32 v32, 16, v15 +; GFX950-NEXT: s_waitcnt vmcnt(18) +; GFX950-NEXT: v_lshlrev_b32_e32 v33, 16, v16 +; GFX950-NEXT: v_cvt_f64_f32_e32 v[12:13], v27 +; GFX950-NEXT: s_waitcnt vmcnt(16) +; GFX950-NEXT: v_lshlrev_b32_e32 v37, 16, v18 +; GFX950-NEXT: s_waitcnt vmcnt(15) +; GFX950-NEXT: v_lshlrev_b32_e32 v39, 16, v19 +; GFX950-NEXT: s_waitcnt vmcnt(14) +; GFX950-NEXT: v_lshlrev_b32_e32 v44, 16, v20 +; GFX950-NEXT: s_waitcnt vmcnt(13) +; GFX950-NEXT: v_lshlrev_b32_e32 v40, 16, v21 +; GFX950-NEXT: v_cvt_f64_f32_e32 v[14:15], v30 +; GFX950-NEXT: v_cvt_f64_f32_e32 v[20:21], v31 +; GFX950-NEXT: s_waitcnt vmcnt(10) +; GFX950-NEXT: v_lshlrev_b32_e32 v49, 16, v24 +; GFX950-NEXT: s_waitcnt vmcnt(9) +; GFX950-NEXT: v_lshlrev_b32_e32 v53, 16, v25 +; GFX950-NEXT: s_waitcnt vmcnt(8) +; GFX950-NEXT: v_lshlrev_b32_e32 v41, 16, v26 +; GFX950-NEXT: s_waitcnt vmcnt(7) +; GFX950-NEXT: v_lshlrev_b32_e32 v1, 16, v42 +; GFX950-NEXT: s_waitcnt vmcnt(6) +; GFX950-NEXT: v_lshlrev_b32_e32 v42, 16, v43 +; GFX950-NEXT: v_cvt_f64_f32_e32 v[18:19], v32 +; GFX950-NEXT: v_cvt_f64_f32_e32 v[24:25], v33 +; GFX950-NEXT: v_cvt_f64_f32_e32 v[26:27], v36 +; GFX950-NEXT: v_cvt_f64_f32_e32 v[32:33], v37 +; GFX950-NEXT: v_cvt_f64_f32_e32 v[30:31], v38 +; GFX950-NEXT: v_cvt_f64_f32_e32 v[36:37], v39 +; GFX950-NEXT: v_cvt_f64_f32_e32 v[38:39], v44 +; GFX950-NEXT: v_cvt_f64_f32_e32 v[44:45], v42 +; GFX950-NEXT: s_waitcnt vmcnt(5) +; GFX950-NEXT: v_lshlrev_b32_e32 v42, 16, v46 +; GFX950-NEXT: v_cvt_f64_f32_e32 v[42:43], v42 +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: v_lshlrev_b32_e32 v46, 16, v58 +; GFX950-NEXT: scratch_store_dwordx4 v0, v[42:45], off offset:240 +; GFX950-NEXT: v_cvt_f64_f32_e32 v[58:59], v46 +; GFX950-NEXT: v_lshlrev_b32_e32 v46, 16, v47 +; GFX950-NEXT: v_cvt_f64_f32_e32 v[44:45], v1 +; GFX950-NEXT: v_lshlrev_b32_e32 v1, 16, v56 +; GFX950-NEXT: v_cvt_f64_f32_e32 v[42:43], v1 +; GFX950-NEXT: v_lshlrev_b32_e32 v1, 16, v57 +; GFX950-NEXT: v_cvt_f64_f32_e32 v[56:57], v46 +; GFX950-NEXT: v_lshlrev_b32_e32 v35, 16, v17 +; GFX950-NEXT: v_lshlrev_b32_e32 v48, 16, v23 +; GFX950-NEXT: v_lshlrev_b32_e32 v52, 16, v22 +; GFX950-NEXT: scratch_store_dwordx4 v0, v[56:59], off offset:224 +; GFX950-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; GFX950-NEXT: v_cvt_f64_f32_e32 v[10:11], v28 +; GFX950-NEXT: v_cvt_f64_f32_e32 v[58:59], v1 +; GFX950-NEXT: v_lshlrev_b32_e32 v1, 16, v60 +; GFX950-NEXT: v_cvt_f64_f32_e32 v[16:17], v29 +; GFX950-NEXT: v_cvt_f64_f32_e32 v[22:23], v34 +; GFX950-NEXT: v_cvt_f64_f32_e32 v[28:29], v35 +; GFX950-NEXT: v_cvt_f64_f32_e32 v[34:35], v48 +; GFX950-NEXT: v_cvt_f64_f32_e32 v[50:51], v49 +; GFX950-NEXT: v_cvt_f64_f32_e32 v[48:49], v52 +; GFX950-NEXT: v_cvt_f64_f32_e32 v[54:55], v53 +; GFX950-NEXT: v_cvt_f64_f32_e32 v[52:53], v40 +; GFX950-NEXT: v_cvt_f64_f32_e32 v[40:41], v41 +; GFX950-NEXT: v_cvt_f64_f32_e32 v[56:57], v1 +; GFX950-NEXT: v_cvt_f64_f32_e32 v[2:3], v2 +; GFX950-NEXT: v_cvt_f64_f32_e32 v[8:9], v7 +; GFX950-NEXT: v_cvt_f64_f32_e32 v[6:7], v6 +; GFX950-NEXT: scratch_store_dwordx4 v0, v[56:59], off offset:208 +; GFX950-NEXT: scratch_store_dwordx4 v0, v[42:45], off offset:192 +; GFX950-NEXT: scratch_store_dwordx4 v0, v[38:41], off offset:176 +; GFX950-NEXT: scratch_store_dwordx4 v0, v[52:55], off offset:160 +; GFX950-NEXT: scratch_store_dwordx4 v0, v[48:51], off offset:144 +; GFX950-NEXT: scratch_store_dwordx4 v0, v[34:37], off offset:128 +; GFX950-NEXT: scratch_store_dwordx4 v0, v[30:33], off offset:112 +; GFX950-NEXT: scratch_store_dwordx4 v0, v[26:29], off offset:96 +; GFX950-NEXT: scratch_store_dwordx4 v0, v[22:25], off offset:80 +; GFX950-NEXT: scratch_store_dwordx4 v0, v[18:21], off offset:64 +; GFX950-NEXT: scratch_store_dwordx4 v0, v[14:17], off offset:48 +; GFX950-NEXT: scratch_store_dwordx4 v0, v[10:13], off offset:32 +; GFX950-NEXT: scratch_store_dwordx4 v0, v[6:9], off offset:16 +; GFX950-NEXT: scratch_store_dwordx4 v0, v[2:5], off +; GFX950-NEXT: v_accvgpr_read_b32 v60, a12 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_read_b32 v59, a11 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_read_b32 v58, a10 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_read_b32 v57, a9 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_read_b32 v56, a8 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_read_b32 v47, a7 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_read_b32 v46, a6 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_read_b32 v45, a5 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_read_b32 v44, a4 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_read_b32 v43, a3 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_read_b32 v42, a2 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_read_b32 v41, a1 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_read_b32 v40, a0 ; Reload Reuse +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: global_extload_v32bf16_to_v32f64: ; GFX10: ; %bb.0: @@ -9050,20 +9566,29 @@ define bfloat @v_fadd_bf16(bfloat %a, bfloat %b) { ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_fadd_bf16: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX9-NEXT: v_add_f32_e32 v0, v0, v1 -; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1 -; GFX9-NEXT: s_movk_i32 s4, 0x7fff -; GFX9-NEXT: v_add3_u32 v1, v1, v0, s4 -; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v0 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: v_fadd_bf16: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX900-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX900-NEXT: v_add_f32_e32 v0, v0, v1 +; GFX900-NEXT: v_bfe_u32 v1, v0, 16, 1 +; GFX900-NEXT: s_movk_i32 s4, 0x7fff +; GFX900-NEXT: v_add3_u32 v1, v1, v0, s4 +; GFX900-NEXT: v_or_b32_e32 v2, 0x400000, v0 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX900-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc +; GFX900-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_fadd_bf16: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX950-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX950-NEXT: v_add_f32_e32 v0, v0, v1 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v0, v0, s0 +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_fadd_bf16: ; GFX10: ; %bb.0: @@ -9178,29 +9703,41 @@ define <2 x bfloat> @v_fadd_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b) { ; GFX8-NEXT: v_alignbit_b32 v0, v0, v2, 16 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_fadd_v2bf16: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v1 -; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v0 -; GFX9-NEXT: v_add_f32_e32 v2, v3, v2 -; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 -; GFX9-NEXT: s_movk_i32 s4, 0x7fff -; GFX9-NEXT: v_add_f32_e32 v0, v0, v1 -; GFX9-NEXT: v_add3_u32 v3, v3, v2, s4 -; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; GFX9-NEXT: v_add3_u32 v1, v1, v0, s4 -; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v0 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v3, vcc -; GFX9-NEXT: s_mov_b32 s4, 0x7060302 -; GFX9-NEXT: v_perm_b32 v0, v0, v2, s4 -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: v_fadd_v2bf16: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_lshlrev_b32_e32 v2, 16, v1 +; GFX900-NEXT: v_lshlrev_b32_e32 v3, 16, v0 +; GFX900-NEXT: v_add_f32_e32 v2, v3, v2 +; GFX900-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX900-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX900-NEXT: v_bfe_u32 v3, v2, 16, 1 +; GFX900-NEXT: s_movk_i32 s4, 0x7fff +; GFX900-NEXT: v_add_f32_e32 v0, v0, v1 +; GFX900-NEXT: v_add3_u32 v3, v3, v2, s4 +; GFX900-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX900-NEXT: v_bfe_u32 v1, v0, 16, 1 +; GFX900-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; GFX900-NEXT: v_add3_u32 v1, v1, v0, s4 +; GFX900-NEXT: v_or_b32_e32 v3, 0x400000, v0 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX900-NEXT: v_cndmask_b32_e32 v0, v1, v3, vcc +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_perm_b32 v0, v0, v2, s4 +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_fadd_v2bf16: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_and_b32_e32 v2, 0xffff0000, v1 +; GFX950-NEXT: v_and_b32_e32 v3, 0xffff0000, v0 +; GFX950-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX950-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX950-NEXT: v_add_f32_e32 v2, v3, v2 +; GFX950-NEXT: v_add_f32_e32 v0, v0, v1 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v0, v0, v2 +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_fadd_v2bf16: ; GFX10: ; %bb.0: @@ -9363,38 +9900,54 @@ define <3 x bfloat> @v_fadd_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b) { ; GFX8-NEXT: v_alignbit_b32 v0, v0, v3, 16 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_fadd_v3bf16: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX9-NEXT: v_add_f32_e32 v1, v1, v3 -; GFX9-NEXT: v_bfe_u32 v3, v1, 16, 1 -; GFX9-NEXT: s_movk_i32 s4, 0x7fff -; GFX9-NEXT: v_add3_u32 v3, v3, v1, s4 -; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v1 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc -; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v2 -; GFX9-NEXT: v_lshlrev_b32_e32 v4, 16, v0 -; GFX9-NEXT: v_add_f32_e32 v3, v4, v3 -; GFX9-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX9-NEXT: v_bfe_u32 v4, v3, 16, 1 -; GFX9-NEXT: v_add_f32_e32 v0, v0, v2 -; GFX9-NEXT: v_add3_u32 v4, v4, v3, s4 -; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v3 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; GFX9-NEXT: v_bfe_u32 v2, v0, 16, 1 -; GFX9-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc -; GFX9-NEXT: v_add3_u32 v2, v2, v0, s4 -; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v0 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc -; GFX9-NEXT: s_mov_b32 s4, 0x7060302 -; GFX9-NEXT: v_perm_b32 v0, v0, v3, s4 -; GFX9-NEXT: v_alignbit_b32 v1, s4, v1, 16 -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: v_fadd_v3bf16: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX900-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX900-NEXT: v_add_f32_e32 v1, v1, v3 +; GFX900-NEXT: v_bfe_u32 v3, v1, 16, 1 +; GFX900-NEXT: s_movk_i32 s4, 0x7fff +; GFX900-NEXT: v_add3_u32 v3, v3, v1, s4 +; GFX900-NEXT: v_or_b32_e32 v4, 0x400000, v1 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX900-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc +; GFX900-NEXT: v_lshlrev_b32_e32 v3, 16, v2 +; GFX900-NEXT: v_lshlrev_b32_e32 v4, 16, v0 +; GFX900-NEXT: v_add_f32_e32 v3, v4, v3 +; GFX900-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX900-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX900-NEXT: v_bfe_u32 v4, v3, 16, 1 +; GFX900-NEXT: v_add_f32_e32 v0, v0, v2 +; GFX900-NEXT: v_add3_u32 v4, v4, v3, s4 +; GFX900-NEXT: v_or_b32_e32 v5, 0x400000, v3 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX900-NEXT: v_bfe_u32 v2, v0, 16, 1 +; GFX900-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc +; GFX900-NEXT: v_add3_u32 v2, v2, v0, s4 +; GFX900-NEXT: v_or_b32_e32 v4, 0x400000, v0 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX900-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_perm_b32 v0, v0, v3, s4 +; GFX900-NEXT: v_alignbit_b32 v1, s4, v1, 16 +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_fadd_v3bf16: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX950-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX950-NEXT: v_add_f32_e32 v1, v1, v3 +; GFX950-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 +; GFX950-NEXT: v_and_b32_e32 v4, 0xffff0000, v0 +; GFX950-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX950-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX950-NEXT: v_add_f32_e32 v3, v4, v3 +; GFX950-NEXT: v_add_f32_e32 v0, v0, v2 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v1, v1, s0 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v0, v0, v3 +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_fadd_v3bf16: ; GFX10: ; %bb.0: @@ -9604,46 +10157,65 @@ define <4 x bfloat> @v_fadd_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b) { ; GFX8-NEXT: v_alignbit_b32 v1, v1, v4, 16 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_fadd_v4bf16: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_lshlrev_b32_e32 v4, 16, v3 -; GFX9-NEXT: v_lshlrev_b32_e32 v5, 16, v1 -; GFX9-NEXT: v_add_f32_e32 v4, v5, v4 -; GFX9-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 -; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GFX9-NEXT: v_bfe_u32 v5, v4, 16, 1 -; GFX9-NEXT: s_movk_i32 s4, 0x7fff -; GFX9-NEXT: v_add_f32_e32 v1, v1, v3 -; GFX9-NEXT: v_add3_u32 v5, v5, v4, s4 -; GFX9-NEXT: v_or_b32_e32 v6, 0x400000, v4 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 -; GFX9-NEXT: v_bfe_u32 v3, v1, 16, 1 -; GFX9-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc -; GFX9-NEXT: v_add3_u32 v3, v3, v1, s4 -; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v1 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc -; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v2 -; GFX9-NEXT: v_lshlrev_b32_e32 v5, 16, v0 -; GFX9-NEXT: v_add_f32_e32 v3, v5, v3 -; GFX9-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX9-NEXT: v_bfe_u32 v5, v3, 16, 1 -; GFX9-NEXT: v_add_f32_e32 v0, v0, v2 -; GFX9-NEXT: v_add3_u32 v5, v5, v3, s4 -; GFX9-NEXT: v_or_b32_e32 v6, 0x400000, v3 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; GFX9-NEXT: v_bfe_u32 v2, v0, 16, 1 -; GFX9-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc -; GFX9-NEXT: v_add3_u32 v2, v2, v0, s4 -; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v0 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v5, vcc -; GFX9-NEXT: s_mov_b32 s4, 0x7060302 -; GFX9-NEXT: v_perm_b32 v0, v0, v3, s4 -; GFX9-NEXT: v_perm_b32 v1, v1, v4, s4 -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: v_fadd_v4bf16: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_lshlrev_b32_e32 v4, 16, v3 +; GFX900-NEXT: v_lshlrev_b32_e32 v5, 16, v1 +; GFX900-NEXT: v_add_f32_e32 v4, v5, v4 +; GFX900-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; GFX900-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX900-NEXT: v_bfe_u32 v5, v4, 16, 1 +; GFX900-NEXT: s_movk_i32 s4, 0x7fff +; GFX900-NEXT: v_add_f32_e32 v1, v1, v3 +; GFX900-NEXT: v_add3_u32 v5, v5, v4, s4 +; GFX900-NEXT: v_or_b32_e32 v6, 0x400000, v4 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; GFX900-NEXT: v_bfe_u32 v3, v1, 16, 1 +; GFX900-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc +; GFX900-NEXT: v_add3_u32 v3, v3, v1, s4 +; GFX900-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX900-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc +; GFX900-NEXT: v_lshlrev_b32_e32 v3, 16, v2 +; GFX900-NEXT: v_lshlrev_b32_e32 v5, 16, v0 +; GFX900-NEXT: v_add_f32_e32 v3, v5, v3 +; GFX900-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX900-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX900-NEXT: v_bfe_u32 v5, v3, 16, 1 +; GFX900-NEXT: v_add_f32_e32 v0, v0, v2 +; GFX900-NEXT: v_add3_u32 v5, v5, v3, s4 +; GFX900-NEXT: v_or_b32_e32 v6, 0x400000, v3 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX900-NEXT: v_bfe_u32 v2, v0, 16, 1 +; GFX900-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc +; GFX900-NEXT: v_add3_u32 v2, v2, v0, s4 +; GFX900-NEXT: v_or_b32_e32 v5, 0x400000, v0 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX900-NEXT: v_cndmask_b32_e32 v0, v2, v5, vcc +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_perm_b32 v0, v0, v3, s4 +; GFX900-NEXT: v_perm_b32 v1, v1, v4, s4 +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_fadd_v4bf16: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_and_b32_e32 v4, 0xffff0000, v3 +; GFX950-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 +; GFX950-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX950-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX950-NEXT: v_add_f32_e32 v4, v5, v4 +; GFX950-NEXT: v_add_f32_e32 v1, v1, v3 +; GFX950-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 +; GFX950-NEXT: v_and_b32_e32 v5, 0xffff0000, v0 +; GFX950-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX950-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX950-NEXT: v_add_f32_e32 v3, v5, v3 +; GFX950-NEXT: v_add_f32_e32 v0, v0, v2 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v0, v0, v3 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v1, v1, v4 +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_fadd_v4bf16: ; GFX10: ; %bb.0: @@ -9967,80 +10539,113 @@ define <8 x bfloat> @v_fadd_v8bf16(<8 x bfloat> %a, <8 x bfloat> %b) { ; GFX8-NEXT: v_alignbit_b32 v3, v3, v8, 16 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_fadd_v8bf16: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_lshlrev_b32_e32 v8, 16, v7 -; GFX9-NEXT: v_lshlrev_b32_e32 v9, 16, v3 -; GFX9-NEXT: v_add_f32_e32 v8, v9, v8 -; GFX9-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 -; GFX9-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 -; GFX9-NEXT: v_bfe_u32 v9, v8, 16, 1 -; GFX9-NEXT: s_movk_i32 s4, 0x7fff -; GFX9-NEXT: v_add_f32_e32 v3, v3, v7 -; GFX9-NEXT: v_add3_u32 v9, v9, v8, s4 -; GFX9-NEXT: v_or_b32_e32 v10, 0x400000, v8 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v8, v8 -; GFX9-NEXT: v_bfe_u32 v7, v3, 16, 1 -; GFX9-NEXT: v_cndmask_b32_e32 v8, v9, v10, vcc -; GFX9-NEXT: v_add3_u32 v7, v7, v3, s4 -; GFX9-NEXT: v_or_b32_e32 v9, 0x400000, v3 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; GFX9-NEXT: v_cndmask_b32_e32 v3, v7, v9, vcc -; GFX9-NEXT: v_lshlrev_b32_e32 v7, 16, v6 -; GFX9-NEXT: v_lshlrev_b32_e32 v9, 16, v2 -; GFX9-NEXT: v_add_f32_e32 v7, v9, v7 -; GFX9-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 -; GFX9-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX9-NEXT: v_bfe_u32 v9, v7, 16, 1 -; GFX9-NEXT: v_add_f32_e32 v2, v2, v6 -; GFX9-NEXT: v_add3_u32 v9, v9, v7, s4 -; GFX9-NEXT: v_or_b32_e32 v10, 0x400000, v7 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 -; GFX9-NEXT: v_bfe_u32 v6, v2, 16, 1 -; GFX9-NEXT: v_cndmask_b32_e32 v7, v9, v10, vcc -; GFX9-NEXT: v_add3_u32 v6, v6, v2, s4 -; GFX9-NEXT: v_or_b32_e32 v9, 0x400000, v2 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v6, v9, vcc -; GFX9-NEXT: v_lshlrev_b32_e32 v6, 16, v5 -; GFX9-NEXT: v_lshlrev_b32_e32 v9, 16, v1 -; GFX9-NEXT: v_add_f32_e32 v6, v9, v6 -; GFX9-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 -; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GFX9-NEXT: v_bfe_u32 v9, v6, 16, 1 -; GFX9-NEXT: v_add_f32_e32 v1, v1, v5 -; GFX9-NEXT: v_add3_u32 v9, v9, v6, s4 -; GFX9-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 -; GFX9-NEXT: v_bfe_u32 v5, v1, 16, 1 -; GFX9-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc -; GFX9-NEXT: v_add3_u32 v5, v5, v1, s4 -; GFX9-NEXT: v_or_b32_e32 v9, 0x400000, v1 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v5, v9, vcc -; GFX9-NEXT: v_lshlrev_b32_e32 v5, 16, v4 -; GFX9-NEXT: v_lshlrev_b32_e32 v9, 16, v0 -; GFX9-NEXT: v_add_f32_e32 v5, v9, v5 -; GFX9-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX9-NEXT: v_bfe_u32 v9, v5, 16, 1 -; GFX9-NEXT: v_add_f32_e32 v0, v0, v4 -; GFX9-NEXT: v_add3_u32 v9, v9, v5, s4 -; GFX9-NEXT: v_or_b32_e32 v10, 0x400000, v5 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX9-NEXT: v_bfe_u32 v4, v0, 16, 1 -; GFX9-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc -; GFX9-NEXT: v_add3_u32 v4, v4, v0, s4 -; GFX9-NEXT: v_or_b32_e32 v9, 0x400000, v0 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v9, vcc -; GFX9-NEXT: s_mov_b32 s4, 0x7060302 -; GFX9-NEXT: v_perm_b32 v0, v0, v5, s4 -; GFX9-NEXT: v_perm_b32 v1, v1, v6, s4 -; GFX9-NEXT: v_perm_b32 v2, v2, v7, s4 -; GFX9-NEXT: v_perm_b32 v3, v3, v8, s4 -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: v_fadd_v8bf16: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_lshlrev_b32_e32 v8, 16, v7 +; GFX900-NEXT: v_lshlrev_b32_e32 v9, 16, v3 +; GFX900-NEXT: v_add_f32_e32 v8, v9, v8 +; GFX900-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 +; GFX900-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; GFX900-NEXT: v_bfe_u32 v9, v8, 16, 1 +; GFX900-NEXT: s_movk_i32 s4, 0x7fff +; GFX900-NEXT: v_add_f32_e32 v3, v3, v7 +; GFX900-NEXT: v_add3_u32 v9, v9, v8, s4 +; GFX900-NEXT: v_or_b32_e32 v10, 0x400000, v8 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v8, v8 +; GFX900-NEXT: v_bfe_u32 v7, v3, 16, 1 +; GFX900-NEXT: v_cndmask_b32_e32 v8, v9, v10, vcc +; GFX900-NEXT: v_add3_u32 v7, v7, v3, s4 +; GFX900-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX900-NEXT: v_cndmask_b32_e32 v3, v7, v9, vcc +; GFX900-NEXT: v_lshlrev_b32_e32 v7, 16, v6 +; GFX900-NEXT: v_lshlrev_b32_e32 v9, 16, v2 +; GFX900-NEXT: v_add_f32_e32 v7, v9, v7 +; GFX900-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 +; GFX900-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX900-NEXT: v_bfe_u32 v9, v7, 16, 1 +; GFX900-NEXT: v_add_f32_e32 v2, v2, v6 +; GFX900-NEXT: v_add3_u32 v9, v9, v7, s4 +; GFX900-NEXT: v_or_b32_e32 v10, 0x400000, v7 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 +; GFX900-NEXT: v_bfe_u32 v6, v2, 16, 1 +; GFX900-NEXT: v_cndmask_b32_e32 v7, v9, v10, vcc +; GFX900-NEXT: v_add3_u32 v6, v6, v2, s4 +; GFX900-NEXT: v_or_b32_e32 v9, 0x400000, v2 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX900-NEXT: v_cndmask_b32_e32 v2, v6, v9, vcc +; GFX900-NEXT: v_lshlrev_b32_e32 v6, 16, v5 +; GFX900-NEXT: v_lshlrev_b32_e32 v9, 16, v1 +; GFX900-NEXT: v_add_f32_e32 v6, v9, v6 +; GFX900-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 +; GFX900-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX900-NEXT: v_bfe_u32 v9, v6, 16, 1 +; GFX900-NEXT: v_add_f32_e32 v1, v1, v5 +; GFX900-NEXT: v_add3_u32 v9, v9, v6, s4 +; GFX900-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; GFX900-NEXT: v_bfe_u32 v5, v1, 16, 1 +; GFX900-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc +; GFX900-NEXT: v_add3_u32 v5, v5, v1, s4 +; GFX900-NEXT: v_or_b32_e32 v9, 0x400000, v1 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX900-NEXT: v_cndmask_b32_e32 v1, v5, v9, vcc +; GFX900-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX900-NEXT: v_lshlrev_b32_e32 v9, 16, v0 +; GFX900-NEXT: v_add_f32_e32 v5, v9, v5 +; GFX900-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 +; GFX900-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX900-NEXT: v_bfe_u32 v9, v5, 16, 1 +; GFX900-NEXT: v_add_f32_e32 v0, v0, v4 +; GFX900-NEXT: v_add3_u32 v9, v9, v5, s4 +; GFX900-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX900-NEXT: v_bfe_u32 v4, v0, 16, 1 +; GFX900-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc +; GFX900-NEXT: v_add3_u32 v4, v4, v0, s4 +; GFX900-NEXT: v_or_b32_e32 v9, 0x400000, v0 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX900-NEXT: v_cndmask_b32_e32 v0, v4, v9, vcc +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_perm_b32 v0, v0, v5, s4 +; GFX900-NEXT: v_perm_b32 v1, v1, v6, s4 +; GFX900-NEXT: v_perm_b32 v2, v2, v7, s4 +; GFX900-NEXT: v_perm_b32 v3, v3, v8, s4 +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_fadd_v8bf16: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_and_b32_e32 v8, 0xffff0000, v7 +; GFX950-NEXT: v_and_b32_e32 v9, 0xffff0000, v3 +; GFX950-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; GFX950-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX950-NEXT: v_add_f32_e32 v8, v9, v8 +; GFX950-NEXT: v_add_f32_e32 v3, v3, v7 +; GFX950-NEXT: v_and_b32_e32 v7, 0xffff0000, v6 +; GFX950-NEXT: v_and_b32_e32 v9, 0xffff0000, v2 +; GFX950-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; GFX950-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX950-NEXT: v_add_f32_e32 v7, v9, v7 +; GFX950-NEXT: v_add_f32_e32 v2, v2, v6 +; GFX950-NEXT: v_and_b32_e32 v6, 0xffff0000, v5 +; GFX950-NEXT: v_and_b32_e32 v9, 0xffff0000, v1 +; GFX950-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX950-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX950-NEXT: v_add_f32_e32 v6, v9, v6 +; GFX950-NEXT: v_add_f32_e32 v1, v1, v5 +; GFX950-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 +; GFX950-NEXT: v_and_b32_e32 v9, 0xffff0000, v0 +; GFX950-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX950-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX950-NEXT: v_add_f32_e32 v5, v9, v5 +; GFX950-NEXT: v_add_f32_e32 v0, v0, v4 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v0, v0, v5 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v1, v1, v6 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v2, v2, v7 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v3, v3, v8 +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_fadd_v8bf16: ; GFX10: ; %bb.0: @@ -10656,148 +11261,209 @@ define <16 x bfloat> @v_fadd_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) { ; GFX8-NEXT: v_alignbit_b32 v7, v7, v16, 16 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_fadd_v16bf16: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_lshlrev_b32_e32 v16, 16, v15 -; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v7 -; GFX9-NEXT: v_add_f32_e32 v16, v17, v16 -; GFX9-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 -; GFX9-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 -; GFX9-NEXT: v_bfe_u32 v17, v16, 16, 1 -; GFX9-NEXT: s_movk_i32 s4, 0x7fff -; GFX9-NEXT: v_add_f32_e32 v7, v7, v15 -; GFX9-NEXT: v_add3_u32 v17, v17, v16, s4 -; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v16 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v16, v16 -; GFX9-NEXT: v_bfe_u32 v15, v7, 16, 1 -; GFX9-NEXT: v_cndmask_b32_e32 v16, v17, v18, vcc -; GFX9-NEXT: v_add3_u32 v15, v15, v7, s4 -; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v7 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 -; GFX9-NEXT: v_cndmask_b32_e32 v7, v15, v17, vcc -; GFX9-NEXT: v_lshlrev_b32_e32 v15, 16, v14 -; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v6 -; GFX9-NEXT: v_add_f32_e32 v15, v17, v15 -; GFX9-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 -; GFX9-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 -; GFX9-NEXT: v_bfe_u32 v17, v15, 16, 1 -; GFX9-NEXT: v_add_f32_e32 v6, v6, v14 -; GFX9-NEXT: v_add3_u32 v17, v17, v15, s4 -; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v15 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v15, v15 -; GFX9-NEXT: v_bfe_u32 v14, v6, 16, 1 -; GFX9-NEXT: v_cndmask_b32_e32 v15, v17, v18, vcc -; GFX9-NEXT: v_add3_u32 v14, v14, v6, s4 -; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v6 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 -; GFX9-NEXT: v_cndmask_b32_e32 v6, v14, v17, vcc -; GFX9-NEXT: v_lshlrev_b32_e32 v14, 16, v13 -; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v5 -; GFX9-NEXT: v_add_f32_e32 v14, v17, v14 -; GFX9-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 -; GFX9-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 -; GFX9-NEXT: v_bfe_u32 v17, v14, 16, 1 -; GFX9-NEXT: v_add_f32_e32 v5, v5, v13 -; GFX9-NEXT: v_add3_u32 v17, v17, v14, s4 -; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v14 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v14, v14 -; GFX9-NEXT: v_bfe_u32 v13, v5, 16, 1 -; GFX9-NEXT: v_cndmask_b32_e32 v14, v17, v18, vcc -; GFX9-NEXT: v_add3_u32 v13, v13, v5, s4 -; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v5 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX9-NEXT: v_cndmask_b32_e32 v5, v13, v17, vcc -; GFX9-NEXT: v_lshlrev_b32_e32 v13, 16, v12 -; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v4 -; GFX9-NEXT: v_add_f32_e32 v13, v17, v13 -; GFX9-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 -; GFX9-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 -; GFX9-NEXT: v_bfe_u32 v17, v13, 16, 1 -; GFX9-NEXT: v_add_f32_e32 v4, v4, v12 -; GFX9-NEXT: v_add3_u32 v17, v17, v13, s4 -; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v13 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v13, v13 -; GFX9-NEXT: v_bfe_u32 v12, v4, 16, 1 -; GFX9-NEXT: v_cndmask_b32_e32 v13, v17, v18, vcc -; GFX9-NEXT: v_add3_u32 v12, v12, v4, s4 -; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v4 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 -; GFX9-NEXT: v_cndmask_b32_e32 v4, v12, v17, vcc -; GFX9-NEXT: v_lshlrev_b32_e32 v12, 16, v11 -; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v3 -; GFX9-NEXT: v_add_f32_e32 v12, v17, v12 -; GFX9-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 -; GFX9-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 -; GFX9-NEXT: v_bfe_u32 v17, v12, 16, 1 -; GFX9-NEXT: v_add_f32_e32 v3, v3, v11 -; GFX9-NEXT: v_add3_u32 v17, v17, v12, s4 -; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v12 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v12, v12 -; GFX9-NEXT: v_bfe_u32 v11, v3, 16, 1 -; GFX9-NEXT: v_cndmask_b32_e32 v12, v17, v18, vcc -; GFX9-NEXT: v_add3_u32 v11, v11, v3, s4 -; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v3 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; GFX9-NEXT: v_cndmask_b32_e32 v3, v11, v17, vcc -; GFX9-NEXT: v_lshlrev_b32_e32 v11, 16, v10 -; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v2 -; GFX9-NEXT: v_add_f32_e32 v11, v17, v11 -; GFX9-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 -; GFX9-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX9-NEXT: v_bfe_u32 v17, v11, 16, 1 -; GFX9-NEXT: v_add_f32_e32 v2, v2, v10 -; GFX9-NEXT: v_add3_u32 v17, v17, v11, s4 -; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v11 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v11, v11 -; GFX9-NEXT: v_bfe_u32 v10, v2, 16, 1 -; GFX9-NEXT: v_cndmask_b32_e32 v11, v17, v18, vcc -; GFX9-NEXT: v_add3_u32 v10, v10, v2, s4 -; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v2 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v10, v17, vcc -; GFX9-NEXT: v_lshlrev_b32_e32 v10, 16, v9 -; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v1 -; GFX9-NEXT: v_add_f32_e32 v10, v17, v10 -; GFX9-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 -; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GFX9-NEXT: v_bfe_u32 v17, v10, 16, 1 -; GFX9-NEXT: v_add_f32_e32 v1, v1, v9 -; GFX9-NEXT: v_add3_u32 v17, v17, v10, s4 -; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v10 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v10, v10 -; GFX9-NEXT: v_bfe_u32 v9, v1, 16, 1 -; GFX9-NEXT: v_cndmask_b32_e32 v10, v17, v18, vcc -; GFX9-NEXT: v_add3_u32 v9, v9, v1, s4 -; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v1 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v9, v17, vcc -; GFX9-NEXT: v_lshlrev_b32_e32 v9, 16, v8 -; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v0 -; GFX9-NEXT: v_add_f32_e32 v9, v17, v9 -; GFX9-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX9-NEXT: v_bfe_u32 v17, v9, 16, 1 -; GFX9-NEXT: v_add_f32_e32 v0, v0, v8 -; GFX9-NEXT: v_add3_u32 v17, v17, v9, s4 -; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v9 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v9, v9 -; GFX9-NEXT: v_bfe_u32 v8, v0, 16, 1 -; GFX9-NEXT: v_cndmask_b32_e32 v9, v17, v18, vcc -; GFX9-NEXT: v_add3_u32 v8, v8, v0, s4 -; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v0 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v8, v17, vcc -; GFX9-NEXT: s_mov_b32 s4, 0x7060302 -; GFX9-NEXT: v_perm_b32 v0, v0, v9, s4 -; GFX9-NEXT: v_perm_b32 v1, v1, v10, s4 -; GFX9-NEXT: v_perm_b32 v2, v2, v11, s4 -; GFX9-NEXT: v_perm_b32 v3, v3, v12, s4 -; GFX9-NEXT: v_perm_b32 v4, v4, v13, s4 -; GFX9-NEXT: v_perm_b32 v5, v5, v14, s4 -; GFX9-NEXT: v_perm_b32 v6, v6, v15, s4 -; GFX9-NEXT: v_perm_b32 v7, v7, v16, s4 -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: v_fadd_v16bf16: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_lshlrev_b32_e32 v16, 16, v15 +; GFX900-NEXT: v_lshlrev_b32_e32 v17, 16, v7 +; GFX900-NEXT: v_add_f32_e32 v16, v17, v16 +; GFX900-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 +; GFX900-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 +; GFX900-NEXT: v_bfe_u32 v17, v16, 16, 1 +; GFX900-NEXT: s_movk_i32 s4, 0x7fff +; GFX900-NEXT: v_add_f32_e32 v7, v7, v15 +; GFX900-NEXT: v_add3_u32 v17, v17, v16, s4 +; GFX900-NEXT: v_or_b32_e32 v18, 0x400000, v16 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v16, v16 +; GFX900-NEXT: v_bfe_u32 v15, v7, 16, 1 +; GFX900-NEXT: v_cndmask_b32_e32 v16, v17, v18, vcc +; GFX900-NEXT: v_add3_u32 v15, v15, v7, s4 +; GFX900-NEXT: v_or_b32_e32 v17, 0x400000, v7 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 +; GFX900-NEXT: v_cndmask_b32_e32 v7, v15, v17, vcc +; GFX900-NEXT: v_lshlrev_b32_e32 v15, 16, v14 +; GFX900-NEXT: v_lshlrev_b32_e32 v17, 16, v6 +; GFX900-NEXT: v_add_f32_e32 v15, v17, v15 +; GFX900-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 +; GFX900-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 +; GFX900-NEXT: v_bfe_u32 v17, v15, 16, 1 +; GFX900-NEXT: v_add_f32_e32 v6, v6, v14 +; GFX900-NEXT: v_add3_u32 v17, v17, v15, s4 +; GFX900-NEXT: v_or_b32_e32 v18, 0x400000, v15 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v15, v15 +; GFX900-NEXT: v_bfe_u32 v14, v6, 16, 1 +; GFX900-NEXT: v_cndmask_b32_e32 v15, v17, v18, vcc +; GFX900-NEXT: v_add3_u32 v14, v14, v6, s4 +; GFX900-NEXT: v_or_b32_e32 v17, 0x400000, v6 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; GFX900-NEXT: v_cndmask_b32_e32 v6, v14, v17, vcc +; GFX900-NEXT: v_lshlrev_b32_e32 v14, 16, v13 +; GFX900-NEXT: v_lshlrev_b32_e32 v17, 16, v5 +; GFX900-NEXT: v_add_f32_e32 v14, v17, v14 +; GFX900-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 +; GFX900-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 +; GFX900-NEXT: v_bfe_u32 v17, v14, 16, 1 +; GFX900-NEXT: v_add_f32_e32 v5, v5, v13 +; GFX900-NEXT: v_add3_u32 v17, v17, v14, s4 +; GFX900-NEXT: v_or_b32_e32 v18, 0x400000, v14 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v14, v14 +; GFX900-NEXT: v_bfe_u32 v13, v5, 16, 1 +; GFX900-NEXT: v_cndmask_b32_e32 v14, v17, v18, vcc +; GFX900-NEXT: v_add3_u32 v13, v13, v5, s4 +; GFX900-NEXT: v_or_b32_e32 v17, 0x400000, v5 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX900-NEXT: v_cndmask_b32_e32 v5, v13, v17, vcc +; GFX900-NEXT: v_lshlrev_b32_e32 v13, 16, v12 +; GFX900-NEXT: v_lshlrev_b32_e32 v17, 16, v4 +; GFX900-NEXT: v_add_f32_e32 v13, v17, v13 +; GFX900-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 +; GFX900-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 +; GFX900-NEXT: v_bfe_u32 v17, v13, 16, 1 +; GFX900-NEXT: v_add_f32_e32 v4, v4, v12 +; GFX900-NEXT: v_add3_u32 v17, v17, v13, s4 +; GFX900-NEXT: v_or_b32_e32 v18, 0x400000, v13 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v13, v13 +; GFX900-NEXT: v_bfe_u32 v12, v4, 16, 1 +; GFX900-NEXT: v_cndmask_b32_e32 v13, v17, v18, vcc +; GFX900-NEXT: v_add3_u32 v12, v12, v4, s4 +; GFX900-NEXT: v_or_b32_e32 v17, 0x400000, v4 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; GFX900-NEXT: v_cndmask_b32_e32 v4, v12, v17, vcc +; GFX900-NEXT: v_lshlrev_b32_e32 v12, 16, v11 +; GFX900-NEXT: v_lshlrev_b32_e32 v17, 16, v3 +; GFX900-NEXT: v_add_f32_e32 v12, v17, v12 +; GFX900-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 +; GFX900-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; GFX900-NEXT: v_bfe_u32 v17, v12, 16, 1 +; GFX900-NEXT: v_add_f32_e32 v3, v3, v11 +; GFX900-NEXT: v_add3_u32 v17, v17, v12, s4 +; GFX900-NEXT: v_or_b32_e32 v18, 0x400000, v12 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v12, v12 +; GFX900-NEXT: v_bfe_u32 v11, v3, 16, 1 +; GFX900-NEXT: v_cndmask_b32_e32 v12, v17, v18, vcc +; GFX900-NEXT: v_add3_u32 v11, v11, v3, s4 +; GFX900-NEXT: v_or_b32_e32 v17, 0x400000, v3 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX900-NEXT: v_cndmask_b32_e32 v3, v11, v17, vcc +; GFX900-NEXT: v_lshlrev_b32_e32 v11, 16, v10 +; GFX900-NEXT: v_lshlrev_b32_e32 v17, 16, v2 +; GFX900-NEXT: v_add_f32_e32 v11, v17, v11 +; GFX900-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 +; GFX900-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX900-NEXT: v_bfe_u32 v17, v11, 16, 1 +; GFX900-NEXT: v_add_f32_e32 v2, v2, v10 +; GFX900-NEXT: v_add3_u32 v17, v17, v11, s4 +; GFX900-NEXT: v_or_b32_e32 v18, 0x400000, v11 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v11, v11 +; GFX900-NEXT: v_bfe_u32 v10, v2, 16, 1 +; GFX900-NEXT: v_cndmask_b32_e32 v11, v17, v18, vcc +; GFX900-NEXT: v_add3_u32 v10, v10, v2, s4 +; GFX900-NEXT: v_or_b32_e32 v17, 0x400000, v2 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX900-NEXT: v_cndmask_b32_e32 v2, v10, v17, vcc +; GFX900-NEXT: v_lshlrev_b32_e32 v10, 16, v9 +; GFX900-NEXT: v_lshlrev_b32_e32 v17, 16, v1 +; GFX900-NEXT: v_add_f32_e32 v10, v17, v10 +; GFX900-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 +; GFX900-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX900-NEXT: v_bfe_u32 v17, v10, 16, 1 +; GFX900-NEXT: v_add_f32_e32 v1, v1, v9 +; GFX900-NEXT: v_add3_u32 v17, v17, v10, s4 +; GFX900-NEXT: v_or_b32_e32 v18, 0x400000, v10 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v10, v10 +; GFX900-NEXT: v_bfe_u32 v9, v1, 16, 1 +; GFX900-NEXT: v_cndmask_b32_e32 v10, v17, v18, vcc +; GFX900-NEXT: v_add3_u32 v9, v9, v1, s4 +; GFX900-NEXT: v_or_b32_e32 v17, 0x400000, v1 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX900-NEXT: v_cndmask_b32_e32 v1, v9, v17, vcc +; GFX900-NEXT: v_lshlrev_b32_e32 v9, 16, v8 +; GFX900-NEXT: v_lshlrev_b32_e32 v17, 16, v0 +; GFX900-NEXT: v_add_f32_e32 v9, v17, v9 +; GFX900-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 +; GFX900-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX900-NEXT: v_bfe_u32 v17, v9, 16, 1 +; GFX900-NEXT: v_add_f32_e32 v0, v0, v8 +; GFX900-NEXT: v_add3_u32 v17, v17, v9, s4 +; GFX900-NEXT: v_or_b32_e32 v18, 0x400000, v9 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v9, v9 +; GFX900-NEXT: v_bfe_u32 v8, v0, 16, 1 +; GFX900-NEXT: v_cndmask_b32_e32 v9, v17, v18, vcc +; GFX900-NEXT: v_add3_u32 v8, v8, v0, s4 +; GFX900-NEXT: v_or_b32_e32 v17, 0x400000, v0 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX900-NEXT: v_cndmask_b32_e32 v0, v8, v17, vcc +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_perm_b32 v0, v0, v9, s4 +; GFX900-NEXT: v_perm_b32 v1, v1, v10, s4 +; GFX900-NEXT: v_perm_b32 v2, v2, v11, s4 +; GFX900-NEXT: v_perm_b32 v3, v3, v12, s4 +; GFX900-NEXT: v_perm_b32 v4, v4, v13, s4 +; GFX900-NEXT: v_perm_b32 v5, v5, v14, s4 +; GFX900-NEXT: v_perm_b32 v6, v6, v15, s4 +; GFX900-NEXT: v_perm_b32 v7, v7, v16, s4 +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_fadd_v16bf16: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_and_b32_e32 v16, 0xffff0000, v15 +; GFX950-NEXT: v_and_b32_e32 v17, 0xffff0000, v7 +; GFX950-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; GFX950-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; GFX950-NEXT: v_add_f32_e32 v16, v17, v16 +; GFX950-NEXT: v_add_f32_e32 v7, v7, v15 +; GFX950-NEXT: v_and_b32_e32 v15, 0xffff0000, v14 +; GFX950-NEXT: v_and_b32_e32 v17, 0xffff0000, v6 +; GFX950-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; GFX950-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; GFX950-NEXT: v_add_f32_e32 v15, v17, v15 +; GFX950-NEXT: v_add_f32_e32 v6, v6, v14 +; GFX950-NEXT: v_and_b32_e32 v14, 0xffff0000, v13 +; GFX950-NEXT: v_and_b32_e32 v17, 0xffff0000, v5 +; GFX950-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; GFX950-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX950-NEXT: v_add_f32_e32 v14, v17, v14 +; GFX950-NEXT: v_add_f32_e32 v5, v5, v13 +; GFX950-NEXT: v_and_b32_e32 v13, 0xffff0000, v12 +; GFX950-NEXT: v_and_b32_e32 v17, 0xffff0000, v4 +; GFX950-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; GFX950-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX950-NEXT: v_add_f32_e32 v13, v17, v13 +; GFX950-NEXT: v_add_f32_e32 v4, v4, v12 +; GFX950-NEXT: v_and_b32_e32 v12, 0xffff0000, v11 +; GFX950-NEXT: v_and_b32_e32 v17, 0xffff0000, v3 +; GFX950-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; GFX950-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX950-NEXT: v_add_f32_e32 v12, v17, v12 +; GFX950-NEXT: v_add_f32_e32 v3, v3, v11 +; GFX950-NEXT: v_and_b32_e32 v11, 0xffff0000, v10 +; GFX950-NEXT: v_and_b32_e32 v17, 0xffff0000, v2 +; GFX950-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; GFX950-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX950-NEXT: v_add_f32_e32 v11, v17, v11 +; GFX950-NEXT: v_add_f32_e32 v2, v2, v10 +; GFX950-NEXT: v_and_b32_e32 v10, 0xffff0000, v9 +; GFX950-NEXT: v_and_b32_e32 v17, 0xffff0000, v1 +; GFX950-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; GFX950-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX950-NEXT: v_add_f32_e32 v10, v17, v10 +; GFX950-NEXT: v_add_f32_e32 v1, v1, v9 +; GFX950-NEXT: v_and_b32_e32 v9, 0xffff0000, v8 +; GFX950-NEXT: v_and_b32_e32 v17, 0xffff0000, v0 +; GFX950-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; GFX950-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX950-NEXT: v_add_f32_e32 v9, v17, v9 +; GFX950-NEXT: v_add_f32_e32 v0, v0, v8 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v0, v0, v9 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v1, v1, v10 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v2, v2, v11 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v3, v3, v12 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v4, v4, v13 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v5, v5, v14 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v6, v6, v15 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v7, v7, v16 +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_fadd_v16bf16: ; GFX10: ; %bb.0: @@ -12112,286 +12778,407 @@ define <32 x bfloat> @v_fadd_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) { ; GFX8-NEXT: v_alignbit_b32 v15, v16, v15, 16 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_fadd_v32bf16: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_lshlrev_b32_e32 v31, 16, v30 -; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v14 -; GFX9-NEXT: v_add_f32_e32 v31, v32, v31 -; GFX9-NEXT: v_and_b32_e32 v30, 0xffff0000, v30 -; GFX9-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 -; GFX9-NEXT: s_movk_i32 s4, 0x7fff -; GFX9-NEXT: v_bfe_u32 v32, v31, 16, 1 -; GFX9-NEXT: v_add_f32_e32 v14, v14, v30 -; GFX9-NEXT: v_add3_u32 v32, v32, v31, s4 -; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v31 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v31, v31 -; GFX9-NEXT: v_bfe_u32 v30, v14, 16, 1 -; GFX9-NEXT: v_cndmask_b32_e32 v31, v32, v33, vcc -; GFX9-NEXT: v_add3_u32 v30, v30, v14, s4 -; GFX9-NEXT: v_or_b32_e32 v32, 0x400000, v14 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v14, v14 -; GFX9-NEXT: v_cndmask_b32_e32 v14, v30, v32, vcc -; GFX9-NEXT: v_lshlrev_b32_e32 v30, 16, v29 -; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v13 -; GFX9-NEXT: v_add_f32_e32 v30, v32, v30 -; GFX9-NEXT: v_and_b32_e32 v29, 0xffff0000, v29 -; GFX9-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 -; GFX9-NEXT: v_bfe_u32 v32, v30, 16, 1 -; GFX9-NEXT: v_add_f32_e32 v13, v13, v29 -; GFX9-NEXT: v_add3_u32 v32, v32, v30, s4 -; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v30 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v30, v30 -; GFX9-NEXT: v_bfe_u32 v29, v13, 16, 1 -; GFX9-NEXT: v_cndmask_b32_e32 v30, v32, v33, vcc -; GFX9-NEXT: v_add3_u32 v29, v29, v13, s4 -; GFX9-NEXT: v_or_b32_e32 v32, 0x400000, v13 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v13, v13 -; GFX9-NEXT: v_cndmask_b32_e32 v13, v29, v32, vcc -; GFX9-NEXT: v_lshlrev_b32_e32 v29, 16, v28 -; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v12 -; GFX9-NEXT: v_add_f32_e32 v32, v32, v29 -; GFX9-NEXT: buffer_load_dword v29, off, s[0:3], s32 -; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v15 -; GFX9-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 -; GFX9-NEXT: v_and_b32_e32 v28, 0xffff0000, v28 -; GFX9-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 -; GFX9-NEXT: v_add_f32_e32 v12, v12, v28 -; GFX9-NEXT: v_bfe_u32 v28, v12, 16, 1 -; GFX9-NEXT: v_add3_u32 v28, v28, v12, s4 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_lshlrev_b32_e32 v34, 16, v29 -; GFX9-NEXT: v_add_f32_e32 v33, v33, v34 -; GFX9-NEXT: v_and_b32_e32 v29, 0xffff0000, v29 -; GFX9-NEXT: v_add_f32_e32 v29, v15, v29 -; GFX9-NEXT: v_bfe_u32 v15, v33, 16, 1 -; GFX9-NEXT: v_add3_u32 v15, v15, v33, s4 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v33 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v33, v33 -; GFX9-NEXT: v_bfe_u32 v33, v29, 16, 1 -; GFX9-NEXT: v_cndmask_b32_e32 v15, v15, v34, vcc -; GFX9-NEXT: v_add3_u32 v33, v33, v29, s4 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v29 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v29, v29 -; GFX9-NEXT: v_cndmask_b32_e32 v29, v33, v34, vcc -; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 -; GFX9-NEXT: v_add3_u32 v33, v33, v32, s4 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 -; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc -; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v12 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v12, v12 -; GFX9-NEXT: v_cndmask_b32_e32 v12, v28, v33, vcc -; GFX9-NEXT: v_lshlrev_b32_e32 v28, 16, v27 -; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v11 -; GFX9-NEXT: v_add_f32_e32 v28, v33, v28 -; GFX9-NEXT: v_and_b32_e32 v27, 0xffff0000, v27 -; GFX9-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 -; GFX9-NEXT: v_bfe_u32 v33, v28, 16, 1 -; GFX9-NEXT: v_add_f32_e32 v11, v11, v27 -; GFX9-NEXT: v_add3_u32 v33, v33, v28, s4 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v28 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v28, v28 -; GFX9-NEXT: v_bfe_u32 v27, v11, 16, 1 -; GFX9-NEXT: v_cndmask_b32_e32 v28, v33, v34, vcc -; GFX9-NEXT: v_add3_u32 v27, v27, v11, s4 -; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v11 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v11, v11 -; GFX9-NEXT: v_cndmask_b32_e32 v11, v27, v33, vcc -; GFX9-NEXT: v_lshlrev_b32_e32 v27, 16, v26 -; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v10 -; GFX9-NEXT: v_add_f32_e32 v27, v33, v27 -; GFX9-NEXT: v_and_b32_e32 v26, 0xffff0000, v26 -; GFX9-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 -; GFX9-NEXT: v_bfe_u32 v33, v27, 16, 1 -; GFX9-NEXT: v_add_f32_e32 v10, v10, v26 -; GFX9-NEXT: v_add3_u32 v33, v33, v27, s4 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v27 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v27, v27 -; GFX9-NEXT: v_bfe_u32 v26, v10, 16, 1 -; GFX9-NEXT: v_cndmask_b32_e32 v27, v33, v34, vcc -; GFX9-NEXT: v_add3_u32 v26, v26, v10, s4 -; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v10 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v10, v10 -; GFX9-NEXT: v_cndmask_b32_e32 v10, v26, v33, vcc -; GFX9-NEXT: v_lshlrev_b32_e32 v26, 16, v25 -; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v9 -; GFX9-NEXT: v_add_f32_e32 v26, v33, v26 -; GFX9-NEXT: v_and_b32_e32 v25, 0xffff0000, v25 -; GFX9-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 -; GFX9-NEXT: v_bfe_u32 v33, v26, 16, 1 -; GFX9-NEXT: v_add_f32_e32 v9, v9, v25 -; GFX9-NEXT: v_add3_u32 v33, v33, v26, s4 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v26 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v26, v26 -; GFX9-NEXT: v_bfe_u32 v25, v9, 16, 1 -; GFX9-NEXT: v_cndmask_b32_e32 v26, v33, v34, vcc -; GFX9-NEXT: v_add3_u32 v25, v25, v9, s4 -; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v9 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v9, v9 -; GFX9-NEXT: v_cndmask_b32_e32 v9, v25, v33, vcc -; GFX9-NEXT: v_lshlrev_b32_e32 v25, 16, v24 -; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v8 -; GFX9-NEXT: v_add_f32_e32 v25, v33, v25 -; GFX9-NEXT: v_and_b32_e32 v24, 0xffff0000, v24 -; GFX9-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 -; GFX9-NEXT: v_bfe_u32 v33, v25, 16, 1 -; GFX9-NEXT: v_add_f32_e32 v8, v8, v24 -; GFX9-NEXT: v_add3_u32 v33, v33, v25, s4 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v25 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v25, v25 -; GFX9-NEXT: v_bfe_u32 v24, v8, 16, 1 -; GFX9-NEXT: v_cndmask_b32_e32 v25, v33, v34, vcc -; GFX9-NEXT: v_add3_u32 v24, v24, v8, s4 -; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v8 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v8, v8 -; GFX9-NEXT: v_cndmask_b32_e32 v8, v24, v33, vcc -; GFX9-NEXT: v_lshlrev_b32_e32 v24, 16, v23 -; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v7 -; GFX9-NEXT: v_add_f32_e32 v24, v33, v24 -; GFX9-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 -; GFX9-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 -; GFX9-NEXT: v_bfe_u32 v33, v24, 16, 1 -; GFX9-NEXT: v_add_f32_e32 v7, v7, v23 -; GFX9-NEXT: v_add3_u32 v33, v33, v24, s4 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v24 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v24, v24 -; GFX9-NEXT: v_bfe_u32 v23, v7, 16, 1 -; GFX9-NEXT: v_cndmask_b32_e32 v24, v33, v34, vcc -; GFX9-NEXT: v_add3_u32 v23, v23, v7, s4 -; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v7 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 -; GFX9-NEXT: v_cndmask_b32_e32 v7, v23, v33, vcc -; GFX9-NEXT: v_lshlrev_b32_e32 v23, 16, v22 -; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v6 -; GFX9-NEXT: v_add_f32_e32 v23, v33, v23 -; GFX9-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 -; GFX9-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 -; GFX9-NEXT: v_bfe_u32 v33, v23, 16, 1 -; GFX9-NEXT: v_add_f32_e32 v6, v6, v22 -; GFX9-NEXT: v_add3_u32 v33, v33, v23, s4 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v23 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v23, v23 -; GFX9-NEXT: v_bfe_u32 v22, v6, 16, 1 -; GFX9-NEXT: v_cndmask_b32_e32 v23, v33, v34, vcc -; GFX9-NEXT: v_add3_u32 v22, v22, v6, s4 -; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v6 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 -; GFX9-NEXT: v_cndmask_b32_e32 v6, v22, v33, vcc -; GFX9-NEXT: v_lshlrev_b32_e32 v22, 16, v21 -; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v5 -; GFX9-NEXT: v_add_f32_e32 v22, v33, v22 -; GFX9-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 -; GFX9-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 -; GFX9-NEXT: v_bfe_u32 v33, v22, 16, 1 -; GFX9-NEXT: v_add_f32_e32 v5, v5, v21 -; GFX9-NEXT: v_add3_u32 v33, v33, v22, s4 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v22 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v22, v22 -; GFX9-NEXT: v_bfe_u32 v21, v5, 16, 1 -; GFX9-NEXT: v_cndmask_b32_e32 v22, v33, v34, vcc -; GFX9-NEXT: v_add3_u32 v21, v21, v5, s4 -; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v5 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX9-NEXT: v_cndmask_b32_e32 v5, v21, v33, vcc -; GFX9-NEXT: v_lshlrev_b32_e32 v21, 16, v20 -; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v4 -; GFX9-NEXT: v_add_f32_e32 v21, v33, v21 -; GFX9-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 -; GFX9-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 -; GFX9-NEXT: v_bfe_u32 v33, v21, 16, 1 -; GFX9-NEXT: v_add_f32_e32 v4, v4, v20 -; GFX9-NEXT: v_add3_u32 v33, v33, v21, s4 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v21 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v21, v21 -; GFX9-NEXT: v_bfe_u32 v20, v4, 16, 1 -; GFX9-NEXT: v_cndmask_b32_e32 v21, v33, v34, vcc -; GFX9-NEXT: v_add3_u32 v20, v20, v4, s4 -; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v4 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 -; GFX9-NEXT: v_cndmask_b32_e32 v4, v20, v33, vcc -; GFX9-NEXT: v_lshlrev_b32_e32 v20, 16, v19 -; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v3 -; GFX9-NEXT: v_add_f32_e32 v20, v33, v20 -; GFX9-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 -; GFX9-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 -; GFX9-NEXT: v_bfe_u32 v33, v20, 16, 1 -; GFX9-NEXT: v_add_f32_e32 v3, v3, v19 -; GFX9-NEXT: v_add3_u32 v33, v33, v20, s4 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v20 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v20, v20 -; GFX9-NEXT: v_bfe_u32 v19, v3, 16, 1 -; GFX9-NEXT: v_cndmask_b32_e32 v20, v33, v34, vcc -; GFX9-NEXT: v_add3_u32 v19, v19, v3, s4 -; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v3 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; GFX9-NEXT: v_cndmask_b32_e32 v3, v19, v33, vcc -; GFX9-NEXT: v_lshlrev_b32_e32 v19, 16, v18 -; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v2 -; GFX9-NEXT: v_add_f32_e32 v19, v33, v19 -; GFX9-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 -; GFX9-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX9-NEXT: v_bfe_u32 v33, v19, 16, 1 -; GFX9-NEXT: v_add_f32_e32 v2, v2, v18 -; GFX9-NEXT: v_add3_u32 v33, v33, v19, s4 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v19 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v19, v19 -; GFX9-NEXT: v_bfe_u32 v18, v2, 16, 1 -; GFX9-NEXT: v_cndmask_b32_e32 v19, v33, v34, vcc -; GFX9-NEXT: v_add3_u32 v18, v18, v2, s4 -; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v2 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v18, v33, vcc -; GFX9-NEXT: v_lshlrev_b32_e32 v18, 16, v17 -; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v1 -; GFX9-NEXT: v_add_f32_e32 v18, v33, v18 -; GFX9-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 -; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GFX9-NEXT: v_bfe_u32 v33, v18, 16, 1 -; GFX9-NEXT: v_add_f32_e32 v1, v1, v17 -; GFX9-NEXT: v_add3_u32 v33, v33, v18, s4 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v18 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 -; GFX9-NEXT: v_bfe_u32 v17, v1, 16, 1 -; GFX9-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc -; GFX9-NEXT: v_add3_u32 v17, v17, v1, s4 -; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v1 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v17, v33, vcc -; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v16 -; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v0 -; GFX9-NEXT: v_add_f32_e32 v17, v33, v17 -; GFX9-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX9-NEXT: v_bfe_u32 v33, v17, 16, 1 -; GFX9-NEXT: v_add_f32_e32 v0, v0, v16 -; GFX9-NEXT: v_add3_u32 v33, v33, v17, s4 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v17 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 -; GFX9-NEXT: v_bfe_u32 v16, v0, 16, 1 -; GFX9-NEXT: v_cndmask_b32_e32 v17, v33, v34, vcc -; GFX9-NEXT: v_add3_u32 v16, v16, v0, s4 -; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v0 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v16, v33, vcc -; GFX9-NEXT: s_mov_b32 s4, 0x7060302 -; GFX9-NEXT: v_perm_b32 v0, v0, v17, s4 -; GFX9-NEXT: v_perm_b32 v1, v1, v18, s4 -; GFX9-NEXT: v_perm_b32 v2, v2, v19, s4 -; GFX9-NEXT: v_perm_b32 v3, v3, v20, s4 -; GFX9-NEXT: v_perm_b32 v4, v4, v21, s4 -; GFX9-NEXT: v_perm_b32 v5, v5, v22, s4 -; GFX9-NEXT: v_perm_b32 v6, v6, v23, s4 -; GFX9-NEXT: v_perm_b32 v7, v7, v24, s4 -; GFX9-NEXT: v_perm_b32 v8, v8, v25, s4 -; GFX9-NEXT: v_perm_b32 v9, v9, v26, s4 -; GFX9-NEXT: v_perm_b32 v10, v10, v27, s4 -; GFX9-NEXT: v_perm_b32 v11, v11, v28, s4 -; GFX9-NEXT: v_perm_b32 v12, v12, v32, s4 -; GFX9-NEXT: v_perm_b32 v13, v13, v30, s4 -; GFX9-NEXT: v_perm_b32 v14, v14, v31, s4 -; GFX9-NEXT: v_perm_b32 v15, v29, v15, s4 -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: v_fadd_v32bf16: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_lshlrev_b32_e32 v31, 16, v30 +; GFX900-NEXT: v_lshlrev_b32_e32 v32, 16, v14 +; GFX900-NEXT: v_add_f32_e32 v31, v32, v31 +; GFX900-NEXT: v_and_b32_e32 v30, 0xffff0000, v30 +; GFX900-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 +; GFX900-NEXT: s_movk_i32 s4, 0x7fff +; GFX900-NEXT: v_bfe_u32 v32, v31, 16, 1 +; GFX900-NEXT: v_add_f32_e32 v14, v14, v30 +; GFX900-NEXT: v_add3_u32 v32, v32, v31, s4 +; GFX900-NEXT: v_or_b32_e32 v33, 0x400000, v31 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v31, v31 +; GFX900-NEXT: v_bfe_u32 v30, v14, 16, 1 +; GFX900-NEXT: v_cndmask_b32_e32 v31, v32, v33, vcc +; GFX900-NEXT: v_add3_u32 v30, v30, v14, s4 +; GFX900-NEXT: v_or_b32_e32 v32, 0x400000, v14 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v14, v14 +; GFX900-NEXT: v_cndmask_b32_e32 v14, v30, v32, vcc +; GFX900-NEXT: v_lshlrev_b32_e32 v30, 16, v29 +; GFX900-NEXT: v_lshlrev_b32_e32 v32, 16, v13 +; GFX900-NEXT: v_add_f32_e32 v30, v32, v30 +; GFX900-NEXT: v_and_b32_e32 v29, 0xffff0000, v29 +; GFX900-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 +; GFX900-NEXT: v_bfe_u32 v32, v30, 16, 1 +; GFX900-NEXT: v_add_f32_e32 v13, v13, v29 +; GFX900-NEXT: v_add3_u32 v32, v32, v30, s4 +; GFX900-NEXT: v_or_b32_e32 v33, 0x400000, v30 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v30, v30 +; GFX900-NEXT: v_bfe_u32 v29, v13, 16, 1 +; GFX900-NEXT: v_cndmask_b32_e32 v30, v32, v33, vcc +; GFX900-NEXT: v_add3_u32 v29, v29, v13, s4 +; GFX900-NEXT: v_or_b32_e32 v32, 0x400000, v13 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v13, v13 +; GFX900-NEXT: v_cndmask_b32_e32 v13, v29, v32, vcc +; GFX900-NEXT: v_lshlrev_b32_e32 v29, 16, v28 +; GFX900-NEXT: v_lshlrev_b32_e32 v32, 16, v12 +; GFX900-NEXT: v_add_f32_e32 v32, v32, v29 +; GFX900-NEXT: buffer_load_dword v29, off, s[0:3], s32 +; GFX900-NEXT: v_lshlrev_b32_e32 v33, 16, v15 +; GFX900-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 +; GFX900-NEXT: v_and_b32_e32 v28, 0xffff0000, v28 +; GFX900-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 +; GFX900-NEXT: v_add_f32_e32 v12, v12, v28 +; GFX900-NEXT: v_bfe_u32 v28, v12, 16, 1 +; GFX900-NEXT: v_add3_u32 v28, v28, v12, s4 +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: v_lshlrev_b32_e32 v34, 16, v29 +; GFX900-NEXT: v_add_f32_e32 v33, v33, v34 +; GFX900-NEXT: v_and_b32_e32 v29, 0xffff0000, v29 +; GFX900-NEXT: v_add_f32_e32 v29, v15, v29 +; GFX900-NEXT: v_bfe_u32 v15, v33, 16, 1 +; GFX900-NEXT: v_add3_u32 v15, v15, v33, s4 +; GFX900-NEXT: v_or_b32_e32 v34, 0x400000, v33 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v33, v33 +; GFX900-NEXT: v_bfe_u32 v33, v29, 16, 1 +; GFX900-NEXT: v_cndmask_b32_e32 v15, v15, v34, vcc +; GFX900-NEXT: v_add3_u32 v33, v33, v29, s4 +; GFX900-NEXT: v_or_b32_e32 v34, 0x400000, v29 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v29, v29 +; GFX900-NEXT: v_cndmask_b32_e32 v29, v33, v34, vcc +; GFX900-NEXT: v_bfe_u32 v33, v32, 16, 1 +; GFX900-NEXT: v_add3_u32 v33, v33, v32, s4 +; GFX900-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX900-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; GFX900-NEXT: v_or_b32_e32 v33, 0x400000, v12 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v12, v12 +; GFX900-NEXT: v_cndmask_b32_e32 v12, v28, v33, vcc +; GFX900-NEXT: v_lshlrev_b32_e32 v28, 16, v27 +; GFX900-NEXT: v_lshlrev_b32_e32 v33, 16, v11 +; GFX900-NEXT: v_add_f32_e32 v28, v33, v28 +; GFX900-NEXT: v_and_b32_e32 v27, 0xffff0000, v27 +; GFX900-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 +; GFX900-NEXT: v_bfe_u32 v33, v28, 16, 1 +; GFX900-NEXT: v_add_f32_e32 v11, v11, v27 +; GFX900-NEXT: v_add3_u32 v33, v33, v28, s4 +; GFX900-NEXT: v_or_b32_e32 v34, 0x400000, v28 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v28, v28 +; GFX900-NEXT: v_bfe_u32 v27, v11, 16, 1 +; GFX900-NEXT: v_cndmask_b32_e32 v28, v33, v34, vcc +; GFX900-NEXT: v_add3_u32 v27, v27, v11, s4 +; GFX900-NEXT: v_or_b32_e32 v33, 0x400000, v11 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v11, v11 +; GFX900-NEXT: v_cndmask_b32_e32 v11, v27, v33, vcc +; GFX900-NEXT: v_lshlrev_b32_e32 v27, 16, v26 +; GFX900-NEXT: v_lshlrev_b32_e32 v33, 16, v10 +; GFX900-NEXT: v_add_f32_e32 v27, v33, v27 +; GFX900-NEXT: v_and_b32_e32 v26, 0xffff0000, v26 +; GFX900-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 +; GFX900-NEXT: v_bfe_u32 v33, v27, 16, 1 +; GFX900-NEXT: v_add_f32_e32 v10, v10, v26 +; GFX900-NEXT: v_add3_u32 v33, v33, v27, s4 +; GFX900-NEXT: v_or_b32_e32 v34, 0x400000, v27 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v27, v27 +; GFX900-NEXT: v_bfe_u32 v26, v10, 16, 1 +; GFX900-NEXT: v_cndmask_b32_e32 v27, v33, v34, vcc +; GFX900-NEXT: v_add3_u32 v26, v26, v10, s4 +; GFX900-NEXT: v_or_b32_e32 v33, 0x400000, v10 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v10, v10 +; GFX900-NEXT: v_cndmask_b32_e32 v10, v26, v33, vcc +; GFX900-NEXT: v_lshlrev_b32_e32 v26, 16, v25 +; GFX900-NEXT: v_lshlrev_b32_e32 v33, 16, v9 +; GFX900-NEXT: v_add_f32_e32 v26, v33, v26 +; GFX900-NEXT: v_and_b32_e32 v25, 0xffff0000, v25 +; GFX900-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 +; GFX900-NEXT: v_bfe_u32 v33, v26, 16, 1 +; GFX900-NEXT: v_add_f32_e32 v9, v9, v25 +; GFX900-NEXT: v_add3_u32 v33, v33, v26, s4 +; GFX900-NEXT: v_or_b32_e32 v34, 0x400000, v26 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v26, v26 +; GFX900-NEXT: v_bfe_u32 v25, v9, 16, 1 +; GFX900-NEXT: v_cndmask_b32_e32 v26, v33, v34, vcc +; GFX900-NEXT: v_add3_u32 v25, v25, v9, s4 +; GFX900-NEXT: v_or_b32_e32 v33, 0x400000, v9 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v9, v9 +; GFX900-NEXT: v_cndmask_b32_e32 v9, v25, v33, vcc +; GFX900-NEXT: v_lshlrev_b32_e32 v25, 16, v24 +; GFX900-NEXT: v_lshlrev_b32_e32 v33, 16, v8 +; GFX900-NEXT: v_add_f32_e32 v25, v33, v25 +; GFX900-NEXT: v_and_b32_e32 v24, 0xffff0000, v24 +; GFX900-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 +; GFX900-NEXT: v_bfe_u32 v33, v25, 16, 1 +; GFX900-NEXT: v_add_f32_e32 v8, v8, v24 +; GFX900-NEXT: v_add3_u32 v33, v33, v25, s4 +; GFX900-NEXT: v_or_b32_e32 v34, 0x400000, v25 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v25, v25 +; GFX900-NEXT: v_bfe_u32 v24, v8, 16, 1 +; GFX900-NEXT: v_cndmask_b32_e32 v25, v33, v34, vcc +; GFX900-NEXT: v_add3_u32 v24, v24, v8, s4 +; GFX900-NEXT: v_or_b32_e32 v33, 0x400000, v8 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v8, v8 +; GFX900-NEXT: v_cndmask_b32_e32 v8, v24, v33, vcc +; GFX900-NEXT: v_lshlrev_b32_e32 v24, 16, v23 +; GFX900-NEXT: v_lshlrev_b32_e32 v33, 16, v7 +; GFX900-NEXT: v_add_f32_e32 v24, v33, v24 +; GFX900-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 +; GFX900-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 +; GFX900-NEXT: v_bfe_u32 v33, v24, 16, 1 +; GFX900-NEXT: v_add_f32_e32 v7, v7, v23 +; GFX900-NEXT: v_add3_u32 v33, v33, v24, s4 +; GFX900-NEXT: v_or_b32_e32 v34, 0x400000, v24 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v24, v24 +; GFX900-NEXT: v_bfe_u32 v23, v7, 16, 1 +; GFX900-NEXT: v_cndmask_b32_e32 v24, v33, v34, vcc +; GFX900-NEXT: v_add3_u32 v23, v23, v7, s4 +; GFX900-NEXT: v_or_b32_e32 v33, 0x400000, v7 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 +; GFX900-NEXT: v_cndmask_b32_e32 v7, v23, v33, vcc +; GFX900-NEXT: v_lshlrev_b32_e32 v23, 16, v22 +; GFX900-NEXT: v_lshlrev_b32_e32 v33, 16, v6 +; GFX900-NEXT: v_add_f32_e32 v23, v33, v23 +; GFX900-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 +; GFX900-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 +; GFX900-NEXT: v_bfe_u32 v33, v23, 16, 1 +; GFX900-NEXT: v_add_f32_e32 v6, v6, v22 +; GFX900-NEXT: v_add3_u32 v33, v33, v23, s4 +; GFX900-NEXT: v_or_b32_e32 v34, 0x400000, v23 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v23, v23 +; GFX900-NEXT: v_bfe_u32 v22, v6, 16, 1 +; GFX900-NEXT: v_cndmask_b32_e32 v23, v33, v34, vcc +; GFX900-NEXT: v_add3_u32 v22, v22, v6, s4 +; GFX900-NEXT: v_or_b32_e32 v33, 0x400000, v6 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; GFX900-NEXT: v_cndmask_b32_e32 v6, v22, v33, vcc +; GFX900-NEXT: v_lshlrev_b32_e32 v22, 16, v21 +; GFX900-NEXT: v_lshlrev_b32_e32 v33, 16, v5 +; GFX900-NEXT: v_add_f32_e32 v22, v33, v22 +; GFX900-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 +; GFX900-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 +; GFX900-NEXT: v_bfe_u32 v33, v22, 16, 1 +; GFX900-NEXT: v_add_f32_e32 v5, v5, v21 +; GFX900-NEXT: v_add3_u32 v33, v33, v22, s4 +; GFX900-NEXT: v_or_b32_e32 v34, 0x400000, v22 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v22, v22 +; GFX900-NEXT: v_bfe_u32 v21, v5, 16, 1 +; GFX900-NEXT: v_cndmask_b32_e32 v22, v33, v34, vcc +; GFX900-NEXT: v_add3_u32 v21, v21, v5, s4 +; GFX900-NEXT: v_or_b32_e32 v33, 0x400000, v5 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX900-NEXT: v_cndmask_b32_e32 v5, v21, v33, vcc +; GFX900-NEXT: v_lshlrev_b32_e32 v21, 16, v20 +; GFX900-NEXT: v_lshlrev_b32_e32 v33, 16, v4 +; GFX900-NEXT: v_add_f32_e32 v21, v33, v21 +; GFX900-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 +; GFX900-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 +; GFX900-NEXT: v_bfe_u32 v33, v21, 16, 1 +; GFX900-NEXT: v_add_f32_e32 v4, v4, v20 +; GFX900-NEXT: v_add3_u32 v33, v33, v21, s4 +; GFX900-NEXT: v_or_b32_e32 v34, 0x400000, v21 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v21, v21 +; GFX900-NEXT: v_bfe_u32 v20, v4, 16, 1 +; GFX900-NEXT: v_cndmask_b32_e32 v21, v33, v34, vcc +; GFX900-NEXT: v_add3_u32 v20, v20, v4, s4 +; GFX900-NEXT: v_or_b32_e32 v33, 0x400000, v4 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; GFX900-NEXT: v_cndmask_b32_e32 v4, v20, v33, vcc +; GFX900-NEXT: v_lshlrev_b32_e32 v20, 16, v19 +; GFX900-NEXT: v_lshlrev_b32_e32 v33, 16, v3 +; GFX900-NEXT: v_add_f32_e32 v20, v33, v20 +; GFX900-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 +; GFX900-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; GFX900-NEXT: v_bfe_u32 v33, v20, 16, 1 +; GFX900-NEXT: v_add_f32_e32 v3, v3, v19 +; GFX900-NEXT: v_add3_u32 v33, v33, v20, s4 +; GFX900-NEXT: v_or_b32_e32 v34, 0x400000, v20 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v20, v20 +; GFX900-NEXT: v_bfe_u32 v19, v3, 16, 1 +; GFX900-NEXT: v_cndmask_b32_e32 v20, v33, v34, vcc +; GFX900-NEXT: v_add3_u32 v19, v19, v3, s4 +; GFX900-NEXT: v_or_b32_e32 v33, 0x400000, v3 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX900-NEXT: v_cndmask_b32_e32 v3, v19, v33, vcc +; GFX900-NEXT: v_lshlrev_b32_e32 v19, 16, v18 +; GFX900-NEXT: v_lshlrev_b32_e32 v33, 16, v2 +; GFX900-NEXT: v_add_f32_e32 v19, v33, v19 +; GFX900-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 +; GFX900-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX900-NEXT: v_bfe_u32 v33, v19, 16, 1 +; GFX900-NEXT: v_add_f32_e32 v2, v2, v18 +; GFX900-NEXT: v_add3_u32 v33, v33, v19, s4 +; GFX900-NEXT: v_or_b32_e32 v34, 0x400000, v19 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v19, v19 +; GFX900-NEXT: v_bfe_u32 v18, v2, 16, 1 +; GFX900-NEXT: v_cndmask_b32_e32 v19, v33, v34, vcc +; GFX900-NEXT: v_add3_u32 v18, v18, v2, s4 +; GFX900-NEXT: v_or_b32_e32 v33, 0x400000, v2 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX900-NEXT: v_cndmask_b32_e32 v2, v18, v33, vcc +; GFX900-NEXT: v_lshlrev_b32_e32 v18, 16, v17 +; GFX900-NEXT: v_lshlrev_b32_e32 v33, 16, v1 +; GFX900-NEXT: v_add_f32_e32 v18, v33, v18 +; GFX900-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 +; GFX900-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX900-NEXT: v_bfe_u32 v33, v18, 16, 1 +; GFX900-NEXT: v_add_f32_e32 v1, v1, v17 +; GFX900-NEXT: v_add3_u32 v33, v33, v18, s4 +; GFX900-NEXT: v_or_b32_e32 v34, 0x400000, v18 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 +; GFX900-NEXT: v_bfe_u32 v17, v1, 16, 1 +; GFX900-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc +; GFX900-NEXT: v_add3_u32 v17, v17, v1, s4 +; GFX900-NEXT: v_or_b32_e32 v33, 0x400000, v1 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX900-NEXT: v_cndmask_b32_e32 v1, v17, v33, vcc +; GFX900-NEXT: v_lshlrev_b32_e32 v17, 16, v16 +; GFX900-NEXT: v_lshlrev_b32_e32 v33, 16, v0 +; GFX900-NEXT: v_add_f32_e32 v17, v33, v17 +; GFX900-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 +; GFX900-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX900-NEXT: v_bfe_u32 v33, v17, 16, 1 +; GFX900-NEXT: v_add_f32_e32 v0, v0, v16 +; GFX900-NEXT: v_add3_u32 v33, v33, v17, s4 +; GFX900-NEXT: v_or_b32_e32 v34, 0x400000, v17 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 +; GFX900-NEXT: v_bfe_u32 v16, v0, 16, 1 +; GFX900-NEXT: v_cndmask_b32_e32 v17, v33, v34, vcc +; GFX900-NEXT: v_add3_u32 v16, v16, v0, s4 +; GFX900-NEXT: v_or_b32_e32 v33, 0x400000, v0 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX900-NEXT: v_cndmask_b32_e32 v0, v16, v33, vcc +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_perm_b32 v0, v0, v17, s4 +; GFX900-NEXT: v_perm_b32 v1, v1, v18, s4 +; GFX900-NEXT: v_perm_b32 v2, v2, v19, s4 +; GFX900-NEXT: v_perm_b32 v3, v3, v20, s4 +; GFX900-NEXT: v_perm_b32 v4, v4, v21, s4 +; GFX900-NEXT: v_perm_b32 v5, v5, v22, s4 +; GFX900-NEXT: v_perm_b32 v6, v6, v23, s4 +; GFX900-NEXT: v_perm_b32 v7, v7, v24, s4 +; GFX900-NEXT: v_perm_b32 v8, v8, v25, s4 +; GFX900-NEXT: v_perm_b32 v9, v9, v26, s4 +; GFX900-NEXT: v_perm_b32 v10, v10, v27, s4 +; GFX900-NEXT: v_perm_b32 v11, v11, v28, s4 +; GFX900-NEXT: v_perm_b32 v12, v12, v32, s4 +; GFX900-NEXT: v_perm_b32 v13, v13, v30, s4 +; GFX900-NEXT: v_perm_b32 v14, v14, v31, s4 +; GFX900-NEXT: v_perm_b32 v15, v29, v15, s4 +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_fadd_v32bf16: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: scratch_load_dword v31, off, s32 +; GFX950-NEXT: v_and_b32_e32 v53, 0xffff0000, v24 +; GFX950-NEXT: v_and_b32_e32 v54, 0xffff0000, v8 +; GFX950-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; GFX950-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; GFX950-NEXT: v_accvgpr_write_b32 a0, v40 ; Reload Reuse +; GFX950-NEXT: v_and_b32_e32 v32, 0xffff0000, v15 +; GFX950-NEXT: v_and_b32_e32 v55, 0xffff0000, v23 +; GFX950-NEXT: v_and_b32_e32 v40, 0xffff0000, v7 +; GFX950-NEXT: v_add_f32_e32 v8, v8, v24 +; GFX950-NEXT: v_lshlrev_b32_e32 v23, 16, v23 +; GFX950-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; GFX950-NEXT: v_add_f32_e32 v7, v7, v23 +; GFX950-NEXT: v_and_b32_e32 v23, 0xffff0000, v22 +; GFX950-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; GFX950-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; GFX950-NEXT: v_and_b32_e32 v33, 0xffff0000, v30 +; GFX950-NEXT: v_and_b32_e32 v34, 0xffff0000, v14 +; GFX950-NEXT: v_lshlrev_b32_e32 v30, 16, v30 +; GFX950-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; GFX950-NEXT: v_and_b32_e32 v35, 0xffff0000, v29 +; GFX950-NEXT: v_and_b32_e32 v36, 0xffff0000, v13 +; GFX950-NEXT: v_lshlrev_b32_e32 v29, 16, v29 +; GFX950-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; GFX950-NEXT: v_and_b32_e32 v37, 0xffff0000, v28 +; GFX950-NEXT: v_and_b32_e32 v38, 0xffff0000, v12 +; GFX950-NEXT: v_lshlrev_b32_e32 v28, 16, v28 +; GFX950-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; GFX950-NEXT: v_and_b32_e32 v39, 0xffff0000, v27 +; GFX950-NEXT: v_and_b32_e32 v48, 0xffff0000, v11 +; GFX950-NEXT: v_lshlrev_b32_e32 v27, 16, v27 +; GFX950-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; GFX950-NEXT: v_and_b32_e32 v49, 0xffff0000, v26 +; GFX950-NEXT: v_and_b32_e32 v50, 0xffff0000, v10 +; GFX950-NEXT: v_lshlrev_b32_e32 v26, 16, v26 +; GFX950-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; GFX950-NEXT: v_and_b32_e32 v51, 0xffff0000, v25 +; GFX950-NEXT: v_and_b32_e32 v52, 0xffff0000, v9 +; GFX950-NEXT: v_lshlrev_b32_e32 v25, 16, v25 +; GFX950-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; GFX950-NEXT: v_add_f32_e32 v33, v34, v33 +; GFX950-NEXT: v_add_f32_e32 v14, v14, v30 +; GFX950-NEXT: v_add_f32_e32 v30, v36, v35 +; GFX950-NEXT: v_add_f32_e32 v13, v13, v29 +; GFX950-NEXT: v_add_f32_e32 v29, v38, v37 +; GFX950-NEXT: v_add_f32_e32 v12, v12, v28 +; GFX950-NEXT: v_add_f32_e32 v28, v48, v39 +; GFX950-NEXT: v_add_f32_e32 v11, v11, v27 +; GFX950-NEXT: v_add_f32_e32 v27, v50, v49 +; GFX950-NEXT: v_add_f32_e32 v10, v10, v26 +; GFX950-NEXT: v_add_f32_e32 v26, v52, v51 +; GFX950-NEXT: v_add_f32_e32 v9, v9, v25 +; GFX950-NEXT: v_add_f32_e32 v25, v54, v53 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v8, v8, v25 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v9, v9, v26 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v10, v10, v27 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v11, v11, v28 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v12, v12, v29 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v13, v13, v30 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v14, v14, v33 +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: v_and_b32_e32 v24, 0xffff0000, v31 +; GFX950-NEXT: v_add_f32_e32 v24, v32, v24 +; GFX950-NEXT: v_and_b32_e32 v32, 0xffff0000, v6 +; GFX950-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; GFX950-NEXT: v_add_f32_e32 v23, v32, v23 +; GFX950-NEXT: v_add_f32_e32 v6, v6, v22 +; GFX950-NEXT: v_and_b32_e32 v22, 0xffff0000, v21 +; GFX950-NEXT: v_and_b32_e32 v32, 0xffff0000, v5 +; GFX950-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; GFX950-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX950-NEXT: v_add_f32_e32 v22, v32, v22 +; GFX950-NEXT: v_add_f32_e32 v5, v5, v21 +; GFX950-NEXT: v_and_b32_e32 v21, 0xffff0000, v20 +; GFX950-NEXT: v_and_b32_e32 v32, 0xffff0000, v4 +; GFX950-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; GFX950-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX950-NEXT: v_add_f32_e32 v21, v32, v21 +; GFX950-NEXT: v_add_f32_e32 v4, v4, v20 +; GFX950-NEXT: v_and_b32_e32 v20, 0xffff0000, v19 +; GFX950-NEXT: v_and_b32_e32 v32, 0xffff0000, v3 +; GFX950-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; GFX950-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX950-NEXT: v_add_f32_e32 v20, v32, v20 +; GFX950-NEXT: v_add_f32_e32 v3, v3, v19 +; GFX950-NEXT: v_and_b32_e32 v19, 0xffff0000, v18 +; GFX950-NEXT: v_and_b32_e32 v32, 0xffff0000, v2 +; GFX950-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; GFX950-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX950-NEXT: v_add_f32_e32 v19, v32, v19 +; GFX950-NEXT: v_add_f32_e32 v2, v2, v18 +; GFX950-NEXT: v_and_b32_e32 v18, 0xffff0000, v17 +; GFX950-NEXT: v_and_b32_e32 v32, 0xffff0000, v1 +; GFX950-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; GFX950-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX950-NEXT: v_lshlrev_b32_e32 v31, 16, v31 +; GFX950-NEXT: v_add_f32_e32 v18, v32, v18 +; GFX950-NEXT: v_add_f32_e32 v1, v1, v17 +; GFX950-NEXT: v_and_b32_e32 v17, 0xffff0000, v16 +; GFX950-NEXT: v_and_b32_e32 v32, 0xffff0000, v0 +; GFX950-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; GFX950-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX950-NEXT: v_add_f32_e32 v15, v15, v31 +; GFX950-NEXT: v_add_f32_e32 v31, v40, v55 +; GFX950-NEXT: v_add_f32_e32 v17, v32, v17 +; GFX950-NEXT: v_add_f32_e32 v0, v0, v16 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v0, v0, v17 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v1, v1, v18 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v2, v2, v19 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v3, v3, v20 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v4, v4, v21 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v5, v5, v22 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v6, v6, v23 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v7, v7, v31 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v15, v15, v24 +; GFX950-NEXT: v_accvgpr_read_b32 v40, a0 ; Reload Reuse +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_fadd_v32bf16: ; GFX10: ; %bb.0: @@ -13290,19 +14077,27 @@ define bfloat @v_fadd_bf16_fpimm_0(bfloat %arg0) { ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_fadd_bf16_fpimm_0: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX9-NEXT: v_add_f32_e32 v0, 1.0, v0 -; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1 -; GFX9-NEXT: s_movk_i32 s4, 0x7fff -; GFX9-NEXT: v_add3_u32 v1, v1, v0, s4 -; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v0 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: v_fadd_bf16_fpimm_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX900-NEXT: v_add_f32_e32 v0, 1.0, v0 +; GFX900-NEXT: v_bfe_u32 v1, v0, 16, 1 +; GFX900-NEXT: s_movk_i32 s4, 0x7fff +; GFX900-NEXT: v_add3_u32 v1, v1, v0, s4 +; GFX900-NEXT: v_or_b32_e32 v2, 0x400000, v0 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX900-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc +; GFX900-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_fadd_bf16_fpimm_0: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX950-NEXT: v_add_f32_e32 v0, 1.0, v0 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v0, v0, s0 +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_fadd_bf16_fpimm_0: ; GFX10: ; %bb.0: @@ -13386,19 +14181,27 @@ define bfloat @v_fadd_bf16_fpimm_1(bfloat %arg0) { ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_fadd_bf16_fpimm_1: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX9-NEXT: v_add_f32_e32 v0, 0x42280000, v0 -; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1 -; GFX9-NEXT: s_movk_i32 s4, 0x7fff -; GFX9-NEXT: v_add3_u32 v1, v1, v0, s4 -; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v0 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: v_fadd_bf16_fpimm_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX900-NEXT: v_add_f32_e32 v0, 0x42280000, v0 +; GFX900-NEXT: v_bfe_u32 v1, v0, 16, 1 +; GFX900-NEXT: s_movk_i32 s4, 0x7fff +; GFX900-NEXT: v_add3_u32 v1, v1, v0, s4 +; GFX900-NEXT: v_or_b32_e32 v2, 0x400000, v0 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX900-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc +; GFX900-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_fadd_bf16_fpimm_1: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX950-NEXT: v_add_f32_e32 v0, 0x42280000, v0 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v0, v0, s0 +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_fadd_bf16_fpimm_1: ; GFX10: ; %bb.0: @@ -13487,20 +14290,29 @@ define bfloat @v_fsub_bf16(bfloat %a, bfloat %b) { ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_fsub_bf16: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX9-NEXT: v_sub_f32_e32 v0, v0, v1 -; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1 -; GFX9-NEXT: s_movk_i32 s4, 0x7fff -; GFX9-NEXT: v_add3_u32 v1, v1, v0, s4 -; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v0 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: v_fsub_bf16: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX900-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX900-NEXT: v_sub_f32_e32 v0, v0, v1 +; GFX900-NEXT: v_bfe_u32 v1, v0, 16, 1 +; GFX900-NEXT: s_movk_i32 s4, 0x7fff +; GFX900-NEXT: v_add3_u32 v1, v1, v0, s4 +; GFX900-NEXT: v_or_b32_e32 v2, 0x400000, v0 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX900-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc +; GFX900-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_fsub_bf16: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX950-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX950-NEXT: v_sub_f32_e32 v0, v0, v1 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v0, v0, s0 +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_fsub_bf16: ; GFX10: ; %bb.0: @@ -13615,29 +14427,41 @@ define <2 x bfloat> @v_fsub_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b) { ; GFX8-NEXT: v_alignbit_b32 v0, v0, v2, 16 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_fsub_v2bf16: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v1 -; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v0 -; GFX9-NEXT: v_sub_f32_e32 v2, v3, v2 -; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 -; GFX9-NEXT: s_movk_i32 s4, 0x7fff -; GFX9-NEXT: v_sub_f32_e32 v0, v0, v1 -; GFX9-NEXT: v_add3_u32 v3, v3, v2, s4 -; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; GFX9-NEXT: v_add3_u32 v1, v1, v0, s4 -; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v0 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v3, vcc -; GFX9-NEXT: s_mov_b32 s4, 0x7060302 -; GFX9-NEXT: v_perm_b32 v0, v0, v2, s4 -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: v_fsub_v2bf16: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_lshlrev_b32_e32 v2, 16, v1 +; GFX900-NEXT: v_lshlrev_b32_e32 v3, 16, v0 +; GFX900-NEXT: v_sub_f32_e32 v2, v3, v2 +; GFX900-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX900-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX900-NEXT: v_bfe_u32 v3, v2, 16, 1 +; GFX900-NEXT: s_movk_i32 s4, 0x7fff +; GFX900-NEXT: v_sub_f32_e32 v0, v0, v1 +; GFX900-NEXT: v_add3_u32 v3, v3, v2, s4 +; GFX900-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX900-NEXT: v_bfe_u32 v1, v0, 16, 1 +; GFX900-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; GFX900-NEXT: v_add3_u32 v1, v1, v0, s4 +; GFX900-NEXT: v_or_b32_e32 v3, 0x400000, v0 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX900-NEXT: v_cndmask_b32_e32 v0, v1, v3, vcc +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_perm_b32 v0, v0, v2, s4 +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_fsub_v2bf16: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_and_b32_e32 v2, 0xffff0000, v1 +; GFX950-NEXT: v_and_b32_e32 v3, 0xffff0000, v0 +; GFX950-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX950-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX950-NEXT: v_sub_f32_e32 v2, v3, v2 +; GFX950-NEXT: v_sub_f32_e32 v0, v0, v1 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v0, v0, v2 +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_fsub_v2bf16: ; GFX10: ; %bb.0: @@ -13800,38 +14624,54 @@ define <3 x bfloat> @v_fsub_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b) { ; GFX8-NEXT: v_alignbit_b32 v0, v0, v3, 16 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_fsub_v3bf16: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX9-NEXT: v_sub_f32_e32 v1, v1, v3 -; GFX9-NEXT: v_bfe_u32 v3, v1, 16, 1 -; GFX9-NEXT: s_movk_i32 s4, 0x7fff -; GFX9-NEXT: v_add3_u32 v3, v3, v1, s4 -; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v1 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc -; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v2 -; GFX9-NEXT: v_lshlrev_b32_e32 v4, 16, v0 -; GFX9-NEXT: v_sub_f32_e32 v3, v4, v3 -; GFX9-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX9-NEXT: v_bfe_u32 v4, v3, 16, 1 -; GFX9-NEXT: v_sub_f32_e32 v0, v0, v2 -; GFX9-NEXT: v_add3_u32 v4, v4, v3, s4 -; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v3 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; GFX9-NEXT: v_bfe_u32 v2, v0, 16, 1 -; GFX9-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc -; GFX9-NEXT: v_add3_u32 v2, v2, v0, s4 -; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v0 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc -; GFX9-NEXT: s_mov_b32 s4, 0x7060302 -; GFX9-NEXT: v_perm_b32 v0, v0, v3, s4 -; GFX9-NEXT: v_alignbit_b32 v1, s4, v1, 16 -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: v_fsub_v3bf16: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX900-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX900-NEXT: v_sub_f32_e32 v1, v1, v3 +; GFX900-NEXT: v_bfe_u32 v3, v1, 16, 1 +; GFX900-NEXT: s_movk_i32 s4, 0x7fff +; GFX900-NEXT: v_add3_u32 v3, v3, v1, s4 +; GFX900-NEXT: v_or_b32_e32 v4, 0x400000, v1 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX900-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc +; GFX900-NEXT: v_lshlrev_b32_e32 v3, 16, v2 +; GFX900-NEXT: v_lshlrev_b32_e32 v4, 16, v0 +; GFX900-NEXT: v_sub_f32_e32 v3, v4, v3 +; GFX900-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX900-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX900-NEXT: v_bfe_u32 v4, v3, 16, 1 +; GFX900-NEXT: v_sub_f32_e32 v0, v0, v2 +; GFX900-NEXT: v_add3_u32 v4, v4, v3, s4 +; GFX900-NEXT: v_or_b32_e32 v5, 0x400000, v3 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX900-NEXT: v_bfe_u32 v2, v0, 16, 1 +; GFX900-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc +; GFX900-NEXT: v_add3_u32 v2, v2, v0, s4 +; GFX900-NEXT: v_or_b32_e32 v4, 0x400000, v0 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX900-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_perm_b32 v0, v0, v3, s4 +; GFX900-NEXT: v_alignbit_b32 v1, s4, v1, 16 +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_fsub_v3bf16: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX950-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX950-NEXT: v_sub_f32_e32 v1, v1, v3 +; GFX950-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 +; GFX950-NEXT: v_and_b32_e32 v4, 0xffff0000, v0 +; GFX950-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX950-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX950-NEXT: v_sub_f32_e32 v3, v4, v3 +; GFX950-NEXT: v_sub_f32_e32 v0, v0, v2 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v1, v1, s0 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v0, v0, v3 +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_fsub_v3bf16: ; GFX10: ; %bb.0: @@ -14041,46 +14881,65 @@ define <4 x bfloat> @v_fsub_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b) { ; GFX8-NEXT: v_alignbit_b32 v1, v1, v4, 16 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_fsub_v4bf16: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_lshlrev_b32_e32 v4, 16, v3 -; GFX9-NEXT: v_lshlrev_b32_e32 v5, 16, v1 -; GFX9-NEXT: v_sub_f32_e32 v4, v5, v4 -; GFX9-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 -; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GFX9-NEXT: v_bfe_u32 v5, v4, 16, 1 -; GFX9-NEXT: s_movk_i32 s4, 0x7fff -; GFX9-NEXT: v_sub_f32_e32 v1, v1, v3 -; GFX9-NEXT: v_add3_u32 v5, v5, v4, s4 -; GFX9-NEXT: v_or_b32_e32 v6, 0x400000, v4 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 -; GFX9-NEXT: v_bfe_u32 v3, v1, 16, 1 -; GFX9-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc -; GFX9-NEXT: v_add3_u32 v3, v3, v1, s4 -; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v1 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc -; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v2 -; GFX9-NEXT: v_lshlrev_b32_e32 v5, 16, v0 -; GFX9-NEXT: v_sub_f32_e32 v3, v5, v3 -; GFX9-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX9-NEXT: v_bfe_u32 v5, v3, 16, 1 -; GFX9-NEXT: v_sub_f32_e32 v0, v0, v2 -; GFX9-NEXT: v_add3_u32 v5, v5, v3, s4 -; GFX9-NEXT: v_or_b32_e32 v6, 0x400000, v3 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; GFX9-NEXT: v_bfe_u32 v2, v0, 16, 1 -; GFX9-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc -; GFX9-NEXT: v_add3_u32 v2, v2, v0, s4 -; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v0 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v5, vcc -; GFX9-NEXT: s_mov_b32 s4, 0x7060302 -; GFX9-NEXT: v_perm_b32 v0, v0, v3, s4 -; GFX9-NEXT: v_perm_b32 v1, v1, v4, s4 -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: v_fsub_v4bf16: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_lshlrev_b32_e32 v4, 16, v3 +; GFX900-NEXT: v_lshlrev_b32_e32 v5, 16, v1 +; GFX900-NEXT: v_sub_f32_e32 v4, v5, v4 +; GFX900-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; GFX900-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX900-NEXT: v_bfe_u32 v5, v4, 16, 1 +; GFX900-NEXT: s_movk_i32 s4, 0x7fff +; GFX900-NEXT: v_sub_f32_e32 v1, v1, v3 +; GFX900-NEXT: v_add3_u32 v5, v5, v4, s4 +; GFX900-NEXT: v_or_b32_e32 v6, 0x400000, v4 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; GFX900-NEXT: v_bfe_u32 v3, v1, 16, 1 +; GFX900-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc +; GFX900-NEXT: v_add3_u32 v3, v3, v1, s4 +; GFX900-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX900-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc +; GFX900-NEXT: v_lshlrev_b32_e32 v3, 16, v2 +; GFX900-NEXT: v_lshlrev_b32_e32 v5, 16, v0 +; GFX900-NEXT: v_sub_f32_e32 v3, v5, v3 +; GFX900-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX900-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX900-NEXT: v_bfe_u32 v5, v3, 16, 1 +; GFX900-NEXT: v_sub_f32_e32 v0, v0, v2 +; GFX900-NEXT: v_add3_u32 v5, v5, v3, s4 +; GFX900-NEXT: v_or_b32_e32 v6, 0x400000, v3 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX900-NEXT: v_bfe_u32 v2, v0, 16, 1 +; GFX900-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc +; GFX900-NEXT: v_add3_u32 v2, v2, v0, s4 +; GFX900-NEXT: v_or_b32_e32 v5, 0x400000, v0 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX900-NEXT: v_cndmask_b32_e32 v0, v2, v5, vcc +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_perm_b32 v0, v0, v3, s4 +; GFX900-NEXT: v_perm_b32 v1, v1, v4, s4 +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_fsub_v4bf16: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_and_b32_e32 v4, 0xffff0000, v3 +; GFX950-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 +; GFX950-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX950-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX950-NEXT: v_sub_f32_e32 v4, v5, v4 +; GFX950-NEXT: v_sub_f32_e32 v1, v1, v3 +; GFX950-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 +; GFX950-NEXT: v_and_b32_e32 v5, 0xffff0000, v0 +; GFX950-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX950-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX950-NEXT: v_sub_f32_e32 v3, v5, v3 +; GFX950-NEXT: v_sub_f32_e32 v0, v0, v2 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v0, v0, v3 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v1, v1, v4 +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_fsub_v4bf16: ; GFX10: ; %bb.0: @@ -14249,20 +15108,29 @@ define bfloat @v_fmul_bf16(bfloat %a, bfloat %b) { ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_fmul_bf16: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX9-NEXT: v_mul_f32_e32 v0, v0, v1 -; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1 -; GFX9-NEXT: s_movk_i32 s4, 0x7fff -; GFX9-NEXT: v_add3_u32 v1, v1, v0, s4 -; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v0 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: v_fmul_bf16: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX900-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX900-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX900-NEXT: v_bfe_u32 v1, v0, 16, 1 +; GFX900-NEXT: s_movk_i32 s4, 0x7fff +; GFX900-NEXT: v_add3_u32 v1, v1, v0, s4 +; GFX900-NEXT: v_or_b32_e32 v2, 0x400000, v0 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX900-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc +; GFX900-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_fmul_bf16: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX950-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX950-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v0, v0, s0 +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_fmul_bf16: ; GFX10: ; %bb.0: @@ -14377,29 +15245,41 @@ define <2 x bfloat> @v_fmul_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b) { ; GFX8-NEXT: v_alignbit_b32 v0, v0, v2, 16 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_fmul_v2bf16: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v1 -; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v0 -; GFX9-NEXT: v_mul_f32_e32 v2, v3, v2 -; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 -; GFX9-NEXT: s_movk_i32 s4, 0x7fff -; GFX9-NEXT: v_mul_f32_e32 v0, v0, v1 -; GFX9-NEXT: v_add3_u32 v3, v3, v2, s4 -; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; GFX9-NEXT: v_add3_u32 v1, v1, v0, s4 -; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v0 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v3, vcc -; GFX9-NEXT: s_mov_b32 s4, 0x7060302 -; GFX9-NEXT: v_perm_b32 v0, v0, v2, s4 -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: v_fmul_v2bf16: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_lshlrev_b32_e32 v2, 16, v1 +; GFX900-NEXT: v_lshlrev_b32_e32 v3, 16, v0 +; GFX900-NEXT: v_mul_f32_e32 v2, v3, v2 +; GFX900-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX900-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX900-NEXT: v_bfe_u32 v3, v2, 16, 1 +; GFX900-NEXT: s_movk_i32 s4, 0x7fff +; GFX900-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX900-NEXT: v_add3_u32 v3, v3, v2, s4 +; GFX900-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX900-NEXT: v_bfe_u32 v1, v0, 16, 1 +; GFX900-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; GFX900-NEXT: v_add3_u32 v1, v1, v0, s4 +; GFX900-NEXT: v_or_b32_e32 v3, 0x400000, v0 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX900-NEXT: v_cndmask_b32_e32 v0, v1, v3, vcc +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_perm_b32 v0, v0, v2, s4 +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_fmul_v2bf16: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_and_b32_e32 v2, 0xffff0000, v1 +; GFX950-NEXT: v_and_b32_e32 v3, 0xffff0000, v0 +; GFX950-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX950-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX950-NEXT: v_mul_f32_e32 v2, v3, v2 +; GFX950-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v0, v0, v2 +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_fmul_v2bf16: ; GFX10: ; %bb.0: @@ -14562,38 +15442,54 @@ define <3 x bfloat> @v_fmul_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b) { ; GFX8-NEXT: v_alignbit_b32 v0, v0, v3, 16 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_fmul_v3bf16: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX9-NEXT: v_mul_f32_e32 v1, v1, v3 -; GFX9-NEXT: v_bfe_u32 v3, v1, 16, 1 -; GFX9-NEXT: s_movk_i32 s4, 0x7fff -; GFX9-NEXT: v_add3_u32 v3, v3, v1, s4 -; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v1 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc -; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v2 -; GFX9-NEXT: v_lshlrev_b32_e32 v4, 16, v0 -; GFX9-NEXT: v_mul_f32_e32 v3, v4, v3 -; GFX9-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX9-NEXT: v_bfe_u32 v4, v3, 16, 1 -; GFX9-NEXT: v_mul_f32_e32 v0, v0, v2 -; GFX9-NEXT: v_add3_u32 v4, v4, v3, s4 -; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v3 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; GFX9-NEXT: v_bfe_u32 v2, v0, 16, 1 -; GFX9-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc -; GFX9-NEXT: v_add3_u32 v2, v2, v0, s4 -; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v0 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc -; GFX9-NEXT: s_mov_b32 s4, 0x7060302 -; GFX9-NEXT: v_perm_b32 v0, v0, v3, s4 -; GFX9-NEXT: v_alignbit_b32 v1, s4, v1, 16 -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: v_fmul_v3bf16: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX900-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX900-NEXT: v_mul_f32_e32 v1, v1, v3 +; GFX900-NEXT: v_bfe_u32 v3, v1, 16, 1 +; GFX900-NEXT: s_movk_i32 s4, 0x7fff +; GFX900-NEXT: v_add3_u32 v3, v3, v1, s4 +; GFX900-NEXT: v_or_b32_e32 v4, 0x400000, v1 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX900-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc +; GFX900-NEXT: v_lshlrev_b32_e32 v3, 16, v2 +; GFX900-NEXT: v_lshlrev_b32_e32 v4, 16, v0 +; GFX900-NEXT: v_mul_f32_e32 v3, v4, v3 +; GFX900-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX900-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX900-NEXT: v_bfe_u32 v4, v3, 16, 1 +; GFX900-NEXT: v_mul_f32_e32 v0, v0, v2 +; GFX900-NEXT: v_add3_u32 v4, v4, v3, s4 +; GFX900-NEXT: v_or_b32_e32 v5, 0x400000, v3 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX900-NEXT: v_bfe_u32 v2, v0, 16, 1 +; GFX900-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc +; GFX900-NEXT: v_add3_u32 v2, v2, v0, s4 +; GFX900-NEXT: v_or_b32_e32 v4, 0x400000, v0 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX900-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_perm_b32 v0, v0, v3, s4 +; GFX900-NEXT: v_alignbit_b32 v1, s4, v1, 16 +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_fmul_v3bf16: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX950-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX950-NEXT: v_mul_f32_e32 v1, v1, v3 +; GFX950-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 +; GFX950-NEXT: v_and_b32_e32 v4, 0xffff0000, v0 +; GFX950-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX950-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX950-NEXT: v_mul_f32_e32 v3, v4, v3 +; GFX950-NEXT: v_mul_f32_e32 v0, v0, v2 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v1, v1, s0 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v0, v0, v3 +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_fmul_v3bf16: ; GFX10: ; %bb.0: @@ -14803,46 +15699,65 @@ define <4 x bfloat> @v_fmul_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b) { ; GFX8-NEXT: v_alignbit_b32 v1, v1, v4, 16 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_fmul_v4bf16: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_lshlrev_b32_e32 v4, 16, v3 -; GFX9-NEXT: v_lshlrev_b32_e32 v5, 16, v1 -; GFX9-NEXT: v_mul_f32_e32 v4, v5, v4 -; GFX9-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 -; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GFX9-NEXT: v_bfe_u32 v5, v4, 16, 1 -; GFX9-NEXT: s_movk_i32 s4, 0x7fff -; GFX9-NEXT: v_mul_f32_e32 v1, v1, v3 -; GFX9-NEXT: v_add3_u32 v5, v5, v4, s4 -; GFX9-NEXT: v_or_b32_e32 v6, 0x400000, v4 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 -; GFX9-NEXT: v_bfe_u32 v3, v1, 16, 1 -; GFX9-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc -; GFX9-NEXT: v_add3_u32 v3, v3, v1, s4 -; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v1 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc -; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v2 -; GFX9-NEXT: v_lshlrev_b32_e32 v5, 16, v0 -; GFX9-NEXT: v_mul_f32_e32 v3, v5, v3 -; GFX9-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX9-NEXT: v_bfe_u32 v5, v3, 16, 1 -; GFX9-NEXT: v_mul_f32_e32 v0, v0, v2 -; GFX9-NEXT: v_add3_u32 v5, v5, v3, s4 -; GFX9-NEXT: v_or_b32_e32 v6, 0x400000, v3 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; GFX9-NEXT: v_bfe_u32 v2, v0, 16, 1 -; GFX9-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc -; GFX9-NEXT: v_add3_u32 v2, v2, v0, s4 -; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v0 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v5, vcc -; GFX9-NEXT: s_mov_b32 s4, 0x7060302 -; GFX9-NEXT: v_perm_b32 v0, v0, v3, s4 -; GFX9-NEXT: v_perm_b32 v1, v1, v4, s4 -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: v_fmul_v4bf16: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_lshlrev_b32_e32 v4, 16, v3 +; GFX900-NEXT: v_lshlrev_b32_e32 v5, 16, v1 +; GFX900-NEXT: v_mul_f32_e32 v4, v5, v4 +; GFX900-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; GFX900-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX900-NEXT: v_bfe_u32 v5, v4, 16, 1 +; GFX900-NEXT: s_movk_i32 s4, 0x7fff +; GFX900-NEXT: v_mul_f32_e32 v1, v1, v3 +; GFX900-NEXT: v_add3_u32 v5, v5, v4, s4 +; GFX900-NEXT: v_or_b32_e32 v6, 0x400000, v4 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; GFX900-NEXT: v_bfe_u32 v3, v1, 16, 1 +; GFX900-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc +; GFX900-NEXT: v_add3_u32 v3, v3, v1, s4 +; GFX900-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX900-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc +; GFX900-NEXT: v_lshlrev_b32_e32 v3, 16, v2 +; GFX900-NEXT: v_lshlrev_b32_e32 v5, 16, v0 +; GFX900-NEXT: v_mul_f32_e32 v3, v5, v3 +; GFX900-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX900-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX900-NEXT: v_bfe_u32 v5, v3, 16, 1 +; GFX900-NEXT: v_mul_f32_e32 v0, v0, v2 +; GFX900-NEXT: v_add3_u32 v5, v5, v3, s4 +; GFX900-NEXT: v_or_b32_e32 v6, 0x400000, v3 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX900-NEXT: v_bfe_u32 v2, v0, 16, 1 +; GFX900-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc +; GFX900-NEXT: v_add3_u32 v2, v2, v0, s4 +; GFX900-NEXT: v_or_b32_e32 v5, 0x400000, v0 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX900-NEXT: v_cndmask_b32_e32 v0, v2, v5, vcc +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_perm_b32 v0, v0, v3, s4 +; GFX900-NEXT: v_perm_b32 v1, v1, v4, s4 +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_fmul_v4bf16: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_and_b32_e32 v4, 0xffff0000, v3 +; GFX950-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 +; GFX950-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX950-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX950-NEXT: v_mul_f32_e32 v4, v5, v4 +; GFX950-NEXT: v_mul_f32_e32 v1, v1, v3 +; GFX950-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 +; GFX950-NEXT: v_and_b32_e32 v5, 0xffff0000, v0 +; GFX950-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX950-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX950-NEXT: v_mul_f32_e32 v3, v5, v3 +; GFX950-NEXT: v_mul_f32_e32 v0, v0, v2 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v0, v0, v3 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v1, v1, v4 +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_fmul_v4bf16: ; GFX10: ; %bb.0: @@ -15166,80 +16081,113 @@ define <8 x bfloat> @v_fmul_v8bf16(<8 x bfloat> %a, <8 x bfloat> %b) { ; GFX8-NEXT: v_alignbit_b32 v3, v3, v8, 16 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_fmul_v8bf16: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_lshlrev_b32_e32 v8, 16, v7 -; GFX9-NEXT: v_lshlrev_b32_e32 v9, 16, v3 -; GFX9-NEXT: v_mul_f32_e32 v8, v9, v8 -; GFX9-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 -; GFX9-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 -; GFX9-NEXT: v_bfe_u32 v9, v8, 16, 1 -; GFX9-NEXT: s_movk_i32 s4, 0x7fff -; GFX9-NEXT: v_mul_f32_e32 v3, v3, v7 -; GFX9-NEXT: v_add3_u32 v9, v9, v8, s4 -; GFX9-NEXT: v_or_b32_e32 v10, 0x400000, v8 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v8, v8 -; GFX9-NEXT: v_bfe_u32 v7, v3, 16, 1 -; GFX9-NEXT: v_cndmask_b32_e32 v8, v9, v10, vcc -; GFX9-NEXT: v_add3_u32 v7, v7, v3, s4 -; GFX9-NEXT: v_or_b32_e32 v9, 0x400000, v3 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; GFX9-NEXT: v_cndmask_b32_e32 v3, v7, v9, vcc -; GFX9-NEXT: v_lshlrev_b32_e32 v7, 16, v6 -; GFX9-NEXT: v_lshlrev_b32_e32 v9, 16, v2 -; GFX9-NEXT: v_mul_f32_e32 v7, v9, v7 -; GFX9-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 -; GFX9-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX9-NEXT: v_bfe_u32 v9, v7, 16, 1 -; GFX9-NEXT: v_mul_f32_e32 v2, v2, v6 -; GFX9-NEXT: v_add3_u32 v9, v9, v7, s4 -; GFX9-NEXT: v_or_b32_e32 v10, 0x400000, v7 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 -; GFX9-NEXT: v_bfe_u32 v6, v2, 16, 1 -; GFX9-NEXT: v_cndmask_b32_e32 v7, v9, v10, vcc -; GFX9-NEXT: v_add3_u32 v6, v6, v2, s4 -; GFX9-NEXT: v_or_b32_e32 v9, 0x400000, v2 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v6, v9, vcc -; GFX9-NEXT: v_lshlrev_b32_e32 v6, 16, v5 -; GFX9-NEXT: v_lshlrev_b32_e32 v9, 16, v1 -; GFX9-NEXT: v_mul_f32_e32 v6, v9, v6 -; GFX9-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 -; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GFX9-NEXT: v_bfe_u32 v9, v6, 16, 1 -; GFX9-NEXT: v_mul_f32_e32 v1, v1, v5 -; GFX9-NEXT: v_add3_u32 v9, v9, v6, s4 -; GFX9-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 -; GFX9-NEXT: v_bfe_u32 v5, v1, 16, 1 -; GFX9-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc -; GFX9-NEXT: v_add3_u32 v5, v5, v1, s4 -; GFX9-NEXT: v_or_b32_e32 v9, 0x400000, v1 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v5, v9, vcc -; GFX9-NEXT: v_lshlrev_b32_e32 v5, 16, v4 -; GFX9-NEXT: v_lshlrev_b32_e32 v9, 16, v0 -; GFX9-NEXT: v_mul_f32_e32 v5, v9, v5 -; GFX9-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX9-NEXT: v_bfe_u32 v9, v5, 16, 1 -; GFX9-NEXT: v_mul_f32_e32 v0, v0, v4 -; GFX9-NEXT: v_add3_u32 v9, v9, v5, s4 -; GFX9-NEXT: v_or_b32_e32 v10, 0x400000, v5 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX9-NEXT: v_bfe_u32 v4, v0, 16, 1 -; GFX9-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc -; GFX9-NEXT: v_add3_u32 v4, v4, v0, s4 -; GFX9-NEXT: v_or_b32_e32 v9, 0x400000, v0 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v9, vcc -; GFX9-NEXT: s_mov_b32 s4, 0x7060302 -; GFX9-NEXT: v_perm_b32 v0, v0, v5, s4 -; GFX9-NEXT: v_perm_b32 v1, v1, v6, s4 -; GFX9-NEXT: v_perm_b32 v2, v2, v7, s4 -; GFX9-NEXT: v_perm_b32 v3, v3, v8, s4 -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: v_fmul_v8bf16: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_lshlrev_b32_e32 v8, 16, v7 +; GFX900-NEXT: v_lshlrev_b32_e32 v9, 16, v3 +; GFX900-NEXT: v_mul_f32_e32 v8, v9, v8 +; GFX900-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 +; GFX900-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; GFX900-NEXT: v_bfe_u32 v9, v8, 16, 1 +; GFX900-NEXT: s_movk_i32 s4, 0x7fff +; GFX900-NEXT: v_mul_f32_e32 v3, v3, v7 +; GFX900-NEXT: v_add3_u32 v9, v9, v8, s4 +; GFX900-NEXT: v_or_b32_e32 v10, 0x400000, v8 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v8, v8 +; GFX900-NEXT: v_bfe_u32 v7, v3, 16, 1 +; GFX900-NEXT: v_cndmask_b32_e32 v8, v9, v10, vcc +; GFX900-NEXT: v_add3_u32 v7, v7, v3, s4 +; GFX900-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX900-NEXT: v_cndmask_b32_e32 v3, v7, v9, vcc +; GFX900-NEXT: v_lshlrev_b32_e32 v7, 16, v6 +; GFX900-NEXT: v_lshlrev_b32_e32 v9, 16, v2 +; GFX900-NEXT: v_mul_f32_e32 v7, v9, v7 +; GFX900-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 +; GFX900-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX900-NEXT: v_bfe_u32 v9, v7, 16, 1 +; GFX900-NEXT: v_mul_f32_e32 v2, v2, v6 +; GFX900-NEXT: v_add3_u32 v9, v9, v7, s4 +; GFX900-NEXT: v_or_b32_e32 v10, 0x400000, v7 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 +; GFX900-NEXT: v_bfe_u32 v6, v2, 16, 1 +; GFX900-NEXT: v_cndmask_b32_e32 v7, v9, v10, vcc +; GFX900-NEXT: v_add3_u32 v6, v6, v2, s4 +; GFX900-NEXT: v_or_b32_e32 v9, 0x400000, v2 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX900-NEXT: v_cndmask_b32_e32 v2, v6, v9, vcc +; GFX900-NEXT: v_lshlrev_b32_e32 v6, 16, v5 +; GFX900-NEXT: v_lshlrev_b32_e32 v9, 16, v1 +; GFX900-NEXT: v_mul_f32_e32 v6, v9, v6 +; GFX900-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 +; GFX900-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX900-NEXT: v_bfe_u32 v9, v6, 16, 1 +; GFX900-NEXT: v_mul_f32_e32 v1, v1, v5 +; GFX900-NEXT: v_add3_u32 v9, v9, v6, s4 +; GFX900-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; GFX900-NEXT: v_bfe_u32 v5, v1, 16, 1 +; GFX900-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc +; GFX900-NEXT: v_add3_u32 v5, v5, v1, s4 +; GFX900-NEXT: v_or_b32_e32 v9, 0x400000, v1 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX900-NEXT: v_cndmask_b32_e32 v1, v5, v9, vcc +; GFX900-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX900-NEXT: v_lshlrev_b32_e32 v9, 16, v0 +; GFX900-NEXT: v_mul_f32_e32 v5, v9, v5 +; GFX900-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 +; GFX900-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX900-NEXT: v_bfe_u32 v9, v5, 16, 1 +; GFX900-NEXT: v_mul_f32_e32 v0, v0, v4 +; GFX900-NEXT: v_add3_u32 v9, v9, v5, s4 +; GFX900-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX900-NEXT: v_bfe_u32 v4, v0, 16, 1 +; GFX900-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc +; GFX900-NEXT: v_add3_u32 v4, v4, v0, s4 +; GFX900-NEXT: v_or_b32_e32 v9, 0x400000, v0 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX900-NEXT: v_cndmask_b32_e32 v0, v4, v9, vcc +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_perm_b32 v0, v0, v5, s4 +; GFX900-NEXT: v_perm_b32 v1, v1, v6, s4 +; GFX900-NEXT: v_perm_b32 v2, v2, v7, s4 +; GFX900-NEXT: v_perm_b32 v3, v3, v8, s4 +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_fmul_v8bf16: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_and_b32_e32 v8, 0xffff0000, v7 +; GFX950-NEXT: v_and_b32_e32 v9, 0xffff0000, v3 +; GFX950-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; GFX950-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX950-NEXT: v_mul_f32_e32 v8, v9, v8 +; GFX950-NEXT: v_mul_f32_e32 v3, v3, v7 +; GFX950-NEXT: v_and_b32_e32 v7, 0xffff0000, v6 +; GFX950-NEXT: v_and_b32_e32 v9, 0xffff0000, v2 +; GFX950-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; GFX950-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX950-NEXT: v_mul_f32_e32 v7, v9, v7 +; GFX950-NEXT: v_mul_f32_e32 v2, v2, v6 +; GFX950-NEXT: v_and_b32_e32 v6, 0xffff0000, v5 +; GFX950-NEXT: v_and_b32_e32 v9, 0xffff0000, v1 +; GFX950-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX950-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX950-NEXT: v_mul_f32_e32 v6, v9, v6 +; GFX950-NEXT: v_mul_f32_e32 v1, v1, v5 +; GFX950-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 +; GFX950-NEXT: v_and_b32_e32 v9, 0xffff0000, v0 +; GFX950-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX950-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX950-NEXT: v_mul_f32_e32 v5, v9, v5 +; GFX950-NEXT: v_mul_f32_e32 v0, v0, v4 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v0, v0, v5 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v1, v1, v6 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v2, v2, v7 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v3, v3, v8 +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_fmul_v8bf16: ; GFX10: ; %bb.0: @@ -15855,148 +16803,209 @@ define <16 x bfloat> @v_fmul_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) { ; GFX8-NEXT: v_alignbit_b32 v7, v7, v16, 16 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_fmul_v16bf16: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_lshlrev_b32_e32 v16, 16, v15 -; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v7 -; GFX9-NEXT: v_mul_f32_e32 v16, v17, v16 -; GFX9-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 -; GFX9-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 -; GFX9-NEXT: v_bfe_u32 v17, v16, 16, 1 -; GFX9-NEXT: s_movk_i32 s4, 0x7fff -; GFX9-NEXT: v_mul_f32_e32 v7, v7, v15 -; GFX9-NEXT: v_add3_u32 v17, v17, v16, s4 -; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v16 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v16, v16 -; GFX9-NEXT: v_bfe_u32 v15, v7, 16, 1 -; GFX9-NEXT: v_cndmask_b32_e32 v16, v17, v18, vcc -; GFX9-NEXT: v_add3_u32 v15, v15, v7, s4 -; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v7 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 -; GFX9-NEXT: v_cndmask_b32_e32 v7, v15, v17, vcc -; GFX9-NEXT: v_lshlrev_b32_e32 v15, 16, v14 -; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v6 -; GFX9-NEXT: v_mul_f32_e32 v15, v17, v15 -; GFX9-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 -; GFX9-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 -; GFX9-NEXT: v_bfe_u32 v17, v15, 16, 1 -; GFX9-NEXT: v_mul_f32_e32 v6, v6, v14 -; GFX9-NEXT: v_add3_u32 v17, v17, v15, s4 -; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v15 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v15, v15 -; GFX9-NEXT: v_bfe_u32 v14, v6, 16, 1 -; GFX9-NEXT: v_cndmask_b32_e32 v15, v17, v18, vcc -; GFX9-NEXT: v_add3_u32 v14, v14, v6, s4 -; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v6 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 -; GFX9-NEXT: v_cndmask_b32_e32 v6, v14, v17, vcc -; GFX9-NEXT: v_lshlrev_b32_e32 v14, 16, v13 -; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v5 -; GFX9-NEXT: v_mul_f32_e32 v14, v17, v14 -; GFX9-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 -; GFX9-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 -; GFX9-NEXT: v_bfe_u32 v17, v14, 16, 1 -; GFX9-NEXT: v_mul_f32_e32 v5, v5, v13 -; GFX9-NEXT: v_add3_u32 v17, v17, v14, s4 -; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v14 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v14, v14 -; GFX9-NEXT: v_bfe_u32 v13, v5, 16, 1 -; GFX9-NEXT: v_cndmask_b32_e32 v14, v17, v18, vcc -; GFX9-NEXT: v_add3_u32 v13, v13, v5, s4 -; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v5 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX9-NEXT: v_cndmask_b32_e32 v5, v13, v17, vcc -; GFX9-NEXT: v_lshlrev_b32_e32 v13, 16, v12 -; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v4 -; GFX9-NEXT: v_mul_f32_e32 v13, v17, v13 -; GFX9-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 -; GFX9-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 -; GFX9-NEXT: v_bfe_u32 v17, v13, 16, 1 -; GFX9-NEXT: v_mul_f32_e32 v4, v4, v12 -; GFX9-NEXT: v_add3_u32 v17, v17, v13, s4 -; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v13 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v13, v13 -; GFX9-NEXT: v_bfe_u32 v12, v4, 16, 1 -; GFX9-NEXT: v_cndmask_b32_e32 v13, v17, v18, vcc -; GFX9-NEXT: v_add3_u32 v12, v12, v4, s4 -; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v4 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 -; GFX9-NEXT: v_cndmask_b32_e32 v4, v12, v17, vcc -; GFX9-NEXT: v_lshlrev_b32_e32 v12, 16, v11 -; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v3 -; GFX9-NEXT: v_mul_f32_e32 v12, v17, v12 -; GFX9-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 -; GFX9-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 -; GFX9-NEXT: v_bfe_u32 v17, v12, 16, 1 -; GFX9-NEXT: v_mul_f32_e32 v3, v3, v11 -; GFX9-NEXT: v_add3_u32 v17, v17, v12, s4 -; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v12 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v12, v12 -; GFX9-NEXT: v_bfe_u32 v11, v3, 16, 1 -; GFX9-NEXT: v_cndmask_b32_e32 v12, v17, v18, vcc -; GFX9-NEXT: v_add3_u32 v11, v11, v3, s4 -; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v3 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; GFX9-NEXT: v_cndmask_b32_e32 v3, v11, v17, vcc -; GFX9-NEXT: v_lshlrev_b32_e32 v11, 16, v10 -; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v2 -; GFX9-NEXT: v_mul_f32_e32 v11, v17, v11 -; GFX9-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 -; GFX9-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX9-NEXT: v_bfe_u32 v17, v11, 16, 1 -; GFX9-NEXT: v_mul_f32_e32 v2, v2, v10 -; GFX9-NEXT: v_add3_u32 v17, v17, v11, s4 -; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v11 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v11, v11 -; GFX9-NEXT: v_bfe_u32 v10, v2, 16, 1 -; GFX9-NEXT: v_cndmask_b32_e32 v11, v17, v18, vcc -; GFX9-NEXT: v_add3_u32 v10, v10, v2, s4 -; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v2 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v10, v17, vcc -; GFX9-NEXT: v_lshlrev_b32_e32 v10, 16, v9 -; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v1 -; GFX9-NEXT: v_mul_f32_e32 v10, v17, v10 -; GFX9-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 -; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GFX9-NEXT: v_bfe_u32 v17, v10, 16, 1 -; GFX9-NEXT: v_mul_f32_e32 v1, v1, v9 -; GFX9-NEXT: v_add3_u32 v17, v17, v10, s4 -; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v10 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v10, v10 -; GFX9-NEXT: v_bfe_u32 v9, v1, 16, 1 -; GFX9-NEXT: v_cndmask_b32_e32 v10, v17, v18, vcc -; GFX9-NEXT: v_add3_u32 v9, v9, v1, s4 -; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v1 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v9, v17, vcc -; GFX9-NEXT: v_lshlrev_b32_e32 v9, 16, v8 -; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v0 -; GFX9-NEXT: v_mul_f32_e32 v9, v17, v9 -; GFX9-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX9-NEXT: v_bfe_u32 v17, v9, 16, 1 -; GFX9-NEXT: v_mul_f32_e32 v0, v0, v8 -; GFX9-NEXT: v_add3_u32 v17, v17, v9, s4 -; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v9 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v9, v9 -; GFX9-NEXT: v_bfe_u32 v8, v0, 16, 1 -; GFX9-NEXT: v_cndmask_b32_e32 v9, v17, v18, vcc -; GFX9-NEXT: v_add3_u32 v8, v8, v0, s4 -; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v0 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v8, v17, vcc -; GFX9-NEXT: s_mov_b32 s4, 0x7060302 -; GFX9-NEXT: v_perm_b32 v0, v0, v9, s4 -; GFX9-NEXT: v_perm_b32 v1, v1, v10, s4 -; GFX9-NEXT: v_perm_b32 v2, v2, v11, s4 -; GFX9-NEXT: v_perm_b32 v3, v3, v12, s4 -; GFX9-NEXT: v_perm_b32 v4, v4, v13, s4 -; GFX9-NEXT: v_perm_b32 v5, v5, v14, s4 -; GFX9-NEXT: v_perm_b32 v6, v6, v15, s4 -; GFX9-NEXT: v_perm_b32 v7, v7, v16, s4 -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: v_fmul_v16bf16: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_lshlrev_b32_e32 v16, 16, v15 +; GFX900-NEXT: v_lshlrev_b32_e32 v17, 16, v7 +; GFX900-NEXT: v_mul_f32_e32 v16, v17, v16 +; GFX900-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 +; GFX900-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 +; GFX900-NEXT: v_bfe_u32 v17, v16, 16, 1 +; GFX900-NEXT: s_movk_i32 s4, 0x7fff +; GFX900-NEXT: v_mul_f32_e32 v7, v7, v15 +; GFX900-NEXT: v_add3_u32 v17, v17, v16, s4 +; GFX900-NEXT: v_or_b32_e32 v18, 0x400000, v16 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v16, v16 +; GFX900-NEXT: v_bfe_u32 v15, v7, 16, 1 +; GFX900-NEXT: v_cndmask_b32_e32 v16, v17, v18, vcc +; GFX900-NEXT: v_add3_u32 v15, v15, v7, s4 +; GFX900-NEXT: v_or_b32_e32 v17, 0x400000, v7 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 +; GFX900-NEXT: v_cndmask_b32_e32 v7, v15, v17, vcc +; GFX900-NEXT: v_lshlrev_b32_e32 v15, 16, v14 +; GFX900-NEXT: v_lshlrev_b32_e32 v17, 16, v6 +; GFX900-NEXT: v_mul_f32_e32 v15, v17, v15 +; GFX900-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 +; GFX900-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 +; GFX900-NEXT: v_bfe_u32 v17, v15, 16, 1 +; GFX900-NEXT: v_mul_f32_e32 v6, v6, v14 +; GFX900-NEXT: v_add3_u32 v17, v17, v15, s4 +; GFX900-NEXT: v_or_b32_e32 v18, 0x400000, v15 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v15, v15 +; GFX900-NEXT: v_bfe_u32 v14, v6, 16, 1 +; GFX900-NEXT: v_cndmask_b32_e32 v15, v17, v18, vcc +; GFX900-NEXT: v_add3_u32 v14, v14, v6, s4 +; GFX900-NEXT: v_or_b32_e32 v17, 0x400000, v6 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; GFX900-NEXT: v_cndmask_b32_e32 v6, v14, v17, vcc +; GFX900-NEXT: v_lshlrev_b32_e32 v14, 16, v13 +; GFX900-NEXT: v_lshlrev_b32_e32 v17, 16, v5 +; GFX900-NEXT: v_mul_f32_e32 v14, v17, v14 +; GFX900-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 +; GFX900-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 +; GFX900-NEXT: v_bfe_u32 v17, v14, 16, 1 +; GFX900-NEXT: v_mul_f32_e32 v5, v5, v13 +; GFX900-NEXT: v_add3_u32 v17, v17, v14, s4 +; GFX900-NEXT: v_or_b32_e32 v18, 0x400000, v14 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v14, v14 +; GFX900-NEXT: v_bfe_u32 v13, v5, 16, 1 +; GFX900-NEXT: v_cndmask_b32_e32 v14, v17, v18, vcc +; GFX900-NEXT: v_add3_u32 v13, v13, v5, s4 +; GFX900-NEXT: v_or_b32_e32 v17, 0x400000, v5 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX900-NEXT: v_cndmask_b32_e32 v5, v13, v17, vcc +; GFX900-NEXT: v_lshlrev_b32_e32 v13, 16, v12 +; GFX900-NEXT: v_lshlrev_b32_e32 v17, 16, v4 +; GFX900-NEXT: v_mul_f32_e32 v13, v17, v13 +; GFX900-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 +; GFX900-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 +; GFX900-NEXT: v_bfe_u32 v17, v13, 16, 1 +; GFX900-NEXT: v_mul_f32_e32 v4, v4, v12 +; GFX900-NEXT: v_add3_u32 v17, v17, v13, s4 +; GFX900-NEXT: v_or_b32_e32 v18, 0x400000, v13 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v13, v13 +; GFX900-NEXT: v_bfe_u32 v12, v4, 16, 1 +; GFX900-NEXT: v_cndmask_b32_e32 v13, v17, v18, vcc +; GFX900-NEXT: v_add3_u32 v12, v12, v4, s4 +; GFX900-NEXT: v_or_b32_e32 v17, 0x400000, v4 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; GFX900-NEXT: v_cndmask_b32_e32 v4, v12, v17, vcc +; GFX900-NEXT: v_lshlrev_b32_e32 v12, 16, v11 +; GFX900-NEXT: v_lshlrev_b32_e32 v17, 16, v3 +; GFX900-NEXT: v_mul_f32_e32 v12, v17, v12 +; GFX900-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 +; GFX900-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; GFX900-NEXT: v_bfe_u32 v17, v12, 16, 1 +; GFX900-NEXT: v_mul_f32_e32 v3, v3, v11 +; GFX900-NEXT: v_add3_u32 v17, v17, v12, s4 +; GFX900-NEXT: v_or_b32_e32 v18, 0x400000, v12 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v12, v12 +; GFX900-NEXT: v_bfe_u32 v11, v3, 16, 1 +; GFX900-NEXT: v_cndmask_b32_e32 v12, v17, v18, vcc +; GFX900-NEXT: v_add3_u32 v11, v11, v3, s4 +; GFX900-NEXT: v_or_b32_e32 v17, 0x400000, v3 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX900-NEXT: v_cndmask_b32_e32 v3, v11, v17, vcc +; GFX900-NEXT: v_lshlrev_b32_e32 v11, 16, v10 +; GFX900-NEXT: v_lshlrev_b32_e32 v17, 16, v2 +; GFX900-NEXT: v_mul_f32_e32 v11, v17, v11 +; GFX900-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 +; GFX900-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX900-NEXT: v_bfe_u32 v17, v11, 16, 1 +; GFX900-NEXT: v_mul_f32_e32 v2, v2, v10 +; GFX900-NEXT: v_add3_u32 v17, v17, v11, s4 +; GFX900-NEXT: v_or_b32_e32 v18, 0x400000, v11 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v11, v11 +; GFX900-NEXT: v_bfe_u32 v10, v2, 16, 1 +; GFX900-NEXT: v_cndmask_b32_e32 v11, v17, v18, vcc +; GFX900-NEXT: v_add3_u32 v10, v10, v2, s4 +; GFX900-NEXT: v_or_b32_e32 v17, 0x400000, v2 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX900-NEXT: v_cndmask_b32_e32 v2, v10, v17, vcc +; GFX900-NEXT: v_lshlrev_b32_e32 v10, 16, v9 +; GFX900-NEXT: v_lshlrev_b32_e32 v17, 16, v1 +; GFX900-NEXT: v_mul_f32_e32 v10, v17, v10 +; GFX900-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 +; GFX900-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX900-NEXT: v_bfe_u32 v17, v10, 16, 1 +; GFX900-NEXT: v_mul_f32_e32 v1, v1, v9 +; GFX900-NEXT: v_add3_u32 v17, v17, v10, s4 +; GFX900-NEXT: v_or_b32_e32 v18, 0x400000, v10 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v10, v10 +; GFX900-NEXT: v_bfe_u32 v9, v1, 16, 1 +; GFX900-NEXT: v_cndmask_b32_e32 v10, v17, v18, vcc +; GFX900-NEXT: v_add3_u32 v9, v9, v1, s4 +; GFX900-NEXT: v_or_b32_e32 v17, 0x400000, v1 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX900-NEXT: v_cndmask_b32_e32 v1, v9, v17, vcc +; GFX900-NEXT: v_lshlrev_b32_e32 v9, 16, v8 +; GFX900-NEXT: v_lshlrev_b32_e32 v17, 16, v0 +; GFX900-NEXT: v_mul_f32_e32 v9, v17, v9 +; GFX900-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 +; GFX900-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX900-NEXT: v_bfe_u32 v17, v9, 16, 1 +; GFX900-NEXT: v_mul_f32_e32 v0, v0, v8 +; GFX900-NEXT: v_add3_u32 v17, v17, v9, s4 +; GFX900-NEXT: v_or_b32_e32 v18, 0x400000, v9 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v9, v9 +; GFX900-NEXT: v_bfe_u32 v8, v0, 16, 1 +; GFX900-NEXT: v_cndmask_b32_e32 v9, v17, v18, vcc +; GFX900-NEXT: v_add3_u32 v8, v8, v0, s4 +; GFX900-NEXT: v_or_b32_e32 v17, 0x400000, v0 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX900-NEXT: v_cndmask_b32_e32 v0, v8, v17, vcc +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_perm_b32 v0, v0, v9, s4 +; GFX900-NEXT: v_perm_b32 v1, v1, v10, s4 +; GFX900-NEXT: v_perm_b32 v2, v2, v11, s4 +; GFX900-NEXT: v_perm_b32 v3, v3, v12, s4 +; GFX900-NEXT: v_perm_b32 v4, v4, v13, s4 +; GFX900-NEXT: v_perm_b32 v5, v5, v14, s4 +; GFX900-NEXT: v_perm_b32 v6, v6, v15, s4 +; GFX900-NEXT: v_perm_b32 v7, v7, v16, s4 +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_fmul_v16bf16: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_and_b32_e32 v16, 0xffff0000, v15 +; GFX950-NEXT: v_and_b32_e32 v17, 0xffff0000, v7 +; GFX950-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; GFX950-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; GFX950-NEXT: v_mul_f32_e32 v16, v17, v16 +; GFX950-NEXT: v_mul_f32_e32 v7, v7, v15 +; GFX950-NEXT: v_and_b32_e32 v15, 0xffff0000, v14 +; GFX950-NEXT: v_and_b32_e32 v17, 0xffff0000, v6 +; GFX950-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; GFX950-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; GFX950-NEXT: v_mul_f32_e32 v15, v17, v15 +; GFX950-NEXT: v_mul_f32_e32 v6, v6, v14 +; GFX950-NEXT: v_and_b32_e32 v14, 0xffff0000, v13 +; GFX950-NEXT: v_and_b32_e32 v17, 0xffff0000, v5 +; GFX950-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; GFX950-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX950-NEXT: v_mul_f32_e32 v14, v17, v14 +; GFX950-NEXT: v_mul_f32_e32 v5, v5, v13 +; GFX950-NEXT: v_and_b32_e32 v13, 0xffff0000, v12 +; GFX950-NEXT: v_and_b32_e32 v17, 0xffff0000, v4 +; GFX950-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; GFX950-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX950-NEXT: v_mul_f32_e32 v13, v17, v13 +; GFX950-NEXT: v_mul_f32_e32 v4, v4, v12 +; GFX950-NEXT: v_and_b32_e32 v12, 0xffff0000, v11 +; GFX950-NEXT: v_and_b32_e32 v17, 0xffff0000, v3 +; GFX950-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; GFX950-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX950-NEXT: v_mul_f32_e32 v12, v17, v12 +; GFX950-NEXT: v_mul_f32_e32 v3, v3, v11 +; GFX950-NEXT: v_and_b32_e32 v11, 0xffff0000, v10 +; GFX950-NEXT: v_and_b32_e32 v17, 0xffff0000, v2 +; GFX950-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; GFX950-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX950-NEXT: v_mul_f32_e32 v11, v17, v11 +; GFX950-NEXT: v_mul_f32_e32 v2, v2, v10 +; GFX950-NEXT: v_and_b32_e32 v10, 0xffff0000, v9 +; GFX950-NEXT: v_and_b32_e32 v17, 0xffff0000, v1 +; GFX950-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; GFX950-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX950-NEXT: v_mul_f32_e32 v10, v17, v10 +; GFX950-NEXT: v_mul_f32_e32 v1, v1, v9 +; GFX950-NEXT: v_and_b32_e32 v9, 0xffff0000, v8 +; GFX950-NEXT: v_and_b32_e32 v17, 0xffff0000, v0 +; GFX950-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; GFX950-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX950-NEXT: v_mul_f32_e32 v9, v17, v9 +; GFX950-NEXT: v_mul_f32_e32 v0, v0, v8 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v0, v0, v9 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v1, v1, v10 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v2, v2, v11 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v3, v3, v12 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v4, v4, v13 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v5, v5, v14 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v6, v6, v15 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v7, v7, v16 +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_fmul_v16bf16: ; GFX10: ; %bb.0: @@ -17311,286 +18320,407 @@ define <32 x bfloat> @v_fmul_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) { ; GFX8-NEXT: v_alignbit_b32 v15, v16, v15, 16 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_fmul_v32bf16: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_lshlrev_b32_e32 v31, 16, v30 -; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v14 -; GFX9-NEXT: v_mul_f32_e32 v31, v32, v31 -; GFX9-NEXT: v_and_b32_e32 v30, 0xffff0000, v30 -; GFX9-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 -; GFX9-NEXT: s_movk_i32 s4, 0x7fff -; GFX9-NEXT: v_bfe_u32 v32, v31, 16, 1 -; GFX9-NEXT: v_mul_f32_e32 v14, v14, v30 -; GFX9-NEXT: v_add3_u32 v32, v32, v31, s4 -; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v31 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v31, v31 -; GFX9-NEXT: v_bfe_u32 v30, v14, 16, 1 -; GFX9-NEXT: v_cndmask_b32_e32 v31, v32, v33, vcc -; GFX9-NEXT: v_add3_u32 v30, v30, v14, s4 -; GFX9-NEXT: v_or_b32_e32 v32, 0x400000, v14 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v14, v14 -; GFX9-NEXT: v_cndmask_b32_e32 v14, v30, v32, vcc -; GFX9-NEXT: v_lshlrev_b32_e32 v30, 16, v29 -; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v13 -; GFX9-NEXT: v_mul_f32_e32 v30, v32, v30 -; GFX9-NEXT: v_and_b32_e32 v29, 0xffff0000, v29 -; GFX9-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 -; GFX9-NEXT: v_bfe_u32 v32, v30, 16, 1 -; GFX9-NEXT: v_mul_f32_e32 v13, v13, v29 -; GFX9-NEXT: v_add3_u32 v32, v32, v30, s4 -; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v30 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v30, v30 -; GFX9-NEXT: v_bfe_u32 v29, v13, 16, 1 -; GFX9-NEXT: v_cndmask_b32_e32 v30, v32, v33, vcc -; GFX9-NEXT: v_add3_u32 v29, v29, v13, s4 -; GFX9-NEXT: v_or_b32_e32 v32, 0x400000, v13 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v13, v13 -; GFX9-NEXT: v_cndmask_b32_e32 v13, v29, v32, vcc -; GFX9-NEXT: v_lshlrev_b32_e32 v29, 16, v28 -; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v12 -; GFX9-NEXT: v_mul_f32_e32 v32, v32, v29 -; GFX9-NEXT: buffer_load_dword v29, off, s[0:3], s32 -; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v15 -; GFX9-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 -; GFX9-NEXT: v_and_b32_e32 v28, 0xffff0000, v28 -; GFX9-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 -; GFX9-NEXT: v_mul_f32_e32 v12, v12, v28 -; GFX9-NEXT: v_bfe_u32 v28, v12, 16, 1 -; GFX9-NEXT: v_add3_u32 v28, v28, v12, s4 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_lshlrev_b32_e32 v34, 16, v29 -; GFX9-NEXT: v_mul_f32_e32 v33, v33, v34 -; GFX9-NEXT: v_and_b32_e32 v29, 0xffff0000, v29 -; GFX9-NEXT: v_mul_f32_e32 v29, v15, v29 -; GFX9-NEXT: v_bfe_u32 v15, v33, 16, 1 -; GFX9-NEXT: v_add3_u32 v15, v15, v33, s4 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v33 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v33, v33 -; GFX9-NEXT: v_bfe_u32 v33, v29, 16, 1 -; GFX9-NEXT: v_cndmask_b32_e32 v15, v15, v34, vcc -; GFX9-NEXT: v_add3_u32 v33, v33, v29, s4 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v29 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v29, v29 -; GFX9-NEXT: v_cndmask_b32_e32 v29, v33, v34, vcc -; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 -; GFX9-NEXT: v_add3_u32 v33, v33, v32, s4 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 -; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc -; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v12 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v12, v12 -; GFX9-NEXT: v_cndmask_b32_e32 v12, v28, v33, vcc -; GFX9-NEXT: v_lshlrev_b32_e32 v28, 16, v27 -; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v11 -; GFX9-NEXT: v_mul_f32_e32 v28, v33, v28 -; GFX9-NEXT: v_and_b32_e32 v27, 0xffff0000, v27 -; GFX9-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 -; GFX9-NEXT: v_bfe_u32 v33, v28, 16, 1 -; GFX9-NEXT: v_mul_f32_e32 v11, v11, v27 -; GFX9-NEXT: v_add3_u32 v33, v33, v28, s4 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v28 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v28, v28 -; GFX9-NEXT: v_bfe_u32 v27, v11, 16, 1 -; GFX9-NEXT: v_cndmask_b32_e32 v28, v33, v34, vcc -; GFX9-NEXT: v_add3_u32 v27, v27, v11, s4 -; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v11 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v11, v11 -; GFX9-NEXT: v_cndmask_b32_e32 v11, v27, v33, vcc -; GFX9-NEXT: v_lshlrev_b32_e32 v27, 16, v26 -; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v10 -; GFX9-NEXT: v_mul_f32_e32 v27, v33, v27 -; GFX9-NEXT: v_and_b32_e32 v26, 0xffff0000, v26 -; GFX9-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 -; GFX9-NEXT: v_bfe_u32 v33, v27, 16, 1 -; GFX9-NEXT: v_mul_f32_e32 v10, v10, v26 -; GFX9-NEXT: v_add3_u32 v33, v33, v27, s4 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v27 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v27, v27 -; GFX9-NEXT: v_bfe_u32 v26, v10, 16, 1 -; GFX9-NEXT: v_cndmask_b32_e32 v27, v33, v34, vcc -; GFX9-NEXT: v_add3_u32 v26, v26, v10, s4 -; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v10 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v10, v10 -; GFX9-NEXT: v_cndmask_b32_e32 v10, v26, v33, vcc -; GFX9-NEXT: v_lshlrev_b32_e32 v26, 16, v25 -; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v9 -; GFX9-NEXT: v_mul_f32_e32 v26, v33, v26 -; GFX9-NEXT: v_and_b32_e32 v25, 0xffff0000, v25 -; GFX9-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 -; GFX9-NEXT: v_bfe_u32 v33, v26, 16, 1 -; GFX9-NEXT: v_mul_f32_e32 v9, v9, v25 -; GFX9-NEXT: v_add3_u32 v33, v33, v26, s4 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v26 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v26, v26 -; GFX9-NEXT: v_bfe_u32 v25, v9, 16, 1 -; GFX9-NEXT: v_cndmask_b32_e32 v26, v33, v34, vcc -; GFX9-NEXT: v_add3_u32 v25, v25, v9, s4 -; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v9 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v9, v9 -; GFX9-NEXT: v_cndmask_b32_e32 v9, v25, v33, vcc -; GFX9-NEXT: v_lshlrev_b32_e32 v25, 16, v24 -; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v8 -; GFX9-NEXT: v_mul_f32_e32 v25, v33, v25 -; GFX9-NEXT: v_and_b32_e32 v24, 0xffff0000, v24 -; GFX9-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 -; GFX9-NEXT: v_bfe_u32 v33, v25, 16, 1 -; GFX9-NEXT: v_mul_f32_e32 v8, v8, v24 -; GFX9-NEXT: v_add3_u32 v33, v33, v25, s4 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v25 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v25, v25 -; GFX9-NEXT: v_bfe_u32 v24, v8, 16, 1 -; GFX9-NEXT: v_cndmask_b32_e32 v25, v33, v34, vcc -; GFX9-NEXT: v_add3_u32 v24, v24, v8, s4 -; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v8 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v8, v8 -; GFX9-NEXT: v_cndmask_b32_e32 v8, v24, v33, vcc -; GFX9-NEXT: v_lshlrev_b32_e32 v24, 16, v23 -; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v7 -; GFX9-NEXT: v_mul_f32_e32 v24, v33, v24 -; GFX9-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 -; GFX9-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 -; GFX9-NEXT: v_bfe_u32 v33, v24, 16, 1 -; GFX9-NEXT: v_mul_f32_e32 v7, v7, v23 -; GFX9-NEXT: v_add3_u32 v33, v33, v24, s4 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v24 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v24, v24 -; GFX9-NEXT: v_bfe_u32 v23, v7, 16, 1 -; GFX9-NEXT: v_cndmask_b32_e32 v24, v33, v34, vcc -; GFX9-NEXT: v_add3_u32 v23, v23, v7, s4 -; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v7 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 -; GFX9-NEXT: v_cndmask_b32_e32 v7, v23, v33, vcc -; GFX9-NEXT: v_lshlrev_b32_e32 v23, 16, v22 -; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v6 -; GFX9-NEXT: v_mul_f32_e32 v23, v33, v23 -; GFX9-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 -; GFX9-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 -; GFX9-NEXT: v_bfe_u32 v33, v23, 16, 1 -; GFX9-NEXT: v_mul_f32_e32 v6, v6, v22 -; GFX9-NEXT: v_add3_u32 v33, v33, v23, s4 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v23 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v23, v23 -; GFX9-NEXT: v_bfe_u32 v22, v6, 16, 1 -; GFX9-NEXT: v_cndmask_b32_e32 v23, v33, v34, vcc -; GFX9-NEXT: v_add3_u32 v22, v22, v6, s4 -; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v6 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 -; GFX9-NEXT: v_cndmask_b32_e32 v6, v22, v33, vcc -; GFX9-NEXT: v_lshlrev_b32_e32 v22, 16, v21 -; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v5 -; GFX9-NEXT: v_mul_f32_e32 v22, v33, v22 -; GFX9-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 -; GFX9-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 -; GFX9-NEXT: v_bfe_u32 v33, v22, 16, 1 -; GFX9-NEXT: v_mul_f32_e32 v5, v5, v21 -; GFX9-NEXT: v_add3_u32 v33, v33, v22, s4 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v22 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v22, v22 -; GFX9-NEXT: v_bfe_u32 v21, v5, 16, 1 -; GFX9-NEXT: v_cndmask_b32_e32 v22, v33, v34, vcc -; GFX9-NEXT: v_add3_u32 v21, v21, v5, s4 -; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v5 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX9-NEXT: v_cndmask_b32_e32 v5, v21, v33, vcc -; GFX9-NEXT: v_lshlrev_b32_e32 v21, 16, v20 -; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v4 -; GFX9-NEXT: v_mul_f32_e32 v21, v33, v21 -; GFX9-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 -; GFX9-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 -; GFX9-NEXT: v_bfe_u32 v33, v21, 16, 1 -; GFX9-NEXT: v_mul_f32_e32 v4, v4, v20 -; GFX9-NEXT: v_add3_u32 v33, v33, v21, s4 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v21 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v21, v21 -; GFX9-NEXT: v_bfe_u32 v20, v4, 16, 1 -; GFX9-NEXT: v_cndmask_b32_e32 v21, v33, v34, vcc -; GFX9-NEXT: v_add3_u32 v20, v20, v4, s4 -; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v4 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 -; GFX9-NEXT: v_cndmask_b32_e32 v4, v20, v33, vcc -; GFX9-NEXT: v_lshlrev_b32_e32 v20, 16, v19 -; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v3 -; GFX9-NEXT: v_mul_f32_e32 v20, v33, v20 -; GFX9-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 -; GFX9-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 -; GFX9-NEXT: v_bfe_u32 v33, v20, 16, 1 -; GFX9-NEXT: v_mul_f32_e32 v3, v3, v19 -; GFX9-NEXT: v_add3_u32 v33, v33, v20, s4 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v20 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v20, v20 -; GFX9-NEXT: v_bfe_u32 v19, v3, 16, 1 -; GFX9-NEXT: v_cndmask_b32_e32 v20, v33, v34, vcc -; GFX9-NEXT: v_add3_u32 v19, v19, v3, s4 -; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v3 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; GFX9-NEXT: v_cndmask_b32_e32 v3, v19, v33, vcc -; GFX9-NEXT: v_lshlrev_b32_e32 v19, 16, v18 -; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v2 -; GFX9-NEXT: v_mul_f32_e32 v19, v33, v19 -; GFX9-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 -; GFX9-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX9-NEXT: v_bfe_u32 v33, v19, 16, 1 -; GFX9-NEXT: v_mul_f32_e32 v2, v2, v18 -; GFX9-NEXT: v_add3_u32 v33, v33, v19, s4 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v19 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v19, v19 -; GFX9-NEXT: v_bfe_u32 v18, v2, 16, 1 -; GFX9-NEXT: v_cndmask_b32_e32 v19, v33, v34, vcc -; GFX9-NEXT: v_add3_u32 v18, v18, v2, s4 -; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v2 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v18, v33, vcc -; GFX9-NEXT: v_lshlrev_b32_e32 v18, 16, v17 -; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v1 -; GFX9-NEXT: v_mul_f32_e32 v18, v33, v18 -; GFX9-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 -; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GFX9-NEXT: v_bfe_u32 v33, v18, 16, 1 -; GFX9-NEXT: v_mul_f32_e32 v1, v1, v17 -; GFX9-NEXT: v_add3_u32 v33, v33, v18, s4 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v18 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 -; GFX9-NEXT: v_bfe_u32 v17, v1, 16, 1 -; GFX9-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc -; GFX9-NEXT: v_add3_u32 v17, v17, v1, s4 -; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v1 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v17, v33, vcc -; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v16 -; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v0 -; GFX9-NEXT: v_mul_f32_e32 v17, v33, v17 -; GFX9-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX9-NEXT: v_bfe_u32 v33, v17, 16, 1 -; GFX9-NEXT: v_mul_f32_e32 v0, v0, v16 -; GFX9-NEXT: v_add3_u32 v33, v33, v17, s4 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v17 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 -; GFX9-NEXT: v_bfe_u32 v16, v0, 16, 1 -; GFX9-NEXT: v_cndmask_b32_e32 v17, v33, v34, vcc -; GFX9-NEXT: v_add3_u32 v16, v16, v0, s4 -; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v0 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v16, v33, vcc -; GFX9-NEXT: s_mov_b32 s4, 0x7060302 -; GFX9-NEXT: v_perm_b32 v0, v0, v17, s4 -; GFX9-NEXT: v_perm_b32 v1, v1, v18, s4 -; GFX9-NEXT: v_perm_b32 v2, v2, v19, s4 -; GFX9-NEXT: v_perm_b32 v3, v3, v20, s4 -; GFX9-NEXT: v_perm_b32 v4, v4, v21, s4 -; GFX9-NEXT: v_perm_b32 v5, v5, v22, s4 -; GFX9-NEXT: v_perm_b32 v6, v6, v23, s4 -; GFX9-NEXT: v_perm_b32 v7, v7, v24, s4 -; GFX9-NEXT: v_perm_b32 v8, v8, v25, s4 -; GFX9-NEXT: v_perm_b32 v9, v9, v26, s4 -; GFX9-NEXT: v_perm_b32 v10, v10, v27, s4 -; GFX9-NEXT: v_perm_b32 v11, v11, v28, s4 -; GFX9-NEXT: v_perm_b32 v12, v12, v32, s4 -; GFX9-NEXT: v_perm_b32 v13, v13, v30, s4 -; GFX9-NEXT: v_perm_b32 v14, v14, v31, s4 -; GFX9-NEXT: v_perm_b32 v15, v29, v15, s4 -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: v_fmul_v32bf16: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_lshlrev_b32_e32 v31, 16, v30 +; GFX900-NEXT: v_lshlrev_b32_e32 v32, 16, v14 +; GFX900-NEXT: v_mul_f32_e32 v31, v32, v31 +; GFX900-NEXT: v_and_b32_e32 v30, 0xffff0000, v30 +; GFX900-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 +; GFX900-NEXT: s_movk_i32 s4, 0x7fff +; GFX900-NEXT: v_bfe_u32 v32, v31, 16, 1 +; GFX900-NEXT: v_mul_f32_e32 v14, v14, v30 +; GFX900-NEXT: v_add3_u32 v32, v32, v31, s4 +; GFX900-NEXT: v_or_b32_e32 v33, 0x400000, v31 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v31, v31 +; GFX900-NEXT: v_bfe_u32 v30, v14, 16, 1 +; GFX900-NEXT: v_cndmask_b32_e32 v31, v32, v33, vcc +; GFX900-NEXT: v_add3_u32 v30, v30, v14, s4 +; GFX900-NEXT: v_or_b32_e32 v32, 0x400000, v14 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v14, v14 +; GFX900-NEXT: v_cndmask_b32_e32 v14, v30, v32, vcc +; GFX900-NEXT: v_lshlrev_b32_e32 v30, 16, v29 +; GFX900-NEXT: v_lshlrev_b32_e32 v32, 16, v13 +; GFX900-NEXT: v_mul_f32_e32 v30, v32, v30 +; GFX900-NEXT: v_and_b32_e32 v29, 0xffff0000, v29 +; GFX900-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 +; GFX900-NEXT: v_bfe_u32 v32, v30, 16, 1 +; GFX900-NEXT: v_mul_f32_e32 v13, v13, v29 +; GFX900-NEXT: v_add3_u32 v32, v32, v30, s4 +; GFX900-NEXT: v_or_b32_e32 v33, 0x400000, v30 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v30, v30 +; GFX900-NEXT: v_bfe_u32 v29, v13, 16, 1 +; GFX900-NEXT: v_cndmask_b32_e32 v30, v32, v33, vcc +; GFX900-NEXT: v_add3_u32 v29, v29, v13, s4 +; GFX900-NEXT: v_or_b32_e32 v32, 0x400000, v13 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v13, v13 +; GFX900-NEXT: v_cndmask_b32_e32 v13, v29, v32, vcc +; GFX900-NEXT: v_lshlrev_b32_e32 v29, 16, v28 +; GFX900-NEXT: v_lshlrev_b32_e32 v32, 16, v12 +; GFX900-NEXT: v_mul_f32_e32 v32, v32, v29 +; GFX900-NEXT: buffer_load_dword v29, off, s[0:3], s32 +; GFX900-NEXT: v_lshlrev_b32_e32 v33, 16, v15 +; GFX900-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 +; GFX900-NEXT: v_and_b32_e32 v28, 0xffff0000, v28 +; GFX900-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 +; GFX900-NEXT: v_mul_f32_e32 v12, v12, v28 +; GFX900-NEXT: v_bfe_u32 v28, v12, 16, 1 +; GFX900-NEXT: v_add3_u32 v28, v28, v12, s4 +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: v_lshlrev_b32_e32 v34, 16, v29 +; GFX900-NEXT: v_mul_f32_e32 v33, v33, v34 +; GFX900-NEXT: v_and_b32_e32 v29, 0xffff0000, v29 +; GFX900-NEXT: v_mul_f32_e32 v29, v15, v29 +; GFX900-NEXT: v_bfe_u32 v15, v33, 16, 1 +; GFX900-NEXT: v_add3_u32 v15, v15, v33, s4 +; GFX900-NEXT: v_or_b32_e32 v34, 0x400000, v33 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v33, v33 +; GFX900-NEXT: v_bfe_u32 v33, v29, 16, 1 +; GFX900-NEXT: v_cndmask_b32_e32 v15, v15, v34, vcc +; GFX900-NEXT: v_add3_u32 v33, v33, v29, s4 +; GFX900-NEXT: v_or_b32_e32 v34, 0x400000, v29 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v29, v29 +; GFX900-NEXT: v_cndmask_b32_e32 v29, v33, v34, vcc +; GFX900-NEXT: v_bfe_u32 v33, v32, 16, 1 +; GFX900-NEXT: v_add3_u32 v33, v33, v32, s4 +; GFX900-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX900-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; GFX900-NEXT: v_or_b32_e32 v33, 0x400000, v12 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v12, v12 +; GFX900-NEXT: v_cndmask_b32_e32 v12, v28, v33, vcc +; GFX900-NEXT: v_lshlrev_b32_e32 v28, 16, v27 +; GFX900-NEXT: v_lshlrev_b32_e32 v33, 16, v11 +; GFX900-NEXT: v_mul_f32_e32 v28, v33, v28 +; GFX900-NEXT: v_and_b32_e32 v27, 0xffff0000, v27 +; GFX900-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 +; GFX900-NEXT: v_bfe_u32 v33, v28, 16, 1 +; GFX900-NEXT: v_mul_f32_e32 v11, v11, v27 +; GFX900-NEXT: v_add3_u32 v33, v33, v28, s4 +; GFX900-NEXT: v_or_b32_e32 v34, 0x400000, v28 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v28, v28 +; GFX900-NEXT: v_bfe_u32 v27, v11, 16, 1 +; GFX900-NEXT: v_cndmask_b32_e32 v28, v33, v34, vcc +; GFX900-NEXT: v_add3_u32 v27, v27, v11, s4 +; GFX900-NEXT: v_or_b32_e32 v33, 0x400000, v11 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v11, v11 +; GFX900-NEXT: v_cndmask_b32_e32 v11, v27, v33, vcc +; GFX900-NEXT: v_lshlrev_b32_e32 v27, 16, v26 +; GFX900-NEXT: v_lshlrev_b32_e32 v33, 16, v10 +; GFX900-NEXT: v_mul_f32_e32 v27, v33, v27 +; GFX900-NEXT: v_and_b32_e32 v26, 0xffff0000, v26 +; GFX900-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 +; GFX900-NEXT: v_bfe_u32 v33, v27, 16, 1 +; GFX900-NEXT: v_mul_f32_e32 v10, v10, v26 +; GFX900-NEXT: v_add3_u32 v33, v33, v27, s4 +; GFX900-NEXT: v_or_b32_e32 v34, 0x400000, v27 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v27, v27 +; GFX900-NEXT: v_bfe_u32 v26, v10, 16, 1 +; GFX900-NEXT: v_cndmask_b32_e32 v27, v33, v34, vcc +; GFX900-NEXT: v_add3_u32 v26, v26, v10, s4 +; GFX900-NEXT: v_or_b32_e32 v33, 0x400000, v10 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v10, v10 +; GFX900-NEXT: v_cndmask_b32_e32 v10, v26, v33, vcc +; GFX900-NEXT: v_lshlrev_b32_e32 v26, 16, v25 +; GFX900-NEXT: v_lshlrev_b32_e32 v33, 16, v9 +; GFX900-NEXT: v_mul_f32_e32 v26, v33, v26 +; GFX900-NEXT: v_and_b32_e32 v25, 0xffff0000, v25 +; GFX900-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 +; GFX900-NEXT: v_bfe_u32 v33, v26, 16, 1 +; GFX900-NEXT: v_mul_f32_e32 v9, v9, v25 +; GFX900-NEXT: v_add3_u32 v33, v33, v26, s4 +; GFX900-NEXT: v_or_b32_e32 v34, 0x400000, v26 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v26, v26 +; GFX900-NEXT: v_bfe_u32 v25, v9, 16, 1 +; GFX900-NEXT: v_cndmask_b32_e32 v26, v33, v34, vcc +; GFX900-NEXT: v_add3_u32 v25, v25, v9, s4 +; GFX900-NEXT: v_or_b32_e32 v33, 0x400000, v9 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v9, v9 +; GFX900-NEXT: v_cndmask_b32_e32 v9, v25, v33, vcc +; GFX900-NEXT: v_lshlrev_b32_e32 v25, 16, v24 +; GFX900-NEXT: v_lshlrev_b32_e32 v33, 16, v8 +; GFX900-NEXT: v_mul_f32_e32 v25, v33, v25 +; GFX900-NEXT: v_and_b32_e32 v24, 0xffff0000, v24 +; GFX900-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 +; GFX900-NEXT: v_bfe_u32 v33, v25, 16, 1 +; GFX900-NEXT: v_mul_f32_e32 v8, v8, v24 +; GFX900-NEXT: v_add3_u32 v33, v33, v25, s4 +; GFX900-NEXT: v_or_b32_e32 v34, 0x400000, v25 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v25, v25 +; GFX900-NEXT: v_bfe_u32 v24, v8, 16, 1 +; GFX900-NEXT: v_cndmask_b32_e32 v25, v33, v34, vcc +; GFX900-NEXT: v_add3_u32 v24, v24, v8, s4 +; GFX900-NEXT: v_or_b32_e32 v33, 0x400000, v8 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v8, v8 +; GFX900-NEXT: v_cndmask_b32_e32 v8, v24, v33, vcc +; GFX900-NEXT: v_lshlrev_b32_e32 v24, 16, v23 +; GFX900-NEXT: v_lshlrev_b32_e32 v33, 16, v7 +; GFX900-NEXT: v_mul_f32_e32 v24, v33, v24 +; GFX900-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 +; GFX900-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 +; GFX900-NEXT: v_bfe_u32 v33, v24, 16, 1 +; GFX900-NEXT: v_mul_f32_e32 v7, v7, v23 +; GFX900-NEXT: v_add3_u32 v33, v33, v24, s4 +; GFX900-NEXT: v_or_b32_e32 v34, 0x400000, v24 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v24, v24 +; GFX900-NEXT: v_bfe_u32 v23, v7, 16, 1 +; GFX900-NEXT: v_cndmask_b32_e32 v24, v33, v34, vcc +; GFX900-NEXT: v_add3_u32 v23, v23, v7, s4 +; GFX900-NEXT: v_or_b32_e32 v33, 0x400000, v7 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 +; GFX900-NEXT: v_cndmask_b32_e32 v7, v23, v33, vcc +; GFX900-NEXT: v_lshlrev_b32_e32 v23, 16, v22 +; GFX900-NEXT: v_lshlrev_b32_e32 v33, 16, v6 +; GFX900-NEXT: v_mul_f32_e32 v23, v33, v23 +; GFX900-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 +; GFX900-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 +; GFX900-NEXT: v_bfe_u32 v33, v23, 16, 1 +; GFX900-NEXT: v_mul_f32_e32 v6, v6, v22 +; GFX900-NEXT: v_add3_u32 v33, v33, v23, s4 +; GFX900-NEXT: v_or_b32_e32 v34, 0x400000, v23 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v23, v23 +; GFX900-NEXT: v_bfe_u32 v22, v6, 16, 1 +; GFX900-NEXT: v_cndmask_b32_e32 v23, v33, v34, vcc +; GFX900-NEXT: v_add3_u32 v22, v22, v6, s4 +; GFX900-NEXT: v_or_b32_e32 v33, 0x400000, v6 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; GFX900-NEXT: v_cndmask_b32_e32 v6, v22, v33, vcc +; GFX900-NEXT: v_lshlrev_b32_e32 v22, 16, v21 +; GFX900-NEXT: v_lshlrev_b32_e32 v33, 16, v5 +; GFX900-NEXT: v_mul_f32_e32 v22, v33, v22 +; GFX900-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 +; GFX900-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 +; GFX900-NEXT: v_bfe_u32 v33, v22, 16, 1 +; GFX900-NEXT: v_mul_f32_e32 v5, v5, v21 +; GFX900-NEXT: v_add3_u32 v33, v33, v22, s4 +; GFX900-NEXT: v_or_b32_e32 v34, 0x400000, v22 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v22, v22 +; GFX900-NEXT: v_bfe_u32 v21, v5, 16, 1 +; GFX900-NEXT: v_cndmask_b32_e32 v22, v33, v34, vcc +; GFX900-NEXT: v_add3_u32 v21, v21, v5, s4 +; GFX900-NEXT: v_or_b32_e32 v33, 0x400000, v5 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX900-NEXT: v_cndmask_b32_e32 v5, v21, v33, vcc +; GFX900-NEXT: v_lshlrev_b32_e32 v21, 16, v20 +; GFX900-NEXT: v_lshlrev_b32_e32 v33, 16, v4 +; GFX900-NEXT: v_mul_f32_e32 v21, v33, v21 +; GFX900-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 +; GFX900-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 +; GFX900-NEXT: v_bfe_u32 v33, v21, 16, 1 +; GFX900-NEXT: v_mul_f32_e32 v4, v4, v20 +; GFX900-NEXT: v_add3_u32 v33, v33, v21, s4 +; GFX900-NEXT: v_or_b32_e32 v34, 0x400000, v21 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v21, v21 +; GFX900-NEXT: v_bfe_u32 v20, v4, 16, 1 +; GFX900-NEXT: v_cndmask_b32_e32 v21, v33, v34, vcc +; GFX900-NEXT: v_add3_u32 v20, v20, v4, s4 +; GFX900-NEXT: v_or_b32_e32 v33, 0x400000, v4 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; GFX900-NEXT: v_cndmask_b32_e32 v4, v20, v33, vcc +; GFX900-NEXT: v_lshlrev_b32_e32 v20, 16, v19 +; GFX900-NEXT: v_lshlrev_b32_e32 v33, 16, v3 +; GFX900-NEXT: v_mul_f32_e32 v20, v33, v20 +; GFX900-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 +; GFX900-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; GFX900-NEXT: v_bfe_u32 v33, v20, 16, 1 +; GFX900-NEXT: v_mul_f32_e32 v3, v3, v19 +; GFX900-NEXT: v_add3_u32 v33, v33, v20, s4 +; GFX900-NEXT: v_or_b32_e32 v34, 0x400000, v20 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v20, v20 +; GFX900-NEXT: v_bfe_u32 v19, v3, 16, 1 +; GFX900-NEXT: v_cndmask_b32_e32 v20, v33, v34, vcc +; GFX900-NEXT: v_add3_u32 v19, v19, v3, s4 +; GFX900-NEXT: v_or_b32_e32 v33, 0x400000, v3 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX900-NEXT: v_cndmask_b32_e32 v3, v19, v33, vcc +; GFX900-NEXT: v_lshlrev_b32_e32 v19, 16, v18 +; GFX900-NEXT: v_lshlrev_b32_e32 v33, 16, v2 +; GFX900-NEXT: v_mul_f32_e32 v19, v33, v19 +; GFX900-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 +; GFX900-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX900-NEXT: v_bfe_u32 v33, v19, 16, 1 +; GFX900-NEXT: v_mul_f32_e32 v2, v2, v18 +; GFX900-NEXT: v_add3_u32 v33, v33, v19, s4 +; GFX900-NEXT: v_or_b32_e32 v34, 0x400000, v19 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v19, v19 +; GFX900-NEXT: v_bfe_u32 v18, v2, 16, 1 +; GFX900-NEXT: v_cndmask_b32_e32 v19, v33, v34, vcc +; GFX900-NEXT: v_add3_u32 v18, v18, v2, s4 +; GFX900-NEXT: v_or_b32_e32 v33, 0x400000, v2 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX900-NEXT: v_cndmask_b32_e32 v2, v18, v33, vcc +; GFX900-NEXT: v_lshlrev_b32_e32 v18, 16, v17 +; GFX900-NEXT: v_lshlrev_b32_e32 v33, 16, v1 +; GFX900-NEXT: v_mul_f32_e32 v18, v33, v18 +; GFX900-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 +; GFX900-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX900-NEXT: v_bfe_u32 v33, v18, 16, 1 +; GFX900-NEXT: v_mul_f32_e32 v1, v1, v17 +; GFX900-NEXT: v_add3_u32 v33, v33, v18, s4 +; GFX900-NEXT: v_or_b32_e32 v34, 0x400000, v18 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 +; GFX900-NEXT: v_bfe_u32 v17, v1, 16, 1 +; GFX900-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc +; GFX900-NEXT: v_add3_u32 v17, v17, v1, s4 +; GFX900-NEXT: v_or_b32_e32 v33, 0x400000, v1 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX900-NEXT: v_cndmask_b32_e32 v1, v17, v33, vcc +; GFX900-NEXT: v_lshlrev_b32_e32 v17, 16, v16 +; GFX900-NEXT: v_lshlrev_b32_e32 v33, 16, v0 +; GFX900-NEXT: v_mul_f32_e32 v17, v33, v17 +; GFX900-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 +; GFX900-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX900-NEXT: v_bfe_u32 v33, v17, 16, 1 +; GFX900-NEXT: v_mul_f32_e32 v0, v0, v16 +; GFX900-NEXT: v_add3_u32 v33, v33, v17, s4 +; GFX900-NEXT: v_or_b32_e32 v34, 0x400000, v17 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 +; GFX900-NEXT: v_bfe_u32 v16, v0, 16, 1 +; GFX900-NEXT: v_cndmask_b32_e32 v17, v33, v34, vcc +; GFX900-NEXT: v_add3_u32 v16, v16, v0, s4 +; GFX900-NEXT: v_or_b32_e32 v33, 0x400000, v0 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX900-NEXT: v_cndmask_b32_e32 v0, v16, v33, vcc +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_perm_b32 v0, v0, v17, s4 +; GFX900-NEXT: v_perm_b32 v1, v1, v18, s4 +; GFX900-NEXT: v_perm_b32 v2, v2, v19, s4 +; GFX900-NEXT: v_perm_b32 v3, v3, v20, s4 +; GFX900-NEXT: v_perm_b32 v4, v4, v21, s4 +; GFX900-NEXT: v_perm_b32 v5, v5, v22, s4 +; GFX900-NEXT: v_perm_b32 v6, v6, v23, s4 +; GFX900-NEXT: v_perm_b32 v7, v7, v24, s4 +; GFX900-NEXT: v_perm_b32 v8, v8, v25, s4 +; GFX900-NEXT: v_perm_b32 v9, v9, v26, s4 +; GFX900-NEXT: v_perm_b32 v10, v10, v27, s4 +; GFX900-NEXT: v_perm_b32 v11, v11, v28, s4 +; GFX900-NEXT: v_perm_b32 v12, v12, v32, s4 +; GFX900-NEXT: v_perm_b32 v13, v13, v30, s4 +; GFX900-NEXT: v_perm_b32 v14, v14, v31, s4 +; GFX900-NEXT: v_perm_b32 v15, v29, v15, s4 +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_fmul_v32bf16: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: scratch_load_dword v31, off, s32 +; GFX950-NEXT: v_and_b32_e32 v53, 0xffff0000, v24 +; GFX950-NEXT: v_and_b32_e32 v54, 0xffff0000, v8 +; GFX950-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; GFX950-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; GFX950-NEXT: v_accvgpr_write_b32 a0, v40 ; Reload Reuse +; GFX950-NEXT: v_and_b32_e32 v32, 0xffff0000, v15 +; GFX950-NEXT: v_and_b32_e32 v55, 0xffff0000, v23 +; GFX950-NEXT: v_and_b32_e32 v40, 0xffff0000, v7 +; GFX950-NEXT: v_mul_f32_e32 v8, v8, v24 +; GFX950-NEXT: v_lshlrev_b32_e32 v23, 16, v23 +; GFX950-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; GFX950-NEXT: v_mul_f32_e32 v7, v7, v23 +; GFX950-NEXT: v_and_b32_e32 v23, 0xffff0000, v22 +; GFX950-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; GFX950-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; GFX950-NEXT: v_and_b32_e32 v33, 0xffff0000, v30 +; GFX950-NEXT: v_and_b32_e32 v34, 0xffff0000, v14 +; GFX950-NEXT: v_lshlrev_b32_e32 v30, 16, v30 +; GFX950-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; GFX950-NEXT: v_and_b32_e32 v35, 0xffff0000, v29 +; GFX950-NEXT: v_and_b32_e32 v36, 0xffff0000, v13 +; GFX950-NEXT: v_lshlrev_b32_e32 v29, 16, v29 +; GFX950-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; GFX950-NEXT: v_and_b32_e32 v37, 0xffff0000, v28 +; GFX950-NEXT: v_and_b32_e32 v38, 0xffff0000, v12 +; GFX950-NEXT: v_lshlrev_b32_e32 v28, 16, v28 +; GFX950-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; GFX950-NEXT: v_and_b32_e32 v39, 0xffff0000, v27 +; GFX950-NEXT: v_and_b32_e32 v48, 0xffff0000, v11 +; GFX950-NEXT: v_lshlrev_b32_e32 v27, 16, v27 +; GFX950-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; GFX950-NEXT: v_and_b32_e32 v49, 0xffff0000, v26 +; GFX950-NEXT: v_and_b32_e32 v50, 0xffff0000, v10 +; GFX950-NEXT: v_lshlrev_b32_e32 v26, 16, v26 +; GFX950-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; GFX950-NEXT: v_and_b32_e32 v51, 0xffff0000, v25 +; GFX950-NEXT: v_and_b32_e32 v52, 0xffff0000, v9 +; GFX950-NEXT: v_lshlrev_b32_e32 v25, 16, v25 +; GFX950-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; GFX950-NEXT: v_mul_f32_e32 v33, v34, v33 +; GFX950-NEXT: v_mul_f32_e32 v14, v14, v30 +; GFX950-NEXT: v_mul_f32_e32 v30, v36, v35 +; GFX950-NEXT: v_mul_f32_e32 v13, v13, v29 +; GFX950-NEXT: v_mul_f32_e32 v29, v38, v37 +; GFX950-NEXT: v_mul_f32_e32 v12, v12, v28 +; GFX950-NEXT: v_mul_f32_e32 v28, v48, v39 +; GFX950-NEXT: v_mul_f32_e32 v11, v11, v27 +; GFX950-NEXT: v_mul_f32_e32 v27, v50, v49 +; GFX950-NEXT: v_mul_f32_e32 v10, v10, v26 +; GFX950-NEXT: v_mul_f32_e32 v26, v52, v51 +; GFX950-NEXT: v_mul_f32_e32 v9, v9, v25 +; GFX950-NEXT: v_mul_f32_e32 v25, v54, v53 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v8, v8, v25 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v9, v9, v26 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v10, v10, v27 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v11, v11, v28 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v12, v12, v29 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v13, v13, v30 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v14, v14, v33 +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: v_and_b32_e32 v24, 0xffff0000, v31 +; GFX950-NEXT: v_mul_f32_e32 v24, v32, v24 +; GFX950-NEXT: v_and_b32_e32 v32, 0xffff0000, v6 +; GFX950-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; GFX950-NEXT: v_mul_f32_e32 v23, v32, v23 +; GFX950-NEXT: v_mul_f32_e32 v6, v6, v22 +; GFX950-NEXT: v_and_b32_e32 v22, 0xffff0000, v21 +; GFX950-NEXT: v_and_b32_e32 v32, 0xffff0000, v5 +; GFX950-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; GFX950-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX950-NEXT: v_mul_f32_e32 v22, v32, v22 +; GFX950-NEXT: v_mul_f32_e32 v5, v5, v21 +; GFX950-NEXT: v_and_b32_e32 v21, 0xffff0000, v20 +; GFX950-NEXT: v_and_b32_e32 v32, 0xffff0000, v4 +; GFX950-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; GFX950-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX950-NEXT: v_mul_f32_e32 v21, v32, v21 +; GFX950-NEXT: v_mul_f32_e32 v4, v4, v20 +; GFX950-NEXT: v_and_b32_e32 v20, 0xffff0000, v19 +; GFX950-NEXT: v_and_b32_e32 v32, 0xffff0000, v3 +; GFX950-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; GFX950-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX950-NEXT: v_mul_f32_e32 v20, v32, v20 +; GFX950-NEXT: v_mul_f32_e32 v3, v3, v19 +; GFX950-NEXT: v_and_b32_e32 v19, 0xffff0000, v18 +; GFX950-NEXT: v_and_b32_e32 v32, 0xffff0000, v2 +; GFX950-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; GFX950-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX950-NEXT: v_mul_f32_e32 v19, v32, v19 +; GFX950-NEXT: v_mul_f32_e32 v2, v2, v18 +; GFX950-NEXT: v_and_b32_e32 v18, 0xffff0000, v17 +; GFX950-NEXT: v_and_b32_e32 v32, 0xffff0000, v1 +; GFX950-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; GFX950-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX950-NEXT: v_lshlrev_b32_e32 v31, 16, v31 +; GFX950-NEXT: v_mul_f32_e32 v18, v32, v18 +; GFX950-NEXT: v_mul_f32_e32 v1, v1, v17 +; GFX950-NEXT: v_and_b32_e32 v17, 0xffff0000, v16 +; GFX950-NEXT: v_and_b32_e32 v32, 0xffff0000, v0 +; GFX950-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; GFX950-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX950-NEXT: v_mul_f32_e32 v15, v15, v31 +; GFX950-NEXT: v_mul_f32_e32 v31, v40, v55 +; GFX950-NEXT: v_mul_f32_e32 v17, v32, v17 +; GFX950-NEXT: v_mul_f32_e32 v0, v0, v16 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v0, v0, v17 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v1, v1, v18 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v2, v2, v19 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v3, v3, v20 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v4, v4, v21 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v5, v5, v22 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v6, v6, v23 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v7, v7, v31 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v15, v15, v24 +; GFX950-NEXT: v_accvgpr_read_b32 v40, a0 ; Reload Reuse +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_fmul_v32bf16: ; GFX10: ; %bb.0: @@ -18524,30 +19654,50 @@ define bfloat @v_fdiv_bf16(bfloat %a, bfloat %b) { ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_fdiv_bf16: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX9-NEXT: v_div_scale_f32 v2, s[4:5], v1, v1, v0 -; GFX9-NEXT: v_div_scale_f32 v3, vcc, v0, v1, v0 -; GFX9-NEXT: s_movk_i32 s4, 0x7fff -; GFX9-NEXT: v_rcp_f32_e32 v4, v2 -; GFX9-NEXT: v_fma_f32 v5, -v2, v4, 1.0 -; GFX9-NEXT: v_fma_f32 v4, v5, v4, v4 -; GFX9-NEXT: v_mul_f32_e32 v5, v3, v4 -; GFX9-NEXT: v_fma_f32 v6, -v2, v5, v3 -; GFX9-NEXT: v_fma_f32 v5, v6, v4, v5 -; GFX9-NEXT: v_fma_f32 v2, -v2, v5, v3 -; GFX9-NEXT: v_div_fmas_f32 v2, v2, v4, v5 -; GFX9-NEXT: v_div_fixup_f32 v0, v2, v1, v0 -; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1 -; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v0 -; GFX9-NEXT: v_add3_u32 v1, v1, v0, s4 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: v_fdiv_bf16: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX900-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX900-NEXT: v_div_scale_f32 v2, s[4:5], v1, v1, v0 +; GFX900-NEXT: v_div_scale_f32 v3, vcc, v0, v1, v0 +; GFX900-NEXT: s_movk_i32 s4, 0x7fff +; GFX900-NEXT: v_rcp_f32_e32 v4, v2 +; GFX900-NEXT: v_fma_f32 v5, -v2, v4, 1.0 +; GFX900-NEXT: v_fma_f32 v4, v5, v4, v4 +; GFX900-NEXT: v_mul_f32_e32 v5, v3, v4 +; GFX900-NEXT: v_fma_f32 v6, -v2, v5, v3 +; GFX900-NEXT: v_fma_f32 v5, v6, v4, v5 +; GFX900-NEXT: v_fma_f32 v2, -v2, v5, v3 +; GFX900-NEXT: v_div_fmas_f32 v2, v2, v4, v5 +; GFX900-NEXT: v_div_fixup_f32 v0, v2, v1, v0 +; GFX900-NEXT: v_bfe_u32 v1, v0, 16, 1 +; GFX900-NEXT: v_or_b32_e32 v2, 0x400000, v0 +; GFX900-NEXT: v_add3_u32 v1, v1, v0, s4 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX900-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc +; GFX900-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_fdiv_bf16: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX950-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX950-NEXT: v_div_scale_f32 v2, s[0:1], v1, v1, v0 +; GFX950-NEXT: v_rcp_f32_e32 v3, v2 +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: v_fma_f32 v4, -v2, v3, 1.0 +; GFX950-NEXT: v_fmac_f32_e32 v3, v4, v3 +; GFX950-NEXT: v_div_scale_f32 v4, vcc, v0, v1, v0 +; GFX950-NEXT: v_mul_f32_e32 v5, v4, v3 +; GFX950-NEXT: v_fma_f32 v6, -v2, v5, v4 +; GFX950-NEXT: v_fmac_f32_e32 v5, v6, v3 +; GFX950-NEXT: v_fma_f32 v2, -v2, v5, v4 +; GFX950-NEXT: v_div_fmas_f32 v2, v2, v3, v5 +; GFX950-NEXT: v_div_fixup_f32 v0, v2, v1, v0 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v0, v0, s0 +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_fdiv_bf16: ; GFX10: ; %bb.0: @@ -18996,20 +20146,29 @@ define bfloat @v_minnum_bf16(bfloat %a, bfloat %b) { ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_minnum_bf16: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX9-NEXT: v_min_f32_e32 v0, v0, v1 -; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1 -; GFX9-NEXT: s_movk_i32 s4, 0x7fff -; GFX9-NEXT: v_add3_u32 v1, v1, v0, s4 -; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v0 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: v_minnum_bf16: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX900-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX900-NEXT: v_min_f32_e32 v0, v0, v1 +; GFX900-NEXT: v_bfe_u32 v1, v0, 16, 1 +; GFX900-NEXT: s_movk_i32 s4, 0x7fff +; GFX900-NEXT: v_add3_u32 v1, v1, v0, s4 +; GFX900-NEXT: v_or_b32_e32 v2, 0x400000, v0 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX900-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc +; GFX900-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_minnum_bf16: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX950-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX950-NEXT: v_min_f32_e32 v0, v0, v1 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v0, v0, s0 +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_minnum_bf16: ; GFX10: ; %bb.0: @@ -19124,29 +20283,41 @@ define <2 x bfloat> @v_minnum_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b) { ; GFX8-NEXT: v_alignbit_b32 v0, v0, v2, 16 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_minnum_v2bf16: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v1 -; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v0 -; GFX9-NEXT: v_min_f32_e32 v2, v3, v2 -; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 -; GFX9-NEXT: s_movk_i32 s4, 0x7fff -; GFX9-NEXT: v_min_f32_e32 v0, v0, v1 -; GFX9-NEXT: v_add3_u32 v3, v3, v2, s4 -; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; GFX9-NEXT: v_add3_u32 v1, v1, v0, s4 -; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v0 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v3, vcc -; GFX9-NEXT: s_mov_b32 s4, 0x7060302 -; GFX9-NEXT: v_perm_b32 v0, v0, v2, s4 -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: v_minnum_v2bf16: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_lshlrev_b32_e32 v2, 16, v1 +; GFX900-NEXT: v_lshlrev_b32_e32 v3, 16, v0 +; GFX900-NEXT: v_min_f32_e32 v2, v3, v2 +; GFX900-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX900-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX900-NEXT: v_bfe_u32 v3, v2, 16, 1 +; GFX900-NEXT: s_movk_i32 s4, 0x7fff +; GFX900-NEXT: v_min_f32_e32 v0, v0, v1 +; GFX900-NEXT: v_add3_u32 v3, v3, v2, s4 +; GFX900-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX900-NEXT: v_bfe_u32 v1, v0, 16, 1 +; GFX900-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; GFX900-NEXT: v_add3_u32 v1, v1, v0, s4 +; GFX900-NEXT: v_or_b32_e32 v3, 0x400000, v0 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX900-NEXT: v_cndmask_b32_e32 v0, v1, v3, vcc +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_perm_b32 v0, v0, v2, s4 +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_minnum_v2bf16: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_and_b32_e32 v2, 0xffff0000, v1 +; GFX950-NEXT: v_and_b32_e32 v3, 0xffff0000, v0 +; GFX950-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX950-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX950-NEXT: v_min_f32_e32 v2, v3, v2 +; GFX950-NEXT: v_min_f32_e32 v0, v0, v1 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v0, v0, v2 +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_minnum_v2bf16: ; GFX10: ; %bb.0: @@ -19309,38 +20480,54 @@ define <3 x bfloat> @v_minnum_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b) { ; GFX8-NEXT: v_alignbit_b32 v0, v0, v3, 16 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_minnum_v3bf16: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX9-NEXT: v_min_f32_e32 v1, v1, v3 -; GFX9-NEXT: v_bfe_u32 v3, v1, 16, 1 -; GFX9-NEXT: s_movk_i32 s4, 0x7fff -; GFX9-NEXT: v_add3_u32 v3, v3, v1, s4 -; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v1 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc -; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v2 -; GFX9-NEXT: v_lshlrev_b32_e32 v4, 16, v0 -; GFX9-NEXT: v_min_f32_e32 v3, v4, v3 -; GFX9-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX9-NEXT: v_bfe_u32 v4, v3, 16, 1 -; GFX9-NEXT: v_min_f32_e32 v0, v0, v2 -; GFX9-NEXT: v_add3_u32 v4, v4, v3, s4 -; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v3 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; GFX9-NEXT: v_bfe_u32 v2, v0, 16, 1 -; GFX9-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc -; GFX9-NEXT: v_add3_u32 v2, v2, v0, s4 -; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v0 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc -; GFX9-NEXT: s_mov_b32 s4, 0x7060302 -; GFX9-NEXT: v_perm_b32 v0, v0, v3, s4 -; GFX9-NEXT: v_alignbit_b32 v1, s4, v1, 16 -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: v_minnum_v3bf16: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX900-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX900-NEXT: v_min_f32_e32 v1, v1, v3 +; GFX900-NEXT: v_bfe_u32 v3, v1, 16, 1 +; GFX900-NEXT: s_movk_i32 s4, 0x7fff +; GFX900-NEXT: v_add3_u32 v3, v3, v1, s4 +; GFX900-NEXT: v_or_b32_e32 v4, 0x400000, v1 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX900-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc +; GFX900-NEXT: v_lshlrev_b32_e32 v3, 16, v2 +; GFX900-NEXT: v_lshlrev_b32_e32 v4, 16, v0 +; GFX900-NEXT: v_min_f32_e32 v3, v4, v3 +; GFX900-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX900-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX900-NEXT: v_bfe_u32 v4, v3, 16, 1 +; GFX900-NEXT: v_min_f32_e32 v0, v0, v2 +; GFX900-NEXT: v_add3_u32 v4, v4, v3, s4 +; GFX900-NEXT: v_or_b32_e32 v5, 0x400000, v3 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX900-NEXT: v_bfe_u32 v2, v0, 16, 1 +; GFX900-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc +; GFX900-NEXT: v_add3_u32 v2, v2, v0, s4 +; GFX900-NEXT: v_or_b32_e32 v4, 0x400000, v0 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX900-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_perm_b32 v0, v0, v3, s4 +; GFX900-NEXT: v_alignbit_b32 v1, s4, v1, 16 +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_minnum_v3bf16: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX950-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX950-NEXT: v_min_f32_e32 v1, v1, v3 +; GFX950-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 +; GFX950-NEXT: v_and_b32_e32 v4, 0xffff0000, v0 +; GFX950-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX950-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX950-NEXT: v_min_f32_e32 v3, v4, v3 +; GFX950-NEXT: v_min_f32_e32 v0, v0, v2 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v1, v1, s0 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v0, v0, v3 +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_minnum_v3bf16: ; GFX10: ; %bb.0: @@ -19550,46 +20737,65 @@ define <4 x bfloat> @v_minnum_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b) { ; GFX8-NEXT: v_alignbit_b32 v1, v1, v4, 16 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_minnum_v4bf16: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_lshlrev_b32_e32 v4, 16, v3 -; GFX9-NEXT: v_lshlrev_b32_e32 v5, 16, v1 -; GFX9-NEXT: v_min_f32_e32 v4, v5, v4 -; GFX9-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 -; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GFX9-NEXT: v_bfe_u32 v5, v4, 16, 1 -; GFX9-NEXT: s_movk_i32 s4, 0x7fff -; GFX9-NEXT: v_min_f32_e32 v1, v1, v3 -; GFX9-NEXT: v_add3_u32 v5, v5, v4, s4 -; GFX9-NEXT: v_or_b32_e32 v6, 0x400000, v4 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 -; GFX9-NEXT: v_bfe_u32 v3, v1, 16, 1 -; GFX9-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc -; GFX9-NEXT: v_add3_u32 v3, v3, v1, s4 -; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v1 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc -; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v2 -; GFX9-NEXT: v_lshlrev_b32_e32 v5, 16, v0 -; GFX9-NEXT: v_min_f32_e32 v3, v5, v3 -; GFX9-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX9-NEXT: v_bfe_u32 v5, v3, 16, 1 -; GFX9-NEXT: v_min_f32_e32 v0, v0, v2 -; GFX9-NEXT: v_add3_u32 v5, v5, v3, s4 -; GFX9-NEXT: v_or_b32_e32 v6, 0x400000, v3 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; GFX9-NEXT: v_bfe_u32 v2, v0, 16, 1 -; GFX9-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc -; GFX9-NEXT: v_add3_u32 v2, v2, v0, s4 -; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v0 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v5, vcc -; GFX9-NEXT: s_mov_b32 s4, 0x7060302 -; GFX9-NEXT: v_perm_b32 v0, v0, v3, s4 -; GFX9-NEXT: v_perm_b32 v1, v1, v4, s4 -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: v_minnum_v4bf16: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_lshlrev_b32_e32 v4, 16, v3 +; GFX900-NEXT: v_lshlrev_b32_e32 v5, 16, v1 +; GFX900-NEXT: v_min_f32_e32 v4, v5, v4 +; GFX900-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; GFX900-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX900-NEXT: v_bfe_u32 v5, v4, 16, 1 +; GFX900-NEXT: s_movk_i32 s4, 0x7fff +; GFX900-NEXT: v_min_f32_e32 v1, v1, v3 +; GFX900-NEXT: v_add3_u32 v5, v5, v4, s4 +; GFX900-NEXT: v_or_b32_e32 v6, 0x400000, v4 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; GFX900-NEXT: v_bfe_u32 v3, v1, 16, 1 +; GFX900-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc +; GFX900-NEXT: v_add3_u32 v3, v3, v1, s4 +; GFX900-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX900-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc +; GFX900-NEXT: v_lshlrev_b32_e32 v3, 16, v2 +; GFX900-NEXT: v_lshlrev_b32_e32 v5, 16, v0 +; GFX900-NEXT: v_min_f32_e32 v3, v5, v3 +; GFX900-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX900-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX900-NEXT: v_bfe_u32 v5, v3, 16, 1 +; GFX900-NEXT: v_min_f32_e32 v0, v0, v2 +; GFX900-NEXT: v_add3_u32 v5, v5, v3, s4 +; GFX900-NEXT: v_or_b32_e32 v6, 0x400000, v3 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX900-NEXT: v_bfe_u32 v2, v0, 16, 1 +; GFX900-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc +; GFX900-NEXT: v_add3_u32 v2, v2, v0, s4 +; GFX900-NEXT: v_or_b32_e32 v5, 0x400000, v0 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX900-NEXT: v_cndmask_b32_e32 v0, v2, v5, vcc +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_perm_b32 v0, v0, v3, s4 +; GFX900-NEXT: v_perm_b32 v1, v1, v4, s4 +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_minnum_v4bf16: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_and_b32_e32 v4, 0xffff0000, v3 +; GFX950-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 +; GFX950-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX950-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX950-NEXT: v_min_f32_e32 v4, v5, v4 +; GFX950-NEXT: v_min_f32_e32 v1, v1, v3 +; GFX950-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 +; GFX950-NEXT: v_and_b32_e32 v5, 0xffff0000, v0 +; GFX950-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX950-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX950-NEXT: v_min_f32_e32 v3, v5, v3 +; GFX950-NEXT: v_min_f32_e32 v0, v0, v2 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v0, v0, v3 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v1, v1, v4 +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_minnum_v4bf16: ; GFX10: ; %bb.0: @@ -19913,80 +21119,113 @@ define <8 x bfloat> @v_minnum_v8bf16(<8 x bfloat> %a, <8 x bfloat> %b) { ; GFX8-NEXT: v_alignbit_b32 v3, v3, v8, 16 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_minnum_v8bf16: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_lshlrev_b32_e32 v8, 16, v7 -; GFX9-NEXT: v_lshlrev_b32_e32 v9, 16, v3 -; GFX9-NEXT: v_min_f32_e32 v8, v9, v8 -; GFX9-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 -; GFX9-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 -; GFX9-NEXT: v_bfe_u32 v9, v8, 16, 1 -; GFX9-NEXT: s_movk_i32 s4, 0x7fff -; GFX9-NEXT: v_min_f32_e32 v3, v3, v7 -; GFX9-NEXT: v_add3_u32 v9, v9, v8, s4 -; GFX9-NEXT: v_or_b32_e32 v10, 0x400000, v8 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v8, v8 -; GFX9-NEXT: v_bfe_u32 v7, v3, 16, 1 -; GFX9-NEXT: v_cndmask_b32_e32 v8, v9, v10, vcc -; GFX9-NEXT: v_add3_u32 v7, v7, v3, s4 -; GFX9-NEXT: v_or_b32_e32 v9, 0x400000, v3 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; GFX9-NEXT: v_cndmask_b32_e32 v3, v7, v9, vcc -; GFX9-NEXT: v_lshlrev_b32_e32 v7, 16, v6 -; GFX9-NEXT: v_lshlrev_b32_e32 v9, 16, v2 -; GFX9-NEXT: v_min_f32_e32 v7, v9, v7 -; GFX9-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 -; GFX9-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX9-NEXT: v_bfe_u32 v9, v7, 16, 1 -; GFX9-NEXT: v_min_f32_e32 v2, v2, v6 -; GFX9-NEXT: v_add3_u32 v9, v9, v7, s4 -; GFX9-NEXT: v_or_b32_e32 v10, 0x400000, v7 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 -; GFX9-NEXT: v_bfe_u32 v6, v2, 16, 1 -; GFX9-NEXT: v_cndmask_b32_e32 v7, v9, v10, vcc -; GFX9-NEXT: v_add3_u32 v6, v6, v2, s4 -; GFX9-NEXT: v_or_b32_e32 v9, 0x400000, v2 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v6, v9, vcc -; GFX9-NEXT: v_lshlrev_b32_e32 v6, 16, v5 -; GFX9-NEXT: v_lshlrev_b32_e32 v9, 16, v1 -; GFX9-NEXT: v_min_f32_e32 v6, v9, v6 -; GFX9-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 -; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GFX9-NEXT: v_bfe_u32 v9, v6, 16, 1 -; GFX9-NEXT: v_min_f32_e32 v1, v1, v5 -; GFX9-NEXT: v_add3_u32 v9, v9, v6, s4 -; GFX9-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 -; GFX9-NEXT: v_bfe_u32 v5, v1, 16, 1 -; GFX9-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc -; GFX9-NEXT: v_add3_u32 v5, v5, v1, s4 -; GFX9-NEXT: v_or_b32_e32 v9, 0x400000, v1 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v5, v9, vcc -; GFX9-NEXT: v_lshlrev_b32_e32 v5, 16, v4 -; GFX9-NEXT: v_lshlrev_b32_e32 v9, 16, v0 -; GFX9-NEXT: v_min_f32_e32 v5, v9, v5 -; GFX9-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX9-NEXT: v_bfe_u32 v9, v5, 16, 1 -; GFX9-NEXT: v_min_f32_e32 v0, v0, v4 -; GFX9-NEXT: v_add3_u32 v9, v9, v5, s4 -; GFX9-NEXT: v_or_b32_e32 v10, 0x400000, v5 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX9-NEXT: v_bfe_u32 v4, v0, 16, 1 -; GFX9-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc -; GFX9-NEXT: v_add3_u32 v4, v4, v0, s4 -; GFX9-NEXT: v_or_b32_e32 v9, 0x400000, v0 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v9, vcc -; GFX9-NEXT: s_mov_b32 s4, 0x7060302 -; GFX9-NEXT: v_perm_b32 v0, v0, v5, s4 -; GFX9-NEXT: v_perm_b32 v1, v1, v6, s4 -; GFX9-NEXT: v_perm_b32 v2, v2, v7, s4 -; GFX9-NEXT: v_perm_b32 v3, v3, v8, s4 -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: v_minnum_v8bf16: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_lshlrev_b32_e32 v8, 16, v7 +; GFX900-NEXT: v_lshlrev_b32_e32 v9, 16, v3 +; GFX900-NEXT: v_min_f32_e32 v8, v9, v8 +; GFX900-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 +; GFX900-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; GFX900-NEXT: v_bfe_u32 v9, v8, 16, 1 +; GFX900-NEXT: s_movk_i32 s4, 0x7fff +; GFX900-NEXT: v_min_f32_e32 v3, v3, v7 +; GFX900-NEXT: v_add3_u32 v9, v9, v8, s4 +; GFX900-NEXT: v_or_b32_e32 v10, 0x400000, v8 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v8, v8 +; GFX900-NEXT: v_bfe_u32 v7, v3, 16, 1 +; GFX900-NEXT: v_cndmask_b32_e32 v8, v9, v10, vcc +; GFX900-NEXT: v_add3_u32 v7, v7, v3, s4 +; GFX900-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX900-NEXT: v_cndmask_b32_e32 v3, v7, v9, vcc +; GFX900-NEXT: v_lshlrev_b32_e32 v7, 16, v6 +; GFX900-NEXT: v_lshlrev_b32_e32 v9, 16, v2 +; GFX900-NEXT: v_min_f32_e32 v7, v9, v7 +; GFX900-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 +; GFX900-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX900-NEXT: v_bfe_u32 v9, v7, 16, 1 +; GFX900-NEXT: v_min_f32_e32 v2, v2, v6 +; GFX900-NEXT: v_add3_u32 v9, v9, v7, s4 +; GFX900-NEXT: v_or_b32_e32 v10, 0x400000, v7 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 +; GFX900-NEXT: v_bfe_u32 v6, v2, 16, 1 +; GFX900-NEXT: v_cndmask_b32_e32 v7, v9, v10, vcc +; GFX900-NEXT: v_add3_u32 v6, v6, v2, s4 +; GFX900-NEXT: v_or_b32_e32 v9, 0x400000, v2 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX900-NEXT: v_cndmask_b32_e32 v2, v6, v9, vcc +; GFX900-NEXT: v_lshlrev_b32_e32 v6, 16, v5 +; GFX900-NEXT: v_lshlrev_b32_e32 v9, 16, v1 +; GFX900-NEXT: v_min_f32_e32 v6, v9, v6 +; GFX900-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 +; GFX900-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX900-NEXT: v_bfe_u32 v9, v6, 16, 1 +; GFX900-NEXT: v_min_f32_e32 v1, v1, v5 +; GFX900-NEXT: v_add3_u32 v9, v9, v6, s4 +; GFX900-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; GFX900-NEXT: v_bfe_u32 v5, v1, 16, 1 +; GFX900-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc +; GFX900-NEXT: v_add3_u32 v5, v5, v1, s4 +; GFX900-NEXT: v_or_b32_e32 v9, 0x400000, v1 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX900-NEXT: v_cndmask_b32_e32 v1, v5, v9, vcc +; GFX900-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX900-NEXT: v_lshlrev_b32_e32 v9, 16, v0 +; GFX900-NEXT: v_min_f32_e32 v5, v9, v5 +; GFX900-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 +; GFX900-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX900-NEXT: v_bfe_u32 v9, v5, 16, 1 +; GFX900-NEXT: v_min_f32_e32 v0, v0, v4 +; GFX900-NEXT: v_add3_u32 v9, v9, v5, s4 +; GFX900-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX900-NEXT: v_bfe_u32 v4, v0, 16, 1 +; GFX900-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc +; GFX900-NEXT: v_add3_u32 v4, v4, v0, s4 +; GFX900-NEXT: v_or_b32_e32 v9, 0x400000, v0 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX900-NEXT: v_cndmask_b32_e32 v0, v4, v9, vcc +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_perm_b32 v0, v0, v5, s4 +; GFX900-NEXT: v_perm_b32 v1, v1, v6, s4 +; GFX900-NEXT: v_perm_b32 v2, v2, v7, s4 +; GFX900-NEXT: v_perm_b32 v3, v3, v8, s4 +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_minnum_v8bf16: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_and_b32_e32 v8, 0xffff0000, v7 +; GFX950-NEXT: v_and_b32_e32 v9, 0xffff0000, v3 +; GFX950-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; GFX950-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX950-NEXT: v_min_f32_e32 v8, v9, v8 +; GFX950-NEXT: v_min_f32_e32 v3, v3, v7 +; GFX950-NEXT: v_and_b32_e32 v7, 0xffff0000, v6 +; GFX950-NEXT: v_and_b32_e32 v9, 0xffff0000, v2 +; GFX950-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; GFX950-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX950-NEXT: v_min_f32_e32 v7, v9, v7 +; GFX950-NEXT: v_min_f32_e32 v2, v2, v6 +; GFX950-NEXT: v_and_b32_e32 v6, 0xffff0000, v5 +; GFX950-NEXT: v_and_b32_e32 v9, 0xffff0000, v1 +; GFX950-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX950-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX950-NEXT: v_min_f32_e32 v6, v9, v6 +; GFX950-NEXT: v_min_f32_e32 v1, v1, v5 +; GFX950-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 +; GFX950-NEXT: v_and_b32_e32 v9, 0xffff0000, v0 +; GFX950-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX950-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX950-NEXT: v_min_f32_e32 v5, v9, v5 +; GFX950-NEXT: v_min_f32_e32 v0, v0, v4 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v0, v0, v5 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v1, v1, v6 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v2, v2, v7 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v3, v3, v8 +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_minnum_v8bf16: ; GFX10: ; %bb.0: @@ -20602,148 +21841,209 @@ define <16 x bfloat> @v_minnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) { ; GFX8-NEXT: v_alignbit_b32 v7, v7, v16, 16 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_minnum_v16bf16: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_lshlrev_b32_e32 v16, 16, v15 -; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v7 -; GFX9-NEXT: v_min_f32_e32 v16, v17, v16 -; GFX9-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 -; GFX9-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 -; GFX9-NEXT: v_bfe_u32 v17, v16, 16, 1 -; GFX9-NEXT: s_movk_i32 s4, 0x7fff -; GFX9-NEXT: v_min_f32_e32 v7, v7, v15 -; GFX9-NEXT: v_add3_u32 v17, v17, v16, s4 -; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v16 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v16, v16 -; GFX9-NEXT: v_bfe_u32 v15, v7, 16, 1 -; GFX9-NEXT: v_cndmask_b32_e32 v16, v17, v18, vcc -; GFX9-NEXT: v_add3_u32 v15, v15, v7, s4 -; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v7 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 -; GFX9-NEXT: v_cndmask_b32_e32 v7, v15, v17, vcc -; GFX9-NEXT: v_lshlrev_b32_e32 v15, 16, v14 -; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v6 -; GFX9-NEXT: v_min_f32_e32 v15, v17, v15 -; GFX9-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 -; GFX9-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 -; GFX9-NEXT: v_bfe_u32 v17, v15, 16, 1 -; GFX9-NEXT: v_min_f32_e32 v6, v6, v14 -; GFX9-NEXT: v_add3_u32 v17, v17, v15, s4 -; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v15 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v15, v15 -; GFX9-NEXT: v_bfe_u32 v14, v6, 16, 1 -; GFX9-NEXT: v_cndmask_b32_e32 v15, v17, v18, vcc -; GFX9-NEXT: v_add3_u32 v14, v14, v6, s4 -; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v6 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 -; GFX9-NEXT: v_cndmask_b32_e32 v6, v14, v17, vcc -; GFX9-NEXT: v_lshlrev_b32_e32 v14, 16, v13 -; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v5 -; GFX9-NEXT: v_min_f32_e32 v14, v17, v14 -; GFX9-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 -; GFX9-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 -; GFX9-NEXT: v_bfe_u32 v17, v14, 16, 1 -; GFX9-NEXT: v_min_f32_e32 v5, v5, v13 -; GFX9-NEXT: v_add3_u32 v17, v17, v14, s4 -; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v14 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v14, v14 -; GFX9-NEXT: v_bfe_u32 v13, v5, 16, 1 -; GFX9-NEXT: v_cndmask_b32_e32 v14, v17, v18, vcc -; GFX9-NEXT: v_add3_u32 v13, v13, v5, s4 -; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v5 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX9-NEXT: v_cndmask_b32_e32 v5, v13, v17, vcc -; GFX9-NEXT: v_lshlrev_b32_e32 v13, 16, v12 -; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v4 -; GFX9-NEXT: v_min_f32_e32 v13, v17, v13 -; GFX9-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 -; GFX9-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 -; GFX9-NEXT: v_bfe_u32 v17, v13, 16, 1 -; GFX9-NEXT: v_min_f32_e32 v4, v4, v12 -; GFX9-NEXT: v_add3_u32 v17, v17, v13, s4 -; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v13 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v13, v13 -; GFX9-NEXT: v_bfe_u32 v12, v4, 16, 1 -; GFX9-NEXT: v_cndmask_b32_e32 v13, v17, v18, vcc -; GFX9-NEXT: v_add3_u32 v12, v12, v4, s4 -; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v4 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 -; GFX9-NEXT: v_cndmask_b32_e32 v4, v12, v17, vcc -; GFX9-NEXT: v_lshlrev_b32_e32 v12, 16, v11 -; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v3 -; GFX9-NEXT: v_min_f32_e32 v12, v17, v12 -; GFX9-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 -; GFX9-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 -; GFX9-NEXT: v_bfe_u32 v17, v12, 16, 1 -; GFX9-NEXT: v_min_f32_e32 v3, v3, v11 -; GFX9-NEXT: v_add3_u32 v17, v17, v12, s4 -; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v12 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v12, v12 -; GFX9-NEXT: v_bfe_u32 v11, v3, 16, 1 -; GFX9-NEXT: v_cndmask_b32_e32 v12, v17, v18, vcc -; GFX9-NEXT: v_add3_u32 v11, v11, v3, s4 -; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v3 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; GFX9-NEXT: v_cndmask_b32_e32 v3, v11, v17, vcc -; GFX9-NEXT: v_lshlrev_b32_e32 v11, 16, v10 -; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v2 -; GFX9-NEXT: v_min_f32_e32 v11, v17, v11 -; GFX9-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 -; GFX9-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX9-NEXT: v_bfe_u32 v17, v11, 16, 1 -; GFX9-NEXT: v_min_f32_e32 v2, v2, v10 -; GFX9-NEXT: v_add3_u32 v17, v17, v11, s4 -; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v11 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v11, v11 -; GFX9-NEXT: v_bfe_u32 v10, v2, 16, 1 -; GFX9-NEXT: v_cndmask_b32_e32 v11, v17, v18, vcc -; GFX9-NEXT: v_add3_u32 v10, v10, v2, s4 -; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v2 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v10, v17, vcc -; GFX9-NEXT: v_lshlrev_b32_e32 v10, 16, v9 -; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v1 -; GFX9-NEXT: v_min_f32_e32 v10, v17, v10 -; GFX9-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 -; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GFX9-NEXT: v_bfe_u32 v17, v10, 16, 1 -; GFX9-NEXT: v_min_f32_e32 v1, v1, v9 -; GFX9-NEXT: v_add3_u32 v17, v17, v10, s4 -; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v10 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v10, v10 -; GFX9-NEXT: v_bfe_u32 v9, v1, 16, 1 -; GFX9-NEXT: v_cndmask_b32_e32 v10, v17, v18, vcc -; GFX9-NEXT: v_add3_u32 v9, v9, v1, s4 -; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v1 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v9, v17, vcc -; GFX9-NEXT: v_lshlrev_b32_e32 v9, 16, v8 -; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v0 -; GFX9-NEXT: v_min_f32_e32 v9, v17, v9 -; GFX9-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX9-NEXT: v_bfe_u32 v17, v9, 16, 1 -; GFX9-NEXT: v_min_f32_e32 v0, v0, v8 -; GFX9-NEXT: v_add3_u32 v17, v17, v9, s4 -; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v9 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v9, v9 -; GFX9-NEXT: v_bfe_u32 v8, v0, 16, 1 -; GFX9-NEXT: v_cndmask_b32_e32 v9, v17, v18, vcc -; GFX9-NEXT: v_add3_u32 v8, v8, v0, s4 -; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v0 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v8, v17, vcc -; GFX9-NEXT: s_mov_b32 s4, 0x7060302 -; GFX9-NEXT: v_perm_b32 v0, v0, v9, s4 -; GFX9-NEXT: v_perm_b32 v1, v1, v10, s4 -; GFX9-NEXT: v_perm_b32 v2, v2, v11, s4 -; GFX9-NEXT: v_perm_b32 v3, v3, v12, s4 -; GFX9-NEXT: v_perm_b32 v4, v4, v13, s4 -; GFX9-NEXT: v_perm_b32 v5, v5, v14, s4 -; GFX9-NEXT: v_perm_b32 v6, v6, v15, s4 -; GFX9-NEXT: v_perm_b32 v7, v7, v16, s4 -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: v_minnum_v16bf16: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_lshlrev_b32_e32 v16, 16, v15 +; GFX900-NEXT: v_lshlrev_b32_e32 v17, 16, v7 +; GFX900-NEXT: v_min_f32_e32 v16, v17, v16 +; GFX900-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 +; GFX900-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 +; GFX900-NEXT: v_bfe_u32 v17, v16, 16, 1 +; GFX900-NEXT: s_movk_i32 s4, 0x7fff +; GFX900-NEXT: v_min_f32_e32 v7, v7, v15 +; GFX900-NEXT: v_add3_u32 v17, v17, v16, s4 +; GFX900-NEXT: v_or_b32_e32 v18, 0x400000, v16 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v16, v16 +; GFX900-NEXT: v_bfe_u32 v15, v7, 16, 1 +; GFX900-NEXT: v_cndmask_b32_e32 v16, v17, v18, vcc +; GFX900-NEXT: v_add3_u32 v15, v15, v7, s4 +; GFX900-NEXT: v_or_b32_e32 v17, 0x400000, v7 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 +; GFX900-NEXT: v_cndmask_b32_e32 v7, v15, v17, vcc +; GFX900-NEXT: v_lshlrev_b32_e32 v15, 16, v14 +; GFX900-NEXT: v_lshlrev_b32_e32 v17, 16, v6 +; GFX900-NEXT: v_min_f32_e32 v15, v17, v15 +; GFX900-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 +; GFX900-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 +; GFX900-NEXT: v_bfe_u32 v17, v15, 16, 1 +; GFX900-NEXT: v_min_f32_e32 v6, v6, v14 +; GFX900-NEXT: v_add3_u32 v17, v17, v15, s4 +; GFX900-NEXT: v_or_b32_e32 v18, 0x400000, v15 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v15, v15 +; GFX900-NEXT: v_bfe_u32 v14, v6, 16, 1 +; GFX900-NEXT: v_cndmask_b32_e32 v15, v17, v18, vcc +; GFX900-NEXT: v_add3_u32 v14, v14, v6, s4 +; GFX900-NEXT: v_or_b32_e32 v17, 0x400000, v6 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; GFX900-NEXT: v_cndmask_b32_e32 v6, v14, v17, vcc +; GFX900-NEXT: v_lshlrev_b32_e32 v14, 16, v13 +; GFX900-NEXT: v_lshlrev_b32_e32 v17, 16, v5 +; GFX900-NEXT: v_min_f32_e32 v14, v17, v14 +; GFX900-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 +; GFX900-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 +; GFX900-NEXT: v_bfe_u32 v17, v14, 16, 1 +; GFX900-NEXT: v_min_f32_e32 v5, v5, v13 +; GFX900-NEXT: v_add3_u32 v17, v17, v14, s4 +; GFX900-NEXT: v_or_b32_e32 v18, 0x400000, v14 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v14, v14 +; GFX900-NEXT: v_bfe_u32 v13, v5, 16, 1 +; GFX900-NEXT: v_cndmask_b32_e32 v14, v17, v18, vcc +; GFX900-NEXT: v_add3_u32 v13, v13, v5, s4 +; GFX900-NEXT: v_or_b32_e32 v17, 0x400000, v5 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX900-NEXT: v_cndmask_b32_e32 v5, v13, v17, vcc +; GFX900-NEXT: v_lshlrev_b32_e32 v13, 16, v12 +; GFX900-NEXT: v_lshlrev_b32_e32 v17, 16, v4 +; GFX900-NEXT: v_min_f32_e32 v13, v17, v13 +; GFX900-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 +; GFX900-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 +; GFX900-NEXT: v_bfe_u32 v17, v13, 16, 1 +; GFX900-NEXT: v_min_f32_e32 v4, v4, v12 +; GFX900-NEXT: v_add3_u32 v17, v17, v13, s4 +; GFX900-NEXT: v_or_b32_e32 v18, 0x400000, v13 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v13, v13 +; GFX900-NEXT: v_bfe_u32 v12, v4, 16, 1 +; GFX900-NEXT: v_cndmask_b32_e32 v13, v17, v18, vcc +; GFX900-NEXT: v_add3_u32 v12, v12, v4, s4 +; GFX900-NEXT: v_or_b32_e32 v17, 0x400000, v4 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; GFX900-NEXT: v_cndmask_b32_e32 v4, v12, v17, vcc +; GFX900-NEXT: v_lshlrev_b32_e32 v12, 16, v11 +; GFX900-NEXT: v_lshlrev_b32_e32 v17, 16, v3 +; GFX900-NEXT: v_min_f32_e32 v12, v17, v12 +; GFX900-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 +; GFX900-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; GFX900-NEXT: v_bfe_u32 v17, v12, 16, 1 +; GFX900-NEXT: v_min_f32_e32 v3, v3, v11 +; GFX900-NEXT: v_add3_u32 v17, v17, v12, s4 +; GFX900-NEXT: v_or_b32_e32 v18, 0x400000, v12 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v12, v12 +; GFX900-NEXT: v_bfe_u32 v11, v3, 16, 1 +; GFX900-NEXT: v_cndmask_b32_e32 v12, v17, v18, vcc +; GFX900-NEXT: v_add3_u32 v11, v11, v3, s4 +; GFX900-NEXT: v_or_b32_e32 v17, 0x400000, v3 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX900-NEXT: v_cndmask_b32_e32 v3, v11, v17, vcc +; GFX900-NEXT: v_lshlrev_b32_e32 v11, 16, v10 +; GFX900-NEXT: v_lshlrev_b32_e32 v17, 16, v2 +; GFX900-NEXT: v_min_f32_e32 v11, v17, v11 +; GFX900-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 +; GFX900-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX900-NEXT: v_bfe_u32 v17, v11, 16, 1 +; GFX900-NEXT: v_min_f32_e32 v2, v2, v10 +; GFX900-NEXT: v_add3_u32 v17, v17, v11, s4 +; GFX900-NEXT: v_or_b32_e32 v18, 0x400000, v11 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v11, v11 +; GFX900-NEXT: v_bfe_u32 v10, v2, 16, 1 +; GFX900-NEXT: v_cndmask_b32_e32 v11, v17, v18, vcc +; GFX900-NEXT: v_add3_u32 v10, v10, v2, s4 +; GFX900-NEXT: v_or_b32_e32 v17, 0x400000, v2 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX900-NEXT: v_cndmask_b32_e32 v2, v10, v17, vcc +; GFX900-NEXT: v_lshlrev_b32_e32 v10, 16, v9 +; GFX900-NEXT: v_lshlrev_b32_e32 v17, 16, v1 +; GFX900-NEXT: v_min_f32_e32 v10, v17, v10 +; GFX900-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 +; GFX900-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX900-NEXT: v_bfe_u32 v17, v10, 16, 1 +; GFX900-NEXT: v_min_f32_e32 v1, v1, v9 +; GFX900-NEXT: v_add3_u32 v17, v17, v10, s4 +; GFX900-NEXT: v_or_b32_e32 v18, 0x400000, v10 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v10, v10 +; GFX900-NEXT: v_bfe_u32 v9, v1, 16, 1 +; GFX900-NEXT: v_cndmask_b32_e32 v10, v17, v18, vcc +; GFX900-NEXT: v_add3_u32 v9, v9, v1, s4 +; GFX900-NEXT: v_or_b32_e32 v17, 0x400000, v1 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX900-NEXT: v_cndmask_b32_e32 v1, v9, v17, vcc +; GFX900-NEXT: v_lshlrev_b32_e32 v9, 16, v8 +; GFX900-NEXT: v_lshlrev_b32_e32 v17, 16, v0 +; GFX900-NEXT: v_min_f32_e32 v9, v17, v9 +; GFX900-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 +; GFX900-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX900-NEXT: v_bfe_u32 v17, v9, 16, 1 +; GFX900-NEXT: v_min_f32_e32 v0, v0, v8 +; GFX900-NEXT: v_add3_u32 v17, v17, v9, s4 +; GFX900-NEXT: v_or_b32_e32 v18, 0x400000, v9 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v9, v9 +; GFX900-NEXT: v_bfe_u32 v8, v0, 16, 1 +; GFX900-NEXT: v_cndmask_b32_e32 v9, v17, v18, vcc +; GFX900-NEXT: v_add3_u32 v8, v8, v0, s4 +; GFX900-NEXT: v_or_b32_e32 v17, 0x400000, v0 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX900-NEXT: v_cndmask_b32_e32 v0, v8, v17, vcc +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_perm_b32 v0, v0, v9, s4 +; GFX900-NEXT: v_perm_b32 v1, v1, v10, s4 +; GFX900-NEXT: v_perm_b32 v2, v2, v11, s4 +; GFX900-NEXT: v_perm_b32 v3, v3, v12, s4 +; GFX900-NEXT: v_perm_b32 v4, v4, v13, s4 +; GFX900-NEXT: v_perm_b32 v5, v5, v14, s4 +; GFX900-NEXT: v_perm_b32 v6, v6, v15, s4 +; GFX900-NEXT: v_perm_b32 v7, v7, v16, s4 +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_minnum_v16bf16: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_and_b32_e32 v16, 0xffff0000, v15 +; GFX950-NEXT: v_and_b32_e32 v17, 0xffff0000, v7 +; GFX950-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; GFX950-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; GFX950-NEXT: v_min_f32_e32 v16, v17, v16 +; GFX950-NEXT: v_min_f32_e32 v7, v7, v15 +; GFX950-NEXT: v_and_b32_e32 v15, 0xffff0000, v14 +; GFX950-NEXT: v_and_b32_e32 v17, 0xffff0000, v6 +; GFX950-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; GFX950-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; GFX950-NEXT: v_min_f32_e32 v15, v17, v15 +; GFX950-NEXT: v_min_f32_e32 v6, v6, v14 +; GFX950-NEXT: v_and_b32_e32 v14, 0xffff0000, v13 +; GFX950-NEXT: v_and_b32_e32 v17, 0xffff0000, v5 +; GFX950-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; GFX950-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX950-NEXT: v_min_f32_e32 v14, v17, v14 +; GFX950-NEXT: v_min_f32_e32 v5, v5, v13 +; GFX950-NEXT: v_and_b32_e32 v13, 0xffff0000, v12 +; GFX950-NEXT: v_and_b32_e32 v17, 0xffff0000, v4 +; GFX950-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; GFX950-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX950-NEXT: v_min_f32_e32 v13, v17, v13 +; GFX950-NEXT: v_min_f32_e32 v4, v4, v12 +; GFX950-NEXT: v_and_b32_e32 v12, 0xffff0000, v11 +; GFX950-NEXT: v_and_b32_e32 v17, 0xffff0000, v3 +; GFX950-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; GFX950-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX950-NEXT: v_min_f32_e32 v12, v17, v12 +; GFX950-NEXT: v_min_f32_e32 v3, v3, v11 +; GFX950-NEXT: v_and_b32_e32 v11, 0xffff0000, v10 +; GFX950-NEXT: v_and_b32_e32 v17, 0xffff0000, v2 +; GFX950-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; GFX950-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX950-NEXT: v_min_f32_e32 v11, v17, v11 +; GFX950-NEXT: v_min_f32_e32 v2, v2, v10 +; GFX950-NEXT: v_and_b32_e32 v10, 0xffff0000, v9 +; GFX950-NEXT: v_and_b32_e32 v17, 0xffff0000, v1 +; GFX950-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; GFX950-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX950-NEXT: v_min_f32_e32 v10, v17, v10 +; GFX950-NEXT: v_min_f32_e32 v1, v1, v9 +; GFX950-NEXT: v_and_b32_e32 v9, 0xffff0000, v8 +; GFX950-NEXT: v_and_b32_e32 v17, 0xffff0000, v0 +; GFX950-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; GFX950-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX950-NEXT: v_min_f32_e32 v9, v17, v9 +; GFX950-NEXT: v_min_f32_e32 v0, v0, v8 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v0, v0, v9 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v1, v1, v10 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v2, v2, v11 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v3, v3, v12 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v4, v4, v13 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v5, v5, v14 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v6, v6, v15 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v7, v7, v16 +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_minnum_v16bf16: ; GFX10: ; %bb.0: @@ -22058,286 +23358,407 @@ define <32 x bfloat> @v_minnum_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) { ; GFX8-NEXT: v_alignbit_b32 v15, v16, v15, 16 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_minnum_v32bf16: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_lshlrev_b32_e32 v31, 16, v30 -; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v14 -; GFX9-NEXT: v_min_f32_e32 v31, v32, v31 -; GFX9-NEXT: v_and_b32_e32 v30, 0xffff0000, v30 -; GFX9-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 -; GFX9-NEXT: s_movk_i32 s4, 0x7fff -; GFX9-NEXT: v_bfe_u32 v32, v31, 16, 1 -; GFX9-NEXT: v_min_f32_e32 v14, v14, v30 -; GFX9-NEXT: v_add3_u32 v32, v32, v31, s4 -; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v31 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v31, v31 -; GFX9-NEXT: v_bfe_u32 v30, v14, 16, 1 -; GFX9-NEXT: v_cndmask_b32_e32 v31, v32, v33, vcc -; GFX9-NEXT: v_add3_u32 v30, v30, v14, s4 -; GFX9-NEXT: v_or_b32_e32 v32, 0x400000, v14 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v14, v14 -; GFX9-NEXT: v_cndmask_b32_e32 v14, v30, v32, vcc -; GFX9-NEXT: v_lshlrev_b32_e32 v30, 16, v29 -; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v13 -; GFX9-NEXT: v_min_f32_e32 v30, v32, v30 -; GFX9-NEXT: v_and_b32_e32 v29, 0xffff0000, v29 -; GFX9-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 -; GFX9-NEXT: v_bfe_u32 v32, v30, 16, 1 -; GFX9-NEXT: v_min_f32_e32 v13, v13, v29 -; GFX9-NEXT: v_add3_u32 v32, v32, v30, s4 -; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v30 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v30, v30 -; GFX9-NEXT: v_bfe_u32 v29, v13, 16, 1 -; GFX9-NEXT: v_cndmask_b32_e32 v30, v32, v33, vcc -; GFX9-NEXT: v_add3_u32 v29, v29, v13, s4 -; GFX9-NEXT: v_or_b32_e32 v32, 0x400000, v13 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v13, v13 -; GFX9-NEXT: v_cndmask_b32_e32 v13, v29, v32, vcc -; GFX9-NEXT: v_lshlrev_b32_e32 v29, 16, v28 -; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v12 -; GFX9-NEXT: v_min_f32_e32 v32, v32, v29 -; GFX9-NEXT: buffer_load_dword v29, off, s[0:3], s32 -; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v15 -; GFX9-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 -; GFX9-NEXT: v_and_b32_e32 v28, 0xffff0000, v28 -; GFX9-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 -; GFX9-NEXT: v_min_f32_e32 v12, v12, v28 -; GFX9-NEXT: v_bfe_u32 v28, v12, 16, 1 -; GFX9-NEXT: v_add3_u32 v28, v28, v12, s4 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_lshlrev_b32_e32 v34, 16, v29 -; GFX9-NEXT: v_min_f32_e32 v33, v33, v34 -; GFX9-NEXT: v_and_b32_e32 v29, 0xffff0000, v29 -; GFX9-NEXT: v_min_f32_e32 v29, v15, v29 -; GFX9-NEXT: v_bfe_u32 v15, v33, 16, 1 -; GFX9-NEXT: v_add3_u32 v15, v15, v33, s4 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v33 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v33, v33 -; GFX9-NEXT: v_bfe_u32 v33, v29, 16, 1 -; GFX9-NEXT: v_cndmask_b32_e32 v15, v15, v34, vcc -; GFX9-NEXT: v_add3_u32 v33, v33, v29, s4 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v29 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v29, v29 -; GFX9-NEXT: v_cndmask_b32_e32 v29, v33, v34, vcc -; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 -; GFX9-NEXT: v_add3_u32 v33, v33, v32, s4 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 -; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc -; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v12 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v12, v12 -; GFX9-NEXT: v_cndmask_b32_e32 v12, v28, v33, vcc -; GFX9-NEXT: v_lshlrev_b32_e32 v28, 16, v27 -; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v11 -; GFX9-NEXT: v_min_f32_e32 v28, v33, v28 -; GFX9-NEXT: v_and_b32_e32 v27, 0xffff0000, v27 -; GFX9-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 -; GFX9-NEXT: v_bfe_u32 v33, v28, 16, 1 -; GFX9-NEXT: v_min_f32_e32 v11, v11, v27 -; GFX9-NEXT: v_add3_u32 v33, v33, v28, s4 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v28 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v28, v28 -; GFX9-NEXT: v_bfe_u32 v27, v11, 16, 1 -; GFX9-NEXT: v_cndmask_b32_e32 v28, v33, v34, vcc -; GFX9-NEXT: v_add3_u32 v27, v27, v11, s4 -; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v11 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v11, v11 -; GFX9-NEXT: v_cndmask_b32_e32 v11, v27, v33, vcc -; GFX9-NEXT: v_lshlrev_b32_e32 v27, 16, v26 -; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v10 -; GFX9-NEXT: v_min_f32_e32 v27, v33, v27 -; GFX9-NEXT: v_and_b32_e32 v26, 0xffff0000, v26 -; GFX9-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 -; GFX9-NEXT: v_bfe_u32 v33, v27, 16, 1 -; GFX9-NEXT: v_min_f32_e32 v10, v10, v26 -; GFX9-NEXT: v_add3_u32 v33, v33, v27, s4 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v27 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v27, v27 -; GFX9-NEXT: v_bfe_u32 v26, v10, 16, 1 -; GFX9-NEXT: v_cndmask_b32_e32 v27, v33, v34, vcc -; GFX9-NEXT: v_add3_u32 v26, v26, v10, s4 -; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v10 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v10, v10 -; GFX9-NEXT: v_cndmask_b32_e32 v10, v26, v33, vcc -; GFX9-NEXT: v_lshlrev_b32_e32 v26, 16, v25 -; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v9 -; GFX9-NEXT: v_min_f32_e32 v26, v33, v26 -; GFX9-NEXT: v_and_b32_e32 v25, 0xffff0000, v25 -; GFX9-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 -; GFX9-NEXT: v_bfe_u32 v33, v26, 16, 1 -; GFX9-NEXT: v_min_f32_e32 v9, v9, v25 -; GFX9-NEXT: v_add3_u32 v33, v33, v26, s4 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v26 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v26, v26 -; GFX9-NEXT: v_bfe_u32 v25, v9, 16, 1 -; GFX9-NEXT: v_cndmask_b32_e32 v26, v33, v34, vcc -; GFX9-NEXT: v_add3_u32 v25, v25, v9, s4 -; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v9 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v9, v9 -; GFX9-NEXT: v_cndmask_b32_e32 v9, v25, v33, vcc -; GFX9-NEXT: v_lshlrev_b32_e32 v25, 16, v24 -; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v8 -; GFX9-NEXT: v_min_f32_e32 v25, v33, v25 -; GFX9-NEXT: v_and_b32_e32 v24, 0xffff0000, v24 -; GFX9-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 -; GFX9-NEXT: v_bfe_u32 v33, v25, 16, 1 -; GFX9-NEXT: v_min_f32_e32 v8, v8, v24 -; GFX9-NEXT: v_add3_u32 v33, v33, v25, s4 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v25 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v25, v25 -; GFX9-NEXT: v_bfe_u32 v24, v8, 16, 1 -; GFX9-NEXT: v_cndmask_b32_e32 v25, v33, v34, vcc -; GFX9-NEXT: v_add3_u32 v24, v24, v8, s4 -; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v8 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v8, v8 -; GFX9-NEXT: v_cndmask_b32_e32 v8, v24, v33, vcc -; GFX9-NEXT: v_lshlrev_b32_e32 v24, 16, v23 -; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v7 -; GFX9-NEXT: v_min_f32_e32 v24, v33, v24 -; GFX9-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 -; GFX9-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 -; GFX9-NEXT: v_bfe_u32 v33, v24, 16, 1 -; GFX9-NEXT: v_min_f32_e32 v7, v7, v23 -; GFX9-NEXT: v_add3_u32 v33, v33, v24, s4 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v24 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v24, v24 -; GFX9-NEXT: v_bfe_u32 v23, v7, 16, 1 -; GFX9-NEXT: v_cndmask_b32_e32 v24, v33, v34, vcc -; GFX9-NEXT: v_add3_u32 v23, v23, v7, s4 -; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v7 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 -; GFX9-NEXT: v_cndmask_b32_e32 v7, v23, v33, vcc -; GFX9-NEXT: v_lshlrev_b32_e32 v23, 16, v22 -; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v6 -; GFX9-NEXT: v_min_f32_e32 v23, v33, v23 -; GFX9-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 -; GFX9-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 -; GFX9-NEXT: v_bfe_u32 v33, v23, 16, 1 -; GFX9-NEXT: v_min_f32_e32 v6, v6, v22 -; GFX9-NEXT: v_add3_u32 v33, v33, v23, s4 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v23 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v23, v23 -; GFX9-NEXT: v_bfe_u32 v22, v6, 16, 1 -; GFX9-NEXT: v_cndmask_b32_e32 v23, v33, v34, vcc -; GFX9-NEXT: v_add3_u32 v22, v22, v6, s4 -; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v6 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 -; GFX9-NEXT: v_cndmask_b32_e32 v6, v22, v33, vcc -; GFX9-NEXT: v_lshlrev_b32_e32 v22, 16, v21 -; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v5 -; GFX9-NEXT: v_min_f32_e32 v22, v33, v22 -; GFX9-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 -; GFX9-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 -; GFX9-NEXT: v_bfe_u32 v33, v22, 16, 1 -; GFX9-NEXT: v_min_f32_e32 v5, v5, v21 -; GFX9-NEXT: v_add3_u32 v33, v33, v22, s4 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v22 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v22, v22 -; GFX9-NEXT: v_bfe_u32 v21, v5, 16, 1 -; GFX9-NEXT: v_cndmask_b32_e32 v22, v33, v34, vcc -; GFX9-NEXT: v_add3_u32 v21, v21, v5, s4 -; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v5 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX9-NEXT: v_cndmask_b32_e32 v5, v21, v33, vcc -; GFX9-NEXT: v_lshlrev_b32_e32 v21, 16, v20 -; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v4 -; GFX9-NEXT: v_min_f32_e32 v21, v33, v21 -; GFX9-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 -; GFX9-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 -; GFX9-NEXT: v_bfe_u32 v33, v21, 16, 1 -; GFX9-NEXT: v_min_f32_e32 v4, v4, v20 -; GFX9-NEXT: v_add3_u32 v33, v33, v21, s4 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v21 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v21, v21 -; GFX9-NEXT: v_bfe_u32 v20, v4, 16, 1 -; GFX9-NEXT: v_cndmask_b32_e32 v21, v33, v34, vcc -; GFX9-NEXT: v_add3_u32 v20, v20, v4, s4 -; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v4 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 -; GFX9-NEXT: v_cndmask_b32_e32 v4, v20, v33, vcc -; GFX9-NEXT: v_lshlrev_b32_e32 v20, 16, v19 -; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v3 -; GFX9-NEXT: v_min_f32_e32 v20, v33, v20 -; GFX9-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 -; GFX9-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 -; GFX9-NEXT: v_bfe_u32 v33, v20, 16, 1 -; GFX9-NEXT: v_min_f32_e32 v3, v3, v19 -; GFX9-NEXT: v_add3_u32 v33, v33, v20, s4 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v20 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v20, v20 -; GFX9-NEXT: v_bfe_u32 v19, v3, 16, 1 -; GFX9-NEXT: v_cndmask_b32_e32 v20, v33, v34, vcc -; GFX9-NEXT: v_add3_u32 v19, v19, v3, s4 -; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v3 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; GFX9-NEXT: v_cndmask_b32_e32 v3, v19, v33, vcc -; GFX9-NEXT: v_lshlrev_b32_e32 v19, 16, v18 -; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v2 -; GFX9-NEXT: v_min_f32_e32 v19, v33, v19 -; GFX9-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 -; GFX9-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX9-NEXT: v_bfe_u32 v33, v19, 16, 1 -; GFX9-NEXT: v_min_f32_e32 v2, v2, v18 -; GFX9-NEXT: v_add3_u32 v33, v33, v19, s4 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v19 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v19, v19 -; GFX9-NEXT: v_bfe_u32 v18, v2, 16, 1 -; GFX9-NEXT: v_cndmask_b32_e32 v19, v33, v34, vcc -; GFX9-NEXT: v_add3_u32 v18, v18, v2, s4 -; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v2 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v18, v33, vcc -; GFX9-NEXT: v_lshlrev_b32_e32 v18, 16, v17 -; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v1 -; GFX9-NEXT: v_min_f32_e32 v18, v33, v18 -; GFX9-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 -; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GFX9-NEXT: v_bfe_u32 v33, v18, 16, 1 -; GFX9-NEXT: v_min_f32_e32 v1, v1, v17 -; GFX9-NEXT: v_add3_u32 v33, v33, v18, s4 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v18 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 -; GFX9-NEXT: v_bfe_u32 v17, v1, 16, 1 -; GFX9-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc -; GFX9-NEXT: v_add3_u32 v17, v17, v1, s4 -; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v1 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v17, v33, vcc -; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v16 -; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v0 -; GFX9-NEXT: v_min_f32_e32 v17, v33, v17 -; GFX9-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX9-NEXT: v_bfe_u32 v33, v17, 16, 1 -; GFX9-NEXT: v_min_f32_e32 v0, v0, v16 -; GFX9-NEXT: v_add3_u32 v33, v33, v17, s4 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v17 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 -; GFX9-NEXT: v_bfe_u32 v16, v0, 16, 1 -; GFX9-NEXT: v_cndmask_b32_e32 v17, v33, v34, vcc -; GFX9-NEXT: v_add3_u32 v16, v16, v0, s4 -; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v0 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v16, v33, vcc -; GFX9-NEXT: s_mov_b32 s4, 0x7060302 -; GFX9-NEXT: v_perm_b32 v0, v0, v17, s4 -; GFX9-NEXT: v_perm_b32 v1, v1, v18, s4 -; GFX9-NEXT: v_perm_b32 v2, v2, v19, s4 -; GFX9-NEXT: v_perm_b32 v3, v3, v20, s4 -; GFX9-NEXT: v_perm_b32 v4, v4, v21, s4 -; GFX9-NEXT: v_perm_b32 v5, v5, v22, s4 -; GFX9-NEXT: v_perm_b32 v6, v6, v23, s4 -; GFX9-NEXT: v_perm_b32 v7, v7, v24, s4 -; GFX9-NEXT: v_perm_b32 v8, v8, v25, s4 -; GFX9-NEXT: v_perm_b32 v9, v9, v26, s4 -; GFX9-NEXT: v_perm_b32 v10, v10, v27, s4 -; GFX9-NEXT: v_perm_b32 v11, v11, v28, s4 -; GFX9-NEXT: v_perm_b32 v12, v12, v32, s4 -; GFX9-NEXT: v_perm_b32 v13, v13, v30, s4 -; GFX9-NEXT: v_perm_b32 v14, v14, v31, s4 -; GFX9-NEXT: v_perm_b32 v15, v29, v15, s4 -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: v_minnum_v32bf16: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_lshlrev_b32_e32 v31, 16, v30 +; GFX900-NEXT: v_lshlrev_b32_e32 v32, 16, v14 +; GFX900-NEXT: v_min_f32_e32 v31, v32, v31 +; GFX900-NEXT: v_and_b32_e32 v30, 0xffff0000, v30 +; GFX900-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 +; GFX900-NEXT: s_movk_i32 s4, 0x7fff +; GFX900-NEXT: v_bfe_u32 v32, v31, 16, 1 +; GFX900-NEXT: v_min_f32_e32 v14, v14, v30 +; GFX900-NEXT: v_add3_u32 v32, v32, v31, s4 +; GFX900-NEXT: v_or_b32_e32 v33, 0x400000, v31 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v31, v31 +; GFX900-NEXT: v_bfe_u32 v30, v14, 16, 1 +; GFX900-NEXT: v_cndmask_b32_e32 v31, v32, v33, vcc +; GFX900-NEXT: v_add3_u32 v30, v30, v14, s4 +; GFX900-NEXT: v_or_b32_e32 v32, 0x400000, v14 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v14, v14 +; GFX900-NEXT: v_cndmask_b32_e32 v14, v30, v32, vcc +; GFX900-NEXT: v_lshlrev_b32_e32 v30, 16, v29 +; GFX900-NEXT: v_lshlrev_b32_e32 v32, 16, v13 +; GFX900-NEXT: v_min_f32_e32 v30, v32, v30 +; GFX900-NEXT: v_and_b32_e32 v29, 0xffff0000, v29 +; GFX900-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 +; GFX900-NEXT: v_bfe_u32 v32, v30, 16, 1 +; GFX900-NEXT: v_min_f32_e32 v13, v13, v29 +; GFX900-NEXT: v_add3_u32 v32, v32, v30, s4 +; GFX900-NEXT: v_or_b32_e32 v33, 0x400000, v30 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v30, v30 +; GFX900-NEXT: v_bfe_u32 v29, v13, 16, 1 +; GFX900-NEXT: v_cndmask_b32_e32 v30, v32, v33, vcc +; GFX900-NEXT: v_add3_u32 v29, v29, v13, s4 +; GFX900-NEXT: v_or_b32_e32 v32, 0x400000, v13 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v13, v13 +; GFX900-NEXT: v_cndmask_b32_e32 v13, v29, v32, vcc +; GFX900-NEXT: v_lshlrev_b32_e32 v29, 16, v28 +; GFX900-NEXT: v_lshlrev_b32_e32 v32, 16, v12 +; GFX900-NEXT: v_min_f32_e32 v32, v32, v29 +; GFX900-NEXT: buffer_load_dword v29, off, s[0:3], s32 +; GFX900-NEXT: v_lshlrev_b32_e32 v33, 16, v15 +; GFX900-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 +; GFX900-NEXT: v_and_b32_e32 v28, 0xffff0000, v28 +; GFX900-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 +; GFX900-NEXT: v_min_f32_e32 v12, v12, v28 +; GFX900-NEXT: v_bfe_u32 v28, v12, 16, 1 +; GFX900-NEXT: v_add3_u32 v28, v28, v12, s4 +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: v_lshlrev_b32_e32 v34, 16, v29 +; GFX900-NEXT: v_min_f32_e32 v33, v33, v34 +; GFX900-NEXT: v_and_b32_e32 v29, 0xffff0000, v29 +; GFX900-NEXT: v_min_f32_e32 v29, v15, v29 +; GFX900-NEXT: v_bfe_u32 v15, v33, 16, 1 +; GFX900-NEXT: v_add3_u32 v15, v15, v33, s4 +; GFX900-NEXT: v_or_b32_e32 v34, 0x400000, v33 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v33, v33 +; GFX900-NEXT: v_bfe_u32 v33, v29, 16, 1 +; GFX900-NEXT: v_cndmask_b32_e32 v15, v15, v34, vcc +; GFX900-NEXT: v_add3_u32 v33, v33, v29, s4 +; GFX900-NEXT: v_or_b32_e32 v34, 0x400000, v29 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v29, v29 +; GFX900-NEXT: v_cndmask_b32_e32 v29, v33, v34, vcc +; GFX900-NEXT: v_bfe_u32 v33, v32, 16, 1 +; GFX900-NEXT: v_add3_u32 v33, v33, v32, s4 +; GFX900-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX900-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; GFX900-NEXT: v_or_b32_e32 v33, 0x400000, v12 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v12, v12 +; GFX900-NEXT: v_cndmask_b32_e32 v12, v28, v33, vcc +; GFX900-NEXT: v_lshlrev_b32_e32 v28, 16, v27 +; GFX900-NEXT: v_lshlrev_b32_e32 v33, 16, v11 +; GFX900-NEXT: v_min_f32_e32 v28, v33, v28 +; GFX900-NEXT: v_and_b32_e32 v27, 0xffff0000, v27 +; GFX900-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 +; GFX900-NEXT: v_bfe_u32 v33, v28, 16, 1 +; GFX900-NEXT: v_min_f32_e32 v11, v11, v27 +; GFX900-NEXT: v_add3_u32 v33, v33, v28, s4 +; GFX900-NEXT: v_or_b32_e32 v34, 0x400000, v28 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v28, v28 +; GFX900-NEXT: v_bfe_u32 v27, v11, 16, 1 +; GFX900-NEXT: v_cndmask_b32_e32 v28, v33, v34, vcc +; GFX900-NEXT: v_add3_u32 v27, v27, v11, s4 +; GFX900-NEXT: v_or_b32_e32 v33, 0x400000, v11 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v11, v11 +; GFX900-NEXT: v_cndmask_b32_e32 v11, v27, v33, vcc +; GFX900-NEXT: v_lshlrev_b32_e32 v27, 16, v26 +; GFX900-NEXT: v_lshlrev_b32_e32 v33, 16, v10 +; GFX900-NEXT: v_min_f32_e32 v27, v33, v27 +; GFX900-NEXT: v_and_b32_e32 v26, 0xffff0000, v26 +; GFX900-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 +; GFX900-NEXT: v_bfe_u32 v33, v27, 16, 1 +; GFX900-NEXT: v_min_f32_e32 v10, v10, v26 +; GFX900-NEXT: v_add3_u32 v33, v33, v27, s4 +; GFX900-NEXT: v_or_b32_e32 v34, 0x400000, v27 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v27, v27 +; GFX900-NEXT: v_bfe_u32 v26, v10, 16, 1 +; GFX900-NEXT: v_cndmask_b32_e32 v27, v33, v34, vcc +; GFX900-NEXT: v_add3_u32 v26, v26, v10, s4 +; GFX900-NEXT: v_or_b32_e32 v33, 0x400000, v10 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v10, v10 +; GFX900-NEXT: v_cndmask_b32_e32 v10, v26, v33, vcc +; GFX900-NEXT: v_lshlrev_b32_e32 v26, 16, v25 +; GFX900-NEXT: v_lshlrev_b32_e32 v33, 16, v9 +; GFX900-NEXT: v_min_f32_e32 v26, v33, v26 +; GFX900-NEXT: v_and_b32_e32 v25, 0xffff0000, v25 +; GFX900-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 +; GFX900-NEXT: v_bfe_u32 v33, v26, 16, 1 +; GFX900-NEXT: v_min_f32_e32 v9, v9, v25 +; GFX900-NEXT: v_add3_u32 v33, v33, v26, s4 +; GFX900-NEXT: v_or_b32_e32 v34, 0x400000, v26 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v26, v26 +; GFX900-NEXT: v_bfe_u32 v25, v9, 16, 1 +; GFX900-NEXT: v_cndmask_b32_e32 v26, v33, v34, vcc +; GFX900-NEXT: v_add3_u32 v25, v25, v9, s4 +; GFX900-NEXT: v_or_b32_e32 v33, 0x400000, v9 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v9, v9 +; GFX900-NEXT: v_cndmask_b32_e32 v9, v25, v33, vcc +; GFX900-NEXT: v_lshlrev_b32_e32 v25, 16, v24 +; GFX900-NEXT: v_lshlrev_b32_e32 v33, 16, v8 +; GFX900-NEXT: v_min_f32_e32 v25, v33, v25 +; GFX900-NEXT: v_and_b32_e32 v24, 0xffff0000, v24 +; GFX900-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 +; GFX900-NEXT: v_bfe_u32 v33, v25, 16, 1 +; GFX900-NEXT: v_min_f32_e32 v8, v8, v24 +; GFX900-NEXT: v_add3_u32 v33, v33, v25, s4 +; GFX900-NEXT: v_or_b32_e32 v34, 0x400000, v25 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v25, v25 +; GFX900-NEXT: v_bfe_u32 v24, v8, 16, 1 +; GFX900-NEXT: v_cndmask_b32_e32 v25, v33, v34, vcc +; GFX900-NEXT: v_add3_u32 v24, v24, v8, s4 +; GFX900-NEXT: v_or_b32_e32 v33, 0x400000, v8 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v8, v8 +; GFX900-NEXT: v_cndmask_b32_e32 v8, v24, v33, vcc +; GFX900-NEXT: v_lshlrev_b32_e32 v24, 16, v23 +; GFX900-NEXT: v_lshlrev_b32_e32 v33, 16, v7 +; GFX900-NEXT: v_min_f32_e32 v24, v33, v24 +; GFX900-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 +; GFX900-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 +; GFX900-NEXT: v_bfe_u32 v33, v24, 16, 1 +; GFX900-NEXT: v_min_f32_e32 v7, v7, v23 +; GFX900-NEXT: v_add3_u32 v33, v33, v24, s4 +; GFX900-NEXT: v_or_b32_e32 v34, 0x400000, v24 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v24, v24 +; GFX900-NEXT: v_bfe_u32 v23, v7, 16, 1 +; GFX900-NEXT: v_cndmask_b32_e32 v24, v33, v34, vcc +; GFX900-NEXT: v_add3_u32 v23, v23, v7, s4 +; GFX900-NEXT: v_or_b32_e32 v33, 0x400000, v7 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 +; GFX900-NEXT: v_cndmask_b32_e32 v7, v23, v33, vcc +; GFX900-NEXT: v_lshlrev_b32_e32 v23, 16, v22 +; GFX900-NEXT: v_lshlrev_b32_e32 v33, 16, v6 +; GFX900-NEXT: v_min_f32_e32 v23, v33, v23 +; GFX900-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 +; GFX900-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 +; GFX900-NEXT: v_bfe_u32 v33, v23, 16, 1 +; GFX900-NEXT: v_min_f32_e32 v6, v6, v22 +; GFX900-NEXT: v_add3_u32 v33, v33, v23, s4 +; GFX900-NEXT: v_or_b32_e32 v34, 0x400000, v23 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v23, v23 +; GFX900-NEXT: v_bfe_u32 v22, v6, 16, 1 +; GFX900-NEXT: v_cndmask_b32_e32 v23, v33, v34, vcc +; GFX900-NEXT: v_add3_u32 v22, v22, v6, s4 +; GFX900-NEXT: v_or_b32_e32 v33, 0x400000, v6 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; GFX900-NEXT: v_cndmask_b32_e32 v6, v22, v33, vcc +; GFX900-NEXT: v_lshlrev_b32_e32 v22, 16, v21 +; GFX900-NEXT: v_lshlrev_b32_e32 v33, 16, v5 +; GFX900-NEXT: v_min_f32_e32 v22, v33, v22 +; GFX900-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 +; GFX900-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 +; GFX900-NEXT: v_bfe_u32 v33, v22, 16, 1 +; GFX900-NEXT: v_min_f32_e32 v5, v5, v21 +; GFX900-NEXT: v_add3_u32 v33, v33, v22, s4 +; GFX900-NEXT: v_or_b32_e32 v34, 0x400000, v22 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v22, v22 +; GFX900-NEXT: v_bfe_u32 v21, v5, 16, 1 +; GFX900-NEXT: v_cndmask_b32_e32 v22, v33, v34, vcc +; GFX900-NEXT: v_add3_u32 v21, v21, v5, s4 +; GFX900-NEXT: v_or_b32_e32 v33, 0x400000, v5 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX900-NEXT: v_cndmask_b32_e32 v5, v21, v33, vcc +; GFX900-NEXT: v_lshlrev_b32_e32 v21, 16, v20 +; GFX900-NEXT: v_lshlrev_b32_e32 v33, 16, v4 +; GFX900-NEXT: v_min_f32_e32 v21, v33, v21 +; GFX900-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 +; GFX900-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 +; GFX900-NEXT: v_bfe_u32 v33, v21, 16, 1 +; GFX900-NEXT: v_min_f32_e32 v4, v4, v20 +; GFX900-NEXT: v_add3_u32 v33, v33, v21, s4 +; GFX900-NEXT: v_or_b32_e32 v34, 0x400000, v21 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v21, v21 +; GFX900-NEXT: v_bfe_u32 v20, v4, 16, 1 +; GFX900-NEXT: v_cndmask_b32_e32 v21, v33, v34, vcc +; GFX900-NEXT: v_add3_u32 v20, v20, v4, s4 +; GFX900-NEXT: v_or_b32_e32 v33, 0x400000, v4 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; GFX900-NEXT: v_cndmask_b32_e32 v4, v20, v33, vcc +; GFX900-NEXT: v_lshlrev_b32_e32 v20, 16, v19 +; GFX900-NEXT: v_lshlrev_b32_e32 v33, 16, v3 +; GFX900-NEXT: v_min_f32_e32 v20, v33, v20 +; GFX900-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 +; GFX900-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; GFX900-NEXT: v_bfe_u32 v33, v20, 16, 1 +; GFX900-NEXT: v_min_f32_e32 v3, v3, v19 +; GFX900-NEXT: v_add3_u32 v33, v33, v20, s4 +; GFX900-NEXT: v_or_b32_e32 v34, 0x400000, v20 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v20, v20 +; GFX900-NEXT: v_bfe_u32 v19, v3, 16, 1 +; GFX900-NEXT: v_cndmask_b32_e32 v20, v33, v34, vcc +; GFX900-NEXT: v_add3_u32 v19, v19, v3, s4 +; GFX900-NEXT: v_or_b32_e32 v33, 0x400000, v3 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX900-NEXT: v_cndmask_b32_e32 v3, v19, v33, vcc +; GFX900-NEXT: v_lshlrev_b32_e32 v19, 16, v18 +; GFX900-NEXT: v_lshlrev_b32_e32 v33, 16, v2 +; GFX900-NEXT: v_min_f32_e32 v19, v33, v19 +; GFX900-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 +; GFX900-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX900-NEXT: v_bfe_u32 v33, v19, 16, 1 +; GFX900-NEXT: v_min_f32_e32 v2, v2, v18 +; GFX900-NEXT: v_add3_u32 v33, v33, v19, s4 +; GFX900-NEXT: v_or_b32_e32 v34, 0x400000, v19 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v19, v19 +; GFX900-NEXT: v_bfe_u32 v18, v2, 16, 1 +; GFX900-NEXT: v_cndmask_b32_e32 v19, v33, v34, vcc +; GFX900-NEXT: v_add3_u32 v18, v18, v2, s4 +; GFX900-NEXT: v_or_b32_e32 v33, 0x400000, v2 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX900-NEXT: v_cndmask_b32_e32 v2, v18, v33, vcc +; GFX900-NEXT: v_lshlrev_b32_e32 v18, 16, v17 +; GFX900-NEXT: v_lshlrev_b32_e32 v33, 16, v1 +; GFX900-NEXT: v_min_f32_e32 v18, v33, v18 +; GFX900-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 +; GFX900-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX900-NEXT: v_bfe_u32 v33, v18, 16, 1 +; GFX900-NEXT: v_min_f32_e32 v1, v1, v17 +; GFX900-NEXT: v_add3_u32 v33, v33, v18, s4 +; GFX900-NEXT: v_or_b32_e32 v34, 0x400000, v18 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 +; GFX900-NEXT: v_bfe_u32 v17, v1, 16, 1 +; GFX900-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc +; GFX900-NEXT: v_add3_u32 v17, v17, v1, s4 +; GFX900-NEXT: v_or_b32_e32 v33, 0x400000, v1 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX900-NEXT: v_cndmask_b32_e32 v1, v17, v33, vcc +; GFX900-NEXT: v_lshlrev_b32_e32 v17, 16, v16 +; GFX900-NEXT: v_lshlrev_b32_e32 v33, 16, v0 +; GFX900-NEXT: v_min_f32_e32 v17, v33, v17 +; GFX900-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 +; GFX900-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX900-NEXT: v_bfe_u32 v33, v17, 16, 1 +; GFX900-NEXT: v_min_f32_e32 v0, v0, v16 +; GFX900-NEXT: v_add3_u32 v33, v33, v17, s4 +; GFX900-NEXT: v_or_b32_e32 v34, 0x400000, v17 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 +; GFX900-NEXT: v_bfe_u32 v16, v0, 16, 1 +; GFX900-NEXT: v_cndmask_b32_e32 v17, v33, v34, vcc +; GFX900-NEXT: v_add3_u32 v16, v16, v0, s4 +; GFX900-NEXT: v_or_b32_e32 v33, 0x400000, v0 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX900-NEXT: v_cndmask_b32_e32 v0, v16, v33, vcc +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_perm_b32 v0, v0, v17, s4 +; GFX900-NEXT: v_perm_b32 v1, v1, v18, s4 +; GFX900-NEXT: v_perm_b32 v2, v2, v19, s4 +; GFX900-NEXT: v_perm_b32 v3, v3, v20, s4 +; GFX900-NEXT: v_perm_b32 v4, v4, v21, s4 +; GFX900-NEXT: v_perm_b32 v5, v5, v22, s4 +; GFX900-NEXT: v_perm_b32 v6, v6, v23, s4 +; GFX900-NEXT: v_perm_b32 v7, v7, v24, s4 +; GFX900-NEXT: v_perm_b32 v8, v8, v25, s4 +; GFX900-NEXT: v_perm_b32 v9, v9, v26, s4 +; GFX900-NEXT: v_perm_b32 v10, v10, v27, s4 +; GFX900-NEXT: v_perm_b32 v11, v11, v28, s4 +; GFX900-NEXT: v_perm_b32 v12, v12, v32, s4 +; GFX900-NEXT: v_perm_b32 v13, v13, v30, s4 +; GFX900-NEXT: v_perm_b32 v14, v14, v31, s4 +; GFX900-NEXT: v_perm_b32 v15, v29, v15, s4 +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_minnum_v32bf16: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: scratch_load_dword v31, off, s32 +; GFX950-NEXT: v_and_b32_e32 v53, 0xffff0000, v24 +; GFX950-NEXT: v_and_b32_e32 v54, 0xffff0000, v8 +; GFX950-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; GFX950-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; GFX950-NEXT: v_accvgpr_write_b32 a0, v40 ; Reload Reuse +; GFX950-NEXT: v_and_b32_e32 v32, 0xffff0000, v15 +; GFX950-NEXT: v_and_b32_e32 v55, 0xffff0000, v23 +; GFX950-NEXT: v_and_b32_e32 v40, 0xffff0000, v7 +; GFX950-NEXT: v_min_f32_e32 v8, v8, v24 +; GFX950-NEXT: v_lshlrev_b32_e32 v23, 16, v23 +; GFX950-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; GFX950-NEXT: v_min_f32_e32 v7, v7, v23 +; GFX950-NEXT: v_and_b32_e32 v23, 0xffff0000, v22 +; GFX950-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; GFX950-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; GFX950-NEXT: v_and_b32_e32 v33, 0xffff0000, v30 +; GFX950-NEXT: v_and_b32_e32 v34, 0xffff0000, v14 +; GFX950-NEXT: v_lshlrev_b32_e32 v30, 16, v30 +; GFX950-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; GFX950-NEXT: v_and_b32_e32 v35, 0xffff0000, v29 +; GFX950-NEXT: v_and_b32_e32 v36, 0xffff0000, v13 +; GFX950-NEXT: v_lshlrev_b32_e32 v29, 16, v29 +; GFX950-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; GFX950-NEXT: v_and_b32_e32 v37, 0xffff0000, v28 +; GFX950-NEXT: v_and_b32_e32 v38, 0xffff0000, v12 +; GFX950-NEXT: v_lshlrev_b32_e32 v28, 16, v28 +; GFX950-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; GFX950-NEXT: v_and_b32_e32 v39, 0xffff0000, v27 +; GFX950-NEXT: v_and_b32_e32 v48, 0xffff0000, v11 +; GFX950-NEXT: v_lshlrev_b32_e32 v27, 16, v27 +; GFX950-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; GFX950-NEXT: v_and_b32_e32 v49, 0xffff0000, v26 +; GFX950-NEXT: v_and_b32_e32 v50, 0xffff0000, v10 +; GFX950-NEXT: v_lshlrev_b32_e32 v26, 16, v26 +; GFX950-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; GFX950-NEXT: v_and_b32_e32 v51, 0xffff0000, v25 +; GFX950-NEXT: v_and_b32_e32 v52, 0xffff0000, v9 +; GFX950-NEXT: v_lshlrev_b32_e32 v25, 16, v25 +; GFX950-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; GFX950-NEXT: v_min_f32_e32 v33, v34, v33 +; GFX950-NEXT: v_min_f32_e32 v14, v14, v30 +; GFX950-NEXT: v_min_f32_e32 v30, v36, v35 +; GFX950-NEXT: v_min_f32_e32 v13, v13, v29 +; GFX950-NEXT: v_min_f32_e32 v29, v38, v37 +; GFX950-NEXT: v_min_f32_e32 v12, v12, v28 +; GFX950-NEXT: v_min_f32_e32 v28, v48, v39 +; GFX950-NEXT: v_min_f32_e32 v11, v11, v27 +; GFX950-NEXT: v_min_f32_e32 v27, v50, v49 +; GFX950-NEXT: v_min_f32_e32 v10, v10, v26 +; GFX950-NEXT: v_min_f32_e32 v26, v52, v51 +; GFX950-NEXT: v_min_f32_e32 v9, v9, v25 +; GFX950-NEXT: v_min_f32_e32 v25, v54, v53 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v8, v8, v25 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v9, v9, v26 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v10, v10, v27 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v11, v11, v28 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v12, v12, v29 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v13, v13, v30 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v14, v14, v33 +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: v_and_b32_e32 v24, 0xffff0000, v31 +; GFX950-NEXT: v_min_f32_e32 v24, v32, v24 +; GFX950-NEXT: v_and_b32_e32 v32, 0xffff0000, v6 +; GFX950-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; GFX950-NEXT: v_min_f32_e32 v23, v32, v23 +; GFX950-NEXT: v_min_f32_e32 v6, v6, v22 +; GFX950-NEXT: v_and_b32_e32 v22, 0xffff0000, v21 +; GFX950-NEXT: v_and_b32_e32 v32, 0xffff0000, v5 +; GFX950-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; GFX950-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX950-NEXT: v_min_f32_e32 v22, v32, v22 +; GFX950-NEXT: v_min_f32_e32 v5, v5, v21 +; GFX950-NEXT: v_and_b32_e32 v21, 0xffff0000, v20 +; GFX950-NEXT: v_and_b32_e32 v32, 0xffff0000, v4 +; GFX950-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; GFX950-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX950-NEXT: v_min_f32_e32 v21, v32, v21 +; GFX950-NEXT: v_min_f32_e32 v4, v4, v20 +; GFX950-NEXT: v_and_b32_e32 v20, 0xffff0000, v19 +; GFX950-NEXT: v_and_b32_e32 v32, 0xffff0000, v3 +; GFX950-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; GFX950-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX950-NEXT: v_min_f32_e32 v20, v32, v20 +; GFX950-NEXT: v_min_f32_e32 v3, v3, v19 +; GFX950-NEXT: v_and_b32_e32 v19, 0xffff0000, v18 +; GFX950-NEXT: v_and_b32_e32 v32, 0xffff0000, v2 +; GFX950-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; GFX950-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX950-NEXT: v_min_f32_e32 v19, v32, v19 +; GFX950-NEXT: v_min_f32_e32 v2, v2, v18 +; GFX950-NEXT: v_and_b32_e32 v18, 0xffff0000, v17 +; GFX950-NEXT: v_and_b32_e32 v32, 0xffff0000, v1 +; GFX950-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; GFX950-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX950-NEXT: v_lshlrev_b32_e32 v31, 16, v31 +; GFX950-NEXT: v_min_f32_e32 v18, v32, v18 +; GFX950-NEXT: v_min_f32_e32 v1, v1, v17 +; GFX950-NEXT: v_and_b32_e32 v17, 0xffff0000, v16 +; GFX950-NEXT: v_and_b32_e32 v32, 0xffff0000, v0 +; GFX950-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; GFX950-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX950-NEXT: v_min_f32_e32 v15, v15, v31 +; GFX950-NEXT: v_min_f32_e32 v31, v40, v55 +; GFX950-NEXT: v_min_f32_e32 v17, v32, v17 +; GFX950-NEXT: v_min_f32_e32 v0, v0, v16 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v0, v0, v17 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v1, v1, v18 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v2, v2, v19 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v3, v3, v20 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v4, v4, v21 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v5, v5, v22 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v6, v6, v23 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v7, v7, v31 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v15, v15, v24 +; GFX950-NEXT: v_accvgpr_read_b32 v40, a0 ; Reload Reuse +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_minnum_v32bf16: ; GFX10: ; %bb.0: @@ -23250,20 +24671,29 @@ define bfloat @v_maxnum_bf16(bfloat %a, bfloat %b) { ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_maxnum_bf16: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX9-NEXT: v_max_f32_e32 v0, v0, v1 -; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1 -; GFX9-NEXT: s_movk_i32 s4, 0x7fff -; GFX9-NEXT: v_add3_u32 v1, v1, v0, s4 -; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v0 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: v_maxnum_bf16: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX900-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX900-NEXT: v_max_f32_e32 v0, v0, v1 +; GFX900-NEXT: v_bfe_u32 v1, v0, 16, 1 +; GFX900-NEXT: s_movk_i32 s4, 0x7fff +; GFX900-NEXT: v_add3_u32 v1, v1, v0, s4 +; GFX900-NEXT: v_or_b32_e32 v2, 0x400000, v0 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX900-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc +; GFX900-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_maxnum_bf16: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX950-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX950-NEXT: v_max_f32_e32 v0, v0, v1 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v0, v0, s0 +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_maxnum_bf16: ; GFX10: ; %bb.0: @@ -23378,29 +24808,41 @@ define <2 x bfloat> @v_maxnum_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b) { ; GFX8-NEXT: v_alignbit_b32 v0, v0, v2, 16 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_maxnum_v2bf16: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v1 -; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v0 -; GFX9-NEXT: v_max_f32_e32 v2, v3, v2 -; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 -; GFX9-NEXT: s_movk_i32 s4, 0x7fff -; GFX9-NEXT: v_max_f32_e32 v0, v0, v1 -; GFX9-NEXT: v_add3_u32 v3, v3, v2, s4 -; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; GFX9-NEXT: v_add3_u32 v1, v1, v0, s4 -; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v0 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v3, vcc -; GFX9-NEXT: s_mov_b32 s4, 0x7060302 -; GFX9-NEXT: v_perm_b32 v0, v0, v2, s4 -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: v_maxnum_v2bf16: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_lshlrev_b32_e32 v2, 16, v1 +; GFX900-NEXT: v_lshlrev_b32_e32 v3, 16, v0 +; GFX900-NEXT: v_max_f32_e32 v2, v3, v2 +; GFX900-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX900-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX900-NEXT: v_bfe_u32 v3, v2, 16, 1 +; GFX900-NEXT: s_movk_i32 s4, 0x7fff +; GFX900-NEXT: v_max_f32_e32 v0, v0, v1 +; GFX900-NEXT: v_add3_u32 v3, v3, v2, s4 +; GFX900-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX900-NEXT: v_bfe_u32 v1, v0, 16, 1 +; GFX900-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; GFX900-NEXT: v_add3_u32 v1, v1, v0, s4 +; GFX900-NEXT: v_or_b32_e32 v3, 0x400000, v0 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX900-NEXT: v_cndmask_b32_e32 v0, v1, v3, vcc +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_perm_b32 v0, v0, v2, s4 +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_maxnum_v2bf16: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_and_b32_e32 v2, 0xffff0000, v1 +; GFX950-NEXT: v_and_b32_e32 v3, 0xffff0000, v0 +; GFX950-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX950-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX950-NEXT: v_max_f32_e32 v2, v3, v2 +; GFX950-NEXT: v_max_f32_e32 v0, v0, v1 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v0, v0, v2 +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_maxnum_v2bf16: ; GFX10: ; %bb.0: @@ -23563,38 +25005,54 @@ define <3 x bfloat> @v_maxnum_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b) { ; GFX8-NEXT: v_alignbit_b32 v0, v0, v3, 16 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_maxnum_v3bf16: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX9-NEXT: v_max_f32_e32 v1, v1, v3 -; GFX9-NEXT: v_bfe_u32 v3, v1, 16, 1 -; GFX9-NEXT: s_movk_i32 s4, 0x7fff -; GFX9-NEXT: v_add3_u32 v3, v3, v1, s4 -; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v1 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc -; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v2 -; GFX9-NEXT: v_lshlrev_b32_e32 v4, 16, v0 -; GFX9-NEXT: v_max_f32_e32 v3, v4, v3 -; GFX9-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX9-NEXT: v_bfe_u32 v4, v3, 16, 1 -; GFX9-NEXT: v_max_f32_e32 v0, v0, v2 -; GFX9-NEXT: v_add3_u32 v4, v4, v3, s4 -; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v3 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; GFX9-NEXT: v_bfe_u32 v2, v0, 16, 1 -; GFX9-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc -; GFX9-NEXT: v_add3_u32 v2, v2, v0, s4 -; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v0 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc -; GFX9-NEXT: s_mov_b32 s4, 0x7060302 -; GFX9-NEXT: v_perm_b32 v0, v0, v3, s4 -; GFX9-NEXT: v_alignbit_b32 v1, s4, v1, 16 -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: v_maxnum_v3bf16: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX900-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX900-NEXT: v_max_f32_e32 v1, v1, v3 +; GFX900-NEXT: v_bfe_u32 v3, v1, 16, 1 +; GFX900-NEXT: s_movk_i32 s4, 0x7fff +; GFX900-NEXT: v_add3_u32 v3, v3, v1, s4 +; GFX900-NEXT: v_or_b32_e32 v4, 0x400000, v1 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX900-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc +; GFX900-NEXT: v_lshlrev_b32_e32 v3, 16, v2 +; GFX900-NEXT: v_lshlrev_b32_e32 v4, 16, v0 +; GFX900-NEXT: v_max_f32_e32 v3, v4, v3 +; GFX900-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX900-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX900-NEXT: v_bfe_u32 v4, v3, 16, 1 +; GFX900-NEXT: v_max_f32_e32 v0, v0, v2 +; GFX900-NEXT: v_add3_u32 v4, v4, v3, s4 +; GFX900-NEXT: v_or_b32_e32 v5, 0x400000, v3 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX900-NEXT: v_bfe_u32 v2, v0, 16, 1 +; GFX900-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc +; GFX900-NEXT: v_add3_u32 v2, v2, v0, s4 +; GFX900-NEXT: v_or_b32_e32 v4, 0x400000, v0 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX900-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_perm_b32 v0, v0, v3, s4 +; GFX900-NEXT: v_alignbit_b32 v1, s4, v1, 16 +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_maxnum_v3bf16: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX950-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX950-NEXT: v_max_f32_e32 v1, v1, v3 +; GFX950-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 +; GFX950-NEXT: v_and_b32_e32 v4, 0xffff0000, v0 +; GFX950-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX950-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX950-NEXT: v_max_f32_e32 v3, v4, v3 +; GFX950-NEXT: v_max_f32_e32 v0, v0, v2 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v1, v1, s0 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v0, v0, v3 +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_maxnum_v3bf16: ; GFX10: ; %bb.0: @@ -23804,46 +25262,65 @@ define <4 x bfloat> @v_maxnum_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b) { ; GFX8-NEXT: v_alignbit_b32 v1, v1, v4, 16 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_maxnum_v4bf16: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_lshlrev_b32_e32 v4, 16, v3 -; GFX9-NEXT: v_lshlrev_b32_e32 v5, 16, v1 -; GFX9-NEXT: v_max_f32_e32 v4, v5, v4 -; GFX9-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 -; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GFX9-NEXT: v_bfe_u32 v5, v4, 16, 1 -; GFX9-NEXT: s_movk_i32 s4, 0x7fff -; GFX9-NEXT: v_max_f32_e32 v1, v1, v3 -; GFX9-NEXT: v_add3_u32 v5, v5, v4, s4 -; GFX9-NEXT: v_or_b32_e32 v6, 0x400000, v4 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 -; GFX9-NEXT: v_bfe_u32 v3, v1, 16, 1 -; GFX9-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc -; GFX9-NEXT: v_add3_u32 v3, v3, v1, s4 -; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v1 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc -; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v2 -; GFX9-NEXT: v_lshlrev_b32_e32 v5, 16, v0 -; GFX9-NEXT: v_max_f32_e32 v3, v5, v3 -; GFX9-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX9-NEXT: v_bfe_u32 v5, v3, 16, 1 -; GFX9-NEXT: v_max_f32_e32 v0, v0, v2 -; GFX9-NEXT: v_add3_u32 v5, v5, v3, s4 -; GFX9-NEXT: v_or_b32_e32 v6, 0x400000, v3 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; GFX9-NEXT: v_bfe_u32 v2, v0, 16, 1 -; GFX9-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc -; GFX9-NEXT: v_add3_u32 v2, v2, v0, s4 -; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v0 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v5, vcc -; GFX9-NEXT: s_mov_b32 s4, 0x7060302 -; GFX9-NEXT: v_perm_b32 v0, v0, v3, s4 -; GFX9-NEXT: v_perm_b32 v1, v1, v4, s4 -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: v_maxnum_v4bf16: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_lshlrev_b32_e32 v4, 16, v3 +; GFX900-NEXT: v_lshlrev_b32_e32 v5, 16, v1 +; GFX900-NEXT: v_max_f32_e32 v4, v5, v4 +; GFX900-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; GFX900-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX900-NEXT: v_bfe_u32 v5, v4, 16, 1 +; GFX900-NEXT: s_movk_i32 s4, 0x7fff +; GFX900-NEXT: v_max_f32_e32 v1, v1, v3 +; GFX900-NEXT: v_add3_u32 v5, v5, v4, s4 +; GFX900-NEXT: v_or_b32_e32 v6, 0x400000, v4 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; GFX900-NEXT: v_bfe_u32 v3, v1, 16, 1 +; GFX900-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc +; GFX900-NEXT: v_add3_u32 v3, v3, v1, s4 +; GFX900-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX900-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc +; GFX900-NEXT: v_lshlrev_b32_e32 v3, 16, v2 +; GFX900-NEXT: v_lshlrev_b32_e32 v5, 16, v0 +; GFX900-NEXT: v_max_f32_e32 v3, v5, v3 +; GFX900-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX900-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX900-NEXT: v_bfe_u32 v5, v3, 16, 1 +; GFX900-NEXT: v_max_f32_e32 v0, v0, v2 +; GFX900-NEXT: v_add3_u32 v5, v5, v3, s4 +; GFX900-NEXT: v_or_b32_e32 v6, 0x400000, v3 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX900-NEXT: v_bfe_u32 v2, v0, 16, 1 +; GFX900-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc +; GFX900-NEXT: v_add3_u32 v2, v2, v0, s4 +; GFX900-NEXT: v_or_b32_e32 v5, 0x400000, v0 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX900-NEXT: v_cndmask_b32_e32 v0, v2, v5, vcc +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_perm_b32 v0, v0, v3, s4 +; GFX900-NEXT: v_perm_b32 v1, v1, v4, s4 +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_maxnum_v4bf16: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_and_b32_e32 v4, 0xffff0000, v3 +; GFX950-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 +; GFX950-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX950-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX950-NEXT: v_max_f32_e32 v4, v5, v4 +; GFX950-NEXT: v_max_f32_e32 v1, v1, v3 +; GFX950-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 +; GFX950-NEXT: v_and_b32_e32 v5, 0xffff0000, v0 +; GFX950-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX950-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX950-NEXT: v_max_f32_e32 v3, v5, v3 +; GFX950-NEXT: v_max_f32_e32 v0, v0, v2 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v0, v0, v3 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v1, v1, v4 +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_maxnum_v4bf16: ; GFX10: ; %bb.0: @@ -24167,80 +25644,113 @@ define <8 x bfloat> @v_maxnum_v8bf16(<8 x bfloat> %a, <8 x bfloat> %b) { ; GFX8-NEXT: v_alignbit_b32 v3, v3, v8, 16 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_maxnum_v8bf16: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_lshlrev_b32_e32 v8, 16, v7 -; GFX9-NEXT: v_lshlrev_b32_e32 v9, 16, v3 -; GFX9-NEXT: v_max_f32_e32 v8, v9, v8 -; GFX9-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 -; GFX9-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 -; GFX9-NEXT: v_bfe_u32 v9, v8, 16, 1 -; GFX9-NEXT: s_movk_i32 s4, 0x7fff -; GFX9-NEXT: v_max_f32_e32 v3, v3, v7 -; GFX9-NEXT: v_add3_u32 v9, v9, v8, s4 -; GFX9-NEXT: v_or_b32_e32 v10, 0x400000, v8 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v8, v8 -; GFX9-NEXT: v_bfe_u32 v7, v3, 16, 1 -; GFX9-NEXT: v_cndmask_b32_e32 v8, v9, v10, vcc -; GFX9-NEXT: v_add3_u32 v7, v7, v3, s4 -; GFX9-NEXT: v_or_b32_e32 v9, 0x400000, v3 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; GFX9-NEXT: v_cndmask_b32_e32 v3, v7, v9, vcc -; GFX9-NEXT: v_lshlrev_b32_e32 v7, 16, v6 -; GFX9-NEXT: v_lshlrev_b32_e32 v9, 16, v2 -; GFX9-NEXT: v_max_f32_e32 v7, v9, v7 -; GFX9-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 -; GFX9-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX9-NEXT: v_bfe_u32 v9, v7, 16, 1 -; GFX9-NEXT: v_max_f32_e32 v2, v2, v6 -; GFX9-NEXT: v_add3_u32 v9, v9, v7, s4 -; GFX9-NEXT: v_or_b32_e32 v10, 0x400000, v7 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 -; GFX9-NEXT: v_bfe_u32 v6, v2, 16, 1 -; GFX9-NEXT: v_cndmask_b32_e32 v7, v9, v10, vcc -; GFX9-NEXT: v_add3_u32 v6, v6, v2, s4 -; GFX9-NEXT: v_or_b32_e32 v9, 0x400000, v2 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v6, v9, vcc -; GFX9-NEXT: v_lshlrev_b32_e32 v6, 16, v5 -; GFX9-NEXT: v_lshlrev_b32_e32 v9, 16, v1 -; GFX9-NEXT: v_max_f32_e32 v6, v9, v6 -; GFX9-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 -; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GFX9-NEXT: v_bfe_u32 v9, v6, 16, 1 -; GFX9-NEXT: v_max_f32_e32 v1, v1, v5 -; GFX9-NEXT: v_add3_u32 v9, v9, v6, s4 -; GFX9-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 -; GFX9-NEXT: v_bfe_u32 v5, v1, 16, 1 -; GFX9-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc -; GFX9-NEXT: v_add3_u32 v5, v5, v1, s4 -; GFX9-NEXT: v_or_b32_e32 v9, 0x400000, v1 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v5, v9, vcc -; GFX9-NEXT: v_lshlrev_b32_e32 v5, 16, v4 -; GFX9-NEXT: v_lshlrev_b32_e32 v9, 16, v0 -; GFX9-NEXT: v_max_f32_e32 v5, v9, v5 -; GFX9-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX9-NEXT: v_bfe_u32 v9, v5, 16, 1 -; GFX9-NEXT: v_max_f32_e32 v0, v0, v4 -; GFX9-NEXT: v_add3_u32 v9, v9, v5, s4 -; GFX9-NEXT: v_or_b32_e32 v10, 0x400000, v5 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX9-NEXT: v_bfe_u32 v4, v0, 16, 1 -; GFX9-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc -; GFX9-NEXT: v_add3_u32 v4, v4, v0, s4 -; GFX9-NEXT: v_or_b32_e32 v9, 0x400000, v0 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v9, vcc -; GFX9-NEXT: s_mov_b32 s4, 0x7060302 -; GFX9-NEXT: v_perm_b32 v0, v0, v5, s4 -; GFX9-NEXT: v_perm_b32 v1, v1, v6, s4 -; GFX9-NEXT: v_perm_b32 v2, v2, v7, s4 -; GFX9-NEXT: v_perm_b32 v3, v3, v8, s4 -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: v_maxnum_v8bf16: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_lshlrev_b32_e32 v8, 16, v7 +; GFX900-NEXT: v_lshlrev_b32_e32 v9, 16, v3 +; GFX900-NEXT: v_max_f32_e32 v8, v9, v8 +; GFX900-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 +; GFX900-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; GFX900-NEXT: v_bfe_u32 v9, v8, 16, 1 +; GFX900-NEXT: s_movk_i32 s4, 0x7fff +; GFX900-NEXT: v_max_f32_e32 v3, v3, v7 +; GFX900-NEXT: v_add3_u32 v9, v9, v8, s4 +; GFX900-NEXT: v_or_b32_e32 v10, 0x400000, v8 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v8, v8 +; GFX900-NEXT: v_bfe_u32 v7, v3, 16, 1 +; GFX900-NEXT: v_cndmask_b32_e32 v8, v9, v10, vcc +; GFX900-NEXT: v_add3_u32 v7, v7, v3, s4 +; GFX900-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX900-NEXT: v_cndmask_b32_e32 v3, v7, v9, vcc +; GFX900-NEXT: v_lshlrev_b32_e32 v7, 16, v6 +; GFX900-NEXT: v_lshlrev_b32_e32 v9, 16, v2 +; GFX900-NEXT: v_max_f32_e32 v7, v9, v7 +; GFX900-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 +; GFX900-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX900-NEXT: v_bfe_u32 v9, v7, 16, 1 +; GFX900-NEXT: v_max_f32_e32 v2, v2, v6 +; GFX900-NEXT: v_add3_u32 v9, v9, v7, s4 +; GFX900-NEXT: v_or_b32_e32 v10, 0x400000, v7 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 +; GFX900-NEXT: v_bfe_u32 v6, v2, 16, 1 +; GFX900-NEXT: v_cndmask_b32_e32 v7, v9, v10, vcc +; GFX900-NEXT: v_add3_u32 v6, v6, v2, s4 +; GFX900-NEXT: v_or_b32_e32 v9, 0x400000, v2 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX900-NEXT: v_cndmask_b32_e32 v2, v6, v9, vcc +; GFX900-NEXT: v_lshlrev_b32_e32 v6, 16, v5 +; GFX900-NEXT: v_lshlrev_b32_e32 v9, 16, v1 +; GFX900-NEXT: v_max_f32_e32 v6, v9, v6 +; GFX900-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 +; GFX900-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX900-NEXT: v_bfe_u32 v9, v6, 16, 1 +; GFX900-NEXT: v_max_f32_e32 v1, v1, v5 +; GFX900-NEXT: v_add3_u32 v9, v9, v6, s4 +; GFX900-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; GFX900-NEXT: v_bfe_u32 v5, v1, 16, 1 +; GFX900-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc +; GFX900-NEXT: v_add3_u32 v5, v5, v1, s4 +; GFX900-NEXT: v_or_b32_e32 v9, 0x400000, v1 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX900-NEXT: v_cndmask_b32_e32 v1, v5, v9, vcc +; GFX900-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX900-NEXT: v_lshlrev_b32_e32 v9, 16, v0 +; GFX900-NEXT: v_max_f32_e32 v5, v9, v5 +; GFX900-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 +; GFX900-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX900-NEXT: v_bfe_u32 v9, v5, 16, 1 +; GFX900-NEXT: v_max_f32_e32 v0, v0, v4 +; GFX900-NEXT: v_add3_u32 v9, v9, v5, s4 +; GFX900-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX900-NEXT: v_bfe_u32 v4, v0, 16, 1 +; GFX900-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc +; GFX900-NEXT: v_add3_u32 v4, v4, v0, s4 +; GFX900-NEXT: v_or_b32_e32 v9, 0x400000, v0 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX900-NEXT: v_cndmask_b32_e32 v0, v4, v9, vcc +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_perm_b32 v0, v0, v5, s4 +; GFX900-NEXT: v_perm_b32 v1, v1, v6, s4 +; GFX900-NEXT: v_perm_b32 v2, v2, v7, s4 +; GFX900-NEXT: v_perm_b32 v3, v3, v8, s4 +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_maxnum_v8bf16: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_and_b32_e32 v8, 0xffff0000, v7 +; GFX950-NEXT: v_and_b32_e32 v9, 0xffff0000, v3 +; GFX950-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; GFX950-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX950-NEXT: v_max_f32_e32 v8, v9, v8 +; GFX950-NEXT: v_max_f32_e32 v3, v3, v7 +; GFX950-NEXT: v_and_b32_e32 v7, 0xffff0000, v6 +; GFX950-NEXT: v_and_b32_e32 v9, 0xffff0000, v2 +; GFX950-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; GFX950-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX950-NEXT: v_max_f32_e32 v7, v9, v7 +; GFX950-NEXT: v_max_f32_e32 v2, v2, v6 +; GFX950-NEXT: v_and_b32_e32 v6, 0xffff0000, v5 +; GFX950-NEXT: v_and_b32_e32 v9, 0xffff0000, v1 +; GFX950-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX950-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX950-NEXT: v_max_f32_e32 v6, v9, v6 +; GFX950-NEXT: v_max_f32_e32 v1, v1, v5 +; GFX950-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 +; GFX950-NEXT: v_and_b32_e32 v9, 0xffff0000, v0 +; GFX950-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX950-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX950-NEXT: v_max_f32_e32 v5, v9, v5 +; GFX950-NEXT: v_max_f32_e32 v0, v0, v4 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v0, v0, v5 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v1, v1, v6 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v2, v2, v7 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v3, v3, v8 +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_maxnum_v8bf16: ; GFX10: ; %bb.0: @@ -24856,148 +26366,209 @@ define <16 x bfloat> @v_maxnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) { ; GFX8-NEXT: v_alignbit_b32 v7, v7, v16, 16 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_maxnum_v16bf16: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_lshlrev_b32_e32 v16, 16, v15 -; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v7 -; GFX9-NEXT: v_max_f32_e32 v16, v17, v16 -; GFX9-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 -; GFX9-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 -; GFX9-NEXT: v_bfe_u32 v17, v16, 16, 1 -; GFX9-NEXT: s_movk_i32 s4, 0x7fff -; GFX9-NEXT: v_max_f32_e32 v7, v7, v15 -; GFX9-NEXT: v_add3_u32 v17, v17, v16, s4 -; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v16 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v16, v16 -; GFX9-NEXT: v_bfe_u32 v15, v7, 16, 1 -; GFX9-NEXT: v_cndmask_b32_e32 v16, v17, v18, vcc -; GFX9-NEXT: v_add3_u32 v15, v15, v7, s4 -; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v7 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 -; GFX9-NEXT: v_cndmask_b32_e32 v7, v15, v17, vcc -; GFX9-NEXT: v_lshlrev_b32_e32 v15, 16, v14 -; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v6 -; GFX9-NEXT: v_max_f32_e32 v15, v17, v15 -; GFX9-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 -; GFX9-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 -; GFX9-NEXT: v_bfe_u32 v17, v15, 16, 1 -; GFX9-NEXT: v_max_f32_e32 v6, v6, v14 -; GFX9-NEXT: v_add3_u32 v17, v17, v15, s4 -; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v15 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v15, v15 -; GFX9-NEXT: v_bfe_u32 v14, v6, 16, 1 -; GFX9-NEXT: v_cndmask_b32_e32 v15, v17, v18, vcc -; GFX9-NEXT: v_add3_u32 v14, v14, v6, s4 -; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v6 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 -; GFX9-NEXT: v_cndmask_b32_e32 v6, v14, v17, vcc -; GFX9-NEXT: v_lshlrev_b32_e32 v14, 16, v13 -; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v5 -; GFX9-NEXT: v_max_f32_e32 v14, v17, v14 -; GFX9-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 -; GFX9-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 -; GFX9-NEXT: v_bfe_u32 v17, v14, 16, 1 -; GFX9-NEXT: v_max_f32_e32 v5, v5, v13 -; GFX9-NEXT: v_add3_u32 v17, v17, v14, s4 -; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v14 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v14, v14 -; GFX9-NEXT: v_bfe_u32 v13, v5, 16, 1 -; GFX9-NEXT: v_cndmask_b32_e32 v14, v17, v18, vcc -; GFX9-NEXT: v_add3_u32 v13, v13, v5, s4 -; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v5 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX9-NEXT: v_cndmask_b32_e32 v5, v13, v17, vcc -; GFX9-NEXT: v_lshlrev_b32_e32 v13, 16, v12 -; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v4 -; GFX9-NEXT: v_max_f32_e32 v13, v17, v13 -; GFX9-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 -; GFX9-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 -; GFX9-NEXT: v_bfe_u32 v17, v13, 16, 1 -; GFX9-NEXT: v_max_f32_e32 v4, v4, v12 -; GFX9-NEXT: v_add3_u32 v17, v17, v13, s4 -; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v13 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v13, v13 -; GFX9-NEXT: v_bfe_u32 v12, v4, 16, 1 -; GFX9-NEXT: v_cndmask_b32_e32 v13, v17, v18, vcc -; GFX9-NEXT: v_add3_u32 v12, v12, v4, s4 -; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v4 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 -; GFX9-NEXT: v_cndmask_b32_e32 v4, v12, v17, vcc -; GFX9-NEXT: v_lshlrev_b32_e32 v12, 16, v11 -; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v3 -; GFX9-NEXT: v_max_f32_e32 v12, v17, v12 -; GFX9-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 -; GFX9-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 -; GFX9-NEXT: v_bfe_u32 v17, v12, 16, 1 -; GFX9-NEXT: v_max_f32_e32 v3, v3, v11 -; GFX9-NEXT: v_add3_u32 v17, v17, v12, s4 -; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v12 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v12, v12 -; GFX9-NEXT: v_bfe_u32 v11, v3, 16, 1 -; GFX9-NEXT: v_cndmask_b32_e32 v12, v17, v18, vcc -; GFX9-NEXT: v_add3_u32 v11, v11, v3, s4 -; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v3 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; GFX9-NEXT: v_cndmask_b32_e32 v3, v11, v17, vcc -; GFX9-NEXT: v_lshlrev_b32_e32 v11, 16, v10 -; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v2 -; GFX9-NEXT: v_max_f32_e32 v11, v17, v11 -; GFX9-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 -; GFX9-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX9-NEXT: v_bfe_u32 v17, v11, 16, 1 -; GFX9-NEXT: v_max_f32_e32 v2, v2, v10 -; GFX9-NEXT: v_add3_u32 v17, v17, v11, s4 -; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v11 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v11, v11 -; GFX9-NEXT: v_bfe_u32 v10, v2, 16, 1 -; GFX9-NEXT: v_cndmask_b32_e32 v11, v17, v18, vcc -; GFX9-NEXT: v_add3_u32 v10, v10, v2, s4 -; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v2 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v10, v17, vcc -; GFX9-NEXT: v_lshlrev_b32_e32 v10, 16, v9 -; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v1 -; GFX9-NEXT: v_max_f32_e32 v10, v17, v10 -; GFX9-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 -; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GFX9-NEXT: v_bfe_u32 v17, v10, 16, 1 -; GFX9-NEXT: v_max_f32_e32 v1, v1, v9 -; GFX9-NEXT: v_add3_u32 v17, v17, v10, s4 -; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v10 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v10, v10 -; GFX9-NEXT: v_bfe_u32 v9, v1, 16, 1 -; GFX9-NEXT: v_cndmask_b32_e32 v10, v17, v18, vcc -; GFX9-NEXT: v_add3_u32 v9, v9, v1, s4 -; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v1 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v9, v17, vcc -; GFX9-NEXT: v_lshlrev_b32_e32 v9, 16, v8 -; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v0 -; GFX9-NEXT: v_max_f32_e32 v9, v17, v9 -; GFX9-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX9-NEXT: v_bfe_u32 v17, v9, 16, 1 -; GFX9-NEXT: v_max_f32_e32 v0, v0, v8 -; GFX9-NEXT: v_add3_u32 v17, v17, v9, s4 -; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v9 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v9, v9 -; GFX9-NEXT: v_bfe_u32 v8, v0, 16, 1 -; GFX9-NEXT: v_cndmask_b32_e32 v9, v17, v18, vcc -; GFX9-NEXT: v_add3_u32 v8, v8, v0, s4 -; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v0 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v8, v17, vcc -; GFX9-NEXT: s_mov_b32 s4, 0x7060302 -; GFX9-NEXT: v_perm_b32 v0, v0, v9, s4 -; GFX9-NEXT: v_perm_b32 v1, v1, v10, s4 -; GFX9-NEXT: v_perm_b32 v2, v2, v11, s4 -; GFX9-NEXT: v_perm_b32 v3, v3, v12, s4 -; GFX9-NEXT: v_perm_b32 v4, v4, v13, s4 -; GFX9-NEXT: v_perm_b32 v5, v5, v14, s4 -; GFX9-NEXT: v_perm_b32 v6, v6, v15, s4 -; GFX9-NEXT: v_perm_b32 v7, v7, v16, s4 -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: v_maxnum_v16bf16: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_lshlrev_b32_e32 v16, 16, v15 +; GFX900-NEXT: v_lshlrev_b32_e32 v17, 16, v7 +; GFX900-NEXT: v_max_f32_e32 v16, v17, v16 +; GFX900-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 +; GFX900-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 +; GFX900-NEXT: v_bfe_u32 v17, v16, 16, 1 +; GFX900-NEXT: s_movk_i32 s4, 0x7fff +; GFX900-NEXT: v_max_f32_e32 v7, v7, v15 +; GFX900-NEXT: v_add3_u32 v17, v17, v16, s4 +; GFX900-NEXT: v_or_b32_e32 v18, 0x400000, v16 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v16, v16 +; GFX900-NEXT: v_bfe_u32 v15, v7, 16, 1 +; GFX900-NEXT: v_cndmask_b32_e32 v16, v17, v18, vcc +; GFX900-NEXT: v_add3_u32 v15, v15, v7, s4 +; GFX900-NEXT: v_or_b32_e32 v17, 0x400000, v7 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 +; GFX900-NEXT: v_cndmask_b32_e32 v7, v15, v17, vcc +; GFX900-NEXT: v_lshlrev_b32_e32 v15, 16, v14 +; GFX900-NEXT: v_lshlrev_b32_e32 v17, 16, v6 +; GFX900-NEXT: v_max_f32_e32 v15, v17, v15 +; GFX900-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 +; GFX900-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 +; GFX900-NEXT: v_bfe_u32 v17, v15, 16, 1 +; GFX900-NEXT: v_max_f32_e32 v6, v6, v14 +; GFX900-NEXT: v_add3_u32 v17, v17, v15, s4 +; GFX900-NEXT: v_or_b32_e32 v18, 0x400000, v15 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v15, v15 +; GFX900-NEXT: v_bfe_u32 v14, v6, 16, 1 +; GFX900-NEXT: v_cndmask_b32_e32 v15, v17, v18, vcc +; GFX900-NEXT: v_add3_u32 v14, v14, v6, s4 +; GFX900-NEXT: v_or_b32_e32 v17, 0x400000, v6 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; GFX900-NEXT: v_cndmask_b32_e32 v6, v14, v17, vcc +; GFX900-NEXT: v_lshlrev_b32_e32 v14, 16, v13 +; GFX900-NEXT: v_lshlrev_b32_e32 v17, 16, v5 +; GFX900-NEXT: v_max_f32_e32 v14, v17, v14 +; GFX900-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 +; GFX900-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 +; GFX900-NEXT: v_bfe_u32 v17, v14, 16, 1 +; GFX900-NEXT: v_max_f32_e32 v5, v5, v13 +; GFX900-NEXT: v_add3_u32 v17, v17, v14, s4 +; GFX900-NEXT: v_or_b32_e32 v18, 0x400000, v14 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v14, v14 +; GFX900-NEXT: v_bfe_u32 v13, v5, 16, 1 +; GFX900-NEXT: v_cndmask_b32_e32 v14, v17, v18, vcc +; GFX900-NEXT: v_add3_u32 v13, v13, v5, s4 +; GFX900-NEXT: v_or_b32_e32 v17, 0x400000, v5 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX900-NEXT: v_cndmask_b32_e32 v5, v13, v17, vcc +; GFX900-NEXT: v_lshlrev_b32_e32 v13, 16, v12 +; GFX900-NEXT: v_lshlrev_b32_e32 v17, 16, v4 +; GFX900-NEXT: v_max_f32_e32 v13, v17, v13 +; GFX900-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 +; GFX900-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 +; GFX900-NEXT: v_bfe_u32 v17, v13, 16, 1 +; GFX900-NEXT: v_max_f32_e32 v4, v4, v12 +; GFX900-NEXT: v_add3_u32 v17, v17, v13, s4 +; GFX900-NEXT: v_or_b32_e32 v18, 0x400000, v13 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v13, v13 +; GFX900-NEXT: v_bfe_u32 v12, v4, 16, 1 +; GFX900-NEXT: v_cndmask_b32_e32 v13, v17, v18, vcc +; GFX900-NEXT: v_add3_u32 v12, v12, v4, s4 +; GFX900-NEXT: v_or_b32_e32 v17, 0x400000, v4 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; GFX900-NEXT: v_cndmask_b32_e32 v4, v12, v17, vcc +; GFX900-NEXT: v_lshlrev_b32_e32 v12, 16, v11 +; GFX900-NEXT: v_lshlrev_b32_e32 v17, 16, v3 +; GFX900-NEXT: v_max_f32_e32 v12, v17, v12 +; GFX900-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 +; GFX900-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; GFX900-NEXT: v_bfe_u32 v17, v12, 16, 1 +; GFX900-NEXT: v_max_f32_e32 v3, v3, v11 +; GFX900-NEXT: v_add3_u32 v17, v17, v12, s4 +; GFX900-NEXT: v_or_b32_e32 v18, 0x400000, v12 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v12, v12 +; GFX900-NEXT: v_bfe_u32 v11, v3, 16, 1 +; GFX900-NEXT: v_cndmask_b32_e32 v12, v17, v18, vcc +; GFX900-NEXT: v_add3_u32 v11, v11, v3, s4 +; GFX900-NEXT: v_or_b32_e32 v17, 0x400000, v3 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX900-NEXT: v_cndmask_b32_e32 v3, v11, v17, vcc +; GFX900-NEXT: v_lshlrev_b32_e32 v11, 16, v10 +; GFX900-NEXT: v_lshlrev_b32_e32 v17, 16, v2 +; GFX900-NEXT: v_max_f32_e32 v11, v17, v11 +; GFX900-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 +; GFX900-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX900-NEXT: v_bfe_u32 v17, v11, 16, 1 +; GFX900-NEXT: v_max_f32_e32 v2, v2, v10 +; GFX900-NEXT: v_add3_u32 v17, v17, v11, s4 +; GFX900-NEXT: v_or_b32_e32 v18, 0x400000, v11 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v11, v11 +; GFX900-NEXT: v_bfe_u32 v10, v2, 16, 1 +; GFX900-NEXT: v_cndmask_b32_e32 v11, v17, v18, vcc +; GFX900-NEXT: v_add3_u32 v10, v10, v2, s4 +; GFX900-NEXT: v_or_b32_e32 v17, 0x400000, v2 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX900-NEXT: v_cndmask_b32_e32 v2, v10, v17, vcc +; GFX900-NEXT: v_lshlrev_b32_e32 v10, 16, v9 +; GFX900-NEXT: v_lshlrev_b32_e32 v17, 16, v1 +; GFX900-NEXT: v_max_f32_e32 v10, v17, v10 +; GFX900-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 +; GFX900-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX900-NEXT: v_bfe_u32 v17, v10, 16, 1 +; GFX900-NEXT: v_max_f32_e32 v1, v1, v9 +; GFX900-NEXT: v_add3_u32 v17, v17, v10, s4 +; GFX900-NEXT: v_or_b32_e32 v18, 0x400000, v10 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v10, v10 +; GFX900-NEXT: v_bfe_u32 v9, v1, 16, 1 +; GFX900-NEXT: v_cndmask_b32_e32 v10, v17, v18, vcc +; GFX900-NEXT: v_add3_u32 v9, v9, v1, s4 +; GFX900-NEXT: v_or_b32_e32 v17, 0x400000, v1 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX900-NEXT: v_cndmask_b32_e32 v1, v9, v17, vcc +; GFX900-NEXT: v_lshlrev_b32_e32 v9, 16, v8 +; GFX900-NEXT: v_lshlrev_b32_e32 v17, 16, v0 +; GFX900-NEXT: v_max_f32_e32 v9, v17, v9 +; GFX900-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 +; GFX900-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX900-NEXT: v_bfe_u32 v17, v9, 16, 1 +; GFX900-NEXT: v_max_f32_e32 v0, v0, v8 +; GFX900-NEXT: v_add3_u32 v17, v17, v9, s4 +; GFX900-NEXT: v_or_b32_e32 v18, 0x400000, v9 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v9, v9 +; GFX900-NEXT: v_bfe_u32 v8, v0, 16, 1 +; GFX900-NEXT: v_cndmask_b32_e32 v9, v17, v18, vcc +; GFX900-NEXT: v_add3_u32 v8, v8, v0, s4 +; GFX900-NEXT: v_or_b32_e32 v17, 0x400000, v0 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX900-NEXT: v_cndmask_b32_e32 v0, v8, v17, vcc +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_perm_b32 v0, v0, v9, s4 +; GFX900-NEXT: v_perm_b32 v1, v1, v10, s4 +; GFX900-NEXT: v_perm_b32 v2, v2, v11, s4 +; GFX900-NEXT: v_perm_b32 v3, v3, v12, s4 +; GFX900-NEXT: v_perm_b32 v4, v4, v13, s4 +; GFX900-NEXT: v_perm_b32 v5, v5, v14, s4 +; GFX900-NEXT: v_perm_b32 v6, v6, v15, s4 +; GFX900-NEXT: v_perm_b32 v7, v7, v16, s4 +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_maxnum_v16bf16: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_and_b32_e32 v16, 0xffff0000, v15 +; GFX950-NEXT: v_and_b32_e32 v17, 0xffff0000, v7 +; GFX950-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; GFX950-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; GFX950-NEXT: v_max_f32_e32 v16, v17, v16 +; GFX950-NEXT: v_max_f32_e32 v7, v7, v15 +; GFX950-NEXT: v_and_b32_e32 v15, 0xffff0000, v14 +; GFX950-NEXT: v_and_b32_e32 v17, 0xffff0000, v6 +; GFX950-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; GFX950-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; GFX950-NEXT: v_max_f32_e32 v15, v17, v15 +; GFX950-NEXT: v_max_f32_e32 v6, v6, v14 +; GFX950-NEXT: v_and_b32_e32 v14, 0xffff0000, v13 +; GFX950-NEXT: v_and_b32_e32 v17, 0xffff0000, v5 +; GFX950-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; GFX950-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX950-NEXT: v_max_f32_e32 v14, v17, v14 +; GFX950-NEXT: v_max_f32_e32 v5, v5, v13 +; GFX950-NEXT: v_and_b32_e32 v13, 0xffff0000, v12 +; GFX950-NEXT: v_and_b32_e32 v17, 0xffff0000, v4 +; GFX950-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; GFX950-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX950-NEXT: v_max_f32_e32 v13, v17, v13 +; GFX950-NEXT: v_max_f32_e32 v4, v4, v12 +; GFX950-NEXT: v_and_b32_e32 v12, 0xffff0000, v11 +; GFX950-NEXT: v_and_b32_e32 v17, 0xffff0000, v3 +; GFX950-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; GFX950-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX950-NEXT: v_max_f32_e32 v12, v17, v12 +; GFX950-NEXT: v_max_f32_e32 v3, v3, v11 +; GFX950-NEXT: v_and_b32_e32 v11, 0xffff0000, v10 +; GFX950-NEXT: v_and_b32_e32 v17, 0xffff0000, v2 +; GFX950-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; GFX950-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX950-NEXT: v_max_f32_e32 v11, v17, v11 +; GFX950-NEXT: v_max_f32_e32 v2, v2, v10 +; GFX950-NEXT: v_and_b32_e32 v10, 0xffff0000, v9 +; GFX950-NEXT: v_and_b32_e32 v17, 0xffff0000, v1 +; GFX950-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; GFX950-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX950-NEXT: v_max_f32_e32 v10, v17, v10 +; GFX950-NEXT: v_max_f32_e32 v1, v1, v9 +; GFX950-NEXT: v_and_b32_e32 v9, 0xffff0000, v8 +; GFX950-NEXT: v_and_b32_e32 v17, 0xffff0000, v0 +; GFX950-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; GFX950-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX950-NEXT: v_max_f32_e32 v9, v17, v9 +; GFX950-NEXT: v_max_f32_e32 v0, v0, v8 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v0, v0, v9 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v1, v1, v10 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v2, v2, v11 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v3, v3, v12 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v4, v4, v13 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v5, v5, v14 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v6, v6, v15 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v7, v7, v16 +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_maxnum_v16bf16: ; GFX10: ; %bb.0: @@ -26312,286 +27883,407 @@ define <32 x bfloat> @v_maxnum_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) { ; GFX8-NEXT: v_alignbit_b32 v15, v16, v15, 16 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_maxnum_v32bf16: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_lshlrev_b32_e32 v31, 16, v30 -; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v14 -; GFX9-NEXT: v_max_f32_e32 v31, v32, v31 -; GFX9-NEXT: v_and_b32_e32 v30, 0xffff0000, v30 -; GFX9-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 -; GFX9-NEXT: s_movk_i32 s4, 0x7fff -; GFX9-NEXT: v_bfe_u32 v32, v31, 16, 1 -; GFX9-NEXT: v_max_f32_e32 v14, v14, v30 -; GFX9-NEXT: v_add3_u32 v32, v32, v31, s4 -; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v31 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v31, v31 -; GFX9-NEXT: v_bfe_u32 v30, v14, 16, 1 -; GFX9-NEXT: v_cndmask_b32_e32 v31, v32, v33, vcc -; GFX9-NEXT: v_add3_u32 v30, v30, v14, s4 -; GFX9-NEXT: v_or_b32_e32 v32, 0x400000, v14 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v14, v14 -; GFX9-NEXT: v_cndmask_b32_e32 v14, v30, v32, vcc -; GFX9-NEXT: v_lshlrev_b32_e32 v30, 16, v29 -; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v13 -; GFX9-NEXT: v_max_f32_e32 v30, v32, v30 -; GFX9-NEXT: v_and_b32_e32 v29, 0xffff0000, v29 -; GFX9-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 -; GFX9-NEXT: v_bfe_u32 v32, v30, 16, 1 -; GFX9-NEXT: v_max_f32_e32 v13, v13, v29 -; GFX9-NEXT: v_add3_u32 v32, v32, v30, s4 -; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v30 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v30, v30 -; GFX9-NEXT: v_bfe_u32 v29, v13, 16, 1 -; GFX9-NEXT: v_cndmask_b32_e32 v30, v32, v33, vcc -; GFX9-NEXT: v_add3_u32 v29, v29, v13, s4 -; GFX9-NEXT: v_or_b32_e32 v32, 0x400000, v13 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v13, v13 -; GFX9-NEXT: v_cndmask_b32_e32 v13, v29, v32, vcc -; GFX9-NEXT: v_lshlrev_b32_e32 v29, 16, v28 -; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v12 -; GFX9-NEXT: v_max_f32_e32 v32, v32, v29 -; GFX9-NEXT: buffer_load_dword v29, off, s[0:3], s32 -; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v15 -; GFX9-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 -; GFX9-NEXT: v_and_b32_e32 v28, 0xffff0000, v28 -; GFX9-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 -; GFX9-NEXT: v_max_f32_e32 v12, v12, v28 -; GFX9-NEXT: v_bfe_u32 v28, v12, 16, 1 -; GFX9-NEXT: v_add3_u32 v28, v28, v12, s4 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_lshlrev_b32_e32 v34, 16, v29 -; GFX9-NEXT: v_max_f32_e32 v33, v33, v34 -; GFX9-NEXT: v_and_b32_e32 v29, 0xffff0000, v29 -; GFX9-NEXT: v_max_f32_e32 v29, v15, v29 -; GFX9-NEXT: v_bfe_u32 v15, v33, 16, 1 -; GFX9-NEXT: v_add3_u32 v15, v15, v33, s4 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v33 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v33, v33 -; GFX9-NEXT: v_bfe_u32 v33, v29, 16, 1 -; GFX9-NEXT: v_cndmask_b32_e32 v15, v15, v34, vcc -; GFX9-NEXT: v_add3_u32 v33, v33, v29, s4 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v29 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v29, v29 -; GFX9-NEXT: v_cndmask_b32_e32 v29, v33, v34, vcc -; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 -; GFX9-NEXT: v_add3_u32 v33, v33, v32, s4 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 -; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc -; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v12 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v12, v12 -; GFX9-NEXT: v_cndmask_b32_e32 v12, v28, v33, vcc -; GFX9-NEXT: v_lshlrev_b32_e32 v28, 16, v27 -; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v11 -; GFX9-NEXT: v_max_f32_e32 v28, v33, v28 -; GFX9-NEXT: v_and_b32_e32 v27, 0xffff0000, v27 -; GFX9-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 -; GFX9-NEXT: v_bfe_u32 v33, v28, 16, 1 -; GFX9-NEXT: v_max_f32_e32 v11, v11, v27 -; GFX9-NEXT: v_add3_u32 v33, v33, v28, s4 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v28 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v28, v28 -; GFX9-NEXT: v_bfe_u32 v27, v11, 16, 1 -; GFX9-NEXT: v_cndmask_b32_e32 v28, v33, v34, vcc -; GFX9-NEXT: v_add3_u32 v27, v27, v11, s4 -; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v11 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v11, v11 -; GFX9-NEXT: v_cndmask_b32_e32 v11, v27, v33, vcc -; GFX9-NEXT: v_lshlrev_b32_e32 v27, 16, v26 -; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v10 -; GFX9-NEXT: v_max_f32_e32 v27, v33, v27 -; GFX9-NEXT: v_and_b32_e32 v26, 0xffff0000, v26 -; GFX9-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 -; GFX9-NEXT: v_bfe_u32 v33, v27, 16, 1 -; GFX9-NEXT: v_max_f32_e32 v10, v10, v26 -; GFX9-NEXT: v_add3_u32 v33, v33, v27, s4 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v27 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v27, v27 -; GFX9-NEXT: v_bfe_u32 v26, v10, 16, 1 -; GFX9-NEXT: v_cndmask_b32_e32 v27, v33, v34, vcc -; GFX9-NEXT: v_add3_u32 v26, v26, v10, s4 -; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v10 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v10, v10 -; GFX9-NEXT: v_cndmask_b32_e32 v10, v26, v33, vcc -; GFX9-NEXT: v_lshlrev_b32_e32 v26, 16, v25 -; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v9 -; GFX9-NEXT: v_max_f32_e32 v26, v33, v26 -; GFX9-NEXT: v_and_b32_e32 v25, 0xffff0000, v25 -; GFX9-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 -; GFX9-NEXT: v_bfe_u32 v33, v26, 16, 1 -; GFX9-NEXT: v_max_f32_e32 v9, v9, v25 -; GFX9-NEXT: v_add3_u32 v33, v33, v26, s4 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v26 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v26, v26 -; GFX9-NEXT: v_bfe_u32 v25, v9, 16, 1 -; GFX9-NEXT: v_cndmask_b32_e32 v26, v33, v34, vcc -; GFX9-NEXT: v_add3_u32 v25, v25, v9, s4 -; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v9 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v9, v9 -; GFX9-NEXT: v_cndmask_b32_e32 v9, v25, v33, vcc -; GFX9-NEXT: v_lshlrev_b32_e32 v25, 16, v24 -; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v8 -; GFX9-NEXT: v_max_f32_e32 v25, v33, v25 -; GFX9-NEXT: v_and_b32_e32 v24, 0xffff0000, v24 -; GFX9-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 -; GFX9-NEXT: v_bfe_u32 v33, v25, 16, 1 -; GFX9-NEXT: v_max_f32_e32 v8, v8, v24 -; GFX9-NEXT: v_add3_u32 v33, v33, v25, s4 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v25 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v25, v25 -; GFX9-NEXT: v_bfe_u32 v24, v8, 16, 1 -; GFX9-NEXT: v_cndmask_b32_e32 v25, v33, v34, vcc -; GFX9-NEXT: v_add3_u32 v24, v24, v8, s4 -; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v8 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v8, v8 -; GFX9-NEXT: v_cndmask_b32_e32 v8, v24, v33, vcc -; GFX9-NEXT: v_lshlrev_b32_e32 v24, 16, v23 -; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v7 -; GFX9-NEXT: v_max_f32_e32 v24, v33, v24 -; GFX9-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 -; GFX9-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 -; GFX9-NEXT: v_bfe_u32 v33, v24, 16, 1 -; GFX9-NEXT: v_max_f32_e32 v7, v7, v23 -; GFX9-NEXT: v_add3_u32 v33, v33, v24, s4 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v24 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v24, v24 -; GFX9-NEXT: v_bfe_u32 v23, v7, 16, 1 -; GFX9-NEXT: v_cndmask_b32_e32 v24, v33, v34, vcc -; GFX9-NEXT: v_add3_u32 v23, v23, v7, s4 -; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v7 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 -; GFX9-NEXT: v_cndmask_b32_e32 v7, v23, v33, vcc -; GFX9-NEXT: v_lshlrev_b32_e32 v23, 16, v22 -; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v6 -; GFX9-NEXT: v_max_f32_e32 v23, v33, v23 -; GFX9-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 -; GFX9-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 -; GFX9-NEXT: v_bfe_u32 v33, v23, 16, 1 -; GFX9-NEXT: v_max_f32_e32 v6, v6, v22 -; GFX9-NEXT: v_add3_u32 v33, v33, v23, s4 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v23 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v23, v23 -; GFX9-NEXT: v_bfe_u32 v22, v6, 16, 1 -; GFX9-NEXT: v_cndmask_b32_e32 v23, v33, v34, vcc -; GFX9-NEXT: v_add3_u32 v22, v22, v6, s4 -; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v6 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 -; GFX9-NEXT: v_cndmask_b32_e32 v6, v22, v33, vcc -; GFX9-NEXT: v_lshlrev_b32_e32 v22, 16, v21 -; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v5 -; GFX9-NEXT: v_max_f32_e32 v22, v33, v22 -; GFX9-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 -; GFX9-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 -; GFX9-NEXT: v_bfe_u32 v33, v22, 16, 1 -; GFX9-NEXT: v_max_f32_e32 v5, v5, v21 -; GFX9-NEXT: v_add3_u32 v33, v33, v22, s4 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v22 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v22, v22 -; GFX9-NEXT: v_bfe_u32 v21, v5, 16, 1 -; GFX9-NEXT: v_cndmask_b32_e32 v22, v33, v34, vcc -; GFX9-NEXT: v_add3_u32 v21, v21, v5, s4 -; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v5 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX9-NEXT: v_cndmask_b32_e32 v5, v21, v33, vcc -; GFX9-NEXT: v_lshlrev_b32_e32 v21, 16, v20 -; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v4 -; GFX9-NEXT: v_max_f32_e32 v21, v33, v21 -; GFX9-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 -; GFX9-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 -; GFX9-NEXT: v_bfe_u32 v33, v21, 16, 1 -; GFX9-NEXT: v_max_f32_e32 v4, v4, v20 -; GFX9-NEXT: v_add3_u32 v33, v33, v21, s4 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v21 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v21, v21 -; GFX9-NEXT: v_bfe_u32 v20, v4, 16, 1 -; GFX9-NEXT: v_cndmask_b32_e32 v21, v33, v34, vcc -; GFX9-NEXT: v_add3_u32 v20, v20, v4, s4 -; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v4 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 -; GFX9-NEXT: v_cndmask_b32_e32 v4, v20, v33, vcc -; GFX9-NEXT: v_lshlrev_b32_e32 v20, 16, v19 -; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v3 -; GFX9-NEXT: v_max_f32_e32 v20, v33, v20 -; GFX9-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 -; GFX9-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 -; GFX9-NEXT: v_bfe_u32 v33, v20, 16, 1 -; GFX9-NEXT: v_max_f32_e32 v3, v3, v19 -; GFX9-NEXT: v_add3_u32 v33, v33, v20, s4 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v20 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v20, v20 -; GFX9-NEXT: v_bfe_u32 v19, v3, 16, 1 -; GFX9-NEXT: v_cndmask_b32_e32 v20, v33, v34, vcc -; GFX9-NEXT: v_add3_u32 v19, v19, v3, s4 -; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v3 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; GFX9-NEXT: v_cndmask_b32_e32 v3, v19, v33, vcc -; GFX9-NEXT: v_lshlrev_b32_e32 v19, 16, v18 -; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v2 -; GFX9-NEXT: v_max_f32_e32 v19, v33, v19 -; GFX9-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 -; GFX9-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX9-NEXT: v_bfe_u32 v33, v19, 16, 1 -; GFX9-NEXT: v_max_f32_e32 v2, v2, v18 -; GFX9-NEXT: v_add3_u32 v33, v33, v19, s4 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v19 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v19, v19 -; GFX9-NEXT: v_bfe_u32 v18, v2, 16, 1 -; GFX9-NEXT: v_cndmask_b32_e32 v19, v33, v34, vcc -; GFX9-NEXT: v_add3_u32 v18, v18, v2, s4 -; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v2 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v18, v33, vcc -; GFX9-NEXT: v_lshlrev_b32_e32 v18, 16, v17 -; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v1 -; GFX9-NEXT: v_max_f32_e32 v18, v33, v18 -; GFX9-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 -; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GFX9-NEXT: v_bfe_u32 v33, v18, 16, 1 -; GFX9-NEXT: v_max_f32_e32 v1, v1, v17 -; GFX9-NEXT: v_add3_u32 v33, v33, v18, s4 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v18 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 -; GFX9-NEXT: v_bfe_u32 v17, v1, 16, 1 -; GFX9-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc -; GFX9-NEXT: v_add3_u32 v17, v17, v1, s4 -; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v1 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v17, v33, vcc -; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v16 -; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v0 -; GFX9-NEXT: v_max_f32_e32 v17, v33, v17 -; GFX9-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX9-NEXT: v_bfe_u32 v33, v17, 16, 1 -; GFX9-NEXT: v_max_f32_e32 v0, v0, v16 -; GFX9-NEXT: v_add3_u32 v33, v33, v17, s4 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v17 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 -; GFX9-NEXT: v_bfe_u32 v16, v0, 16, 1 -; GFX9-NEXT: v_cndmask_b32_e32 v17, v33, v34, vcc -; GFX9-NEXT: v_add3_u32 v16, v16, v0, s4 -; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v0 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v16, v33, vcc -; GFX9-NEXT: s_mov_b32 s4, 0x7060302 -; GFX9-NEXT: v_perm_b32 v0, v0, v17, s4 -; GFX9-NEXT: v_perm_b32 v1, v1, v18, s4 -; GFX9-NEXT: v_perm_b32 v2, v2, v19, s4 -; GFX9-NEXT: v_perm_b32 v3, v3, v20, s4 -; GFX9-NEXT: v_perm_b32 v4, v4, v21, s4 -; GFX9-NEXT: v_perm_b32 v5, v5, v22, s4 -; GFX9-NEXT: v_perm_b32 v6, v6, v23, s4 -; GFX9-NEXT: v_perm_b32 v7, v7, v24, s4 -; GFX9-NEXT: v_perm_b32 v8, v8, v25, s4 -; GFX9-NEXT: v_perm_b32 v9, v9, v26, s4 -; GFX9-NEXT: v_perm_b32 v10, v10, v27, s4 -; GFX9-NEXT: v_perm_b32 v11, v11, v28, s4 -; GFX9-NEXT: v_perm_b32 v12, v12, v32, s4 -; GFX9-NEXT: v_perm_b32 v13, v13, v30, s4 -; GFX9-NEXT: v_perm_b32 v14, v14, v31, s4 -; GFX9-NEXT: v_perm_b32 v15, v29, v15, s4 -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: v_maxnum_v32bf16: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_lshlrev_b32_e32 v31, 16, v30 +; GFX900-NEXT: v_lshlrev_b32_e32 v32, 16, v14 +; GFX900-NEXT: v_max_f32_e32 v31, v32, v31 +; GFX900-NEXT: v_and_b32_e32 v30, 0xffff0000, v30 +; GFX900-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 +; GFX900-NEXT: s_movk_i32 s4, 0x7fff +; GFX900-NEXT: v_bfe_u32 v32, v31, 16, 1 +; GFX900-NEXT: v_max_f32_e32 v14, v14, v30 +; GFX900-NEXT: v_add3_u32 v32, v32, v31, s4 +; GFX900-NEXT: v_or_b32_e32 v33, 0x400000, v31 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v31, v31 +; GFX900-NEXT: v_bfe_u32 v30, v14, 16, 1 +; GFX900-NEXT: v_cndmask_b32_e32 v31, v32, v33, vcc +; GFX900-NEXT: v_add3_u32 v30, v30, v14, s4 +; GFX900-NEXT: v_or_b32_e32 v32, 0x400000, v14 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v14, v14 +; GFX900-NEXT: v_cndmask_b32_e32 v14, v30, v32, vcc +; GFX900-NEXT: v_lshlrev_b32_e32 v30, 16, v29 +; GFX900-NEXT: v_lshlrev_b32_e32 v32, 16, v13 +; GFX900-NEXT: v_max_f32_e32 v30, v32, v30 +; GFX900-NEXT: v_and_b32_e32 v29, 0xffff0000, v29 +; GFX900-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 +; GFX900-NEXT: v_bfe_u32 v32, v30, 16, 1 +; GFX900-NEXT: v_max_f32_e32 v13, v13, v29 +; GFX900-NEXT: v_add3_u32 v32, v32, v30, s4 +; GFX900-NEXT: v_or_b32_e32 v33, 0x400000, v30 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v30, v30 +; GFX900-NEXT: v_bfe_u32 v29, v13, 16, 1 +; GFX900-NEXT: v_cndmask_b32_e32 v30, v32, v33, vcc +; GFX900-NEXT: v_add3_u32 v29, v29, v13, s4 +; GFX900-NEXT: v_or_b32_e32 v32, 0x400000, v13 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v13, v13 +; GFX900-NEXT: v_cndmask_b32_e32 v13, v29, v32, vcc +; GFX900-NEXT: v_lshlrev_b32_e32 v29, 16, v28 +; GFX900-NEXT: v_lshlrev_b32_e32 v32, 16, v12 +; GFX900-NEXT: v_max_f32_e32 v32, v32, v29 +; GFX900-NEXT: buffer_load_dword v29, off, s[0:3], s32 +; GFX900-NEXT: v_lshlrev_b32_e32 v33, 16, v15 +; GFX900-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 +; GFX900-NEXT: v_and_b32_e32 v28, 0xffff0000, v28 +; GFX900-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 +; GFX900-NEXT: v_max_f32_e32 v12, v12, v28 +; GFX900-NEXT: v_bfe_u32 v28, v12, 16, 1 +; GFX900-NEXT: v_add3_u32 v28, v28, v12, s4 +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: v_lshlrev_b32_e32 v34, 16, v29 +; GFX900-NEXT: v_max_f32_e32 v33, v33, v34 +; GFX900-NEXT: v_and_b32_e32 v29, 0xffff0000, v29 +; GFX900-NEXT: v_max_f32_e32 v29, v15, v29 +; GFX900-NEXT: v_bfe_u32 v15, v33, 16, 1 +; GFX900-NEXT: v_add3_u32 v15, v15, v33, s4 +; GFX900-NEXT: v_or_b32_e32 v34, 0x400000, v33 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v33, v33 +; GFX900-NEXT: v_bfe_u32 v33, v29, 16, 1 +; GFX900-NEXT: v_cndmask_b32_e32 v15, v15, v34, vcc +; GFX900-NEXT: v_add3_u32 v33, v33, v29, s4 +; GFX900-NEXT: v_or_b32_e32 v34, 0x400000, v29 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v29, v29 +; GFX900-NEXT: v_cndmask_b32_e32 v29, v33, v34, vcc +; GFX900-NEXT: v_bfe_u32 v33, v32, 16, 1 +; GFX900-NEXT: v_add3_u32 v33, v33, v32, s4 +; GFX900-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX900-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; GFX900-NEXT: v_or_b32_e32 v33, 0x400000, v12 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v12, v12 +; GFX900-NEXT: v_cndmask_b32_e32 v12, v28, v33, vcc +; GFX900-NEXT: v_lshlrev_b32_e32 v28, 16, v27 +; GFX900-NEXT: v_lshlrev_b32_e32 v33, 16, v11 +; GFX900-NEXT: v_max_f32_e32 v28, v33, v28 +; GFX900-NEXT: v_and_b32_e32 v27, 0xffff0000, v27 +; GFX900-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 +; GFX900-NEXT: v_bfe_u32 v33, v28, 16, 1 +; GFX900-NEXT: v_max_f32_e32 v11, v11, v27 +; GFX900-NEXT: v_add3_u32 v33, v33, v28, s4 +; GFX900-NEXT: v_or_b32_e32 v34, 0x400000, v28 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v28, v28 +; GFX900-NEXT: v_bfe_u32 v27, v11, 16, 1 +; GFX900-NEXT: v_cndmask_b32_e32 v28, v33, v34, vcc +; GFX900-NEXT: v_add3_u32 v27, v27, v11, s4 +; GFX900-NEXT: v_or_b32_e32 v33, 0x400000, v11 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v11, v11 +; GFX900-NEXT: v_cndmask_b32_e32 v11, v27, v33, vcc +; GFX900-NEXT: v_lshlrev_b32_e32 v27, 16, v26 +; GFX900-NEXT: v_lshlrev_b32_e32 v33, 16, v10 +; GFX900-NEXT: v_max_f32_e32 v27, v33, v27 +; GFX900-NEXT: v_and_b32_e32 v26, 0xffff0000, v26 +; GFX900-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 +; GFX900-NEXT: v_bfe_u32 v33, v27, 16, 1 +; GFX900-NEXT: v_max_f32_e32 v10, v10, v26 +; GFX900-NEXT: v_add3_u32 v33, v33, v27, s4 +; GFX900-NEXT: v_or_b32_e32 v34, 0x400000, v27 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v27, v27 +; GFX900-NEXT: v_bfe_u32 v26, v10, 16, 1 +; GFX900-NEXT: v_cndmask_b32_e32 v27, v33, v34, vcc +; GFX900-NEXT: v_add3_u32 v26, v26, v10, s4 +; GFX900-NEXT: v_or_b32_e32 v33, 0x400000, v10 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v10, v10 +; GFX900-NEXT: v_cndmask_b32_e32 v10, v26, v33, vcc +; GFX900-NEXT: v_lshlrev_b32_e32 v26, 16, v25 +; GFX900-NEXT: v_lshlrev_b32_e32 v33, 16, v9 +; GFX900-NEXT: v_max_f32_e32 v26, v33, v26 +; GFX900-NEXT: v_and_b32_e32 v25, 0xffff0000, v25 +; GFX900-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 +; GFX900-NEXT: v_bfe_u32 v33, v26, 16, 1 +; GFX900-NEXT: v_max_f32_e32 v9, v9, v25 +; GFX900-NEXT: v_add3_u32 v33, v33, v26, s4 +; GFX900-NEXT: v_or_b32_e32 v34, 0x400000, v26 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v26, v26 +; GFX900-NEXT: v_bfe_u32 v25, v9, 16, 1 +; GFX900-NEXT: v_cndmask_b32_e32 v26, v33, v34, vcc +; GFX900-NEXT: v_add3_u32 v25, v25, v9, s4 +; GFX900-NEXT: v_or_b32_e32 v33, 0x400000, v9 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v9, v9 +; GFX900-NEXT: v_cndmask_b32_e32 v9, v25, v33, vcc +; GFX900-NEXT: v_lshlrev_b32_e32 v25, 16, v24 +; GFX900-NEXT: v_lshlrev_b32_e32 v33, 16, v8 +; GFX900-NEXT: v_max_f32_e32 v25, v33, v25 +; GFX900-NEXT: v_and_b32_e32 v24, 0xffff0000, v24 +; GFX900-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 +; GFX900-NEXT: v_bfe_u32 v33, v25, 16, 1 +; GFX900-NEXT: v_max_f32_e32 v8, v8, v24 +; GFX900-NEXT: v_add3_u32 v33, v33, v25, s4 +; GFX900-NEXT: v_or_b32_e32 v34, 0x400000, v25 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v25, v25 +; GFX900-NEXT: v_bfe_u32 v24, v8, 16, 1 +; GFX900-NEXT: v_cndmask_b32_e32 v25, v33, v34, vcc +; GFX900-NEXT: v_add3_u32 v24, v24, v8, s4 +; GFX900-NEXT: v_or_b32_e32 v33, 0x400000, v8 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v8, v8 +; GFX900-NEXT: v_cndmask_b32_e32 v8, v24, v33, vcc +; GFX900-NEXT: v_lshlrev_b32_e32 v24, 16, v23 +; GFX900-NEXT: v_lshlrev_b32_e32 v33, 16, v7 +; GFX900-NEXT: v_max_f32_e32 v24, v33, v24 +; GFX900-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 +; GFX900-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 +; GFX900-NEXT: v_bfe_u32 v33, v24, 16, 1 +; GFX900-NEXT: v_max_f32_e32 v7, v7, v23 +; GFX900-NEXT: v_add3_u32 v33, v33, v24, s4 +; GFX900-NEXT: v_or_b32_e32 v34, 0x400000, v24 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v24, v24 +; GFX900-NEXT: v_bfe_u32 v23, v7, 16, 1 +; GFX900-NEXT: v_cndmask_b32_e32 v24, v33, v34, vcc +; GFX900-NEXT: v_add3_u32 v23, v23, v7, s4 +; GFX900-NEXT: v_or_b32_e32 v33, 0x400000, v7 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 +; GFX900-NEXT: v_cndmask_b32_e32 v7, v23, v33, vcc +; GFX900-NEXT: v_lshlrev_b32_e32 v23, 16, v22 +; GFX900-NEXT: v_lshlrev_b32_e32 v33, 16, v6 +; GFX900-NEXT: v_max_f32_e32 v23, v33, v23 +; GFX900-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 +; GFX900-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 +; GFX900-NEXT: v_bfe_u32 v33, v23, 16, 1 +; GFX900-NEXT: v_max_f32_e32 v6, v6, v22 +; GFX900-NEXT: v_add3_u32 v33, v33, v23, s4 +; GFX900-NEXT: v_or_b32_e32 v34, 0x400000, v23 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v23, v23 +; GFX900-NEXT: v_bfe_u32 v22, v6, 16, 1 +; GFX900-NEXT: v_cndmask_b32_e32 v23, v33, v34, vcc +; GFX900-NEXT: v_add3_u32 v22, v22, v6, s4 +; GFX900-NEXT: v_or_b32_e32 v33, 0x400000, v6 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; GFX900-NEXT: v_cndmask_b32_e32 v6, v22, v33, vcc +; GFX900-NEXT: v_lshlrev_b32_e32 v22, 16, v21 +; GFX900-NEXT: v_lshlrev_b32_e32 v33, 16, v5 +; GFX900-NEXT: v_max_f32_e32 v22, v33, v22 +; GFX900-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 +; GFX900-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 +; GFX900-NEXT: v_bfe_u32 v33, v22, 16, 1 +; GFX900-NEXT: v_max_f32_e32 v5, v5, v21 +; GFX900-NEXT: v_add3_u32 v33, v33, v22, s4 +; GFX900-NEXT: v_or_b32_e32 v34, 0x400000, v22 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v22, v22 +; GFX900-NEXT: v_bfe_u32 v21, v5, 16, 1 +; GFX900-NEXT: v_cndmask_b32_e32 v22, v33, v34, vcc +; GFX900-NEXT: v_add3_u32 v21, v21, v5, s4 +; GFX900-NEXT: v_or_b32_e32 v33, 0x400000, v5 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX900-NEXT: v_cndmask_b32_e32 v5, v21, v33, vcc +; GFX900-NEXT: v_lshlrev_b32_e32 v21, 16, v20 +; GFX900-NEXT: v_lshlrev_b32_e32 v33, 16, v4 +; GFX900-NEXT: v_max_f32_e32 v21, v33, v21 +; GFX900-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 +; GFX900-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 +; GFX900-NEXT: v_bfe_u32 v33, v21, 16, 1 +; GFX900-NEXT: v_max_f32_e32 v4, v4, v20 +; GFX900-NEXT: v_add3_u32 v33, v33, v21, s4 +; GFX900-NEXT: v_or_b32_e32 v34, 0x400000, v21 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v21, v21 +; GFX900-NEXT: v_bfe_u32 v20, v4, 16, 1 +; GFX900-NEXT: v_cndmask_b32_e32 v21, v33, v34, vcc +; GFX900-NEXT: v_add3_u32 v20, v20, v4, s4 +; GFX900-NEXT: v_or_b32_e32 v33, 0x400000, v4 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; GFX900-NEXT: v_cndmask_b32_e32 v4, v20, v33, vcc +; GFX900-NEXT: v_lshlrev_b32_e32 v20, 16, v19 +; GFX900-NEXT: v_lshlrev_b32_e32 v33, 16, v3 +; GFX900-NEXT: v_max_f32_e32 v20, v33, v20 +; GFX900-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 +; GFX900-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; GFX900-NEXT: v_bfe_u32 v33, v20, 16, 1 +; GFX900-NEXT: v_max_f32_e32 v3, v3, v19 +; GFX900-NEXT: v_add3_u32 v33, v33, v20, s4 +; GFX900-NEXT: v_or_b32_e32 v34, 0x400000, v20 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v20, v20 +; GFX900-NEXT: v_bfe_u32 v19, v3, 16, 1 +; GFX900-NEXT: v_cndmask_b32_e32 v20, v33, v34, vcc +; GFX900-NEXT: v_add3_u32 v19, v19, v3, s4 +; GFX900-NEXT: v_or_b32_e32 v33, 0x400000, v3 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX900-NEXT: v_cndmask_b32_e32 v3, v19, v33, vcc +; GFX900-NEXT: v_lshlrev_b32_e32 v19, 16, v18 +; GFX900-NEXT: v_lshlrev_b32_e32 v33, 16, v2 +; GFX900-NEXT: v_max_f32_e32 v19, v33, v19 +; GFX900-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 +; GFX900-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX900-NEXT: v_bfe_u32 v33, v19, 16, 1 +; GFX900-NEXT: v_max_f32_e32 v2, v2, v18 +; GFX900-NEXT: v_add3_u32 v33, v33, v19, s4 +; GFX900-NEXT: v_or_b32_e32 v34, 0x400000, v19 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v19, v19 +; GFX900-NEXT: v_bfe_u32 v18, v2, 16, 1 +; GFX900-NEXT: v_cndmask_b32_e32 v19, v33, v34, vcc +; GFX900-NEXT: v_add3_u32 v18, v18, v2, s4 +; GFX900-NEXT: v_or_b32_e32 v33, 0x400000, v2 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX900-NEXT: v_cndmask_b32_e32 v2, v18, v33, vcc +; GFX900-NEXT: v_lshlrev_b32_e32 v18, 16, v17 +; GFX900-NEXT: v_lshlrev_b32_e32 v33, 16, v1 +; GFX900-NEXT: v_max_f32_e32 v18, v33, v18 +; GFX900-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 +; GFX900-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX900-NEXT: v_bfe_u32 v33, v18, 16, 1 +; GFX900-NEXT: v_max_f32_e32 v1, v1, v17 +; GFX900-NEXT: v_add3_u32 v33, v33, v18, s4 +; GFX900-NEXT: v_or_b32_e32 v34, 0x400000, v18 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 +; GFX900-NEXT: v_bfe_u32 v17, v1, 16, 1 +; GFX900-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc +; GFX900-NEXT: v_add3_u32 v17, v17, v1, s4 +; GFX900-NEXT: v_or_b32_e32 v33, 0x400000, v1 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX900-NEXT: v_cndmask_b32_e32 v1, v17, v33, vcc +; GFX900-NEXT: v_lshlrev_b32_e32 v17, 16, v16 +; GFX900-NEXT: v_lshlrev_b32_e32 v33, 16, v0 +; GFX900-NEXT: v_max_f32_e32 v17, v33, v17 +; GFX900-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 +; GFX900-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX900-NEXT: v_bfe_u32 v33, v17, 16, 1 +; GFX900-NEXT: v_max_f32_e32 v0, v0, v16 +; GFX900-NEXT: v_add3_u32 v33, v33, v17, s4 +; GFX900-NEXT: v_or_b32_e32 v34, 0x400000, v17 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 +; GFX900-NEXT: v_bfe_u32 v16, v0, 16, 1 +; GFX900-NEXT: v_cndmask_b32_e32 v17, v33, v34, vcc +; GFX900-NEXT: v_add3_u32 v16, v16, v0, s4 +; GFX900-NEXT: v_or_b32_e32 v33, 0x400000, v0 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX900-NEXT: v_cndmask_b32_e32 v0, v16, v33, vcc +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_perm_b32 v0, v0, v17, s4 +; GFX900-NEXT: v_perm_b32 v1, v1, v18, s4 +; GFX900-NEXT: v_perm_b32 v2, v2, v19, s4 +; GFX900-NEXT: v_perm_b32 v3, v3, v20, s4 +; GFX900-NEXT: v_perm_b32 v4, v4, v21, s4 +; GFX900-NEXT: v_perm_b32 v5, v5, v22, s4 +; GFX900-NEXT: v_perm_b32 v6, v6, v23, s4 +; GFX900-NEXT: v_perm_b32 v7, v7, v24, s4 +; GFX900-NEXT: v_perm_b32 v8, v8, v25, s4 +; GFX900-NEXT: v_perm_b32 v9, v9, v26, s4 +; GFX900-NEXT: v_perm_b32 v10, v10, v27, s4 +; GFX900-NEXT: v_perm_b32 v11, v11, v28, s4 +; GFX900-NEXT: v_perm_b32 v12, v12, v32, s4 +; GFX900-NEXT: v_perm_b32 v13, v13, v30, s4 +; GFX900-NEXT: v_perm_b32 v14, v14, v31, s4 +; GFX900-NEXT: v_perm_b32 v15, v29, v15, s4 +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_maxnum_v32bf16: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: scratch_load_dword v31, off, s32 +; GFX950-NEXT: v_and_b32_e32 v53, 0xffff0000, v24 +; GFX950-NEXT: v_and_b32_e32 v54, 0xffff0000, v8 +; GFX950-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; GFX950-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; GFX950-NEXT: v_accvgpr_write_b32 a0, v40 ; Reload Reuse +; GFX950-NEXT: v_and_b32_e32 v32, 0xffff0000, v15 +; GFX950-NEXT: v_and_b32_e32 v55, 0xffff0000, v23 +; GFX950-NEXT: v_and_b32_e32 v40, 0xffff0000, v7 +; GFX950-NEXT: v_max_f32_e32 v8, v8, v24 +; GFX950-NEXT: v_lshlrev_b32_e32 v23, 16, v23 +; GFX950-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; GFX950-NEXT: v_max_f32_e32 v7, v7, v23 +; GFX950-NEXT: v_and_b32_e32 v23, 0xffff0000, v22 +; GFX950-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; GFX950-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; GFX950-NEXT: v_and_b32_e32 v33, 0xffff0000, v30 +; GFX950-NEXT: v_and_b32_e32 v34, 0xffff0000, v14 +; GFX950-NEXT: v_lshlrev_b32_e32 v30, 16, v30 +; GFX950-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; GFX950-NEXT: v_and_b32_e32 v35, 0xffff0000, v29 +; GFX950-NEXT: v_and_b32_e32 v36, 0xffff0000, v13 +; GFX950-NEXT: v_lshlrev_b32_e32 v29, 16, v29 +; GFX950-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; GFX950-NEXT: v_and_b32_e32 v37, 0xffff0000, v28 +; GFX950-NEXT: v_and_b32_e32 v38, 0xffff0000, v12 +; GFX950-NEXT: v_lshlrev_b32_e32 v28, 16, v28 +; GFX950-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; GFX950-NEXT: v_and_b32_e32 v39, 0xffff0000, v27 +; GFX950-NEXT: v_and_b32_e32 v48, 0xffff0000, v11 +; GFX950-NEXT: v_lshlrev_b32_e32 v27, 16, v27 +; GFX950-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; GFX950-NEXT: v_and_b32_e32 v49, 0xffff0000, v26 +; GFX950-NEXT: v_and_b32_e32 v50, 0xffff0000, v10 +; GFX950-NEXT: v_lshlrev_b32_e32 v26, 16, v26 +; GFX950-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; GFX950-NEXT: v_and_b32_e32 v51, 0xffff0000, v25 +; GFX950-NEXT: v_and_b32_e32 v52, 0xffff0000, v9 +; GFX950-NEXT: v_lshlrev_b32_e32 v25, 16, v25 +; GFX950-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; GFX950-NEXT: v_max_f32_e32 v33, v34, v33 +; GFX950-NEXT: v_max_f32_e32 v14, v14, v30 +; GFX950-NEXT: v_max_f32_e32 v30, v36, v35 +; GFX950-NEXT: v_max_f32_e32 v13, v13, v29 +; GFX950-NEXT: v_max_f32_e32 v29, v38, v37 +; GFX950-NEXT: v_max_f32_e32 v12, v12, v28 +; GFX950-NEXT: v_max_f32_e32 v28, v48, v39 +; GFX950-NEXT: v_max_f32_e32 v11, v11, v27 +; GFX950-NEXT: v_max_f32_e32 v27, v50, v49 +; GFX950-NEXT: v_max_f32_e32 v10, v10, v26 +; GFX950-NEXT: v_max_f32_e32 v26, v52, v51 +; GFX950-NEXT: v_max_f32_e32 v9, v9, v25 +; GFX950-NEXT: v_max_f32_e32 v25, v54, v53 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v8, v8, v25 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v9, v9, v26 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v10, v10, v27 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v11, v11, v28 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v12, v12, v29 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v13, v13, v30 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v14, v14, v33 +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: v_and_b32_e32 v24, 0xffff0000, v31 +; GFX950-NEXT: v_max_f32_e32 v24, v32, v24 +; GFX950-NEXT: v_and_b32_e32 v32, 0xffff0000, v6 +; GFX950-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; GFX950-NEXT: v_max_f32_e32 v23, v32, v23 +; GFX950-NEXT: v_max_f32_e32 v6, v6, v22 +; GFX950-NEXT: v_and_b32_e32 v22, 0xffff0000, v21 +; GFX950-NEXT: v_and_b32_e32 v32, 0xffff0000, v5 +; GFX950-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; GFX950-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX950-NEXT: v_max_f32_e32 v22, v32, v22 +; GFX950-NEXT: v_max_f32_e32 v5, v5, v21 +; GFX950-NEXT: v_and_b32_e32 v21, 0xffff0000, v20 +; GFX950-NEXT: v_and_b32_e32 v32, 0xffff0000, v4 +; GFX950-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; GFX950-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX950-NEXT: v_max_f32_e32 v21, v32, v21 +; GFX950-NEXT: v_max_f32_e32 v4, v4, v20 +; GFX950-NEXT: v_and_b32_e32 v20, 0xffff0000, v19 +; GFX950-NEXT: v_and_b32_e32 v32, 0xffff0000, v3 +; GFX950-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; GFX950-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX950-NEXT: v_max_f32_e32 v20, v32, v20 +; GFX950-NEXT: v_max_f32_e32 v3, v3, v19 +; GFX950-NEXT: v_and_b32_e32 v19, 0xffff0000, v18 +; GFX950-NEXT: v_and_b32_e32 v32, 0xffff0000, v2 +; GFX950-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; GFX950-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX950-NEXT: v_max_f32_e32 v19, v32, v19 +; GFX950-NEXT: v_max_f32_e32 v2, v2, v18 +; GFX950-NEXT: v_and_b32_e32 v18, 0xffff0000, v17 +; GFX950-NEXT: v_and_b32_e32 v32, 0xffff0000, v1 +; GFX950-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; GFX950-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX950-NEXT: v_lshlrev_b32_e32 v31, 16, v31 +; GFX950-NEXT: v_max_f32_e32 v18, v32, v18 +; GFX950-NEXT: v_max_f32_e32 v1, v1, v17 +; GFX950-NEXT: v_and_b32_e32 v17, 0xffff0000, v16 +; GFX950-NEXT: v_and_b32_e32 v32, 0xffff0000, v0 +; GFX950-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; GFX950-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX950-NEXT: v_max_f32_e32 v15, v15, v31 +; GFX950-NEXT: v_max_f32_e32 v31, v40, v55 +; GFX950-NEXT: v_max_f32_e32 v17, v32, v17 +; GFX950-NEXT: v_max_f32_e32 v0, v0, v16 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v0, v0, v17 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v1, v1, v18 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v2, v2, v19 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v3, v3, v20 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v4, v4, v21 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v5, v5, v22 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v6, v6, v23 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v7, v7, v31 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v15, v15, v24 +; GFX950-NEXT: v_accvgpr_read_b32 v40, a0 ; Reload Reuse +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_maxnum_v32bf16: ; GFX10: ; %bb.0: @@ -27543,36 +29235,66 @@ define bfloat @v_sqrt_bf16(bfloat %a) { ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_sqrt_bf16: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX9-NEXT: s_mov_b32 s4, 0xf800000 -; GFX9-NEXT: v_mul_f32_e32 v1, 0x4f800000, v0 -; GFX9-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc -; GFX9-NEXT: v_sqrt_f32_e32 v1, v0 -; GFX9-NEXT: v_add_u32_e32 v2, -1, v1 -; GFX9-NEXT: v_fma_f32 v3, -v2, v1, v0 -; GFX9-NEXT: v_cmp_ge_f32_e64 s[4:5], 0, v3 -; GFX9-NEXT: v_add_u32_e32 v3, 1, v1 -; GFX9-NEXT: v_cndmask_b32_e64 v2, v1, v2, s[4:5] -; GFX9-NEXT: v_fma_f32 v1, -v3, v1, v0 -; GFX9-NEXT: v_cmp_lt_f32_e64 s[4:5], 0, v1 -; GFX9-NEXT: v_cndmask_b32_e64 v1, v2, v3, s[4:5] -; GFX9-NEXT: v_mul_f32_e32 v2, 0x37800000, v1 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc -; GFX9-NEXT: v_mov_b32_e32 v2, 0x260 -; GFX9-NEXT: v_cmp_class_f32_e32 vcc, v0, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc -; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1 -; GFX9-NEXT: s_movk_i32 s4, 0x7fff -; GFX9-NEXT: v_add3_u32 v1, v1, v0, s4 -; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v0 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: v_sqrt_bf16: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX900-NEXT: s_mov_b32 s4, 0xf800000 +; GFX900-NEXT: v_mul_f32_e32 v1, 0x4f800000, v0 +; GFX900-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 +; GFX900-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc +; GFX900-NEXT: v_sqrt_f32_e32 v1, v0 +; GFX900-NEXT: v_add_u32_e32 v2, -1, v1 +; GFX900-NEXT: v_fma_f32 v3, -v2, v1, v0 +; GFX900-NEXT: v_cmp_ge_f32_e64 s[4:5], 0, v3 +; GFX900-NEXT: v_add_u32_e32 v3, 1, v1 +; GFX900-NEXT: v_cndmask_b32_e64 v2, v1, v2, s[4:5] +; GFX900-NEXT: v_fma_f32 v1, -v3, v1, v0 +; GFX900-NEXT: v_cmp_lt_f32_e64 s[4:5], 0, v1 +; GFX900-NEXT: v_cndmask_b32_e64 v1, v2, v3, s[4:5] +; GFX900-NEXT: v_mul_f32_e32 v2, 0x37800000, v1 +; GFX900-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; GFX900-NEXT: v_mov_b32_e32 v2, 0x260 +; GFX900-NEXT: v_cmp_class_f32_e32 vcc, v0, v2 +; GFX900-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc +; GFX900-NEXT: v_bfe_u32 v1, v0, 16, 1 +; GFX900-NEXT: s_movk_i32 s4, 0x7fff +; GFX900-NEXT: v_add3_u32 v1, v1, v0, s4 +; GFX900-NEXT: v_or_b32_e32 v2, 0x400000, v0 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX900-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc +; GFX900-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_sqrt_bf16: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX950-NEXT: s_mov_b32 s0, 0xf800000 +; GFX950-NEXT: v_mul_f32_e32 v1, 0x4f800000, v0 +; GFX950-NEXT: v_cmp_gt_f32_e32 vcc, s0, v0 +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc +; GFX950-NEXT: v_sqrt_f32_e32 v1, v0 +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: v_add_u32_e32 v2, -1, v1 +; GFX950-NEXT: v_fma_f32 v3, -v2, v1, v0 +; GFX950-NEXT: v_cmp_ge_f32_e64 s[0:1], 0, v3 +; GFX950-NEXT: v_add_u32_e32 v3, 1, v1 +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: v_cndmask_b32_e64 v2, v1, v2, s[0:1] +; GFX950-NEXT: v_fma_f32 v1, -v3, v1, v0 +; GFX950-NEXT: v_cmp_lt_f32_e64 s[0:1], 0, v1 +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e64 v1, v2, v3, s[0:1] +; GFX950-NEXT: v_mul_f32_e32 v2, 0x37800000, v1 +; GFX950-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; GFX950-NEXT: v_mov_b32_e32 v2, 0x260 +; GFX950-NEXT: v_cmp_class_f32_e32 vcc, v0, v2 +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc +; GFX950-NEXT: v_cvt_pk_bf16_f32 v0, v0, s0 +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_sqrt_bf16: ; GFX10: ; %bb.0: @@ -27715,19 +29437,27 @@ define bfloat @v_ldexp_bf16_i32(bfloat %a, i32 %b) { ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_ldexp_bf16_i32: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX9-NEXT: v_ldexp_f32 v0, v0, v1 -; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1 -; GFX9-NEXT: s_movk_i32 s4, 0x7fff -; GFX9-NEXT: v_add3_u32 v1, v1, v0, s4 -; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v0 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: v_ldexp_bf16_i32: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX900-NEXT: v_ldexp_f32 v0, v0, v1 +; GFX900-NEXT: v_bfe_u32 v1, v0, 16, 1 +; GFX900-NEXT: s_movk_i32 s4, 0x7fff +; GFX900-NEXT: v_add3_u32 v1, v1, v0, s4 +; GFX900-NEXT: v_or_b32_e32 v2, 0x400000, v0 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX900-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc +; GFX900-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_ldexp_bf16_i32: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX950-NEXT: v_ldexp_f32 v0, v0, v1 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v0, v0, s0 +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_ldexp_bf16_i32: ; GFX10: ; %bb.0: @@ -27820,20 +29550,29 @@ define { bfloat, i16 } @v_frexp_bf16_i16(bfloat %a) { ; GFX8-NEXT: v_frexp_exp_i32_f32_e32 v1, v1 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_frexp_bf16_i16: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v0 -; GFX9-NEXT: v_frexp_mant_f32_e32 v0, v1 -; GFX9-NEXT: v_bfe_u32 v2, v0, 16, 1 -; GFX9-NEXT: s_movk_i32 s4, 0x7fff -; GFX9-NEXT: v_add3_u32 v2, v2, v0, s4 -; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v0 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX9-NEXT: v_frexp_exp_i32_f32_e32 v1, v1 -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: v_frexp_bf16_i16: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_lshlrev_b32_e32 v1, 16, v0 +; GFX900-NEXT: v_frexp_mant_f32_e32 v0, v1 +; GFX900-NEXT: v_bfe_u32 v2, v0, 16, 1 +; GFX900-NEXT: s_movk_i32 s4, 0x7fff +; GFX900-NEXT: v_add3_u32 v2, v2, v0, s4 +; GFX900-NEXT: v_or_b32_e32 v3, 0x400000, v0 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX900-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc +; GFX900-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX900-NEXT: v_frexp_exp_i32_f32_e32 v1, v1 +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_frexp_bf16_i16: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_lshlrev_b32_e32 v1, 16, v0 +; GFX950-NEXT: v_frexp_mant_f32_e32 v0, v1 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v0, v0, s0 +; GFX950-NEXT: v_frexp_exp_i32_f32_e32 v1, v1 +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_frexp_bf16_i16: ; GFX10: ; %bb.0: @@ -27979,35 +29718,61 @@ define bfloat @v_log_bf16(bfloat %a) { ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_log_bf16: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX9-NEXT: s_mov_b32 s4, 0x800000 -; GFX9-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 -; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 32, vcc -; GFX9-NEXT: v_ldexp_f32 v0, v0, v1 -; GFX9-NEXT: v_log_f32_e32 v0, v0 -; GFX9-NEXT: s_mov_b32 s4, 0x3f317217 -; GFX9-NEXT: v_mul_f32_e32 v1, 0x3f317217, v0 -; GFX9-NEXT: v_fma_f32 v2, v0, s4, -v1 -; GFX9-NEXT: s_mov_b32 s4, 0x3377d1cf -; GFX9-NEXT: v_fma_f32 v2, v0, s4, v2 -; GFX9-NEXT: s_mov_b32 s4, 0x7f800000 -; GFX9-NEXT: v_add_f32_e32 v1, v1, v2 -; GFX9-NEXT: v_cmp_lt_f32_e64 s[4:5], |v0|, s4 -; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[4:5] -; GFX9-NEXT: v_mov_b32_e32 v1, 0x41b17218 -; GFX9-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc -; GFX9-NEXT: v_sub_f32_e32 v0, v0, v1 -; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1 -; GFX9-NEXT: s_movk_i32 s4, 0x7fff -; GFX9-NEXT: v_add3_u32 v1, v1, v0, s4 -; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v0 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: v_log_bf16: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX900-NEXT: s_mov_b32 s4, 0x800000 +; GFX900-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 +; GFX900-NEXT: v_cndmask_b32_e64 v1, 0, 32, vcc +; GFX900-NEXT: v_ldexp_f32 v0, v0, v1 +; GFX900-NEXT: v_log_f32_e32 v0, v0 +; GFX900-NEXT: s_mov_b32 s4, 0x3f317217 +; GFX900-NEXT: v_mul_f32_e32 v1, 0x3f317217, v0 +; GFX900-NEXT: v_fma_f32 v2, v0, s4, -v1 +; GFX900-NEXT: s_mov_b32 s4, 0x3377d1cf +; GFX900-NEXT: v_fma_f32 v2, v0, s4, v2 +; GFX900-NEXT: s_mov_b32 s4, 0x7f800000 +; GFX900-NEXT: v_add_f32_e32 v1, v1, v2 +; GFX900-NEXT: v_cmp_lt_f32_e64 s[4:5], |v0|, s4 +; GFX900-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[4:5] +; GFX900-NEXT: v_mov_b32_e32 v1, 0x41b17218 +; GFX900-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; GFX900-NEXT: v_sub_f32_e32 v0, v0, v1 +; GFX900-NEXT: v_bfe_u32 v1, v0, 16, 1 +; GFX900-NEXT: s_movk_i32 s4, 0x7fff +; GFX900-NEXT: v_add3_u32 v1, v1, v0, s4 +; GFX900-NEXT: v_or_b32_e32 v2, 0x400000, v0 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX900-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc +; GFX900-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_log_bf16: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX950-NEXT: s_mov_b32 s0, 0x800000 +; GFX950-NEXT: v_cmp_gt_f32_e32 vcc, s0, v0 +; GFX950-NEXT: s_mov_b32 s0, 0x3f317217 +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: v_cndmask_b32_e64 v1, 0, 32, vcc +; GFX950-NEXT: v_ldexp_f32 v0, v0, v1 +; GFX950-NEXT: v_log_f32_e32 v0, v0 +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: v_mul_f32_e32 v1, 0x3f317217, v0 +; GFX950-NEXT: v_fma_f32 v2, v0, s0, -v1 +; GFX950-NEXT: v_fmamk_f32 v2, v0, 0x3377d1cf, v2 +; GFX950-NEXT: s_mov_b32 s0, 0x7f800000 +; GFX950-NEXT: v_add_f32_e32 v1, v1, v2 +; GFX950-NEXT: v_cmp_lt_f32_e64 s[0:1], |v0|, s0 +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[0:1] +; GFX950-NEXT: v_mov_b32_e32 v1, 0x41b17218 +; GFX950-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; GFX950-NEXT: v_sub_f32_e32 v0, v0, v1 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v0, v0, s0 +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_log_bf16: ; GFX10: ; %bb.0: @@ -28153,26 +29918,42 @@ define bfloat @v_log2_bf16(bfloat %a) { ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_log2_bf16: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX9-NEXT: s_mov_b32 s4, 0x800000 -; GFX9-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 -; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 32, vcc -; GFX9-NEXT: v_ldexp_f32 v0, v0, v2 -; GFX9-NEXT: v_log_f32_e32 v0, v0 -; GFX9-NEXT: v_mov_b32_e32 v1, 0x42000000 -; GFX9-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc -; GFX9-NEXT: s_movk_i32 s4, 0x7fff -; GFX9-NEXT: v_sub_f32_e32 v0, v0, v1 -; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1 -; GFX9-NEXT: v_add3_u32 v1, v1, v0, s4 -; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v0 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: v_log2_bf16: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX900-NEXT: s_mov_b32 s4, 0x800000 +; GFX900-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 +; GFX900-NEXT: v_cndmask_b32_e64 v2, 0, 32, vcc +; GFX900-NEXT: v_ldexp_f32 v0, v0, v2 +; GFX900-NEXT: v_log_f32_e32 v0, v0 +; GFX900-NEXT: v_mov_b32_e32 v1, 0x42000000 +; GFX900-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; GFX900-NEXT: s_movk_i32 s4, 0x7fff +; GFX900-NEXT: v_sub_f32_e32 v0, v0, v1 +; GFX900-NEXT: v_bfe_u32 v1, v0, 16, 1 +; GFX900-NEXT: v_add3_u32 v1, v1, v0, s4 +; GFX900-NEXT: v_or_b32_e32 v2, 0x400000, v0 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX900-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc +; GFX900-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_log2_bf16: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX950-NEXT: s_mov_b32 s0, 0x800000 +; GFX950-NEXT: v_cmp_gt_f32_e32 vcc, s0, v0 +; GFX950-NEXT: v_mov_b32_e32 v1, 0x42000000 +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: v_cndmask_b32_e64 v2, 0, 32, vcc +; GFX950-NEXT: v_ldexp_f32 v0, v0, v2 +; GFX950-NEXT: v_log_f32_e32 v0, v0 +; GFX950-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; GFX950-NEXT: v_sub_f32_e32 v0, v0, v1 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v0, v0, s0 +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_log2_bf16: ; GFX10: ; %bb.0: @@ -28329,35 +30110,61 @@ define bfloat @v_log10_bf16(bfloat %a) { ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_log10_bf16: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX9-NEXT: s_mov_b32 s4, 0x800000 -; GFX9-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 -; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 32, vcc -; GFX9-NEXT: v_ldexp_f32 v0, v0, v1 -; GFX9-NEXT: v_log_f32_e32 v0, v0 -; GFX9-NEXT: s_mov_b32 s4, 0x3e9a209a -; GFX9-NEXT: v_mul_f32_e32 v1, 0x3e9a209a, v0 -; GFX9-NEXT: v_fma_f32 v2, v0, s4, -v1 -; GFX9-NEXT: s_mov_b32 s4, 0x3284fbcf -; GFX9-NEXT: v_fma_f32 v2, v0, s4, v2 -; GFX9-NEXT: s_mov_b32 s4, 0x7f800000 -; GFX9-NEXT: v_add_f32_e32 v1, v1, v2 -; GFX9-NEXT: v_cmp_lt_f32_e64 s[4:5], |v0|, s4 -; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[4:5] -; GFX9-NEXT: v_mov_b32_e32 v1, 0x411a209b -; GFX9-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc -; GFX9-NEXT: v_sub_f32_e32 v0, v0, v1 -; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1 -; GFX9-NEXT: s_movk_i32 s4, 0x7fff -; GFX9-NEXT: v_add3_u32 v1, v1, v0, s4 -; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v0 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: v_log10_bf16: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX900-NEXT: s_mov_b32 s4, 0x800000 +; GFX900-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 +; GFX900-NEXT: v_cndmask_b32_e64 v1, 0, 32, vcc +; GFX900-NEXT: v_ldexp_f32 v0, v0, v1 +; GFX900-NEXT: v_log_f32_e32 v0, v0 +; GFX900-NEXT: s_mov_b32 s4, 0x3e9a209a +; GFX900-NEXT: v_mul_f32_e32 v1, 0x3e9a209a, v0 +; GFX900-NEXT: v_fma_f32 v2, v0, s4, -v1 +; GFX900-NEXT: s_mov_b32 s4, 0x3284fbcf +; GFX900-NEXT: v_fma_f32 v2, v0, s4, v2 +; GFX900-NEXT: s_mov_b32 s4, 0x7f800000 +; GFX900-NEXT: v_add_f32_e32 v1, v1, v2 +; GFX900-NEXT: v_cmp_lt_f32_e64 s[4:5], |v0|, s4 +; GFX900-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[4:5] +; GFX900-NEXT: v_mov_b32_e32 v1, 0x411a209b +; GFX900-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; GFX900-NEXT: v_sub_f32_e32 v0, v0, v1 +; GFX900-NEXT: v_bfe_u32 v1, v0, 16, 1 +; GFX900-NEXT: s_movk_i32 s4, 0x7fff +; GFX900-NEXT: v_add3_u32 v1, v1, v0, s4 +; GFX900-NEXT: v_or_b32_e32 v2, 0x400000, v0 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX900-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc +; GFX900-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_log10_bf16: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX950-NEXT: s_mov_b32 s0, 0x800000 +; GFX950-NEXT: v_cmp_gt_f32_e32 vcc, s0, v0 +; GFX950-NEXT: s_mov_b32 s0, 0x3e9a209a +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: v_cndmask_b32_e64 v1, 0, 32, vcc +; GFX950-NEXT: v_ldexp_f32 v0, v0, v1 +; GFX950-NEXT: v_log_f32_e32 v0, v0 +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: v_mul_f32_e32 v1, 0x3e9a209a, v0 +; GFX950-NEXT: v_fma_f32 v2, v0, s0, -v1 +; GFX950-NEXT: v_fmamk_f32 v2, v0, 0x3284fbcf, v2 +; GFX950-NEXT: s_mov_b32 s0, 0x7f800000 +; GFX950-NEXT: v_add_f32_e32 v1, v1, v2 +; GFX950-NEXT: v_cmp_lt_f32_e64 s[0:1], |v0|, s0 +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[0:1] +; GFX950-NEXT: v_mov_b32_e32 v1, 0x411a209b +; GFX950-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; GFX950-NEXT: v_sub_f32_e32 v0, v0, v1 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v0, v0, s0 +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_log10_bf16: ; GFX10: ; %bb.0: @@ -28541,36 +30348,61 @@ define bfloat @v_exp_bf16(bfloat %a) { ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_exp_bf16: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX9-NEXT: v_mul_f32_e32 v1, 0x3fb8aa3b, v0 -; GFX9-NEXT: s_mov_b32 s4, 0x3fb8aa3b -; GFX9-NEXT: v_rndne_f32_e32 v2, v1 -; GFX9-NEXT: v_sub_f32_e32 v3, v1, v2 -; GFX9-NEXT: v_fma_f32 v1, v0, s4, -v1 -; GFX9-NEXT: s_mov_b32 s4, 0x32a5705f -; GFX9-NEXT: v_fma_f32 v1, v0, s4, v1 -; GFX9-NEXT: v_add_f32_e32 v1, v3, v1 -; GFX9-NEXT: v_exp_f32_e32 v1, v1 -; GFX9-NEXT: v_cvt_i32_f32_e32 v2, v2 -; GFX9-NEXT: s_mov_b32 s4, 0xc2ce8ed0 -; GFX9-NEXT: v_cmp_ngt_f32_e32 vcc, s4, v0 -; GFX9-NEXT: s_mov_b32 s4, 0x42b17218 -; GFX9-NEXT: v_ldexp_f32 v1, v1, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc -; GFX9-NEXT: v_mov_b32_e32 v2, 0x7f800000 -; GFX9-NEXT: v_cmp_nlt_f32_e32 vcc, s4, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc -; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1 -; GFX9-NEXT: s_movk_i32 s4, 0x7fff -; GFX9-NEXT: v_add3_u32 v1, v1, v0, s4 -; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v0 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: v_exp_bf16: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX900-NEXT: v_mul_f32_e32 v1, 0x3fb8aa3b, v0 +; GFX900-NEXT: s_mov_b32 s4, 0x3fb8aa3b +; GFX900-NEXT: v_rndne_f32_e32 v2, v1 +; GFX900-NEXT: v_sub_f32_e32 v3, v1, v2 +; GFX900-NEXT: v_fma_f32 v1, v0, s4, -v1 +; GFX900-NEXT: s_mov_b32 s4, 0x32a5705f +; GFX900-NEXT: v_fma_f32 v1, v0, s4, v1 +; GFX900-NEXT: v_add_f32_e32 v1, v3, v1 +; GFX900-NEXT: v_exp_f32_e32 v1, v1 +; GFX900-NEXT: v_cvt_i32_f32_e32 v2, v2 +; GFX900-NEXT: s_mov_b32 s4, 0xc2ce8ed0 +; GFX900-NEXT: v_cmp_ngt_f32_e32 vcc, s4, v0 +; GFX900-NEXT: s_mov_b32 s4, 0x42b17218 +; GFX900-NEXT: v_ldexp_f32 v1, v1, v2 +; GFX900-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; GFX900-NEXT: v_mov_b32_e32 v2, 0x7f800000 +; GFX900-NEXT: v_cmp_nlt_f32_e32 vcc, s4, v0 +; GFX900-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc +; GFX900-NEXT: v_bfe_u32 v1, v0, 16, 1 +; GFX900-NEXT: s_movk_i32 s4, 0x7fff +; GFX900-NEXT: v_add3_u32 v1, v1, v0, s4 +; GFX900-NEXT: v_or_b32_e32 v2, 0x400000, v0 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX900-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc +; GFX900-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_exp_bf16: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX950-NEXT: v_mul_f32_e32 v1, 0x3fb8aa3b, v0 +; GFX950-NEXT: s_mov_b32 s0, 0x3fb8aa3b +; GFX950-NEXT: v_rndne_f32_e32 v2, v1 +; GFX950-NEXT: v_sub_f32_e32 v3, v1, v2 +; GFX950-NEXT: v_fma_f32 v1, v0, s0, -v1 +; GFX950-NEXT: v_fmamk_f32 v1, v0, 0x32a5705f, v1 +; GFX950-NEXT: v_add_f32_e32 v1, v3, v1 +; GFX950-NEXT: v_exp_f32_e32 v1, v1 +; GFX950-NEXT: v_cvt_i32_f32_e32 v2, v2 +; GFX950-NEXT: s_mov_b32 s0, 0xc2ce8ed0 +; GFX950-NEXT: v_cmp_ngt_f32_e32 vcc, s0, v0 +; GFX950-NEXT: s_mov_b32 s0, 0x42b17218 +; GFX950-NEXT: v_ldexp_f32 v1, v1, v2 +; GFX950-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; GFX950-NEXT: v_mov_b32_e32 v2, 0x7f800000 +; GFX950-NEXT: v_cmp_nlt_f32_e32 vcc, s0, v0 +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc +; GFX950-NEXT: v_cvt_pk_bf16_f32 v0, v0, s0 +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_exp_bf16: ; GFX10: ; %bb.0: @@ -28722,27 +30554,43 @@ define bfloat @v_exp2_bf16(bfloat %a) { ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_exp2_bf16: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX9-NEXT: s_mov_b32 s4, 0xc2fc0000 -; GFX9-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 -; GFX9-NEXT: v_mov_b32_e32 v2, 0x42800000 -; GFX9-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc -; GFX9-NEXT: v_add_f32_e32 v0, v0, v2 -; GFX9-NEXT: v_exp_f32_e32 v0, v0 -; GFX9-NEXT: v_not_b32_e32 v1, 63 -; GFX9-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc -; GFX9-NEXT: s_movk_i32 s4, 0x7fff -; GFX9-NEXT: v_ldexp_f32 v0, v0, v1 -; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1 -; GFX9-NEXT: v_add3_u32 v1, v1, v0, s4 -; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v0 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: v_exp2_bf16: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX900-NEXT: s_mov_b32 s4, 0xc2fc0000 +; GFX900-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 +; GFX900-NEXT: v_mov_b32_e32 v2, 0x42800000 +; GFX900-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc +; GFX900-NEXT: v_add_f32_e32 v0, v0, v2 +; GFX900-NEXT: v_exp_f32_e32 v0, v0 +; GFX900-NEXT: v_not_b32_e32 v1, 63 +; GFX900-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; GFX900-NEXT: s_movk_i32 s4, 0x7fff +; GFX900-NEXT: v_ldexp_f32 v0, v0, v1 +; GFX900-NEXT: v_bfe_u32 v1, v0, 16, 1 +; GFX900-NEXT: v_add3_u32 v1, v1, v0, s4 +; GFX900-NEXT: v_or_b32_e32 v2, 0x400000, v0 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX900-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc +; GFX900-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_exp2_bf16: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX950-NEXT: s_mov_b32 s0, 0xc2fc0000 +; GFX950-NEXT: v_cmp_gt_f32_e32 vcc, s0, v0 +; GFX950-NEXT: v_mov_b32_e32 v2, 0x42800000 +; GFX950-NEXT: v_not_b32_e32 v1, 63 +; GFX950-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc +; GFX950-NEXT: v_add_f32_e32 v0, v0, v2 +; GFX950-NEXT: v_exp_f32_e32 v0, v0 +; GFX950-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; GFX950-NEXT: v_ldexp_f32 v0, v0, v1 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v0, v0, s0 +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_exp2_bf16: ; GFX10: ; %bb.0: @@ -28900,36 +30748,61 @@ define bfloat @v_exp10_bf16(bfloat %a) { ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_exp10_bf16: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX9-NEXT: v_mul_f32_e32 v1, 0x40549a78, v0 -; GFX9-NEXT: s_mov_b32 s4, 0x40549a78 -; GFX9-NEXT: v_rndne_f32_e32 v2, v1 -; GFX9-NEXT: v_sub_f32_e32 v3, v1, v2 -; GFX9-NEXT: v_fma_f32 v1, v0, s4, -v1 -; GFX9-NEXT: s_mov_b32 s4, 0x33979a37 -; GFX9-NEXT: v_fma_f32 v1, v0, s4, v1 -; GFX9-NEXT: v_add_f32_e32 v1, v3, v1 -; GFX9-NEXT: v_exp_f32_e32 v1, v1 -; GFX9-NEXT: v_cvt_i32_f32_e32 v2, v2 -; GFX9-NEXT: s_mov_b32 s4, 0xc23369f4 -; GFX9-NEXT: v_cmp_ngt_f32_e32 vcc, s4, v0 -; GFX9-NEXT: s_mov_b32 s4, 0x421a209b -; GFX9-NEXT: v_ldexp_f32 v1, v1, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc -; GFX9-NEXT: v_mov_b32_e32 v2, 0x7f800000 -; GFX9-NEXT: v_cmp_nlt_f32_e32 vcc, s4, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc -; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1 -; GFX9-NEXT: s_movk_i32 s4, 0x7fff -; GFX9-NEXT: v_add3_u32 v1, v1, v0, s4 -; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v0 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: v_exp10_bf16: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX900-NEXT: v_mul_f32_e32 v1, 0x40549a78, v0 +; GFX900-NEXT: s_mov_b32 s4, 0x40549a78 +; GFX900-NEXT: v_rndne_f32_e32 v2, v1 +; GFX900-NEXT: v_sub_f32_e32 v3, v1, v2 +; GFX900-NEXT: v_fma_f32 v1, v0, s4, -v1 +; GFX900-NEXT: s_mov_b32 s4, 0x33979a37 +; GFX900-NEXT: v_fma_f32 v1, v0, s4, v1 +; GFX900-NEXT: v_add_f32_e32 v1, v3, v1 +; GFX900-NEXT: v_exp_f32_e32 v1, v1 +; GFX900-NEXT: v_cvt_i32_f32_e32 v2, v2 +; GFX900-NEXT: s_mov_b32 s4, 0xc23369f4 +; GFX900-NEXT: v_cmp_ngt_f32_e32 vcc, s4, v0 +; GFX900-NEXT: s_mov_b32 s4, 0x421a209b +; GFX900-NEXT: v_ldexp_f32 v1, v1, v2 +; GFX900-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; GFX900-NEXT: v_mov_b32_e32 v2, 0x7f800000 +; GFX900-NEXT: v_cmp_nlt_f32_e32 vcc, s4, v0 +; GFX900-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc +; GFX900-NEXT: v_bfe_u32 v1, v0, 16, 1 +; GFX900-NEXT: s_movk_i32 s4, 0x7fff +; GFX900-NEXT: v_add3_u32 v1, v1, v0, s4 +; GFX900-NEXT: v_or_b32_e32 v2, 0x400000, v0 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX900-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc +; GFX900-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_exp10_bf16: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX950-NEXT: v_mul_f32_e32 v1, 0x40549a78, v0 +; GFX950-NEXT: s_mov_b32 s0, 0x40549a78 +; GFX950-NEXT: v_rndne_f32_e32 v2, v1 +; GFX950-NEXT: v_sub_f32_e32 v3, v1, v2 +; GFX950-NEXT: v_fma_f32 v1, v0, s0, -v1 +; GFX950-NEXT: v_fmamk_f32 v1, v0, 0x33979a37, v1 +; GFX950-NEXT: v_add_f32_e32 v1, v3, v1 +; GFX950-NEXT: v_exp_f32_e32 v1, v1 +; GFX950-NEXT: v_cvt_i32_f32_e32 v2, v2 +; GFX950-NEXT: s_mov_b32 s0, 0xc23369f4 +; GFX950-NEXT: v_cmp_ngt_f32_e32 vcc, s0, v0 +; GFX950-NEXT: s_mov_b32 s0, 0x421a209b +; GFX950-NEXT: v_ldexp_f32 v1, v1, v2 +; GFX950-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; GFX950-NEXT: v_mov_b32_e32 v2, 0x7f800000 +; GFX950-NEXT: v_cmp_nlt_f32_e32 vcc, s0, v0 +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc +; GFX950-NEXT: v_cvt_pk_bf16_f32 v0, v0, s0 +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_exp10_bf16: ; GFX10: ; %bb.0: @@ -29059,19 +30932,27 @@ define bfloat @v_ceil_bf16(bfloat %a) { ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_ceil_bf16: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX9-NEXT: v_ceil_f32_e32 v0, v0 -; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1 -; GFX9-NEXT: s_movk_i32 s4, 0x7fff -; GFX9-NEXT: v_add3_u32 v1, v1, v0, s4 -; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v0 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: v_ceil_bf16: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX900-NEXT: v_ceil_f32_e32 v0, v0 +; GFX900-NEXT: v_bfe_u32 v1, v0, 16, 1 +; GFX900-NEXT: s_movk_i32 s4, 0x7fff +; GFX900-NEXT: v_add3_u32 v1, v1, v0, s4 +; GFX900-NEXT: v_or_b32_e32 v2, 0x400000, v0 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX900-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc +; GFX900-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_ceil_bf16: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX950-NEXT: v_ceil_f32_e32 v0, v0 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v0, v0, s0 +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_ceil_bf16: ; GFX10: ; %bb.0: @@ -29157,19 +31038,27 @@ define bfloat @v_trunc_bf16(bfloat %a) { ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_trunc_bf16: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX9-NEXT: v_trunc_f32_e32 v0, v0 -; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1 -; GFX9-NEXT: s_movk_i32 s4, 0x7fff -; GFX9-NEXT: v_add3_u32 v1, v1, v0, s4 -; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v0 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: v_trunc_bf16: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX900-NEXT: v_trunc_f32_e32 v0, v0 +; GFX900-NEXT: v_bfe_u32 v1, v0, 16, 1 +; GFX900-NEXT: s_movk_i32 s4, 0x7fff +; GFX900-NEXT: v_add3_u32 v1, v1, v0, s4 +; GFX900-NEXT: v_or_b32_e32 v2, 0x400000, v0 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX900-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc +; GFX900-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_trunc_bf16: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX950-NEXT: v_trunc_f32_e32 v0, v0 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v0, v0, s0 +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_trunc_bf16: ; GFX10: ; %bb.0: @@ -29255,19 +31144,27 @@ define bfloat @v_rint_bf16(bfloat %a) { ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_rint_bf16: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX9-NEXT: v_rndne_f32_e32 v0, v0 -; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1 -; GFX9-NEXT: s_movk_i32 s4, 0x7fff -; GFX9-NEXT: v_add3_u32 v1, v1, v0, s4 -; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v0 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: v_rint_bf16: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX900-NEXT: v_rndne_f32_e32 v0, v0 +; GFX900-NEXT: v_bfe_u32 v1, v0, 16, 1 +; GFX900-NEXT: s_movk_i32 s4, 0x7fff +; GFX900-NEXT: v_add3_u32 v1, v1, v0, s4 +; GFX900-NEXT: v_or_b32_e32 v2, 0x400000, v0 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX900-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc +; GFX900-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_rint_bf16: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX950-NEXT: v_rndne_f32_e32 v0, v0 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v0, v0, s0 +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_rint_bf16: ; GFX10: ; %bb.0: @@ -29353,19 +31250,27 @@ define bfloat @v_nearbyint_bf16(bfloat %a) { ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_nearbyint_bf16: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX9-NEXT: v_rndne_f32_e32 v0, v0 -; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1 -; GFX9-NEXT: s_movk_i32 s4, 0x7fff -; GFX9-NEXT: v_add3_u32 v1, v1, v0, s4 -; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v0 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: v_nearbyint_bf16: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX900-NEXT: v_rndne_f32_e32 v0, v0 +; GFX900-NEXT: v_bfe_u32 v1, v0, 16, 1 +; GFX900-NEXT: s_movk_i32 s4, 0x7fff +; GFX900-NEXT: v_add3_u32 v1, v1, v0, s4 +; GFX900-NEXT: v_or_b32_e32 v2, 0x400000, v0 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX900-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc +; GFX900-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_nearbyint_bf16: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX950-NEXT: v_rndne_f32_e32 v0, v0 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v0, v0, s0 +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_nearbyint_bf16: ; GFX10: ; %bb.0: @@ -29469,25 +31374,40 @@ define bfloat @v_round_bf16(bfloat %a) { ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_round_bf16: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX9-NEXT: v_trunc_f32_e32 v1, v0 -; GFX9-NEXT: v_sub_f32_e32 v2, v0, v1 -; GFX9-NEXT: v_cmp_ge_f32_e64 s[4:5], |v2|, 0.5 -; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1.0, s[4:5] -; GFX9-NEXT: s_brev_b32 s4, -2 -; GFX9-NEXT: v_bfi_b32 v0, s4, v2, v0 -; GFX9-NEXT: v_add_f32_e32 v0, v1, v0 -; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1 -; GFX9-NEXT: s_movk_i32 s4, 0x7fff -; GFX9-NEXT: v_add3_u32 v1, v1, v0, s4 -; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v0 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: v_round_bf16: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX900-NEXT: v_trunc_f32_e32 v1, v0 +; GFX900-NEXT: v_sub_f32_e32 v2, v0, v1 +; GFX900-NEXT: v_cmp_ge_f32_e64 s[4:5], |v2|, 0.5 +; GFX900-NEXT: v_cndmask_b32_e64 v2, 0, 1.0, s[4:5] +; GFX900-NEXT: s_brev_b32 s4, -2 +; GFX900-NEXT: v_bfi_b32 v0, s4, v2, v0 +; GFX900-NEXT: v_add_f32_e32 v0, v1, v0 +; GFX900-NEXT: v_bfe_u32 v1, v0, 16, 1 +; GFX900-NEXT: s_movk_i32 s4, 0x7fff +; GFX900-NEXT: v_add3_u32 v1, v1, v0, s4 +; GFX900-NEXT: v_or_b32_e32 v2, 0x400000, v0 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX900-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc +; GFX900-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_round_bf16: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX950-NEXT: v_trunc_f32_e32 v1, v0 +; GFX950-NEXT: v_sub_f32_e32 v2, v0, v1 +; GFX950-NEXT: v_cmp_ge_f32_e64 s[0:1], |v2|, 0.5 +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e64 v2, 0, 1.0, s[0:1] +; GFX950-NEXT: s_brev_b32 s0, -2 +; GFX950-NEXT: v_bfi_b32 v0, s0, v2, v0 +; GFX950-NEXT: v_add_f32_e32 v0, v1, v0 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v0, v0, s0 +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_round_bf16: ; GFX10: ; %bb.0: @@ -29592,19 +31512,27 @@ define bfloat @v_roundeven_bf16(bfloat %a) { ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_roundeven_bf16: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX9-NEXT: v_rndne_f32_e32 v0, v0 -; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1 -; GFX9-NEXT: s_movk_i32 s4, 0x7fff -; GFX9-NEXT: v_add3_u32 v1, v1, v0, s4 -; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v0 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: v_roundeven_bf16: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX900-NEXT: v_rndne_f32_e32 v0, v0 +; GFX900-NEXT: v_bfe_u32 v1, v0, 16, 1 +; GFX900-NEXT: s_movk_i32 s4, 0x7fff +; GFX900-NEXT: v_add3_u32 v1, v1, v0, s4 +; GFX900-NEXT: v_or_b32_e32 v2, 0x400000, v0 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX900-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc +; GFX900-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_roundeven_bf16: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX950-NEXT: v_rndne_f32_e32 v0, v0 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v0, v0, s0 +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_roundeven_bf16: ; GFX10: ; %bb.0: @@ -29690,19 +31618,27 @@ define bfloat @v_floor_bf16(bfloat %a) { ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_floor_bf16: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX9-NEXT: v_floor_f32_e32 v0, v0 -; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1 -; GFX9-NEXT: s_movk_i32 s4, 0x7fff -; GFX9-NEXT: v_add3_u32 v1, v1, v0, s4 -; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v0 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: v_floor_bf16: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX900-NEXT: v_floor_f32_e32 v0, v0 +; GFX900-NEXT: v_bfe_u32 v1, v0, 16, 1 +; GFX900-NEXT: s_movk_i32 s4, 0x7fff +; GFX900-NEXT: v_add3_u32 v1, v1, v0, s4 +; GFX900-NEXT: v_or_b32_e32 v2, 0x400000, v0 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX900-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc +; GFX900-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_floor_bf16: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX950-NEXT: v_floor_f32_e32 v0, v0 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v0, v0, s0 +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_floor_bf16: ; GFX10: ; %bb.0: @@ -29786,19 +31722,27 @@ define bfloat @v_canonicalize_bf16(bfloat %a) { ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_canonicalize_bf16: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX9-NEXT: v_max_f32_e32 v0, v0, v0 -; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1 -; GFX9-NEXT: s_movk_i32 s4, 0x7fff -; GFX9-NEXT: v_add3_u32 v1, v1, v0, s4 -; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v0 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: v_canonicalize_bf16: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX900-NEXT: v_max_f32_e32 v0, v0, v0 +; GFX900-NEXT: v_bfe_u32 v1, v0, 16, 1 +; GFX900-NEXT: s_movk_i32 s4, 0x7fff +; GFX900-NEXT: v_add3_u32 v1, v1, v0, s4 +; GFX900-NEXT: v_or_b32_e32 v2, 0x400000, v0 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX900-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc +; GFX900-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_canonicalize_bf16: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX950-NEXT: v_max_f32_e32 v0, v0, v0 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v0, v0, s0 +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_canonicalize_bf16: ; GFX10: ; %bb.0: @@ -29929,14 +31873,24 @@ define i1 @v_fcmp_oeq_bf16(bfloat %a, bfloat %b) { ; GFX8-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_fcmp_oeq_bf16: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX9-NEXT: v_cmp_eq_f32_e32 vcc, v0, v1 -; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: v_fcmp_oeq_bf16: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX900-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX900-NEXT: v_cmp_eq_f32_e32 vcc, v0, v1 +; GFX900-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_fcmp_oeq_bf16: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX950-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX950-NEXT: v_cmp_eq_f32_e32 vcc, v0, v1 +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_fcmp_oeq_bf16: ; GFX10: ; %bb.0: @@ -30004,14 +31958,24 @@ define i1 @v_fcmp_ogt_bf16(bfloat %a, bfloat %b) { ; GFX8-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_fcmp_ogt_bf16: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX9-NEXT: v_cmp_gt_f32_e32 vcc, v0, v1 -; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: v_fcmp_ogt_bf16: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX900-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX900-NEXT: v_cmp_gt_f32_e32 vcc, v0, v1 +; GFX900-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_fcmp_ogt_bf16: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX950-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX950-NEXT: v_cmp_gt_f32_e32 vcc, v0, v1 +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_fcmp_ogt_bf16: ; GFX10: ; %bb.0: @@ -30079,14 +32043,24 @@ define i1 @v_fcmp_oge_bf16(bfloat %a, bfloat %b) { ; GFX8-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_fcmp_oge_bf16: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX9-NEXT: v_cmp_ge_f32_e32 vcc, v0, v1 -; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: v_fcmp_oge_bf16: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX900-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX900-NEXT: v_cmp_ge_f32_e32 vcc, v0, v1 +; GFX900-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_fcmp_oge_bf16: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX950-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX950-NEXT: v_cmp_ge_f32_e32 vcc, v0, v1 +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_fcmp_oge_bf16: ; GFX10: ; %bb.0: @@ -30154,14 +32128,24 @@ define i1 @v_fcmp_olt_bf16(bfloat %a, bfloat %b) { ; GFX8-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_fcmp_olt_bf16: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX9-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1 -; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: v_fcmp_olt_bf16: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX900-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX900-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1 +; GFX900-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_fcmp_olt_bf16: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX950-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX950-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1 +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_fcmp_olt_bf16: ; GFX10: ; %bb.0: @@ -30229,14 +32213,24 @@ define i1 @v_fcmp_ole_bf16(bfloat %a, bfloat %b) { ; GFX8-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_fcmp_ole_bf16: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX9-NEXT: v_cmp_le_f32_e32 vcc, v0, v1 -; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: v_fcmp_ole_bf16: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX900-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX900-NEXT: v_cmp_le_f32_e32 vcc, v0, v1 +; GFX900-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_fcmp_ole_bf16: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX950-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX950-NEXT: v_cmp_le_f32_e32 vcc, v0, v1 +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_fcmp_ole_bf16: ; GFX10: ; %bb.0: @@ -30304,14 +32298,24 @@ define i1 @v_fcmp_one_bf16(bfloat %a, bfloat %b) { ; GFX8-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_fcmp_one_bf16: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX9-NEXT: v_cmp_lg_f32_e32 vcc, v0, v1 -; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: v_fcmp_one_bf16: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX900-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX900-NEXT: v_cmp_lg_f32_e32 vcc, v0, v1 +; GFX900-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_fcmp_one_bf16: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX950-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX950-NEXT: v_cmp_lg_f32_e32 vcc, v0, v1 +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_fcmp_one_bf16: ; GFX10: ; %bb.0: @@ -30379,14 +32383,24 @@ define i1 @v_fcmp_uno_bf16(bfloat %a, bfloat %b) { ; GFX8-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_fcmp_uno_bf16: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v1 -; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: v_fcmp_uno_bf16: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX900-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v0, v1 +; GFX900-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_fcmp_uno_bf16: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX950-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX950-NEXT: v_cmp_u_f32_e32 vcc, v0, v1 +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_fcmp_uno_bf16: ; GFX10: ; %bb.0: @@ -30454,14 +32468,24 @@ define i1 @v_fcmp_ueq_bf16(bfloat %a, bfloat %b) { ; GFX8-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_fcmp_ueq_bf16: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX9-NEXT: v_cmp_nlg_f32_e32 vcc, v0, v1 -; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: v_fcmp_ueq_bf16: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX900-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX900-NEXT: v_cmp_nlg_f32_e32 vcc, v0, v1 +; GFX900-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_fcmp_ueq_bf16: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX950-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX950-NEXT: v_cmp_nlg_f32_e32 vcc, v0, v1 +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_fcmp_ueq_bf16: ; GFX10: ; %bb.0: @@ -30529,14 +32553,24 @@ define i1 @v_fcmp_ugt_bf16(bfloat %a, bfloat %b) { ; GFX8-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_fcmp_ugt_bf16: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX9-NEXT: v_cmp_nle_f32_e32 vcc, v0, v1 -; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: v_fcmp_ugt_bf16: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX900-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX900-NEXT: v_cmp_nle_f32_e32 vcc, v0, v1 +; GFX900-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_fcmp_ugt_bf16: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX950-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX950-NEXT: v_cmp_nle_f32_e32 vcc, v0, v1 +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_fcmp_ugt_bf16: ; GFX10: ; %bb.0: @@ -30604,14 +32638,24 @@ define i1 @v_fcmp_uge_bf16(bfloat %a, bfloat %b) { ; GFX8-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_fcmp_uge_bf16: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX9-NEXT: v_cmp_nlt_f32_e32 vcc, v0, v1 -; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: v_fcmp_uge_bf16: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX900-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX900-NEXT: v_cmp_nlt_f32_e32 vcc, v0, v1 +; GFX900-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_fcmp_uge_bf16: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX950-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX950-NEXT: v_cmp_nlt_f32_e32 vcc, v0, v1 +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_fcmp_uge_bf16: ; GFX10: ; %bb.0: @@ -30679,14 +32723,24 @@ define i1 @v_fcmp_ult_bf16(bfloat %a, bfloat %b) { ; GFX8-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_fcmp_ult_bf16: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX9-NEXT: v_cmp_nge_f32_e32 vcc, v0, v1 -; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: v_fcmp_ult_bf16: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX900-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX900-NEXT: v_cmp_nge_f32_e32 vcc, v0, v1 +; GFX900-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_fcmp_ult_bf16: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX950-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX950-NEXT: v_cmp_nge_f32_e32 vcc, v0, v1 +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_fcmp_ult_bf16: ; GFX10: ; %bb.0: @@ -30754,14 +32808,24 @@ define i1 @v_fcmp_ule_bf16(bfloat %a, bfloat %b) { ; GFX8-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_fcmp_ule_bf16: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX9-NEXT: v_cmp_ngt_f32_e32 vcc, v0, v1 -; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: v_fcmp_ule_bf16: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX900-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX900-NEXT: v_cmp_ngt_f32_e32 vcc, v0, v1 +; GFX900-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_fcmp_ule_bf16: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX950-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX950-NEXT: v_cmp_ngt_f32_e32 vcc, v0, v1 +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_fcmp_ule_bf16: ; GFX10: ; %bb.0: @@ -30829,14 +32893,24 @@ define i1 @v_fcmp_une_bf16(bfloat %a, bfloat %b) { ; GFX8-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_fcmp_une_bf16: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX9-NEXT: v_cmp_neq_f32_e32 vcc, v0, v1 -; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: v_fcmp_une_bf16: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX900-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX900-NEXT: v_cmp_neq_f32_e32 vcc, v0, v1 +; GFX900-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_fcmp_une_bf16: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX950-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX950-NEXT: v_cmp_neq_f32_e32 vcc, v0, v1 +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_fcmp_une_bf16: ; GFX10: ; %bb.0: @@ -31011,16 +33085,27 @@ define <2 x i16> @v_fptosi_v2bf16_to_v2i16(<2 x bfloat> %x) { ; GFX8-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_fptosi_v2bf16_to_v2i16: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v0 -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX9-NEXT: v_cvt_i32_f32_e32 v1, v1 -; GFX9-NEXT: v_cvt_i32_f32_e32 v0, v0 -; GFX9-NEXT: s_mov_b32 s4, 0x5040100 -; GFX9-NEXT: v_perm_b32 v0, v0, v1, s4 -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: v_fptosi_v2bf16_to_v2i16: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_lshlrev_b32_e32 v1, 16, v0 +; GFX900-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX900-NEXT: v_cvt_i32_f32_e32 v1, v1 +; GFX900-NEXT: v_cvt_i32_f32_e32 v0, v0 +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_perm_b32 v0, v0, v1, s4 +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_fptosi_v2bf16_to_v2i16: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_lshlrev_b32_e32 v1, 16, v0 +; GFX950-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX950-NEXT: v_cvt_i32_f32_e32 v1, v1 +; GFX950-NEXT: v_cvt_i32_f32_e32 v0, v0 +; GFX950-NEXT: s_mov_b32 s0, 0x5040100 +; GFX950-NEXT: v_perm_b32 v0, v0, v1, s0 +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_fptosi_v2bf16_to_v2i16: ; GFX10: ; %bb.0: @@ -31110,18 +33195,31 @@ define <3 x i16> @v_fptosi_v3bf16_to_v3i16(<3 x bfloat> %x) { ; GFX8-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_fptosi_v3bf16_to_v3i16: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v0 -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX9-NEXT: v_cvt_i32_f32_e32 v2, v2 -; GFX9-NEXT: v_cvt_i32_f32_e32 v0, v0 -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX9-NEXT: v_cvt_i32_f32_e32 v1, v1 -; GFX9-NEXT: s_mov_b32 s4, 0x5040100 -; GFX9-NEXT: v_perm_b32 v0, v0, v2, s4 -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: v_fptosi_v3bf16_to_v3i16: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_lshlrev_b32_e32 v2, 16, v0 +; GFX900-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX900-NEXT: v_cvt_i32_f32_e32 v2, v2 +; GFX900-NEXT: v_cvt_i32_f32_e32 v0, v0 +; GFX900-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX900-NEXT: v_cvt_i32_f32_e32 v1, v1 +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_perm_b32 v0, v0, v2, s4 +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_fptosi_v3bf16_to_v3i16: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_lshlrev_b32_e32 v2, 16, v0 +; GFX950-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX950-NEXT: v_cvt_i32_f32_e32 v2, v2 +; GFX950-NEXT: v_cvt_i32_f32_e32 v0, v0 +; GFX950-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX950-NEXT: v_cvt_i32_f32_e32 v1, v1 +; GFX950-NEXT: s_mov_b32 s0, 0x5040100 +; GFX950-NEXT: v_perm_b32 v0, v0, v2, s0 +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_fptosi_v3bf16_to_v3i16: ; GFX10: ; %bb.0: @@ -31232,21 +33330,37 @@ define <4 x i16> @v_fptosi_v4bf16_to_v4i16(<4 x bfloat> %x) { ; GFX8-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_fptosi_v4bf16_to_v4i16: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v1 -; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v0 -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX9-NEXT: v_cvt_i32_f32_e32 v2, v2 -; GFX9-NEXT: v_cvt_i32_f32_e32 v3, v3 -; GFX9-NEXT: v_cvt_i32_f32_e32 v0, v0 -; GFX9-NEXT: v_cvt_i32_f32_e32 v1, v1 -; GFX9-NEXT: s_mov_b32 s4, 0x5040100 -; GFX9-NEXT: v_perm_b32 v0, v0, v3, s4 -; GFX9-NEXT: v_perm_b32 v1, v1, v2, s4 -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: v_fptosi_v4bf16_to_v4i16: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_lshlrev_b32_e32 v2, 16, v1 +; GFX900-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX900-NEXT: v_lshlrev_b32_e32 v3, 16, v0 +; GFX900-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX900-NEXT: v_cvt_i32_f32_e32 v2, v2 +; GFX900-NEXT: v_cvt_i32_f32_e32 v3, v3 +; GFX900-NEXT: v_cvt_i32_f32_e32 v0, v0 +; GFX900-NEXT: v_cvt_i32_f32_e32 v1, v1 +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_perm_b32 v0, v0, v3, s4 +; GFX900-NEXT: v_perm_b32 v1, v1, v2, s4 +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_fptosi_v4bf16_to_v4i16: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_lshlrev_b32_e32 v2, 16, v1 +; GFX950-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX950-NEXT: v_lshlrev_b32_e32 v3, 16, v0 +; GFX950-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX950-NEXT: v_cvt_i32_f32_e32 v2, v2 +; GFX950-NEXT: v_cvt_i32_f32_e32 v3, v3 +; GFX950-NEXT: v_cvt_i32_f32_e32 v0, v0 +; GFX950-NEXT: v_cvt_i32_f32_e32 v1, v1 +; GFX950-NEXT: s_mov_b32 s0, 0x5040100 +; GFX950-NEXT: v_perm_b32 v0, v0, v3, s0 +; GFX950-NEXT: v_perm_b32 v1, v1, v2, s0 +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_fptosi_v4bf16_to_v4i16: ; GFX10: ; %bb.0: @@ -31663,24 +33777,44 @@ define i64 @v_fptosi_bf16_to_i64(bfloat %x) { ; GFX8-NEXT: v_subb_u32_e32 v1, vcc, v1, v3, vcc ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_fptosi_bf16_to_i64: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX9-NEXT: v_trunc_f32_e32 v0, v0 -; GFX9-NEXT: s_mov_b32 s4, 0x2f800000 -; GFX9-NEXT: v_mul_f32_e64 v1, |v0|, s4 -; GFX9-NEXT: v_floor_f32_e32 v1, v1 -; GFX9-NEXT: s_mov_b32 s4, 0xcf800000 -; GFX9-NEXT: v_cvt_u32_f32_e32 v2, v1 -; GFX9-NEXT: v_fma_f32 v1, v1, s4, |v0| -; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v1 -; GFX9-NEXT: v_ashrrev_i32_e32 v3, 31, v0 -; GFX9-NEXT: v_xor_b32_e32 v2, v2, v3 -; GFX9-NEXT: v_xor_b32_e32 v0, v1, v3 -; GFX9-NEXT: v_sub_co_u32_e32 v0, vcc, v0, v3 -; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v2, v3, vcc -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: v_fptosi_bf16_to_i64: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX900-NEXT: v_trunc_f32_e32 v0, v0 +; GFX900-NEXT: s_mov_b32 s4, 0x2f800000 +; GFX900-NEXT: v_mul_f32_e64 v1, |v0|, s4 +; GFX900-NEXT: v_floor_f32_e32 v1, v1 +; GFX900-NEXT: s_mov_b32 s4, 0xcf800000 +; GFX900-NEXT: v_cvt_u32_f32_e32 v2, v1 +; GFX900-NEXT: v_fma_f32 v1, v1, s4, |v0| +; GFX900-NEXT: v_cvt_u32_f32_e32 v1, v1 +; GFX900-NEXT: v_ashrrev_i32_e32 v3, 31, v0 +; GFX900-NEXT: v_xor_b32_e32 v2, v2, v3 +; GFX900-NEXT: v_xor_b32_e32 v0, v1, v3 +; GFX900-NEXT: v_sub_co_u32_e32 v0, vcc, v0, v3 +; GFX900-NEXT: v_subb_co_u32_e32 v1, vcc, v2, v3, vcc +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_fptosi_bf16_to_i64: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX950-NEXT: v_trunc_f32_e32 v0, v0 +; GFX950-NEXT: s_mov_b32 s0, 0x2f800000 +; GFX950-NEXT: v_mul_f32_e64 v1, |v0|, s0 +; GFX950-NEXT: v_floor_f32_e32 v1, v1 +; GFX950-NEXT: s_mov_b32 s0, 0xcf800000 +; GFX950-NEXT: v_cvt_u32_f32_e32 v2, v1 +; GFX950-NEXT: v_fma_f32 v1, v1, s0, |v0| +; GFX950-NEXT: v_cvt_u32_f32_e32 v1, v1 +; GFX950-NEXT: v_ashrrev_i32_e32 v3, 31, v0 +; GFX950-NEXT: v_xor_b32_e32 v2, v2, v3 +; GFX950-NEXT: v_xor_b32_e32 v0, v1, v3 +; GFX950-NEXT: v_sub_co_u32_e32 v0, vcc, v0, v3 +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_subb_co_u32_e32 v1, vcc, v2, v3, vcc +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_fptosi_bf16_to_i64: ; GFX10: ; %bb.0: @@ -31845,36 +33979,69 @@ define <2 x i64> @v_fptosi_v2bf16_to_v2i64(<2 x bfloat> %x) { ; GFX8-NEXT: v_subb_u32_e32 v3, vcc, v4, v3, vcc ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_fptosi_v2bf16_to_v2i64: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v0 -; GFX9-NEXT: v_trunc_f32_e32 v1, v1 -; GFX9-NEXT: s_mov_b32 s4, 0x2f800000 -; GFX9-NEXT: v_mul_f32_e64 v2, |v1|, s4 -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX9-NEXT: v_floor_f32_e32 v2, v2 -; GFX9-NEXT: s_mov_b32 s5, 0xcf800000 -; GFX9-NEXT: v_trunc_f32_e32 v4, v0 -; GFX9-NEXT: v_fma_f32 v3, v2, s5, |v1| -; GFX9-NEXT: v_mul_f32_e64 v0, |v4|, s4 -; GFX9-NEXT: v_cvt_u32_f32_e32 v3, v3 -; GFX9-NEXT: v_floor_f32_e32 v0, v0 -; GFX9-NEXT: v_cvt_u32_f32_e32 v2, v2 -; GFX9-NEXT: v_fma_f32 v5, v0, s5, |v4| -; GFX9-NEXT: v_cvt_u32_f32_e32 v5, v5 -; GFX9-NEXT: v_ashrrev_i32_e32 v1, 31, v1 -; GFX9-NEXT: v_cvt_u32_f32_e32 v6, v0 -; GFX9-NEXT: v_xor_b32_e32 v3, v3, v1 -; GFX9-NEXT: v_xor_b32_e32 v2, v2, v1 -; GFX9-NEXT: v_sub_co_u32_e32 v0, vcc, v3, v1 -; GFX9-NEXT: v_ashrrev_i32_e32 v3, 31, v4 -; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v2, v1, vcc -; GFX9-NEXT: v_xor_b32_e32 v2, v5, v3 -; GFX9-NEXT: v_xor_b32_e32 v4, v6, v3 -; GFX9-NEXT: v_sub_co_u32_e32 v2, vcc, v2, v3 -; GFX9-NEXT: v_subb_co_u32_e32 v3, vcc, v4, v3, vcc -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: v_fptosi_v2bf16_to_v2i64: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_lshlrev_b32_e32 v1, 16, v0 +; GFX900-NEXT: v_trunc_f32_e32 v1, v1 +; GFX900-NEXT: s_mov_b32 s4, 0x2f800000 +; GFX900-NEXT: v_mul_f32_e64 v2, |v1|, s4 +; GFX900-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX900-NEXT: v_floor_f32_e32 v2, v2 +; GFX900-NEXT: s_mov_b32 s5, 0xcf800000 +; GFX900-NEXT: v_trunc_f32_e32 v4, v0 +; GFX900-NEXT: v_fma_f32 v3, v2, s5, |v1| +; GFX900-NEXT: v_mul_f32_e64 v0, |v4|, s4 +; GFX900-NEXT: v_cvt_u32_f32_e32 v3, v3 +; GFX900-NEXT: v_floor_f32_e32 v0, v0 +; GFX900-NEXT: v_cvt_u32_f32_e32 v2, v2 +; GFX900-NEXT: v_fma_f32 v5, v0, s5, |v4| +; GFX900-NEXT: v_cvt_u32_f32_e32 v5, v5 +; GFX900-NEXT: v_ashrrev_i32_e32 v1, 31, v1 +; GFX900-NEXT: v_cvt_u32_f32_e32 v6, v0 +; GFX900-NEXT: v_xor_b32_e32 v3, v3, v1 +; GFX900-NEXT: v_xor_b32_e32 v2, v2, v1 +; GFX900-NEXT: v_sub_co_u32_e32 v0, vcc, v3, v1 +; GFX900-NEXT: v_ashrrev_i32_e32 v3, 31, v4 +; GFX900-NEXT: v_subb_co_u32_e32 v1, vcc, v2, v1, vcc +; GFX900-NEXT: v_xor_b32_e32 v2, v5, v3 +; GFX900-NEXT: v_xor_b32_e32 v4, v6, v3 +; GFX900-NEXT: v_sub_co_u32_e32 v2, vcc, v2, v3 +; GFX900-NEXT: v_subb_co_u32_e32 v3, vcc, v4, v3, vcc +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_fptosi_v2bf16_to_v2i64: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_lshlrev_b32_e32 v1, 16, v0 +; GFX950-NEXT: v_trunc_f32_e32 v1, v1 +; GFX950-NEXT: s_mov_b32 s0, 0x2f800000 +; GFX950-NEXT: v_mul_f32_e64 v2, |v1|, s0 +; GFX950-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX950-NEXT: v_floor_f32_e32 v2, v2 +; GFX950-NEXT: s_mov_b32 s1, 0xcf800000 +; GFX950-NEXT: v_trunc_f32_e32 v4, v0 +; GFX950-NEXT: v_fma_f32 v3, v2, s1, |v1| +; GFX950-NEXT: v_mul_f32_e64 v0, |v4|, s0 +; GFX950-NEXT: v_cvt_u32_f32_e32 v3, v3 +; GFX950-NEXT: v_floor_f32_e32 v0, v0 +; GFX950-NEXT: v_cvt_u32_f32_e32 v2, v2 +; GFX950-NEXT: v_fma_f32 v5, v0, s1, |v4| +; GFX950-NEXT: v_cvt_u32_f32_e32 v5, v5 +; GFX950-NEXT: v_ashrrev_i32_e32 v1, 31, v1 +; GFX950-NEXT: v_cvt_u32_f32_e32 v6, v0 +; GFX950-NEXT: v_xor_b32_e32 v3, v3, v1 +; GFX950-NEXT: v_xor_b32_e32 v2, v2, v1 +; GFX950-NEXT: v_sub_co_u32_e32 v0, vcc, v3, v1 +; GFX950-NEXT: v_ashrrev_i32_e32 v3, 31, v4 +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: v_subb_co_u32_e32 v1, vcc, v2, v1, vcc +; GFX950-NEXT: v_xor_b32_e32 v2, v5, v3 +; GFX950-NEXT: v_xor_b32_e32 v4, v6, v3 +; GFX950-NEXT: v_sub_co_u32_e32 v2, vcc, v2, v3 +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_subb_co_u32_e32 v3, vcc, v4, v3, vcc +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_fptosi_v2bf16_to_v2i64: ; GFX10: ; %bb.0: @@ -32082,49 +34249,96 @@ define <3 x i64> @v_fptosi_v3bf16_to_v3i64(<3 x bfloat> %x) { ; GFX8-NEXT: v_mov_b32_e32 v1, v6 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_fptosi_v3bf16_to_v3i64: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v0 -; GFX9-NEXT: v_trunc_f32_e32 v2, v2 -; GFX9-NEXT: s_mov_b32 s4, 0x2f800000 -; GFX9-NEXT: v_mul_f32_e64 v3, |v2|, s4 -; GFX9-NEXT: v_floor_f32_e32 v3, v3 -; GFX9-NEXT: s_mov_b32 s5, 0xcf800000 -; GFX9-NEXT: v_fma_f32 v4, v3, s5, |v2| -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX9-NEXT: v_cvt_u32_f32_e32 v4, v4 -; GFX9-NEXT: v_trunc_f32_e32 v5, v0 -; GFX9-NEXT: v_cvt_u32_f32_e32 v3, v3 -; GFX9-NEXT: v_mul_f32_e64 v0, |v5|, s4 -; GFX9-NEXT: v_floor_f32_e32 v0, v0 -; GFX9-NEXT: v_ashrrev_i32_e32 v2, 31, v2 -; GFX9-NEXT: v_fma_f32 v6, v0, s5, |v5| -; GFX9-NEXT: v_xor_b32_e32 v4, v4, v2 -; GFX9-NEXT: v_cvt_u32_f32_e32 v7, v6 -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX9-NEXT: v_xor_b32_e32 v3, v3, v2 -; GFX9-NEXT: v_cvt_u32_f32_e32 v8, v0 -; GFX9-NEXT: v_sub_co_u32_e32 v0, vcc, v4, v2 -; GFX9-NEXT: v_trunc_f32_e32 v1, v1 -; GFX9-NEXT: v_subb_co_u32_e32 v6, vcc, v3, v2, vcc -; GFX9-NEXT: v_ashrrev_i32_e32 v3, 31, v5 -; GFX9-NEXT: v_mul_f32_e64 v5, |v1|, s4 -; GFX9-NEXT: v_floor_f32_e32 v5, v5 -; GFX9-NEXT: v_xor_b32_e32 v2, v7, v3 -; GFX9-NEXT: v_fma_f32 v7, v5, s5, |v1| -; GFX9-NEXT: v_cvt_u32_f32_e32 v7, v7 -; GFX9-NEXT: v_cvt_u32_f32_e32 v5, v5 -; GFX9-NEXT: v_xor_b32_e32 v4, v8, v3 -; GFX9-NEXT: v_sub_co_u32_e32 v2, vcc, v2, v3 -; GFX9-NEXT: v_ashrrev_i32_e32 v1, 31, v1 -; GFX9-NEXT: v_subb_co_u32_e32 v3, vcc, v4, v3, vcc -; GFX9-NEXT: v_xor_b32_e32 v4, v7, v1 -; GFX9-NEXT: v_xor_b32_e32 v5, v5, v1 -; GFX9-NEXT: v_sub_co_u32_e32 v4, vcc, v4, v1 -; GFX9-NEXT: v_subb_co_u32_e32 v5, vcc, v5, v1, vcc -; GFX9-NEXT: v_mov_b32_e32 v1, v6 -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: v_fptosi_v3bf16_to_v3i64: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_lshlrev_b32_e32 v2, 16, v0 +; GFX900-NEXT: v_trunc_f32_e32 v2, v2 +; GFX900-NEXT: s_mov_b32 s4, 0x2f800000 +; GFX900-NEXT: v_mul_f32_e64 v3, |v2|, s4 +; GFX900-NEXT: v_floor_f32_e32 v3, v3 +; GFX900-NEXT: s_mov_b32 s5, 0xcf800000 +; GFX900-NEXT: v_fma_f32 v4, v3, s5, |v2| +; GFX900-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX900-NEXT: v_cvt_u32_f32_e32 v4, v4 +; GFX900-NEXT: v_trunc_f32_e32 v5, v0 +; GFX900-NEXT: v_cvt_u32_f32_e32 v3, v3 +; GFX900-NEXT: v_mul_f32_e64 v0, |v5|, s4 +; GFX900-NEXT: v_floor_f32_e32 v0, v0 +; GFX900-NEXT: v_ashrrev_i32_e32 v2, 31, v2 +; GFX900-NEXT: v_fma_f32 v6, v0, s5, |v5| +; GFX900-NEXT: v_xor_b32_e32 v4, v4, v2 +; GFX900-NEXT: v_cvt_u32_f32_e32 v7, v6 +; GFX900-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX900-NEXT: v_xor_b32_e32 v3, v3, v2 +; GFX900-NEXT: v_cvt_u32_f32_e32 v8, v0 +; GFX900-NEXT: v_sub_co_u32_e32 v0, vcc, v4, v2 +; GFX900-NEXT: v_trunc_f32_e32 v1, v1 +; GFX900-NEXT: v_subb_co_u32_e32 v6, vcc, v3, v2, vcc +; GFX900-NEXT: v_ashrrev_i32_e32 v3, 31, v5 +; GFX900-NEXT: v_mul_f32_e64 v5, |v1|, s4 +; GFX900-NEXT: v_floor_f32_e32 v5, v5 +; GFX900-NEXT: v_xor_b32_e32 v2, v7, v3 +; GFX900-NEXT: v_fma_f32 v7, v5, s5, |v1| +; GFX900-NEXT: v_cvt_u32_f32_e32 v7, v7 +; GFX900-NEXT: v_cvt_u32_f32_e32 v5, v5 +; GFX900-NEXT: v_xor_b32_e32 v4, v8, v3 +; GFX900-NEXT: v_sub_co_u32_e32 v2, vcc, v2, v3 +; GFX900-NEXT: v_ashrrev_i32_e32 v1, 31, v1 +; GFX900-NEXT: v_subb_co_u32_e32 v3, vcc, v4, v3, vcc +; GFX900-NEXT: v_xor_b32_e32 v4, v7, v1 +; GFX900-NEXT: v_xor_b32_e32 v5, v5, v1 +; GFX900-NEXT: v_sub_co_u32_e32 v4, vcc, v4, v1 +; GFX900-NEXT: v_subb_co_u32_e32 v5, vcc, v5, v1, vcc +; GFX900-NEXT: v_mov_b32_e32 v1, v6 +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_fptosi_v3bf16_to_v3i64: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_lshlrev_b32_e32 v2, 16, v0 +; GFX950-NEXT: v_trunc_f32_e32 v2, v2 +; GFX950-NEXT: s_mov_b32 s0, 0x2f800000 +; GFX950-NEXT: v_mul_f32_e64 v3, |v2|, s0 +; GFX950-NEXT: v_floor_f32_e32 v3, v3 +; GFX950-NEXT: s_mov_b32 s1, 0xcf800000 +; GFX950-NEXT: v_fma_f32 v4, v3, s1, |v2| +; GFX950-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX950-NEXT: v_cvt_u32_f32_e32 v4, v4 +; GFX950-NEXT: v_trunc_f32_e32 v5, v0 +; GFX950-NEXT: v_cvt_u32_f32_e32 v3, v3 +; GFX950-NEXT: v_mul_f32_e64 v0, |v5|, s0 +; GFX950-NEXT: v_floor_f32_e32 v0, v0 +; GFX950-NEXT: v_ashrrev_i32_e32 v2, 31, v2 +; GFX950-NEXT: v_fma_f32 v6, v0, s1, |v5| +; GFX950-NEXT: v_xor_b32_e32 v4, v4, v2 +; GFX950-NEXT: v_cvt_u32_f32_e32 v7, v6 +; GFX950-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX950-NEXT: v_xor_b32_e32 v3, v3, v2 +; GFX950-NEXT: v_cvt_u32_f32_e32 v8, v0 +; GFX950-NEXT: v_sub_co_u32_e32 v0, vcc, v4, v2 +; GFX950-NEXT: v_trunc_f32_e32 v1, v1 +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: v_subb_co_u32_e32 v6, vcc, v3, v2, vcc +; GFX950-NEXT: v_ashrrev_i32_e32 v3, 31, v5 +; GFX950-NEXT: v_mul_f32_e64 v5, |v1|, s0 +; GFX950-NEXT: v_floor_f32_e32 v5, v5 +; GFX950-NEXT: v_xor_b32_e32 v2, v7, v3 +; GFX950-NEXT: v_fma_f32 v7, v5, s1, |v1| +; GFX950-NEXT: v_cvt_u32_f32_e32 v7, v7 +; GFX950-NEXT: v_cvt_u32_f32_e32 v5, v5 +; GFX950-NEXT: v_xor_b32_e32 v4, v8, v3 +; GFX950-NEXT: v_sub_co_u32_e32 v2, vcc, v2, v3 +; GFX950-NEXT: v_ashrrev_i32_e32 v1, 31, v1 +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: v_subb_co_u32_e32 v3, vcc, v4, v3, vcc +; GFX950-NEXT: v_xor_b32_e32 v4, v7, v1 +; GFX950-NEXT: v_xor_b32_e32 v5, v5, v1 +; GFX950-NEXT: v_sub_co_u32_e32 v4, vcc, v4, v1 +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_subb_co_u32_e32 v5, vcc, v5, v1, vcc +; GFX950-NEXT: v_mov_b32_e32 v1, v6 +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_fptosi_v3bf16_to_v3i64: ; GFX10: ; %bb.0: @@ -32393,61 +34607,120 @@ define <4 x i64> @v_fptosi_v4bf16_to_v4i64(<4 x bfloat> %x) { ; GFX8-NEXT: v_mov_b32_e32 v1, v8 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_fptosi_v4bf16_to_v4i64: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v0 -; GFX9-NEXT: v_trunc_f32_e32 v2, v2 -; GFX9-NEXT: s_mov_b32 s4, 0x2f800000 -; GFX9-NEXT: v_mul_f32_e64 v3, |v2|, s4 -; GFX9-NEXT: v_floor_f32_e32 v3, v3 -; GFX9-NEXT: s_mov_b32 s5, 0xcf800000 -; GFX9-NEXT: v_fma_f32 v4, v3, s5, |v2| -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX9-NEXT: v_cvt_u32_f32_e32 v4, v4 -; GFX9-NEXT: v_trunc_f32_e32 v5, v0 -; GFX9-NEXT: v_cvt_u32_f32_e32 v3, v3 -; GFX9-NEXT: v_mul_f32_e64 v0, |v5|, s4 -; GFX9-NEXT: v_floor_f32_e32 v0, v0 -; GFX9-NEXT: v_ashrrev_i32_e32 v2, 31, v2 -; GFX9-NEXT: v_fma_f32 v6, v0, s5, |v5| -; GFX9-NEXT: v_xor_b32_e32 v4, v4, v2 -; GFX9-NEXT: v_cvt_u32_f32_e32 v6, v6 -; GFX9-NEXT: v_xor_b32_e32 v3, v3, v2 -; GFX9-NEXT: v_cvt_u32_f32_e32 v7, v0 -; GFX9-NEXT: v_sub_co_u32_e32 v0, vcc, v4, v2 -; GFX9-NEXT: v_subb_co_u32_e32 v8, vcc, v3, v2, vcc -; GFX9-NEXT: v_ashrrev_i32_e32 v3, 31, v5 -; GFX9-NEXT: v_lshlrev_b32_e32 v5, 16, v1 -; GFX9-NEXT: v_trunc_f32_e32 v5, v5 -; GFX9-NEXT: v_xor_b32_e32 v2, v6, v3 -; GFX9-NEXT: v_mul_f32_e64 v6, |v5|, s4 -; GFX9-NEXT: v_floor_f32_e32 v6, v6 -; GFX9-NEXT: v_xor_b32_e32 v4, v7, v3 -; GFX9-NEXT: v_fma_f32 v7, v6, s5, |v5| -; GFX9-NEXT: v_cvt_u32_f32_e32 v7, v7 -; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GFX9-NEXT: v_sub_co_u32_e32 v2, vcc, v2, v3 -; GFX9-NEXT: v_ashrrev_i32_e32 v5, 31, v5 -; GFX9-NEXT: v_trunc_f32_e32 v1, v1 -; GFX9-NEXT: v_subb_co_u32_e32 v3, vcc, v4, v3, vcc -; GFX9-NEXT: v_xor_b32_e32 v4, v7, v5 -; GFX9-NEXT: v_mul_f32_e64 v7, |v1|, s4 -; GFX9-NEXT: v_floor_f32_e32 v7, v7 -; GFX9-NEXT: v_cvt_u32_f32_e32 v6, v6 -; GFX9-NEXT: v_fma_f32 v9, v7, s5, |v1| -; GFX9-NEXT: v_cvt_u32_f32_e32 v9, v9 -; GFX9-NEXT: v_cvt_u32_f32_e32 v7, v7 -; GFX9-NEXT: v_xor_b32_e32 v6, v6, v5 -; GFX9-NEXT: v_sub_co_u32_e32 v4, vcc, v4, v5 -; GFX9-NEXT: v_ashrrev_i32_e32 v1, 31, v1 -; GFX9-NEXT: v_subb_co_u32_e32 v5, vcc, v6, v5, vcc -; GFX9-NEXT: v_xor_b32_e32 v6, v9, v1 -; GFX9-NEXT: v_xor_b32_e32 v7, v7, v1 -; GFX9-NEXT: v_sub_co_u32_e32 v6, vcc, v6, v1 -; GFX9-NEXT: v_subb_co_u32_e32 v7, vcc, v7, v1, vcc -; GFX9-NEXT: v_mov_b32_e32 v1, v8 -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: v_fptosi_v4bf16_to_v4i64: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_lshlrev_b32_e32 v2, 16, v0 +; GFX900-NEXT: v_trunc_f32_e32 v2, v2 +; GFX900-NEXT: s_mov_b32 s4, 0x2f800000 +; GFX900-NEXT: v_mul_f32_e64 v3, |v2|, s4 +; GFX900-NEXT: v_floor_f32_e32 v3, v3 +; GFX900-NEXT: s_mov_b32 s5, 0xcf800000 +; GFX900-NEXT: v_fma_f32 v4, v3, s5, |v2| +; GFX900-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX900-NEXT: v_cvt_u32_f32_e32 v4, v4 +; GFX900-NEXT: v_trunc_f32_e32 v5, v0 +; GFX900-NEXT: v_cvt_u32_f32_e32 v3, v3 +; GFX900-NEXT: v_mul_f32_e64 v0, |v5|, s4 +; GFX900-NEXT: v_floor_f32_e32 v0, v0 +; GFX900-NEXT: v_ashrrev_i32_e32 v2, 31, v2 +; GFX900-NEXT: v_fma_f32 v6, v0, s5, |v5| +; GFX900-NEXT: v_xor_b32_e32 v4, v4, v2 +; GFX900-NEXT: v_cvt_u32_f32_e32 v6, v6 +; GFX900-NEXT: v_xor_b32_e32 v3, v3, v2 +; GFX900-NEXT: v_cvt_u32_f32_e32 v7, v0 +; GFX900-NEXT: v_sub_co_u32_e32 v0, vcc, v4, v2 +; GFX900-NEXT: v_subb_co_u32_e32 v8, vcc, v3, v2, vcc +; GFX900-NEXT: v_ashrrev_i32_e32 v3, 31, v5 +; GFX900-NEXT: v_lshlrev_b32_e32 v5, 16, v1 +; GFX900-NEXT: v_trunc_f32_e32 v5, v5 +; GFX900-NEXT: v_xor_b32_e32 v2, v6, v3 +; GFX900-NEXT: v_mul_f32_e64 v6, |v5|, s4 +; GFX900-NEXT: v_floor_f32_e32 v6, v6 +; GFX900-NEXT: v_xor_b32_e32 v4, v7, v3 +; GFX900-NEXT: v_fma_f32 v7, v6, s5, |v5| +; GFX900-NEXT: v_cvt_u32_f32_e32 v7, v7 +; GFX900-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX900-NEXT: v_sub_co_u32_e32 v2, vcc, v2, v3 +; GFX900-NEXT: v_ashrrev_i32_e32 v5, 31, v5 +; GFX900-NEXT: v_trunc_f32_e32 v1, v1 +; GFX900-NEXT: v_subb_co_u32_e32 v3, vcc, v4, v3, vcc +; GFX900-NEXT: v_xor_b32_e32 v4, v7, v5 +; GFX900-NEXT: v_mul_f32_e64 v7, |v1|, s4 +; GFX900-NEXT: v_floor_f32_e32 v7, v7 +; GFX900-NEXT: v_cvt_u32_f32_e32 v6, v6 +; GFX900-NEXT: v_fma_f32 v9, v7, s5, |v1| +; GFX900-NEXT: v_cvt_u32_f32_e32 v9, v9 +; GFX900-NEXT: v_cvt_u32_f32_e32 v7, v7 +; GFX900-NEXT: v_xor_b32_e32 v6, v6, v5 +; GFX900-NEXT: v_sub_co_u32_e32 v4, vcc, v4, v5 +; GFX900-NEXT: v_ashrrev_i32_e32 v1, 31, v1 +; GFX900-NEXT: v_subb_co_u32_e32 v5, vcc, v6, v5, vcc +; GFX900-NEXT: v_xor_b32_e32 v6, v9, v1 +; GFX900-NEXT: v_xor_b32_e32 v7, v7, v1 +; GFX900-NEXT: v_sub_co_u32_e32 v6, vcc, v6, v1 +; GFX900-NEXT: v_subb_co_u32_e32 v7, vcc, v7, v1, vcc +; GFX900-NEXT: v_mov_b32_e32 v1, v8 +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_fptosi_v4bf16_to_v4i64: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_lshlrev_b32_e32 v2, 16, v0 +; GFX950-NEXT: v_trunc_f32_e32 v2, v2 +; GFX950-NEXT: s_mov_b32 s0, 0x2f800000 +; GFX950-NEXT: v_mul_f32_e64 v3, |v2|, s0 +; GFX950-NEXT: v_floor_f32_e32 v3, v3 +; GFX950-NEXT: s_mov_b32 s1, 0xcf800000 +; GFX950-NEXT: v_fma_f32 v4, v3, s1, |v2| +; GFX950-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX950-NEXT: v_cvt_u32_f32_e32 v4, v4 +; GFX950-NEXT: v_trunc_f32_e32 v5, v0 +; GFX950-NEXT: v_cvt_u32_f32_e32 v3, v3 +; GFX950-NEXT: v_mul_f32_e64 v0, |v5|, s0 +; GFX950-NEXT: v_floor_f32_e32 v0, v0 +; GFX950-NEXT: v_ashrrev_i32_e32 v2, 31, v2 +; GFX950-NEXT: v_fma_f32 v6, v0, s1, |v5| +; GFX950-NEXT: v_xor_b32_e32 v4, v4, v2 +; GFX950-NEXT: v_cvt_u32_f32_e32 v6, v6 +; GFX950-NEXT: v_xor_b32_e32 v3, v3, v2 +; GFX950-NEXT: v_cvt_u32_f32_e32 v7, v0 +; GFX950-NEXT: v_sub_co_u32_e32 v0, vcc, v4, v2 +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_subb_co_u32_e32 v8, vcc, v3, v2, vcc +; GFX950-NEXT: v_ashrrev_i32_e32 v3, 31, v5 +; GFX950-NEXT: v_lshlrev_b32_e32 v5, 16, v1 +; GFX950-NEXT: v_trunc_f32_e32 v5, v5 +; GFX950-NEXT: v_xor_b32_e32 v2, v6, v3 +; GFX950-NEXT: v_mul_f32_e64 v6, |v5|, s0 +; GFX950-NEXT: v_floor_f32_e32 v6, v6 +; GFX950-NEXT: v_xor_b32_e32 v4, v7, v3 +; GFX950-NEXT: v_fma_f32 v7, v6, s1, |v5| +; GFX950-NEXT: v_cvt_u32_f32_e32 v7, v7 +; GFX950-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX950-NEXT: v_sub_co_u32_e32 v2, vcc, v2, v3 +; GFX950-NEXT: v_ashrrev_i32_e32 v5, 31, v5 +; GFX950-NEXT: v_trunc_f32_e32 v1, v1 +; GFX950-NEXT: v_subb_co_u32_e32 v3, vcc, v4, v3, vcc +; GFX950-NEXT: v_xor_b32_e32 v4, v7, v5 +; GFX950-NEXT: v_mul_f32_e64 v7, |v1|, s0 +; GFX950-NEXT: v_floor_f32_e32 v7, v7 +; GFX950-NEXT: v_cvt_u32_f32_e32 v6, v6 +; GFX950-NEXT: v_fma_f32 v9, v7, s1, |v1| +; GFX950-NEXT: v_cvt_u32_f32_e32 v9, v9 +; GFX950-NEXT: v_cvt_u32_f32_e32 v7, v7 +; GFX950-NEXT: v_xor_b32_e32 v6, v6, v5 +; GFX950-NEXT: v_sub_co_u32_e32 v4, vcc, v4, v5 +; GFX950-NEXT: v_ashrrev_i32_e32 v1, 31, v1 +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: v_subb_co_u32_e32 v5, vcc, v6, v5, vcc +; GFX950-NEXT: v_xor_b32_e32 v6, v9, v1 +; GFX950-NEXT: v_xor_b32_e32 v7, v7, v1 +; GFX950-NEXT: v_sub_co_u32_e32 v6, vcc, v6, v1 +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_subb_co_u32_e32 v7, vcc, v7, v1, vcc +; GFX950-NEXT: v_mov_b32_e32 v1, v8 +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_fptosi_v4bf16_to_v4i64: ; GFX10: ; %bb.0: @@ -32594,18 +34867,25 @@ define bfloat @v_sitofp_i16_to_bf16(i16 %x) { ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_sitofp_i16_to_bf16: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_cvt_f32_i32_sdwa v0, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 -; GFX9-NEXT: s_movk_i32 s4, 0x7fff -; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1 -; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v0 -; GFX9-NEXT: v_add3_u32 v1, v1, v0, s4 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: v_sitofp_i16_to_bf16: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_cvt_f32_i32_sdwa v0, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 +; GFX900-NEXT: s_movk_i32 s4, 0x7fff +; GFX900-NEXT: v_bfe_u32 v1, v0, 16, 1 +; GFX900-NEXT: v_or_b32_e32 v2, 0x400000, v0 +; GFX900-NEXT: v_add3_u32 v1, v1, v0, s4 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX900-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc +; GFX900-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_sitofp_i16_to_bf16: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_cvt_f32_i32_sdwa v0, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v0, v0, s0 +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_sitofp_i16_to_bf16: ; GFX10: ; %bb.0: @@ -32698,25 +34978,33 @@ define <2 x bfloat> @v_sitofp_v2i16_to_v2bf16(<2 x i16> %x) { ; GFX8-NEXT: v_alignbit_b32 v0, v0, v1, 16 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_sitofp_v2i16_to_v2bf16: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_cvt_f32_i32_sdwa v1, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 -; GFX9-NEXT: v_cvt_f32_i32_sdwa v0, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; GFX9-NEXT: s_movk_i32 s4, 0x7fff -; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 -; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; GFX9-NEXT: v_add3_u32 v2, v2, v1, s4 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; GFX9-NEXT: v_bfe_u32 v2, v0, 16, 1 -; GFX9-NEXT: v_add3_u32 v2, v2, v0, s4 -; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v0 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc -; GFX9-NEXT: s_mov_b32 s4, 0x7060302 -; GFX9-NEXT: v_perm_b32 v0, v0, v1, s4 -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: v_sitofp_v2i16_to_v2bf16: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_cvt_f32_i32_sdwa v1, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 +; GFX900-NEXT: v_cvt_f32_i32_sdwa v0, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; GFX900-NEXT: s_movk_i32 s4, 0x7fff +; GFX900-NEXT: v_bfe_u32 v2, v1, 16, 1 +; GFX900-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; GFX900-NEXT: v_add3_u32 v2, v2, v1, s4 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX900-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; GFX900-NEXT: v_bfe_u32 v2, v0, 16, 1 +; GFX900-NEXT: v_add3_u32 v2, v2, v0, s4 +; GFX900-NEXT: v_or_b32_e32 v3, 0x400000, v0 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX900-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_perm_b32 v0, v0, v1, s4 +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_sitofp_v2i16_to_v2bf16: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_cvt_f32_i32_sdwa v1, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; GFX950-NEXT: v_cvt_f32_i32_sdwa v0, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v0, v0, v1 +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_sitofp_v2i16_to_v2bf16: ; GFX10: ; %bb.0: @@ -32846,32 +35134,42 @@ define <3 x bfloat> @v_sitofp_v3i16_to_v3bf16(<3 x i16> %x) { ; GFX8-NEXT: v_alignbit_b32 v0, v0, v2, 16 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_sitofp_v3i16_to_v3bf16: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_cvt_f32_i32_sdwa v1, sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 -; GFX9-NEXT: v_cvt_f32_i32_sdwa v2, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 -; GFX9-NEXT: s_movk_i32 s4, 0x7fff -; GFX9-NEXT: v_cvt_f32_i32_sdwa v0, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; GFX9-NEXT: v_bfe_u32 v3, v1, 16, 1 -; GFX9-NEXT: v_add3_u32 v3, v3, v1, s4 -; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v1 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc -; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 -; GFX9-NEXT: v_add3_u32 v3, v3, v2, s4 -; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; GFX9-NEXT: v_bfe_u32 v3, v0, 16, 1 -; GFX9-NEXT: v_add3_u32 v3, v3, v0, s4 -; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v0 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v3, v4, vcc -; GFX9-NEXT: s_mov_b32 s4, 0x7060302 -; GFX9-NEXT: v_perm_b32 v0, v0, v2, s4 -; GFX9-NEXT: v_alignbit_b32 v1, s4, v1, 16 -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: v_sitofp_v3i16_to_v3bf16: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_cvt_f32_i32_sdwa v1, sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 +; GFX900-NEXT: v_cvt_f32_i32_sdwa v2, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 +; GFX900-NEXT: s_movk_i32 s4, 0x7fff +; GFX900-NEXT: v_cvt_f32_i32_sdwa v0, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; GFX900-NEXT: v_bfe_u32 v3, v1, 16, 1 +; GFX900-NEXT: v_add3_u32 v3, v3, v1, s4 +; GFX900-NEXT: v_or_b32_e32 v4, 0x400000, v1 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX900-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc +; GFX900-NEXT: v_bfe_u32 v3, v2, 16, 1 +; GFX900-NEXT: v_add3_u32 v3, v3, v2, s4 +; GFX900-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX900-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; GFX900-NEXT: v_bfe_u32 v3, v0, 16, 1 +; GFX900-NEXT: v_add3_u32 v3, v3, v0, s4 +; GFX900-NEXT: v_or_b32_e32 v4, 0x400000, v0 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX900-NEXT: v_cndmask_b32_e32 v0, v3, v4, vcc +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_perm_b32 v0, v0, v2, s4 +; GFX900-NEXT: v_alignbit_b32 v1, s4, v1, 16 +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_sitofp_v3i16_to_v3bf16: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_cvt_f32_i32_sdwa v1, sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 +; GFX950-NEXT: v_cvt_f32_i32_sdwa v2, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; GFX950-NEXT: v_cvt_f32_i32_sdwa v0, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v1, v1, s0 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v0, v0, v2 +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_sitofp_v3i16_to_v3bf16: ; GFX10: ; %bb.0: @@ -33042,38 +35340,49 @@ define <4 x bfloat> @v_sitofp_v4i16_to_v4bf16(<4 x i16> %x) { ; GFX8-NEXT: v_alignbit_b32 v1, v1, v2, 16 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_sitofp_v4i16_to_v4bf16: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_cvt_f32_i32_sdwa v2, sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 -; GFX9-NEXT: s_movk_i32 s4, 0x7fff -; GFX9-NEXT: v_cvt_f32_i32_sdwa v1, sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 -; GFX9-NEXT: v_add3_u32 v3, v3, v2, s4 -; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; GFX9-NEXT: v_cvt_f32_i32_sdwa v4, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 -; GFX9-NEXT: v_cvt_f32_i32_sdwa v0, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; GFX9-NEXT: v_bfe_u32 v3, v1, 16, 1 -; GFX9-NEXT: v_add3_u32 v3, v3, v1, s4 -; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v1 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc -; GFX9-NEXT: v_bfe_u32 v3, v4, 16, 1 -; GFX9-NEXT: v_add3_u32 v3, v3, v4, s4 -; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v4 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 -; GFX9-NEXT: v_bfe_u32 v4, v0, 16, 1 -; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc -; GFX9-NEXT: v_add3_u32 v4, v4, v0, s4 -; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v0 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc -; GFX9-NEXT: s_mov_b32 s4, 0x7060302 -; GFX9-NEXT: v_perm_b32 v0, v0, v3, s4 -; GFX9-NEXT: v_perm_b32 v1, v1, v2, s4 -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: v_sitofp_v4i16_to_v4bf16: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_cvt_f32_i32_sdwa v2, sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 +; GFX900-NEXT: s_movk_i32 s4, 0x7fff +; GFX900-NEXT: v_cvt_f32_i32_sdwa v1, sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; GFX900-NEXT: v_bfe_u32 v3, v2, 16, 1 +; GFX900-NEXT: v_add3_u32 v3, v3, v2, s4 +; GFX900-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX900-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; GFX900-NEXT: v_cvt_f32_i32_sdwa v4, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 +; GFX900-NEXT: v_cvt_f32_i32_sdwa v0, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; GFX900-NEXT: v_bfe_u32 v3, v1, 16, 1 +; GFX900-NEXT: v_add3_u32 v3, v3, v1, s4 +; GFX900-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX900-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc +; GFX900-NEXT: v_bfe_u32 v3, v4, 16, 1 +; GFX900-NEXT: v_add3_u32 v3, v3, v4, s4 +; GFX900-NEXT: v_or_b32_e32 v5, 0x400000, v4 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; GFX900-NEXT: v_bfe_u32 v4, v0, 16, 1 +; GFX900-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc +; GFX900-NEXT: v_add3_u32 v4, v4, v0, s4 +; GFX900-NEXT: v_or_b32_e32 v5, 0x400000, v0 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX900-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_perm_b32 v0, v0, v3, s4 +; GFX900-NEXT: v_perm_b32 v1, v1, v2, s4 +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_sitofp_v4i16_to_v4bf16: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_cvt_f32_i32_sdwa v2, sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; GFX950-NEXT: v_cvt_f32_i32_sdwa v3, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; GFX950-NEXT: v_cvt_f32_i32_sdwa v0, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 +; GFX950-NEXT: v_cvt_f32_i32_sdwa v1, sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v0, v0, v3 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v1, v1, v2 +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_sitofp_v4i16_to_v4bf16: ; GFX10: ; %bb.0: @@ -33219,18 +35528,25 @@ define bfloat @v_sitofp_i32_to_bf16(i32 %x) { ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_sitofp_i32_to_bf16: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_cvt_f32_i32_e32 v0, v0 -; GFX9-NEXT: s_movk_i32 s4, 0x7fff -; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1 -; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v0 -; GFX9-NEXT: v_add3_u32 v1, v1, v0, s4 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: v_sitofp_i32_to_bf16: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_cvt_f32_i32_e32 v0, v0 +; GFX900-NEXT: s_movk_i32 s4, 0x7fff +; GFX900-NEXT: v_bfe_u32 v1, v0, 16, 1 +; GFX900-NEXT: v_or_b32_e32 v2, 0x400000, v0 +; GFX900-NEXT: v_add3_u32 v1, v1, v0, s4 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX900-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc +; GFX900-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_sitofp_i32_to_bf16: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_cvt_f32_i32_e32 v0, v0 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v0, v0, s0 +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_sitofp_i32_to_bf16: ; GFX10: ; %bb.0: @@ -33315,25 +35631,33 @@ define <2 x bfloat> @v_sitofp_v2i32_to_v2bf16(<2 x i32> %x) { ; GFX8-NEXT: v_alignbit_b32 v0, v1, v0, 16 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_sitofp_v2i32_to_v2bf16: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_cvt_f32_i32_e32 v0, v0 -; GFX9-NEXT: v_cvt_f32_i32_e32 v1, v1 -; GFX9-NEXT: s_movk_i32 s4, 0x7fff -; GFX9-NEXT: v_bfe_u32 v2, v0, 16, 1 -; GFX9-NEXT: v_add3_u32 v2, v2, v0, s4 -; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v0 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc -; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 -; GFX9-NEXT: v_add3_u32 v2, v2, v1, s4 -; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; GFX9-NEXT: s_mov_b32 s4, 0x7060302 -; GFX9-NEXT: v_perm_b32 v0, v1, v0, s4 -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: v_sitofp_v2i32_to_v2bf16: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_cvt_f32_i32_e32 v0, v0 +; GFX900-NEXT: v_cvt_f32_i32_e32 v1, v1 +; GFX900-NEXT: s_movk_i32 s4, 0x7fff +; GFX900-NEXT: v_bfe_u32 v2, v0, 16, 1 +; GFX900-NEXT: v_add3_u32 v2, v2, v0, s4 +; GFX900-NEXT: v_or_b32_e32 v3, 0x400000, v0 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX900-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc +; GFX900-NEXT: v_bfe_u32 v2, v1, 16, 1 +; GFX900-NEXT: v_add3_u32 v2, v2, v1, s4 +; GFX900-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX900-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_perm_b32 v0, v1, v0, s4 +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_sitofp_v2i32_to_v2bf16: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_cvt_f32_i32_e32 v1, v1 +; GFX950-NEXT: v_cvt_f32_i32_e32 v0, v0 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v0, v0, v1 +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_sitofp_v2i32_to_v2bf16: ; GFX10: ; %bb.0: @@ -33452,32 +35776,42 @@ define <3 x bfloat> @v_sitofp_v3i32_to_v3bf16(<3 x i32> %x) { ; GFX8-NEXT: v_mov_b32_e32 v1, v2 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_sitofp_v3i32_to_v3bf16: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_cvt_f32_i32_e32 v2, v2 -; GFX9-NEXT: v_cvt_f32_i32_e32 v0, v0 -; GFX9-NEXT: s_movk_i32 s4, 0x7fff -; GFX9-NEXT: v_cvt_f32_i32_e32 v1, v1 -; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 -; GFX9-NEXT: v_add3_u32 v3, v3, v2, s4 -; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; GFX9-NEXT: v_bfe_u32 v3, v0, 16, 1 -; GFX9-NEXT: v_add3_u32 v3, v3, v0, s4 -; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v0 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v3, v4, vcc -; GFX9-NEXT: v_bfe_u32 v3, v1, 16, 1 -; GFX9-NEXT: v_add3_u32 v3, v3, v1, s4 -; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v1 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc -; GFX9-NEXT: s_mov_b32 s4, 0x7060302 -; GFX9-NEXT: v_perm_b32 v0, v1, v0, s4 -; GFX9-NEXT: v_alignbit_b32 v1, s4, v2, 16 -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: v_sitofp_v3i32_to_v3bf16: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_cvt_f32_i32_e32 v2, v2 +; GFX900-NEXT: v_cvt_f32_i32_e32 v0, v0 +; GFX900-NEXT: s_movk_i32 s4, 0x7fff +; GFX900-NEXT: v_cvt_f32_i32_e32 v1, v1 +; GFX900-NEXT: v_bfe_u32 v3, v2, 16, 1 +; GFX900-NEXT: v_add3_u32 v3, v3, v2, s4 +; GFX900-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX900-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; GFX900-NEXT: v_bfe_u32 v3, v0, 16, 1 +; GFX900-NEXT: v_add3_u32 v3, v3, v0, s4 +; GFX900-NEXT: v_or_b32_e32 v4, 0x400000, v0 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX900-NEXT: v_cndmask_b32_e32 v0, v3, v4, vcc +; GFX900-NEXT: v_bfe_u32 v3, v1, 16, 1 +; GFX900-NEXT: v_add3_u32 v3, v3, v1, s4 +; GFX900-NEXT: v_or_b32_e32 v4, 0x400000, v1 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX900-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_perm_b32 v0, v1, v0, s4 +; GFX900-NEXT: v_alignbit_b32 v1, s4, v2, 16 +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_sitofp_v3i32_to_v3bf16: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_cvt_f32_i32_e32 v2, v2 +; GFX950-NEXT: v_cvt_f32_i32_e32 v3, v1 +; GFX950-NEXT: v_cvt_f32_i32_e32 v0, v0 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v1, v2, s0 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v0, v0, v3 +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_sitofp_v3i32_to_v3bf16: ; GFX10: ; %bb.0: @@ -33629,38 +35963,49 @@ define <4 x bfloat> @v_sitofp_v4i32_to_v4bf16(<4 x i32> %x) { ; GFX8-NEXT: v_alignbit_b32 v1, v3, v2, 16 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_sitofp_v4i32_to_v4bf16: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_cvt_f32_i32_e32 v2, v2 -; GFX9-NEXT: v_cvt_f32_i32_e32 v3, v3 -; GFX9-NEXT: s_movk_i32 s4, 0x7fff -; GFX9-NEXT: v_cvt_f32_i32_e32 v0, v0 -; GFX9-NEXT: v_bfe_u32 v4, v2, 16, 1 -; GFX9-NEXT: v_add3_u32 v4, v4, v2, s4 -; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v2 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v4, v5, vcc -; GFX9-NEXT: v_bfe_u32 v4, v3, 16, 1 -; GFX9-NEXT: v_cvt_f32_i32_e32 v1, v1 -; GFX9-NEXT: v_add3_u32 v4, v4, v3, s4 -; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v3 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; GFX9-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc -; GFX9-NEXT: v_bfe_u32 v4, v0, 16, 1 -; GFX9-NEXT: v_add3_u32 v4, v4, v0, s4 -; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v0 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc -; GFX9-NEXT: v_bfe_u32 v4, v1, 16, 1 -; GFX9-NEXT: v_add3_u32 v4, v4, v1, s4 -; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v1 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v4, v5, vcc -; GFX9-NEXT: s_mov_b32 s4, 0x7060302 -; GFX9-NEXT: v_perm_b32 v0, v1, v0, s4 -; GFX9-NEXT: v_perm_b32 v1, v3, v2, s4 -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: v_sitofp_v4i32_to_v4bf16: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_cvt_f32_i32_e32 v2, v2 +; GFX900-NEXT: v_cvt_f32_i32_e32 v3, v3 +; GFX900-NEXT: s_movk_i32 s4, 0x7fff +; GFX900-NEXT: v_cvt_f32_i32_e32 v0, v0 +; GFX900-NEXT: v_bfe_u32 v4, v2, 16, 1 +; GFX900-NEXT: v_add3_u32 v4, v4, v2, s4 +; GFX900-NEXT: v_or_b32_e32 v5, 0x400000, v2 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX900-NEXT: v_cndmask_b32_e32 v2, v4, v5, vcc +; GFX900-NEXT: v_bfe_u32 v4, v3, 16, 1 +; GFX900-NEXT: v_cvt_f32_i32_e32 v1, v1 +; GFX900-NEXT: v_add3_u32 v4, v4, v3, s4 +; GFX900-NEXT: v_or_b32_e32 v5, 0x400000, v3 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX900-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc +; GFX900-NEXT: v_bfe_u32 v4, v0, 16, 1 +; GFX900-NEXT: v_add3_u32 v4, v4, v0, s4 +; GFX900-NEXT: v_or_b32_e32 v5, 0x400000, v0 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX900-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc +; GFX900-NEXT: v_bfe_u32 v4, v1, 16, 1 +; GFX900-NEXT: v_add3_u32 v4, v4, v1, s4 +; GFX900-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX900-NEXT: v_cndmask_b32_e32 v1, v4, v5, vcc +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_perm_b32 v0, v1, v0, s4 +; GFX900-NEXT: v_perm_b32 v1, v3, v2, s4 +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_sitofp_v4i32_to_v4bf16: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_cvt_f32_i32_e32 v3, v3 +; GFX950-NEXT: v_cvt_f32_i32_e32 v1, v1 +; GFX950-NEXT: v_cvt_f32_i32_e32 v0, v0 +; GFX950-NEXT: v_cvt_f32_i32_e32 v2, v2 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v0, v0, v1 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v1, v2, v3 +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_sitofp_v4i32_to_v4bf16: ; GFX10: ; %bb.0: @@ -33827,29 +36172,47 @@ define bfloat @v_sitofp_i64_to_bf16(i64 %x) { ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_sitofp_i64_to_bf16: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_xor_b32_e32 v2, v0, v1 -; GFX9-NEXT: v_ashrrev_i32_e32 v2, 31, v2 -; GFX9-NEXT: v_ffbh_i32_e32 v3, v1 -; GFX9-NEXT: v_add_u32_e32 v2, 32, v2 -; GFX9-NEXT: v_add_u32_e32 v3, -1, v3 -; GFX9-NEXT: v_min_u32_e32 v2, v3, v2 -; GFX9-NEXT: v_lshlrev_b64 v[0:1], v2, v[0:1] -; GFX9-NEXT: s_movk_i32 s4, 0x7fff -; GFX9-NEXT: v_min_u32_e32 v0, 1, v0 -; GFX9-NEXT: v_or_b32_e32 v0, v1, v0 -; GFX9-NEXT: v_cvt_f32_i32_e32 v0, v0 -; GFX9-NEXT: v_sub_u32_e32 v1, 32, v2 -; GFX9-NEXT: v_ldexp_f32 v0, v0, v1 -; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1 -; GFX9-NEXT: v_add3_u32 v1, v1, v0, s4 -; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v0 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: v_sitofp_i64_to_bf16: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_xor_b32_e32 v2, v0, v1 +; GFX900-NEXT: v_ashrrev_i32_e32 v2, 31, v2 +; GFX900-NEXT: v_ffbh_i32_e32 v3, v1 +; GFX900-NEXT: v_add_u32_e32 v2, 32, v2 +; GFX900-NEXT: v_add_u32_e32 v3, -1, v3 +; GFX900-NEXT: v_min_u32_e32 v2, v3, v2 +; GFX900-NEXT: v_lshlrev_b64 v[0:1], v2, v[0:1] +; GFX900-NEXT: s_movk_i32 s4, 0x7fff +; GFX900-NEXT: v_min_u32_e32 v0, 1, v0 +; GFX900-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX900-NEXT: v_cvt_f32_i32_e32 v0, v0 +; GFX900-NEXT: v_sub_u32_e32 v1, 32, v2 +; GFX900-NEXT: v_ldexp_f32 v0, v0, v1 +; GFX900-NEXT: v_bfe_u32 v1, v0, 16, 1 +; GFX900-NEXT: v_add3_u32 v1, v1, v0, s4 +; GFX900-NEXT: v_or_b32_e32 v2, 0x400000, v0 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX900-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc +; GFX900-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_sitofp_i64_to_bf16: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_xor_b32_e32 v2, v0, v1 +; GFX950-NEXT: v_ashrrev_i32_e32 v2, 31, v2 +; GFX950-NEXT: v_ffbh_i32_e32 v3, v1 +; GFX950-NEXT: v_add_u32_e32 v2, 32, v2 +; GFX950-NEXT: v_add_u32_e32 v3, -1, v3 +; GFX950-NEXT: v_min_u32_e32 v2, v3, v2 +; GFX950-NEXT: v_lshlrev_b64 v[0:1], v2, v[0:1] +; GFX950-NEXT: v_min_u32_e32 v0, 1, v0 +; GFX950-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX950-NEXT: v_cvt_f32_i32_e32 v0, v0 +; GFX950-NEXT: v_sub_u32_e32 v1, 32, v2 +; GFX950-NEXT: v_ldexp_f32 v0, v0, v1 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v0, v0, s0 +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_sitofp_i64_to_bf16: ; GFX10: ; %bb.0: @@ -34044,47 +36407,77 @@ define <2 x bfloat> @v_sitofp_v2i64_to_v2bf16(<2 x i64> %x) { ; GFX8-NEXT: v_alignbit_b32 v0, v0, v1, 16 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_sitofp_v2i64_to_v2bf16: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_xor_b32_e32 v5, v0, v1 -; GFX9-NEXT: v_ffbh_i32_e32 v4, v1 -; GFX9-NEXT: v_ashrrev_i32_e32 v5, 31, v5 -; GFX9-NEXT: v_add_u32_e32 v4, -1, v4 -; GFX9-NEXT: v_add_u32_e32 v5, 32, v5 -; GFX9-NEXT: v_min_u32_e32 v4, v4, v5 -; GFX9-NEXT: v_lshlrev_b64 v[0:1], v4, v[0:1] -; GFX9-NEXT: s_movk_i32 s4, 0x7fff -; GFX9-NEXT: v_min_u32_e32 v0, 1, v0 -; GFX9-NEXT: v_or_b32_e32 v0, v1, v0 -; GFX9-NEXT: v_cvt_f32_i32_e32 v0, v0 -; GFX9-NEXT: v_sub_u32_e32 v1, 32, v4 -; GFX9-NEXT: v_ldexp_f32 v4, v0, v1 -; GFX9-NEXT: v_bfe_u32 v0, v4, 16, 1 -; GFX9-NEXT: v_xor_b32_e32 v1, v2, v3 -; GFX9-NEXT: v_add3_u32 v5, v0, v4, s4 -; GFX9-NEXT: v_ffbh_i32_e32 v0, v3 -; GFX9-NEXT: v_ashrrev_i32_e32 v1, 31, v1 -; GFX9-NEXT: v_add_u32_e32 v0, -1, v0 -; GFX9-NEXT: v_add_u32_e32 v1, 32, v1 -; GFX9-NEXT: v_min_u32_e32 v6, v0, v1 -; GFX9-NEXT: v_lshlrev_b64 v[0:1], v6, v[2:3] -; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v4 -; GFX9-NEXT: v_min_u32_e32 v0, 1, v0 -; GFX9-NEXT: v_or_b32_e32 v0, v1, v0 -; GFX9-NEXT: v_cvt_f32_i32_e32 v0, v0 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v5, v2, vcc -; GFX9-NEXT: v_sub_u32_e32 v2, 32, v6 -; GFX9-NEXT: v_ldexp_f32 v0, v0, v2 -; GFX9-NEXT: v_bfe_u32 v2, v0, 16, 1 -; GFX9-NEXT: v_add3_u32 v2, v2, v0, s4 -; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v0 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc -; GFX9-NEXT: s_mov_b32 s4, 0x7060302 -; GFX9-NEXT: v_perm_b32 v0, v0, v1, s4 -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: v_sitofp_v2i64_to_v2bf16: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_xor_b32_e32 v5, v0, v1 +; GFX900-NEXT: v_ffbh_i32_e32 v4, v1 +; GFX900-NEXT: v_ashrrev_i32_e32 v5, 31, v5 +; GFX900-NEXT: v_add_u32_e32 v4, -1, v4 +; GFX900-NEXT: v_add_u32_e32 v5, 32, v5 +; GFX900-NEXT: v_min_u32_e32 v4, v4, v5 +; GFX900-NEXT: v_lshlrev_b64 v[0:1], v4, v[0:1] +; GFX900-NEXT: s_movk_i32 s4, 0x7fff +; GFX900-NEXT: v_min_u32_e32 v0, 1, v0 +; GFX900-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX900-NEXT: v_cvt_f32_i32_e32 v0, v0 +; GFX900-NEXT: v_sub_u32_e32 v1, 32, v4 +; GFX900-NEXT: v_ldexp_f32 v4, v0, v1 +; GFX900-NEXT: v_bfe_u32 v0, v4, 16, 1 +; GFX900-NEXT: v_xor_b32_e32 v1, v2, v3 +; GFX900-NEXT: v_add3_u32 v5, v0, v4, s4 +; GFX900-NEXT: v_ffbh_i32_e32 v0, v3 +; GFX900-NEXT: v_ashrrev_i32_e32 v1, 31, v1 +; GFX900-NEXT: v_add_u32_e32 v0, -1, v0 +; GFX900-NEXT: v_add_u32_e32 v1, 32, v1 +; GFX900-NEXT: v_min_u32_e32 v6, v0, v1 +; GFX900-NEXT: v_lshlrev_b64 v[0:1], v6, v[2:3] +; GFX900-NEXT: v_or_b32_e32 v2, 0x400000, v4 +; GFX900-NEXT: v_min_u32_e32 v0, 1, v0 +; GFX900-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX900-NEXT: v_cvt_f32_i32_e32 v0, v0 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; GFX900-NEXT: v_cndmask_b32_e32 v1, v5, v2, vcc +; GFX900-NEXT: v_sub_u32_e32 v2, 32, v6 +; GFX900-NEXT: v_ldexp_f32 v0, v0, v2 +; GFX900-NEXT: v_bfe_u32 v2, v0, 16, 1 +; GFX900-NEXT: v_add3_u32 v2, v2, v0, s4 +; GFX900-NEXT: v_or_b32_e32 v3, 0x400000, v0 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX900-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_perm_b32 v0, v0, v1, s4 +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_sitofp_v2i64_to_v2bf16: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_xor_b32_e32 v5, v2, v3 +; GFX950-NEXT: v_ffbh_i32_e32 v4, v3 +; GFX950-NEXT: v_ashrrev_i32_e32 v5, 31, v5 +; GFX950-NEXT: v_add_u32_e32 v4, -1, v4 +; GFX950-NEXT: v_add_u32_e32 v5, 32, v5 +; GFX950-NEXT: v_min_u32_e32 v4, v4, v5 +; GFX950-NEXT: v_lshlrev_b64 v[2:3], v4, v[2:3] +; GFX950-NEXT: v_min_u32_e32 v2, 1, v2 +; GFX950-NEXT: v_xor_b32_e32 v5, v0, v1 +; GFX950-NEXT: v_or_b32_e32 v2, v3, v2 +; GFX950-NEXT: v_ffbh_i32_e32 v3, v1 +; GFX950-NEXT: v_ashrrev_i32_e32 v5, 31, v5 +; GFX950-NEXT: v_add_u32_e32 v3, -1, v3 +; GFX950-NEXT: v_add_u32_e32 v5, 32, v5 +; GFX950-NEXT: v_min_u32_e32 v3, v3, v5 +; GFX950-NEXT: v_lshlrev_b64 v[0:1], v3, v[0:1] +; GFX950-NEXT: v_min_u32_e32 v0, 1, v0 +; GFX950-NEXT: v_cvt_f32_i32_e32 v2, v2 +; GFX950-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX950-NEXT: v_cvt_f32_i32_e32 v0, v0 +; GFX950-NEXT: v_sub_u32_e32 v1, 32, v4 +; GFX950-NEXT: v_ldexp_f32 v1, v2, v1 +; GFX950-NEXT: v_sub_u32_e32 v2, 32, v3 +; GFX950-NEXT: v_ldexp_f32 v0, v0, v2 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v0, v0, v1 +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_sitofp_v2i64_to_v2bf16: ; GFX10: ; %bb.0: @@ -34386,65 +36779,109 @@ define <3 x bfloat> @v_sitofp_v3i64_to_v3bf16(<3 x i64> %x) { ; GFX8-NEXT: v_alignbit_b32 v0, v2, v0, 16 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_sitofp_v3i64_to_v3bf16: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_xor_b32_e32 v7, v4, v5 -; GFX9-NEXT: v_ffbh_i32_e32 v6, v5 -; GFX9-NEXT: v_ashrrev_i32_e32 v7, 31, v7 -; GFX9-NEXT: v_add_u32_e32 v6, -1, v6 -; GFX9-NEXT: v_add_u32_e32 v7, 32, v7 -; GFX9-NEXT: v_min_u32_e32 v6, v6, v7 -; GFX9-NEXT: v_lshlrev_b64 v[4:5], v6, v[4:5] -; GFX9-NEXT: v_xor_b32_e32 v7, v0, v1 -; GFX9-NEXT: v_min_u32_e32 v4, 1, v4 -; GFX9-NEXT: v_or_b32_e32 v4, v5, v4 -; GFX9-NEXT: v_sub_u32_e32 v5, 32, v6 -; GFX9-NEXT: v_ffbh_i32_e32 v6, v1 -; GFX9-NEXT: v_ashrrev_i32_e32 v7, 31, v7 -; GFX9-NEXT: v_add_u32_e32 v6, -1, v6 -; GFX9-NEXT: v_add_u32_e32 v7, 32, v7 -; GFX9-NEXT: v_min_u32_e32 v6, v6, v7 -; GFX9-NEXT: v_lshlrev_b64 v[0:1], v6, v[0:1] -; GFX9-NEXT: v_cvt_f32_i32_e32 v4, v4 -; GFX9-NEXT: v_min_u32_e32 v0, 1, v0 -; GFX9-NEXT: v_or_b32_e32 v0, v1, v0 -; GFX9-NEXT: v_cvt_f32_i32_e32 v0, v0 -; GFX9-NEXT: v_ldexp_f32 v4, v4, v5 -; GFX9-NEXT: v_bfe_u32 v5, v4, 16, 1 -; GFX9-NEXT: s_movk_i32 s4, 0x7fff -; GFX9-NEXT: v_add3_u32 v5, v5, v4, s4 -; GFX9-NEXT: v_or_b32_e32 v7, 0x400000, v4 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 -; GFX9-NEXT: v_sub_u32_e32 v1, 32, v6 -; GFX9-NEXT: v_cndmask_b32_e32 v4, v5, v7, vcc -; GFX9-NEXT: v_ldexp_f32 v5, v0, v1 -; GFX9-NEXT: v_bfe_u32 v0, v5, 16, 1 -; GFX9-NEXT: v_xor_b32_e32 v1, v2, v3 -; GFX9-NEXT: v_add3_u32 v6, v0, v5, s4 -; GFX9-NEXT: v_ffbh_i32_e32 v0, v3 -; GFX9-NEXT: v_ashrrev_i32_e32 v1, 31, v1 -; GFX9-NEXT: v_add_u32_e32 v0, -1, v0 -; GFX9-NEXT: v_add_u32_e32 v1, 32, v1 -; GFX9-NEXT: v_min_u32_e32 v7, v0, v1 -; GFX9-NEXT: v_lshlrev_b64 v[0:1], v7, v[2:3] -; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v5 -; GFX9-NEXT: v_min_u32_e32 v0, 1, v0 -; GFX9-NEXT: v_or_b32_e32 v0, v1, v0 -; GFX9-NEXT: v_cvt_f32_i32_e32 v0, v0 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v6, v2, vcc -; GFX9-NEXT: v_sub_u32_e32 v2, 32, v7 -; GFX9-NEXT: v_ldexp_f32 v0, v0, v2 -; GFX9-NEXT: v_bfe_u32 v2, v0, 16, 1 -; GFX9-NEXT: v_add3_u32 v2, v2, v0, s4 -; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v0 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc -; GFX9-NEXT: s_mov_b32 s4, 0x7060302 -; GFX9-NEXT: v_perm_b32 v0, v0, v1, s4 -; GFX9-NEXT: v_alignbit_b32 v1, s4, v4, 16 -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: v_sitofp_v3i64_to_v3bf16: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_xor_b32_e32 v7, v4, v5 +; GFX900-NEXT: v_ffbh_i32_e32 v6, v5 +; GFX900-NEXT: v_ashrrev_i32_e32 v7, 31, v7 +; GFX900-NEXT: v_add_u32_e32 v6, -1, v6 +; GFX900-NEXT: v_add_u32_e32 v7, 32, v7 +; GFX900-NEXT: v_min_u32_e32 v6, v6, v7 +; GFX900-NEXT: v_lshlrev_b64 v[4:5], v6, v[4:5] +; GFX900-NEXT: v_xor_b32_e32 v7, v0, v1 +; GFX900-NEXT: v_min_u32_e32 v4, 1, v4 +; GFX900-NEXT: v_or_b32_e32 v4, v5, v4 +; GFX900-NEXT: v_sub_u32_e32 v5, 32, v6 +; GFX900-NEXT: v_ffbh_i32_e32 v6, v1 +; GFX900-NEXT: v_ashrrev_i32_e32 v7, 31, v7 +; GFX900-NEXT: v_add_u32_e32 v6, -1, v6 +; GFX900-NEXT: v_add_u32_e32 v7, 32, v7 +; GFX900-NEXT: v_min_u32_e32 v6, v6, v7 +; GFX900-NEXT: v_lshlrev_b64 v[0:1], v6, v[0:1] +; GFX900-NEXT: v_cvt_f32_i32_e32 v4, v4 +; GFX900-NEXT: v_min_u32_e32 v0, 1, v0 +; GFX900-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX900-NEXT: v_cvt_f32_i32_e32 v0, v0 +; GFX900-NEXT: v_ldexp_f32 v4, v4, v5 +; GFX900-NEXT: v_bfe_u32 v5, v4, 16, 1 +; GFX900-NEXT: s_movk_i32 s4, 0x7fff +; GFX900-NEXT: v_add3_u32 v5, v5, v4, s4 +; GFX900-NEXT: v_or_b32_e32 v7, 0x400000, v4 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; GFX900-NEXT: v_sub_u32_e32 v1, 32, v6 +; GFX900-NEXT: v_cndmask_b32_e32 v4, v5, v7, vcc +; GFX900-NEXT: v_ldexp_f32 v5, v0, v1 +; GFX900-NEXT: v_bfe_u32 v0, v5, 16, 1 +; GFX900-NEXT: v_xor_b32_e32 v1, v2, v3 +; GFX900-NEXT: v_add3_u32 v6, v0, v5, s4 +; GFX900-NEXT: v_ffbh_i32_e32 v0, v3 +; GFX900-NEXT: v_ashrrev_i32_e32 v1, 31, v1 +; GFX900-NEXT: v_add_u32_e32 v0, -1, v0 +; GFX900-NEXT: v_add_u32_e32 v1, 32, v1 +; GFX900-NEXT: v_min_u32_e32 v7, v0, v1 +; GFX900-NEXT: v_lshlrev_b64 v[0:1], v7, v[2:3] +; GFX900-NEXT: v_or_b32_e32 v2, 0x400000, v5 +; GFX900-NEXT: v_min_u32_e32 v0, 1, v0 +; GFX900-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX900-NEXT: v_cvt_f32_i32_e32 v0, v0 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX900-NEXT: v_cndmask_b32_e32 v1, v6, v2, vcc +; GFX900-NEXT: v_sub_u32_e32 v2, 32, v7 +; GFX900-NEXT: v_ldexp_f32 v0, v0, v2 +; GFX900-NEXT: v_bfe_u32 v2, v0, 16, 1 +; GFX900-NEXT: v_add3_u32 v2, v2, v0, s4 +; GFX900-NEXT: v_or_b32_e32 v3, 0x400000, v0 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX900-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_perm_b32 v0, v0, v1, s4 +; GFX900-NEXT: v_alignbit_b32 v1, s4, v4, 16 +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_sitofp_v3i64_to_v3bf16: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_xor_b32_e32 v7, v4, v5 +; GFX950-NEXT: v_ffbh_i32_e32 v6, v5 +; GFX950-NEXT: v_ashrrev_i32_e32 v7, 31, v7 +; GFX950-NEXT: v_add_u32_e32 v6, -1, v6 +; GFX950-NEXT: v_add_u32_e32 v7, 32, v7 +; GFX950-NEXT: v_min_u32_e32 v6, v6, v7 +; GFX950-NEXT: v_lshlrev_b64 v[4:5], v6, v[4:5] +; GFX950-NEXT: v_min_u32_e32 v4, 1, v4 +; GFX950-NEXT: v_or_b32_e32 v4, v5, v4 +; GFX950-NEXT: v_cvt_f32_i32_e32 v4, v4 +; GFX950-NEXT: v_sub_u32_e32 v5, 32, v6 +; GFX950-NEXT: v_xor_b32_e32 v6, v2, v3 +; GFX950-NEXT: v_ashrrev_i32_e32 v6, 31, v6 +; GFX950-NEXT: v_ldexp_f32 v4, v4, v5 +; GFX950-NEXT: v_ffbh_i32_e32 v5, v3 +; GFX950-NEXT: v_add_u32_e32 v5, -1, v5 +; GFX950-NEXT: v_add_u32_e32 v6, 32, v6 +; GFX950-NEXT: v_min_u32_e32 v5, v5, v6 +; GFX950-NEXT: v_lshlrev_b64 v[2:3], v5, v[2:3] +; GFX950-NEXT: v_min_u32_e32 v2, 1, v2 +; GFX950-NEXT: v_xor_b32_e32 v6, v0, v1 +; GFX950-NEXT: v_or_b32_e32 v2, v3, v2 +; GFX950-NEXT: v_ffbh_i32_e32 v3, v1 +; GFX950-NEXT: v_ashrrev_i32_e32 v6, 31, v6 +; GFX950-NEXT: v_add_u32_e32 v3, -1, v3 +; GFX950-NEXT: v_add_u32_e32 v6, 32, v6 +; GFX950-NEXT: v_min_u32_e32 v3, v3, v6 +; GFX950-NEXT: v_lshlrev_b64 v[0:1], v3, v[0:1] +; GFX950-NEXT: v_min_u32_e32 v0, 1, v0 +; GFX950-NEXT: v_cvt_f32_i32_e32 v2, v2 +; GFX950-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX950-NEXT: v_cvt_f32_i32_e32 v0, v0 +; GFX950-NEXT: v_sub_u32_e32 v1, 32, v5 +; GFX950-NEXT: v_ldexp_f32 v1, v2, v1 +; GFX950-NEXT: v_sub_u32_e32 v2, 32, v3 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v4, v4, s0 +; GFX950-NEXT: v_ldexp_f32 v0, v0, v2 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v0, v0, v1 +; GFX950-NEXT: v_mov_b32_e32 v1, v4 +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_sitofp_v3i64_to_v3bf16: ; GFX10: ; %bb.0: @@ -34842,82 +37279,137 @@ define <4 x bfloat> @v_sitofp_v4i64_to_v4bf16(<4 x i64> %x) { ; GFX8-NEXT: v_alignbit_b32 v1, v4, v5, 16 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_sitofp_v4i64_to_v4bf16: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_xor_b32_e32 v9, v4, v5 -; GFX9-NEXT: v_ffbh_i32_e32 v8, v5 -; GFX9-NEXT: v_ashrrev_i32_e32 v9, 31, v9 -; GFX9-NEXT: v_add_u32_e32 v8, -1, v8 -; GFX9-NEXT: v_add_u32_e32 v9, 32, v9 -; GFX9-NEXT: v_min_u32_e32 v8, v8, v9 -; GFX9-NEXT: v_lshlrev_b64 v[4:5], v8, v[4:5] -; GFX9-NEXT: s_movk_i32 s4, 0x7fff -; GFX9-NEXT: v_min_u32_e32 v4, 1, v4 -; GFX9-NEXT: v_or_b32_e32 v4, v5, v4 -; GFX9-NEXT: v_cvt_f32_i32_e32 v4, v4 -; GFX9-NEXT: v_sub_u32_e32 v5, 32, v8 -; GFX9-NEXT: v_ldexp_f32 v8, v4, v5 -; GFX9-NEXT: v_bfe_u32 v4, v8, 16, 1 -; GFX9-NEXT: v_xor_b32_e32 v5, v6, v7 -; GFX9-NEXT: v_add3_u32 v9, v4, v8, s4 -; GFX9-NEXT: v_ffbh_i32_e32 v4, v7 -; GFX9-NEXT: v_ashrrev_i32_e32 v5, 31, v5 -; GFX9-NEXT: v_add_u32_e32 v4, -1, v4 -; GFX9-NEXT: v_add_u32_e32 v5, 32, v5 -; GFX9-NEXT: v_min_u32_e32 v10, v4, v5 -; GFX9-NEXT: v_lshlrev_b64 v[4:5], v10, v[6:7] -; GFX9-NEXT: v_or_b32_e32 v6, 0x400000, v8 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v8, v8 -; GFX9-NEXT: v_xor_b32_e32 v8, v0, v1 -; GFX9-NEXT: v_ffbh_i32_e32 v7, v1 -; GFX9-NEXT: v_ashrrev_i32_e32 v8, 31, v8 -; GFX9-NEXT: v_add_u32_e32 v7, -1, v7 -; GFX9-NEXT: v_add_u32_e32 v8, 32, v8 -; GFX9-NEXT: v_min_u32_e32 v4, 1, v4 -; GFX9-NEXT: v_min_u32_e32 v7, v7, v8 -; GFX9-NEXT: v_or_b32_e32 v4, v5, v4 -; GFX9-NEXT: v_lshlrev_b64 v[0:1], v7, v[0:1] -; GFX9-NEXT: v_cvt_f32_i32_e32 v4, v4 -; GFX9-NEXT: v_min_u32_e32 v0, 1, v0 -; GFX9-NEXT: v_or_b32_e32 v0, v1, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v5, v9, v6, vcc -; GFX9-NEXT: v_sub_u32_e32 v6, 32, v10 -; GFX9-NEXT: v_cvt_f32_i32_e32 v0, v0 -; GFX9-NEXT: v_ldexp_f32 v4, v4, v6 -; GFX9-NEXT: v_bfe_u32 v6, v4, 16, 1 -; GFX9-NEXT: v_add3_u32 v6, v6, v4, s4 -; GFX9-NEXT: v_or_b32_e32 v8, 0x400000, v4 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 -; GFX9-NEXT: v_sub_u32_e32 v1, 32, v7 -; GFX9-NEXT: v_cndmask_b32_e32 v4, v6, v8, vcc -; GFX9-NEXT: v_ldexp_f32 v6, v0, v1 -; GFX9-NEXT: v_bfe_u32 v0, v6, 16, 1 -; GFX9-NEXT: v_xor_b32_e32 v1, v2, v3 -; GFX9-NEXT: v_add3_u32 v7, v0, v6, s4 -; GFX9-NEXT: v_ffbh_i32_e32 v0, v3 -; GFX9-NEXT: v_ashrrev_i32_e32 v1, 31, v1 -; GFX9-NEXT: v_add_u32_e32 v0, -1, v0 -; GFX9-NEXT: v_add_u32_e32 v1, 32, v1 -; GFX9-NEXT: v_min_u32_e32 v8, v0, v1 -; GFX9-NEXT: v_lshlrev_b64 v[0:1], v8, v[2:3] -; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v6 -; GFX9-NEXT: v_min_u32_e32 v0, 1, v0 -; GFX9-NEXT: v_or_b32_e32 v0, v1, v0 -; GFX9-NEXT: v_cvt_f32_i32_e32 v0, v0 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v7, v2, vcc -; GFX9-NEXT: v_sub_u32_e32 v2, 32, v8 -; GFX9-NEXT: v_ldexp_f32 v0, v0, v2 -; GFX9-NEXT: v_bfe_u32 v2, v0, 16, 1 -; GFX9-NEXT: v_add3_u32 v2, v2, v0, s4 -; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v0 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc -; GFX9-NEXT: s_mov_b32 s4, 0x7060302 -; GFX9-NEXT: v_perm_b32 v0, v0, v1, s4 -; GFX9-NEXT: v_perm_b32 v1, v4, v5, s4 -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: v_sitofp_v4i64_to_v4bf16: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_xor_b32_e32 v9, v4, v5 +; GFX900-NEXT: v_ffbh_i32_e32 v8, v5 +; GFX900-NEXT: v_ashrrev_i32_e32 v9, 31, v9 +; GFX900-NEXT: v_add_u32_e32 v8, -1, v8 +; GFX900-NEXT: v_add_u32_e32 v9, 32, v9 +; GFX900-NEXT: v_min_u32_e32 v8, v8, v9 +; GFX900-NEXT: v_lshlrev_b64 v[4:5], v8, v[4:5] +; GFX900-NEXT: s_movk_i32 s4, 0x7fff +; GFX900-NEXT: v_min_u32_e32 v4, 1, v4 +; GFX900-NEXT: v_or_b32_e32 v4, v5, v4 +; GFX900-NEXT: v_cvt_f32_i32_e32 v4, v4 +; GFX900-NEXT: v_sub_u32_e32 v5, 32, v8 +; GFX900-NEXT: v_ldexp_f32 v8, v4, v5 +; GFX900-NEXT: v_bfe_u32 v4, v8, 16, 1 +; GFX900-NEXT: v_xor_b32_e32 v5, v6, v7 +; GFX900-NEXT: v_add3_u32 v9, v4, v8, s4 +; GFX900-NEXT: v_ffbh_i32_e32 v4, v7 +; GFX900-NEXT: v_ashrrev_i32_e32 v5, 31, v5 +; GFX900-NEXT: v_add_u32_e32 v4, -1, v4 +; GFX900-NEXT: v_add_u32_e32 v5, 32, v5 +; GFX900-NEXT: v_min_u32_e32 v10, v4, v5 +; GFX900-NEXT: v_lshlrev_b64 v[4:5], v10, v[6:7] +; GFX900-NEXT: v_or_b32_e32 v6, 0x400000, v8 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v8, v8 +; GFX900-NEXT: v_xor_b32_e32 v8, v0, v1 +; GFX900-NEXT: v_ffbh_i32_e32 v7, v1 +; GFX900-NEXT: v_ashrrev_i32_e32 v8, 31, v8 +; GFX900-NEXT: v_add_u32_e32 v7, -1, v7 +; GFX900-NEXT: v_add_u32_e32 v8, 32, v8 +; GFX900-NEXT: v_min_u32_e32 v4, 1, v4 +; GFX900-NEXT: v_min_u32_e32 v7, v7, v8 +; GFX900-NEXT: v_or_b32_e32 v4, v5, v4 +; GFX900-NEXT: v_lshlrev_b64 v[0:1], v7, v[0:1] +; GFX900-NEXT: v_cvt_f32_i32_e32 v4, v4 +; GFX900-NEXT: v_min_u32_e32 v0, 1, v0 +; GFX900-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX900-NEXT: v_cndmask_b32_e32 v5, v9, v6, vcc +; GFX900-NEXT: v_sub_u32_e32 v6, 32, v10 +; GFX900-NEXT: v_cvt_f32_i32_e32 v0, v0 +; GFX900-NEXT: v_ldexp_f32 v4, v4, v6 +; GFX900-NEXT: v_bfe_u32 v6, v4, 16, 1 +; GFX900-NEXT: v_add3_u32 v6, v6, v4, s4 +; GFX900-NEXT: v_or_b32_e32 v8, 0x400000, v4 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; GFX900-NEXT: v_sub_u32_e32 v1, 32, v7 +; GFX900-NEXT: v_cndmask_b32_e32 v4, v6, v8, vcc +; GFX900-NEXT: v_ldexp_f32 v6, v0, v1 +; GFX900-NEXT: v_bfe_u32 v0, v6, 16, 1 +; GFX900-NEXT: v_xor_b32_e32 v1, v2, v3 +; GFX900-NEXT: v_add3_u32 v7, v0, v6, s4 +; GFX900-NEXT: v_ffbh_i32_e32 v0, v3 +; GFX900-NEXT: v_ashrrev_i32_e32 v1, 31, v1 +; GFX900-NEXT: v_add_u32_e32 v0, -1, v0 +; GFX900-NEXT: v_add_u32_e32 v1, 32, v1 +; GFX900-NEXT: v_min_u32_e32 v8, v0, v1 +; GFX900-NEXT: v_lshlrev_b64 v[0:1], v8, v[2:3] +; GFX900-NEXT: v_or_b32_e32 v2, 0x400000, v6 +; GFX900-NEXT: v_min_u32_e32 v0, 1, v0 +; GFX900-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX900-NEXT: v_cvt_f32_i32_e32 v0, v0 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; GFX900-NEXT: v_cndmask_b32_e32 v1, v7, v2, vcc +; GFX900-NEXT: v_sub_u32_e32 v2, 32, v8 +; GFX900-NEXT: v_ldexp_f32 v0, v0, v2 +; GFX900-NEXT: v_bfe_u32 v2, v0, 16, 1 +; GFX900-NEXT: v_add3_u32 v2, v2, v0, s4 +; GFX900-NEXT: v_or_b32_e32 v3, 0x400000, v0 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX900-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_perm_b32 v0, v0, v1, s4 +; GFX900-NEXT: v_perm_b32 v1, v4, v5, s4 +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_sitofp_v4i64_to_v4bf16: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_xor_b32_e32 v9, v6, v7 +; GFX950-NEXT: v_ffbh_i32_e32 v8, v7 +; GFX950-NEXT: v_ashrrev_i32_e32 v9, 31, v9 +; GFX950-NEXT: v_add_u32_e32 v8, -1, v8 +; GFX950-NEXT: v_add_u32_e32 v9, 32, v9 +; GFX950-NEXT: v_min_u32_e32 v8, v8, v9 +; GFX950-NEXT: v_lshlrev_b64 v[6:7], v8, v[6:7] +; GFX950-NEXT: v_min_u32_e32 v6, 1, v6 +; GFX950-NEXT: v_xor_b32_e32 v9, v4, v5 +; GFX950-NEXT: v_or_b32_e32 v6, v7, v6 +; GFX950-NEXT: v_ffbh_i32_e32 v7, v5 +; GFX950-NEXT: v_ashrrev_i32_e32 v9, 31, v9 +; GFX950-NEXT: v_add_u32_e32 v7, -1, v7 +; GFX950-NEXT: v_add_u32_e32 v9, 32, v9 +; GFX950-NEXT: v_min_u32_e32 v7, v7, v9 +; GFX950-NEXT: v_lshlrev_b64 v[4:5], v7, v[4:5] +; GFX950-NEXT: v_min_u32_e32 v4, 1, v4 +; GFX950-NEXT: v_cvt_f32_i32_e32 v6, v6 +; GFX950-NEXT: v_or_b32_e32 v4, v5, v4 +; GFX950-NEXT: v_cvt_f32_i32_e32 v4, v4 +; GFX950-NEXT: v_sub_u32_e32 v5, 32, v8 +; GFX950-NEXT: v_ldexp_f32 v5, v6, v5 +; GFX950-NEXT: v_sub_u32_e32 v6, 32, v7 +; GFX950-NEXT: v_xor_b32_e32 v7, v2, v3 +; GFX950-NEXT: v_ldexp_f32 v4, v4, v6 +; GFX950-NEXT: v_ffbh_i32_e32 v6, v3 +; GFX950-NEXT: v_ashrrev_i32_e32 v7, 31, v7 +; GFX950-NEXT: v_add_u32_e32 v6, -1, v6 +; GFX950-NEXT: v_add_u32_e32 v7, 32, v7 +; GFX950-NEXT: v_min_u32_e32 v6, v6, v7 +; GFX950-NEXT: v_lshlrev_b64 v[2:3], v6, v[2:3] +; GFX950-NEXT: v_min_u32_e32 v2, 1, v2 +; GFX950-NEXT: v_xor_b32_e32 v7, v0, v1 +; GFX950-NEXT: v_or_b32_e32 v2, v3, v2 +; GFX950-NEXT: v_ffbh_i32_e32 v3, v1 +; GFX950-NEXT: v_ashrrev_i32_e32 v7, 31, v7 +; GFX950-NEXT: v_add_u32_e32 v3, -1, v3 +; GFX950-NEXT: v_add_u32_e32 v7, 32, v7 +; GFX950-NEXT: v_min_u32_e32 v3, v3, v7 +; GFX950-NEXT: v_lshlrev_b64 v[0:1], v3, v[0:1] +; GFX950-NEXT: v_min_u32_e32 v0, 1, v0 +; GFX950-NEXT: v_cvt_f32_i32_e32 v2, v2 +; GFX950-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX950-NEXT: v_cvt_f32_i32_e32 v0, v0 +; GFX950-NEXT: v_sub_u32_e32 v1, 32, v6 +; GFX950-NEXT: v_ldexp_f32 v1, v2, v1 +; GFX950-NEXT: v_sub_u32_e32 v2, 32, v3 +; GFX950-NEXT: v_ldexp_f32 v0, v0, v2 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v0, v0, v1 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v1, v4, v5 +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_sitofp_v4i64_to_v4bf16: ; GFX10: ; %bb.0: @@ -35202,18 +37694,25 @@ define bfloat @v_uitofp_i16_to_bf16(i16 %x) { ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_uitofp_i16_to_bf16: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_cvt_f32_u32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 -; GFX9-NEXT: s_movk_i32 s4, 0x7fff -; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1 -; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v0 -; GFX9-NEXT: v_add3_u32 v1, v1, v0, s4 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: v_uitofp_i16_to_bf16: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_cvt_f32_u32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 +; GFX900-NEXT: s_movk_i32 s4, 0x7fff +; GFX900-NEXT: v_bfe_u32 v1, v0, 16, 1 +; GFX900-NEXT: v_or_b32_e32 v2, 0x400000, v0 +; GFX900-NEXT: v_add3_u32 v1, v1, v0, s4 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX900-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc +; GFX900-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_uitofp_i16_to_bf16: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_cvt_f32_u32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v0, v0, s0 +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_uitofp_i16_to_bf16: ; GFX10: ; %bb.0: @@ -35306,25 +37805,33 @@ define <2 x bfloat> @v_uitofp_v2i16_to_v2bf16(<2 x i16> %x) { ; GFX8-NEXT: v_alignbit_b32 v0, v0, v1, 16 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_uitofp_v2i16_to_v2bf16: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_cvt_f32_u32_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 -; GFX9-NEXT: v_cvt_f32_u32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; GFX9-NEXT: s_movk_i32 s4, 0x7fff -; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 -; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; GFX9-NEXT: v_add3_u32 v2, v2, v1, s4 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; GFX9-NEXT: v_bfe_u32 v2, v0, 16, 1 -; GFX9-NEXT: v_add3_u32 v2, v2, v0, s4 -; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v0 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc -; GFX9-NEXT: s_mov_b32 s4, 0x7060302 -; GFX9-NEXT: v_perm_b32 v0, v0, v1, s4 -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: v_uitofp_v2i16_to_v2bf16: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_cvt_f32_u32_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 +; GFX900-NEXT: v_cvt_f32_u32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; GFX900-NEXT: s_movk_i32 s4, 0x7fff +; GFX900-NEXT: v_bfe_u32 v2, v1, 16, 1 +; GFX900-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; GFX900-NEXT: v_add3_u32 v2, v2, v1, s4 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX900-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; GFX900-NEXT: v_bfe_u32 v2, v0, 16, 1 +; GFX900-NEXT: v_add3_u32 v2, v2, v0, s4 +; GFX900-NEXT: v_or_b32_e32 v3, 0x400000, v0 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX900-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_perm_b32 v0, v0, v1, s4 +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_uitofp_v2i16_to_v2bf16: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_cvt_f32_u32_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; GFX950-NEXT: v_cvt_f32_u32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v0, v0, v1 +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_uitofp_v2i16_to_v2bf16: ; GFX10: ; %bb.0: @@ -35457,32 +37964,42 @@ define <3 x bfloat> @v_uitofp_v3i16_to_v3bf16(<3 x i16> %x) { ; GFX8-NEXT: v_alignbit_b32 v0, v0, v2, 16 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_uitofp_v3i16_to_v3bf16: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_cvt_f32_u32_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 -; GFX9-NEXT: v_cvt_f32_u32_sdwa v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 -; GFX9-NEXT: s_movk_i32 s4, 0x7fff -; GFX9-NEXT: v_cvt_f32_u32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; GFX9-NEXT: v_bfe_u32 v3, v1, 16, 1 -; GFX9-NEXT: v_add3_u32 v3, v3, v1, s4 -; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v1 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc -; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 -; GFX9-NEXT: v_add3_u32 v3, v3, v2, s4 -; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; GFX9-NEXT: v_bfe_u32 v3, v0, 16, 1 -; GFX9-NEXT: v_add3_u32 v3, v3, v0, s4 -; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v0 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v3, v4, vcc -; GFX9-NEXT: s_mov_b32 s4, 0x7060302 -; GFX9-NEXT: v_perm_b32 v0, v0, v2, s4 -; GFX9-NEXT: v_alignbit_b32 v1, s4, v1, 16 -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: v_uitofp_v3i16_to_v3bf16: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_cvt_f32_u32_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 +; GFX900-NEXT: v_cvt_f32_u32_sdwa v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 +; GFX900-NEXT: s_movk_i32 s4, 0x7fff +; GFX900-NEXT: v_cvt_f32_u32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; GFX900-NEXT: v_bfe_u32 v3, v1, 16, 1 +; GFX900-NEXT: v_add3_u32 v3, v3, v1, s4 +; GFX900-NEXT: v_or_b32_e32 v4, 0x400000, v1 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX900-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc +; GFX900-NEXT: v_bfe_u32 v3, v2, 16, 1 +; GFX900-NEXT: v_add3_u32 v3, v3, v2, s4 +; GFX900-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX900-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; GFX900-NEXT: v_bfe_u32 v3, v0, 16, 1 +; GFX900-NEXT: v_add3_u32 v3, v3, v0, s4 +; GFX900-NEXT: v_or_b32_e32 v4, 0x400000, v0 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX900-NEXT: v_cndmask_b32_e32 v0, v3, v4, vcc +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_perm_b32 v0, v0, v2, s4 +; GFX900-NEXT: v_alignbit_b32 v1, s4, v1, 16 +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_uitofp_v3i16_to_v3bf16: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_cvt_f32_u32_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 +; GFX950-NEXT: v_cvt_f32_u32_sdwa v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; GFX950-NEXT: v_cvt_f32_u32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v1, v1, s0 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v0, v0, v2 +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_uitofp_v3i16_to_v3bf16: ; GFX10: ; %bb.0: @@ -35656,38 +38173,49 @@ define <4 x bfloat> @v_uitofp_v4i16_to_v4bf16(<4 x i16> %x) { ; GFX8-NEXT: v_alignbit_b32 v1, v1, v2, 16 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_uitofp_v4i16_to_v4bf16: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_cvt_f32_u32_sdwa v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 -; GFX9-NEXT: s_movk_i32 s4, 0x7fff -; GFX9-NEXT: v_cvt_f32_u32_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 -; GFX9-NEXT: v_add3_u32 v3, v3, v2, s4 -; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; GFX9-NEXT: v_cvt_f32_u32_sdwa v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 -; GFX9-NEXT: v_cvt_f32_u32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; GFX9-NEXT: v_bfe_u32 v3, v1, 16, 1 -; GFX9-NEXT: v_add3_u32 v3, v3, v1, s4 -; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v1 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc -; GFX9-NEXT: v_bfe_u32 v3, v4, 16, 1 -; GFX9-NEXT: v_add3_u32 v3, v3, v4, s4 -; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v4 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 -; GFX9-NEXT: v_bfe_u32 v4, v0, 16, 1 -; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc -; GFX9-NEXT: v_add3_u32 v4, v4, v0, s4 -; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v0 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc -; GFX9-NEXT: s_mov_b32 s4, 0x7060302 -; GFX9-NEXT: v_perm_b32 v0, v0, v3, s4 -; GFX9-NEXT: v_perm_b32 v1, v1, v2, s4 -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: v_uitofp_v4i16_to_v4bf16: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_cvt_f32_u32_sdwa v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 +; GFX900-NEXT: s_movk_i32 s4, 0x7fff +; GFX900-NEXT: v_cvt_f32_u32_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; GFX900-NEXT: v_bfe_u32 v3, v2, 16, 1 +; GFX900-NEXT: v_add3_u32 v3, v3, v2, s4 +; GFX900-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX900-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; GFX900-NEXT: v_cvt_f32_u32_sdwa v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 +; GFX900-NEXT: v_cvt_f32_u32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; GFX900-NEXT: v_bfe_u32 v3, v1, 16, 1 +; GFX900-NEXT: v_add3_u32 v3, v3, v1, s4 +; GFX900-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX900-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc +; GFX900-NEXT: v_bfe_u32 v3, v4, 16, 1 +; GFX900-NEXT: v_add3_u32 v3, v3, v4, s4 +; GFX900-NEXT: v_or_b32_e32 v5, 0x400000, v4 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; GFX900-NEXT: v_bfe_u32 v4, v0, 16, 1 +; GFX900-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc +; GFX900-NEXT: v_add3_u32 v4, v4, v0, s4 +; GFX900-NEXT: v_or_b32_e32 v5, 0x400000, v0 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX900-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_perm_b32 v0, v0, v3, s4 +; GFX900-NEXT: v_perm_b32 v1, v1, v2, s4 +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_uitofp_v4i16_to_v4bf16: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_cvt_f32_u32_sdwa v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; GFX950-NEXT: v_cvt_f32_u32_sdwa v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; GFX950-NEXT: v_cvt_f32_u32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 +; GFX950-NEXT: v_cvt_f32_u32_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v0, v0, v3 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v1, v1, v2 +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_uitofp_v4i16_to_v4bf16: ; GFX10: ; %bb.0: @@ -35838,18 +38366,25 @@ define bfloat @v_uitofp_i32_to_bf16(i32 %x) { ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_uitofp_i32_to_bf16: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_cvt_f32_u32_e32 v0, v0 -; GFX9-NEXT: s_movk_i32 s4, 0x7fff -; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1 -; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v0 -; GFX9-NEXT: v_add3_u32 v1, v1, v0, s4 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: v_uitofp_i32_to_bf16: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_cvt_f32_u32_e32 v0, v0 +; GFX900-NEXT: s_movk_i32 s4, 0x7fff +; GFX900-NEXT: v_bfe_u32 v1, v0, 16, 1 +; GFX900-NEXT: v_or_b32_e32 v2, 0x400000, v0 +; GFX900-NEXT: v_add3_u32 v1, v1, v0, s4 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX900-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc +; GFX900-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_uitofp_i32_to_bf16: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_cvt_f32_u32_e32 v0, v0 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v0, v0, s0 +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_uitofp_i32_to_bf16: ; GFX10: ; %bb.0: @@ -35934,25 +38469,33 @@ define <2 x bfloat> @v_uitofp_v2i32_to_v2bf16(<2 x i32> %x) { ; GFX8-NEXT: v_alignbit_b32 v0, v1, v0, 16 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_uitofp_v2i32_to_v2bf16: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_cvt_f32_u32_e32 v0, v0 -; GFX9-NEXT: v_cvt_f32_u32_e32 v1, v1 -; GFX9-NEXT: s_movk_i32 s4, 0x7fff -; GFX9-NEXT: v_bfe_u32 v2, v0, 16, 1 -; GFX9-NEXT: v_add3_u32 v2, v2, v0, s4 -; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v0 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc -; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 -; GFX9-NEXT: v_add3_u32 v2, v2, v1, s4 -; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; GFX9-NEXT: s_mov_b32 s4, 0x7060302 -; GFX9-NEXT: v_perm_b32 v0, v1, v0, s4 -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: v_uitofp_v2i32_to_v2bf16: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_cvt_f32_u32_e32 v0, v0 +; GFX900-NEXT: v_cvt_f32_u32_e32 v1, v1 +; GFX900-NEXT: s_movk_i32 s4, 0x7fff +; GFX900-NEXT: v_bfe_u32 v2, v0, 16, 1 +; GFX900-NEXT: v_add3_u32 v2, v2, v0, s4 +; GFX900-NEXT: v_or_b32_e32 v3, 0x400000, v0 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX900-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc +; GFX900-NEXT: v_bfe_u32 v2, v1, 16, 1 +; GFX900-NEXT: v_add3_u32 v2, v2, v1, s4 +; GFX900-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX900-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_perm_b32 v0, v1, v0, s4 +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_uitofp_v2i32_to_v2bf16: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_cvt_f32_u32_e32 v1, v1 +; GFX950-NEXT: v_cvt_f32_u32_e32 v0, v0 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v0, v0, v1 +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_uitofp_v2i32_to_v2bf16: ; GFX10: ; %bb.0: @@ -36071,32 +38614,42 @@ define <3 x bfloat> @v_uitofp_v3i32_to_v3bf16(<3 x i32> %x) { ; GFX8-NEXT: v_mov_b32_e32 v1, v2 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_uitofp_v3i32_to_v3bf16: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_cvt_f32_u32_e32 v2, v2 -; GFX9-NEXT: v_cvt_f32_u32_e32 v0, v0 -; GFX9-NEXT: s_movk_i32 s4, 0x7fff -; GFX9-NEXT: v_cvt_f32_u32_e32 v1, v1 -; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 -; GFX9-NEXT: v_add3_u32 v3, v3, v2, s4 -; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; GFX9-NEXT: v_bfe_u32 v3, v0, 16, 1 -; GFX9-NEXT: v_add3_u32 v3, v3, v0, s4 -; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v0 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v3, v4, vcc -; GFX9-NEXT: v_bfe_u32 v3, v1, 16, 1 -; GFX9-NEXT: v_add3_u32 v3, v3, v1, s4 -; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v1 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc -; GFX9-NEXT: s_mov_b32 s4, 0x7060302 -; GFX9-NEXT: v_perm_b32 v0, v1, v0, s4 -; GFX9-NEXT: v_alignbit_b32 v1, s4, v2, 16 -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: v_uitofp_v3i32_to_v3bf16: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_cvt_f32_u32_e32 v2, v2 +; GFX900-NEXT: v_cvt_f32_u32_e32 v0, v0 +; GFX900-NEXT: s_movk_i32 s4, 0x7fff +; GFX900-NEXT: v_cvt_f32_u32_e32 v1, v1 +; GFX900-NEXT: v_bfe_u32 v3, v2, 16, 1 +; GFX900-NEXT: v_add3_u32 v3, v3, v2, s4 +; GFX900-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX900-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; GFX900-NEXT: v_bfe_u32 v3, v0, 16, 1 +; GFX900-NEXT: v_add3_u32 v3, v3, v0, s4 +; GFX900-NEXT: v_or_b32_e32 v4, 0x400000, v0 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX900-NEXT: v_cndmask_b32_e32 v0, v3, v4, vcc +; GFX900-NEXT: v_bfe_u32 v3, v1, 16, 1 +; GFX900-NEXT: v_add3_u32 v3, v3, v1, s4 +; GFX900-NEXT: v_or_b32_e32 v4, 0x400000, v1 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX900-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_perm_b32 v0, v1, v0, s4 +; GFX900-NEXT: v_alignbit_b32 v1, s4, v2, 16 +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_uitofp_v3i32_to_v3bf16: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_cvt_f32_u32_e32 v2, v2 +; GFX950-NEXT: v_cvt_f32_u32_e32 v3, v1 +; GFX950-NEXT: v_cvt_f32_u32_e32 v0, v0 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v1, v2, s0 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v0, v0, v3 +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_uitofp_v3i32_to_v3bf16: ; GFX10: ; %bb.0: @@ -36248,38 +38801,49 @@ define <4 x bfloat> @v_uitofp_v4i32_to_v4bf16(<4 x i32> %x) { ; GFX8-NEXT: v_alignbit_b32 v1, v3, v2, 16 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_uitofp_v4i32_to_v4bf16: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_cvt_f32_u32_e32 v2, v2 -; GFX9-NEXT: v_cvt_f32_u32_e32 v3, v3 -; GFX9-NEXT: s_movk_i32 s4, 0x7fff -; GFX9-NEXT: v_cvt_f32_u32_e32 v0, v0 -; GFX9-NEXT: v_bfe_u32 v4, v2, 16, 1 -; GFX9-NEXT: v_add3_u32 v4, v4, v2, s4 -; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v2 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v4, v5, vcc -; GFX9-NEXT: v_bfe_u32 v4, v3, 16, 1 -; GFX9-NEXT: v_cvt_f32_u32_e32 v1, v1 -; GFX9-NEXT: v_add3_u32 v4, v4, v3, s4 -; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v3 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; GFX9-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc -; GFX9-NEXT: v_bfe_u32 v4, v0, 16, 1 -; GFX9-NEXT: v_add3_u32 v4, v4, v0, s4 -; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v0 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc -; GFX9-NEXT: v_bfe_u32 v4, v1, 16, 1 -; GFX9-NEXT: v_add3_u32 v4, v4, v1, s4 -; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v1 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v4, v5, vcc -; GFX9-NEXT: s_mov_b32 s4, 0x7060302 -; GFX9-NEXT: v_perm_b32 v0, v1, v0, s4 -; GFX9-NEXT: v_perm_b32 v1, v3, v2, s4 -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: v_uitofp_v4i32_to_v4bf16: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_cvt_f32_u32_e32 v2, v2 +; GFX900-NEXT: v_cvt_f32_u32_e32 v3, v3 +; GFX900-NEXT: s_movk_i32 s4, 0x7fff +; GFX900-NEXT: v_cvt_f32_u32_e32 v0, v0 +; GFX900-NEXT: v_bfe_u32 v4, v2, 16, 1 +; GFX900-NEXT: v_add3_u32 v4, v4, v2, s4 +; GFX900-NEXT: v_or_b32_e32 v5, 0x400000, v2 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX900-NEXT: v_cndmask_b32_e32 v2, v4, v5, vcc +; GFX900-NEXT: v_bfe_u32 v4, v3, 16, 1 +; GFX900-NEXT: v_cvt_f32_u32_e32 v1, v1 +; GFX900-NEXT: v_add3_u32 v4, v4, v3, s4 +; GFX900-NEXT: v_or_b32_e32 v5, 0x400000, v3 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX900-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc +; GFX900-NEXT: v_bfe_u32 v4, v0, 16, 1 +; GFX900-NEXT: v_add3_u32 v4, v4, v0, s4 +; GFX900-NEXT: v_or_b32_e32 v5, 0x400000, v0 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX900-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc +; GFX900-NEXT: v_bfe_u32 v4, v1, 16, 1 +; GFX900-NEXT: v_add3_u32 v4, v4, v1, s4 +; GFX900-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX900-NEXT: v_cndmask_b32_e32 v1, v4, v5, vcc +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_perm_b32 v0, v1, v0, s4 +; GFX900-NEXT: v_perm_b32 v1, v3, v2, s4 +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_uitofp_v4i32_to_v4bf16: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_cvt_f32_u32_e32 v3, v3 +; GFX950-NEXT: v_cvt_f32_u32_e32 v1, v1 +; GFX950-NEXT: v_cvt_f32_u32_e32 v0, v0 +; GFX950-NEXT: v_cvt_f32_u32_e32 v2, v2 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v0, v0, v1 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v1, v2, v3 +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_uitofp_v4i32_to_v4bf16: ; GFX10: ; %bb.0: @@ -36434,25 +38998,39 @@ define bfloat @v_uitofp_i64_to_bf16(i64 %x) { ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_uitofp_i64_to_bf16: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_ffbh_u32_e32 v2, v1 -; GFX9-NEXT: v_min_u32_e32 v2, 32, v2 -; GFX9-NEXT: v_lshlrev_b64 v[0:1], v2, v[0:1] -; GFX9-NEXT: s_movk_i32 s4, 0x7fff -; GFX9-NEXT: v_min_u32_e32 v0, 1, v0 -; GFX9-NEXT: v_or_b32_e32 v0, v1, v0 -; GFX9-NEXT: v_cvt_f32_u32_e32 v0, v0 -; GFX9-NEXT: v_sub_u32_e32 v1, 32, v2 -; GFX9-NEXT: v_ldexp_f32 v0, v0, v1 -; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1 -; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v0 -; GFX9-NEXT: v_add3_u32 v1, v1, v0, s4 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: v_uitofp_i64_to_bf16: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_ffbh_u32_e32 v2, v1 +; GFX900-NEXT: v_min_u32_e32 v2, 32, v2 +; GFX900-NEXT: v_lshlrev_b64 v[0:1], v2, v[0:1] +; GFX900-NEXT: s_movk_i32 s4, 0x7fff +; GFX900-NEXT: v_min_u32_e32 v0, 1, v0 +; GFX900-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX900-NEXT: v_cvt_f32_u32_e32 v0, v0 +; GFX900-NEXT: v_sub_u32_e32 v1, 32, v2 +; GFX900-NEXT: v_ldexp_f32 v0, v0, v1 +; GFX900-NEXT: v_bfe_u32 v1, v0, 16, 1 +; GFX900-NEXT: v_or_b32_e32 v2, 0x400000, v0 +; GFX900-NEXT: v_add3_u32 v1, v1, v0, s4 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX900-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc +; GFX900-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_uitofp_i64_to_bf16: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_ffbh_u32_e32 v2, v1 +; GFX950-NEXT: v_min_u32_e32 v2, 32, v2 +; GFX950-NEXT: v_lshlrev_b64 v[0:1], v2, v[0:1] +; GFX950-NEXT: v_min_u32_e32 v0, 1, v0 +; GFX950-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX950-NEXT: v_cvt_f32_u32_e32 v0, v0 +; GFX950-NEXT: v_sub_u32_e32 v1, 32, v2 +; GFX950-NEXT: v_ldexp_f32 v0, v0, v1 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v0, v0, s0 +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_uitofp_i64_to_bf16: ; GFX10: ; %bb.0: @@ -36606,39 +39184,61 @@ define <2 x bfloat> @v_uitofp_v2i64_to_v2bf16(<2 x i64> %x) { ; GFX8-NEXT: v_alignbit_b32 v0, v0, v1, 16 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_uitofp_v2i64_to_v2bf16: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_ffbh_u32_e32 v4, v1 -; GFX9-NEXT: v_min_u32_e32 v4, 32, v4 -; GFX9-NEXT: v_lshlrev_b64 v[0:1], v4, v[0:1] -; GFX9-NEXT: s_movk_i32 s4, 0x7fff -; GFX9-NEXT: v_min_u32_e32 v0, 1, v0 -; GFX9-NEXT: v_or_b32_e32 v0, v1, v0 -; GFX9-NEXT: v_cvt_f32_u32_e32 v0, v0 -; GFX9-NEXT: v_sub_u32_e32 v1, 32, v4 -; GFX9-NEXT: v_ldexp_f32 v4, v0, v1 -; GFX9-NEXT: v_bfe_u32 v0, v4, 16, 1 -; GFX9-NEXT: v_add3_u32 v5, v0, v4, s4 -; GFX9-NEXT: v_ffbh_u32_e32 v0, v3 -; GFX9-NEXT: v_min_u32_e32 v6, 32, v0 -; GFX9-NEXT: v_lshlrev_b64 v[0:1], v6, v[2:3] -; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v4 -; GFX9-NEXT: v_min_u32_e32 v0, 1, v0 -; GFX9-NEXT: v_or_b32_e32 v0, v1, v0 -; GFX9-NEXT: v_cvt_f32_u32_e32 v0, v0 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v5, v2, vcc -; GFX9-NEXT: v_sub_u32_e32 v2, 32, v6 -; GFX9-NEXT: v_ldexp_f32 v0, v0, v2 -; GFX9-NEXT: v_bfe_u32 v2, v0, 16, 1 -; GFX9-NEXT: v_add3_u32 v2, v2, v0, s4 -; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v0 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc -; GFX9-NEXT: s_mov_b32 s4, 0x7060302 -; GFX9-NEXT: v_perm_b32 v0, v0, v1, s4 -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: v_uitofp_v2i64_to_v2bf16: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_ffbh_u32_e32 v4, v1 +; GFX900-NEXT: v_min_u32_e32 v4, 32, v4 +; GFX900-NEXT: v_lshlrev_b64 v[0:1], v4, v[0:1] +; GFX900-NEXT: s_movk_i32 s4, 0x7fff +; GFX900-NEXT: v_min_u32_e32 v0, 1, v0 +; GFX900-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX900-NEXT: v_cvt_f32_u32_e32 v0, v0 +; GFX900-NEXT: v_sub_u32_e32 v1, 32, v4 +; GFX900-NEXT: v_ldexp_f32 v4, v0, v1 +; GFX900-NEXT: v_bfe_u32 v0, v4, 16, 1 +; GFX900-NEXT: v_add3_u32 v5, v0, v4, s4 +; GFX900-NEXT: v_ffbh_u32_e32 v0, v3 +; GFX900-NEXT: v_min_u32_e32 v6, 32, v0 +; GFX900-NEXT: v_lshlrev_b64 v[0:1], v6, v[2:3] +; GFX900-NEXT: v_or_b32_e32 v2, 0x400000, v4 +; GFX900-NEXT: v_min_u32_e32 v0, 1, v0 +; GFX900-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX900-NEXT: v_cvt_f32_u32_e32 v0, v0 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; GFX900-NEXT: v_cndmask_b32_e32 v1, v5, v2, vcc +; GFX900-NEXT: v_sub_u32_e32 v2, 32, v6 +; GFX900-NEXT: v_ldexp_f32 v0, v0, v2 +; GFX900-NEXT: v_bfe_u32 v2, v0, 16, 1 +; GFX900-NEXT: v_add3_u32 v2, v2, v0, s4 +; GFX900-NEXT: v_or_b32_e32 v3, 0x400000, v0 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX900-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_perm_b32 v0, v0, v1, s4 +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_uitofp_v2i64_to_v2bf16: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_ffbh_u32_e32 v4, v3 +; GFX950-NEXT: v_min_u32_e32 v4, 32, v4 +; GFX950-NEXT: v_lshlrev_b64 v[2:3], v4, v[2:3] +; GFX950-NEXT: v_min_u32_e32 v2, 1, v2 +; GFX950-NEXT: v_or_b32_e32 v2, v3, v2 +; GFX950-NEXT: v_ffbh_u32_e32 v3, v1 +; GFX950-NEXT: v_min_u32_e32 v3, 32, v3 +; GFX950-NEXT: v_lshlrev_b64 v[0:1], v3, v[0:1] +; GFX950-NEXT: v_min_u32_e32 v0, 1, v0 +; GFX950-NEXT: v_cvt_f32_u32_e32 v2, v2 +; GFX950-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX950-NEXT: v_cvt_f32_u32_e32 v0, v0 +; GFX950-NEXT: v_sub_u32_e32 v1, 32, v4 +; GFX950-NEXT: v_ldexp_f32 v1, v2, v1 +; GFX950-NEXT: v_sub_u32_e32 v2, 32, v3 +; GFX950-NEXT: v_ldexp_f32 v0, v0, v2 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v0, v0, v1 +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_uitofp_v2i64_to_v2bf16: ; GFX10: ; %bb.0: @@ -36874,53 +39474,85 @@ define <3 x bfloat> @v_uitofp_v3i64_to_v3bf16(<3 x i64> %x) { ; GFX8-NEXT: v_alignbit_b32 v0, v2, v0, 16 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_uitofp_v3i64_to_v3bf16: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_ffbh_u32_e32 v6, v5 -; GFX9-NEXT: v_min_u32_e32 v6, 32, v6 -; GFX9-NEXT: v_lshlrev_b64 v[4:5], v6, v[4:5] -; GFX9-NEXT: s_movk_i32 s4, 0x7fff -; GFX9-NEXT: v_min_u32_e32 v4, 1, v4 -; GFX9-NEXT: v_or_b32_e32 v4, v5, v4 -; GFX9-NEXT: v_sub_u32_e32 v5, 32, v6 -; GFX9-NEXT: v_ffbh_u32_e32 v6, v1 -; GFX9-NEXT: v_min_u32_e32 v6, 32, v6 -; GFX9-NEXT: v_lshlrev_b64 v[0:1], v6, v[0:1] -; GFX9-NEXT: v_cvt_f32_u32_e32 v4, v4 -; GFX9-NEXT: v_min_u32_e32 v0, 1, v0 -; GFX9-NEXT: v_or_b32_e32 v0, v1, v0 -; GFX9-NEXT: v_cvt_f32_u32_e32 v0, v0 -; GFX9-NEXT: v_ldexp_f32 v4, v4, v5 -; GFX9-NEXT: v_bfe_u32 v5, v4, 16, 1 -; GFX9-NEXT: v_add3_u32 v5, v5, v4, s4 -; GFX9-NEXT: v_or_b32_e32 v7, 0x400000, v4 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 -; GFX9-NEXT: v_sub_u32_e32 v1, 32, v6 -; GFX9-NEXT: v_cndmask_b32_e32 v4, v5, v7, vcc -; GFX9-NEXT: v_ldexp_f32 v5, v0, v1 -; GFX9-NEXT: v_bfe_u32 v0, v5, 16, 1 -; GFX9-NEXT: v_add3_u32 v6, v0, v5, s4 -; GFX9-NEXT: v_ffbh_u32_e32 v0, v3 -; GFX9-NEXT: v_min_u32_e32 v7, 32, v0 -; GFX9-NEXT: v_lshlrev_b64 v[0:1], v7, v[2:3] -; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v5 -; GFX9-NEXT: v_min_u32_e32 v0, 1, v0 -; GFX9-NEXT: v_or_b32_e32 v0, v1, v0 -; GFX9-NEXT: v_cvt_f32_u32_e32 v0, v0 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v6, v2, vcc -; GFX9-NEXT: v_sub_u32_e32 v2, 32, v7 -; GFX9-NEXT: v_ldexp_f32 v0, v0, v2 -; GFX9-NEXT: v_bfe_u32 v2, v0, 16, 1 -; GFX9-NEXT: v_add3_u32 v2, v2, v0, s4 -; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v0 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc -; GFX9-NEXT: s_mov_b32 s4, 0x7060302 -; GFX9-NEXT: v_perm_b32 v0, v0, v1, s4 -; GFX9-NEXT: v_alignbit_b32 v1, s4, v4, 16 -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: v_uitofp_v3i64_to_v3bf16: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_ffbh_u32_e32 v6, v5 +; GFX900-NEXT: v_min_u32_e32 v6, 32, v6 +; GFX900-NEXT: v_lshlrev_b64 v[4:5], v6, v[4:5] +; GFX900-NEXT: s_movk_i32 s4, 0x7fff +; GFX900-NEXT: v_min_u32_e32 v4, 1, v4 +; GFX900-NEXT: v_or_b32_e32 v4, v5, v4 +; GFX900-NEXT: v_sub_u32_e32 v5, 32, v6 +; GFX900-NEXT: v_ffbh_u32_e32 v6, v1 +; GFX900-NEXT: v_min_u32_e32 v6, 32, v6 +; GFX900-NEXT: v_lshlrev_b64 v[0:1], v6, v[0:1] +; GFX900-NEXT: v_cvt_f32_u32_e32 v4, v4 +; GFX900-NEXT: v_min_u32_e32 v0, 1, v0 +; GFX900-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX900-NEXT: v_cvt_f32_u32_e32 v0, v0 +; GFX900-NEXT: v_ldexp_f32 v4, v4, v5 +; GFX900-NEXT: v_bfe_u32 v5, v4, 16, 1 +; GFX900-NEXT: v_add3_u32 v5, v5, v4, s4 +; GFX900-NEXT: v_or_b32_e32 v7, 0x400000, v4 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; GFX900-NEXT: v_sub_u32_e32 v1, 32, v6 +; GFX900-NEXT: v_cndmask_b32_e32 v4, v5, v7, vcc +; GFX900-NEXT: v_ldexp_f32 v5, v0, v1 +; GFX900-NEXT: v_bfe_u32 v0, v5, 16, 1 +; GFX900-NEXT: v_add3_u32 v6, v0, v5, s4 +; GFX900-NEXT: v_ffbh_u32_e32 v0, v3 +; GFX900-NEXT: v_min_u32_e32 v7, 32, v0 +; GFX900-NEXT: v_lshlrev_b64 v[0:1], v7, v[2:3] +; GFX900-NEXT: v_or_b32_e32 v2, 0x400000, v5 +; GFX900-NEXT: v_min_u32_e32 v0, 1, v0 +; GFX900-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX900-NEXT: v_cvt_f32_u32_e32 v0, v0 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX900-NEXT: v_cndmask_b32_e32 v1, v6, v2, vcc +; GFX900-NEXT: v_sub_u32_e32 v2, 32, v7 +; GFX900-NEXT: v_ldexp_f32 v0, v0, v2 +; GFX900-NEXT: v_bfe_u32 v2, v0, 16, 1 +; GFX900-NEXT: v_add3_u32 v2, v2, v0, s4 +; GFX900-NEXT: v_or_b32_e32 v3, 0x400000, v0 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX900-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_perm_b32 v0, v0, v1, s4 +; GFX900-NEXT: v_alignbit_b32 v1, s4, v4, 16 +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_uitofp_v3i64_to_v3bf16: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_ffbh_u32_e32 v6, v5 +; GFX950-NEXT: v_min_u32_e32 v6, 32, v6 +; GFX950-NEXT: v_lshlrev_b64 v[4:5], v6, v[4:5] +; GFX950-NEXT: v_min_u32_e32 v4, 1, v4 +; GFX950-NEXT: v_or_b32_e32 v4, v5, v4 +; GFX950-NEXT: v_cvt_f32_u32_e32 v4, v4 +; GFX950-NEXT: v_sub_u32_e32 v5, 32, v6 +; GFX950-NEXT: v_ldexp_f32 v4, v4, v5 +; GFX950-NEXT: v_ffbh_u32_e32 v5, v3 +; GFX950-NEXT: v_min_u32_e32 v5, 32, v5 +; GFX950-NEXT: v_lshlrev_b64 v[2:3], v5, v[2:3] +; GFX950-NEXT: v_min_u32_e32 v2, 1, v2 +; GFX950-NEXT: v_or_b32_e32 v2, v3, v2 +; GFX950-NEXT: v_ffbh_u32_e32 v3, v1 +; GFX950-NEXT: v_min_u32_e32 v3, 32, v3 +; GFX950-NEXT: v_lshlrev_b64 v[0:1], v3, v[0:1] +; GFX950-NEXT: v_min_u32_e32 v0, 1, v0 +; GFX950-NEXT: v_cvt_f32_u32_e32 v2, v2 +; GFX950-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX950-NEXT: v_cvt_f32_u32_e32 v0, v0 +; GFX950-NEXT: v_sub_u32_e32 v1, 32, v5 +; GFX950-NEXT: v_ldexp_f32 v1, v2, v1 +; GFX950-NEXT: v_sub_u32_e32 v2, 32, v3 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v4, v4, s0 +; GFX950-NEXT: v_ldexp_f32 v0, v0, v2 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v0, v0, v1 +; GFX950-NEXT: v_mov_b32_e32 v1, v4 +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_uitofp_v3i64_to_v3bf16: ; GFX10: ; %bb.0: @@ -37236,66 +39868,105 @@ define <4 x bfloat> @v_uitofp_v4i64_to_v4bf16(<4 x i64> %x) { ; GFX8-NEXT: v_alignbit_b32 v1, v4, v5, 16 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_uitofp_v4i64_to_v4bf16: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_ffbh_u32_e32 v8, v5 -; GFX9-NEXT: v_min_u32_e32 v8, 32, v8 -; GFX9-NEXT: v_lshlrev_b64 v[4:5], v8, v[4:5] -; GFX9-NEXT: s_movk_i32 s4, 0x7fff -; GFX9-NEXT: v_min_u32_e32 v4, 1, v4 -; GFX9-NEXT: v_or_b32_e32 v4, v5, v4 -; GFX9-NEXT: v_cvt_f32_u32_e32 v4, v4 -; GFX9-NEXT: v_sub_u32_e32 v5, 32, v8 -; GFX9-NEXT: v_ldexp_f32 v8, v4, v5 -; GFX9-NEXT: v_bfe_u32 v4, v8, 16, 1 -; GFX9-NEXT: v_add3_u32 v9, v4, v8, s4 -; GFX9-NEXT: v_ffbh_u32_e32 v4, v7 -; GFX9-NEXT: v_min_u32_e32 v10, 32, v4 -; GFX9-NEXT: v_lshlrev_b64 v[4:5], v10, v[6:7] -; GFX9-NEXT: v_ffbh_u32_e32 v7, v1 -; GFX9-NEXT: v_min_u32_e32 v4, 1, v4 -; GFX9-NEXT: v_min_u32_e32 v7, 32, v7 -; GFX9-NEXT: v_or_b32_e32 v4, v5, v4 -; GFX9-NEXT: v_lshlrev_b64 v[0:1], v7, v[0:1] -; GFX9-NEXT: v_cvt_f32_u32_e32 v4, v4 -; GFX9-NEXT: v_min_u32_e32 v0, 1, v0 -; GFX9-NEXT: v_or_b32_e32 v6, 0x400000, v8 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v8, v8 -; GFX9-NEXT: v_or_b32_e32 v0, v1, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v5, v9, v6, vcc -; GFX9-NEXT: v_sub_u32_e32 v6, 32, v10 -; GFX9-NEXT: v_cvt_f32_u32_e32 v0, v0 -; GFX9-NEXT: v_ldexp_f32 v4, v4, v6 -; GFX9-NEXT: v_bfe_u32 v6, v4, 16, 1 -; GFX9-NEXT: v_add3_u32 v6, v6, v4, s4 -; GFX9-NEXT: v_or_b32_e32 v8, 0x400000, v4 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 -; GFX9-NEXT: v_sub_u32_e32 v1, 32, v7 -; GFX9-NEXT: v_cndmask_b32_e32 v4, v6, v8, vcc -; GFX9-NEXT: v_ldexp_f32 v6, v0, v1 -; GFX9-NEXT: v_bfe_u32 v0, v6, 16, 1 -; GFX9-NEXT: v_add3_u32 v7, v0, v6, s4 -; GFX9-NEXT: v_ffbh_u32_e32 v0, v3 -; GFX9-NEXT: v_min_u32_e32 v8, 32, v0 -; GFX9-NEXT: v_lshlrev_b64 v[0:1], v8, v[2:3] -; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v6 -; GFX9-NEXT: v_min_u32_e32 v0, 1, v0 -; GFX9-NEXT: v_or_b32_e32 v0, v1, v0 -; GFX9-NEXT: v_cvt_f32_u32_e32 v0, v0 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v7, v2, vcc -; GFX9-NEXT: v_sub_u32_e32 v2, 32, v8 -; GFX9-NEXT: v_ldexp_f32 v0, v0, v2 -; GFX9-NEXT: v_bfe_u32 v2, v0, 16, 1 -; GFX9-NEXT: v_add3_u32 v2, v2, v0, s4 -; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v0 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc -; GFX9-NEXT: s_mov_b32 s4, 0x7060302 -; GFX9-NEXT: v_perm_b32 v0, v0, v1, s4 -; GFX9-NEXT: v_perm_b32 v1, v4, v5, s4 -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: v_uitofp_v4i64_to_v4bf16: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_ffbh_u32_e32 v8, v5 +; GFX900-NEXT: v_min_u32_e32 v8, 32, v8 +; GFX900-NEXT: v_lshlrev_b64 v[4:5], v8, v[4:5] +; GFX900-NEXT: s_movk_i32 s4, 0x7fff +; GFX900-NEXT: v_min_u32_e32 v4, 1, v4 +; GFX900-NEXT: v_or_b32_e32 v4, v5, v4 +; GFX900-NEXT: v_cvt_f32_u32_e32 v4, v4 +; GFX900-NEXT: v_sub_u32_e32 v5, 32, v8 +; GFX900-NEXT: v_ldexp_f32 v8, v4, v5 +; GFX900-NEXT: v_bfe_u32 v4, v8, 16, 1 +; GFX900-NEXT: v_add3_u32 v9, v4, v8, s4 +; GFX900-NEXT: v_ffbh_u32_e32 v4, v7 +; GFX900-NEXT: v_min_u32_e32 v10, 32, v4 +; GFX900-NEXT: v_lshlrev_b64 v[4:5], v10, v[6:7] +; GFX900-NEXT: v_ffbh_u32_e32 v7, v1 +; GFX900-NEXT: v_min_u32_e32 v4, 1, v4 +; GFX900-NEXT: v_min_u32_e32 v7, 32, v7 +; GFX900-NEXT: v_or_b32_e32 v4, v5, v4 +; GFX900-NEXT: v_lshlrev_b64 v[0:1], v7, v[0:1] +; GFX900-NEXT: v_cvt_f32_u32_e32 v4, v4 +; GFX900-NEXT: v_min_u32_e32 v0, 1, v0 +; GFX900-NEXT: v_or_b32_e32 v6, 0x400000, v8 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v8, v8 +; GFX900-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX900-NEXT: v_cndmask_b32_e32 v5, v9, v6, vcc +; GFX900-NEXT: v_sub_u32_e32 v6, 32, v10 +; GFX900-NEXT: v_cvt_f32_u32_e32 v0, v0 +; GFX900-NEXT: v_ldexp_f32 v4, v4, v6 +; GFX900-NEXT: v_bfe_u32 v6, v4, 16, 1 +; GFX900-NEXT: v_add3_u32 v6, v6, v4, s4 +; GFX900-NEXT: v_or_b32_e32 v8, 0x400000, v4 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; GFX900-NEXT: v_sub_u32_e32 v1, 32, v7 +; GFX900-NEXT: v_cndmask_b32_e32 v4, v6, v8, vcc +; GFX900-NEXT: v_ldexp_f32 v6, v0, v1 +; GFX900-NEXT: v_bfe_u32 v0, v6, 16, 1 +; GFX900-NEXT: v_add3_u32 v7, v0, v6, s4 +; GFX900-NEXT: v_ffbh_u32_e32 v0, v3 +; GFX900-NEXT: v_min_u32_e32 v8, 32, v0 +; GFX900-NEXT: v_lshlrev_b64 v[0:1], v8, v[2:3] +; GFX900-NEXT: v_or_b32_e32 v2, 0x400000, v6 +; GFX900-NEXT: v_min_u32_e32 v0, 1, v0 +; GFX900-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX900-NEXT: v_cvt_f32_u32_e32 v0, v0 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; GFX900-NEXT: v_cndmask_b32_e32 v1, v7, v2, vcc +; GFX900-NEXT: v_sub_u32_e32 v2, 32, v8 +; GFX900-NEXT: v_ldexp_f32 v0, v0, v2 +; GFX900-NEXT: v_bfe_u32 v2, v0, 16, 1 +; GFX900-NEXT: v_add3_u32 v2, v2, v0, s4 +; GFX900-NEXT: v_or_b32_e32 v3, 0x400000, v0 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX900-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_perm_b32 v0, v0, v1, s4 +; GFX900-NEXT: v_perm_b32 v1, v4, v5, s4 +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_uitofp_v4i64_to_v4bf16: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_ffbh_u32_e32 v8, v7 +; GFX950-NEXT: v_min_u32_e32 v8, 32, v8 +; GFX950-NEXT: v_lshlrev_b64 v[6:7], v8, v[6:7] +; GFX950-NEXT: v_min_u32_e32 v6, 1, v6 +; GFX950-NEXT: v_or_b32_e32 v6, v7, v6 +; GFX950-NEXT: v_ffbh_u32_e32 v7, v5 +; GFX950-NEXT: v_min_u32_e32 v7, 32, v7 +; GFX950-NEXT: v_lshlrev_b64 v[4:5], v7, v[4:5] +; GFX950-NEXT: v_min_u32_e32 v4, 1, v4 +; GFX950-NEXT: v_cvt_f32_u32_e32 v6, v6 +; GFX950-NEXT: v_or_b32_e32 v4, v5, v4 +; GFX950-NEXT: v_cvt_f32_u32_e32 v4, v4 +; GFX950-NEXT: v_sub_u32_e32 v5, 32, v8 +; GFX950-NEXT: v_ldexp_f32 v5, v6, v5 +; GFX950-NEXT: v_sub_u32_e32 v6, 32, v7 +; GFX950-NEXT: v_ldexp_f32 v4, v4, v6 +; GFX950-NEXT: v_ffbh_u32_e32 v6, v3 +; GFX950-NEXT: v_min_u32_e32 v6, 32, v6 +; GFX950-NEXT: v_lshlrev_b64 v[2:3], v6, v[2:3] +; GFX950-NEXT: v_min_u32_e32 v2, 1, v2 +; GFX950-NEXT: v_or_b32_e32 v2, v3, v2 +; GFX950-NEXT: v_ffbh_u32_e32 v3, v1 +; GFX950-NEXT: v_min_u32_e32 v3, 32, v3 +; GFX950-NEXT: v_lshlrev_b64 v[0:1], v3, v[0:1] +; GFX950-NEXT: v_min_u32_e32 v0, 1, v0 +; GFX950-NEXT: v_cvt_f32_u32_e32 v2, v2 +; GFX950-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX950-NEXT: v_cvt_f32_u32_e32 v0, v0 +; GFX950-NEXT: v_sub_u32_e32 v1, 32, v6 +; GFX950-NEXT: v_ldexp_f32 v1, v2, v1 +; GFX950-NEXT: v_sub_u32_e32 v2, 32, v3 +; GFX950-NEXT: v_ldexp_f32 v0, v0, v2 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v0, v0, v1 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v1, v4, v5 +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_uitofp_v4i64_to_v4bf16: ; GFX10: ; %bb.0: @@ -37531,13 +40202,22 @@ define bfloat @v_select_bf16(i1 %cond, bfloat %a, bfloat %b) { ; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_select_bf16: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: v_select_bf16: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX900-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GFX900-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_select_bf16: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_select_bf16: ; GFX10: ; %bb.0: @@ -37600,14 +40280,24 @@ define bfloat @v_select_fneg_lhs_bf16(i1 %cond, bfloat %a, bfloat %b) { ; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_select_fneg_lhs_bf16: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX9-NEXT: v_xor_b32_e32 v1, 0x8000, v1 -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: v_select_fneg_lhs_bf16: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX900-NEXT: v_xor_b32_e32 v1, 0x8000, v1 +; GFX900-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GFX900-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_select_fneg_lhs_bf16: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX950-NEXT: v_xor_b32_e32 v1, 0x8000, v1 +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_select_fneg_lhs_bf16: ; GFX10: ; %bb.0: @@ -37674,14 +40364,24 @@ define bfloat @v_select_fneg_rhs_bf16(i1 %cond, bfloat %a, bfloat %b) { ; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_select_fneg_rhs_bf16: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX9-NEXT: v_xor_b32_e32 v2, 0x8000, v2 -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: v_select_fneg_rhs_bf16: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX900-NEXT: v_xor_b32_e32 v2, 0x8000, v2 +; GFX900-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GFX900-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_select_fneg_rhs_bf16: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX950-NEXT: v_xor_b32_e32 v2, 0x8000, v2 +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_select_fneg_rhs_bf16: ; GFX10: ; %bb.0: @@ -37765,16 +40465,28 @@ define <2 x bfloat> @v_select_v2bf16(i1 %cond, <2 x bfloat> %a, <2 x bfloat> %b) ; GFX8-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_select_v2bf16: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc -; GFX9-NEXT: v_cndmask_b32_sdwa v1, v2, v1, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX9-NEXT: s_mov_b32 s4, 0x5040100 -; GFX9-NEXT: v_perm_b32 v0, v1, v0, s4 -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: v_select_v2bf16: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX900-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GFX900-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc +; GFX900-NEXT: v_cndmask_b32_sdwa v1, v2, v1, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_perm_b32 v0, v1, v0, s4 +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_select_v2bf16: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GFX950-NEXT: s_mov_b32 s0, 0x5040100 +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc +; GFX950-NEXT: v_cndmask_b32_sdwa v1, v2, v1, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX950-NEXT: v_perm_b32 v0, v1, v0, s0 +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_select_v2bf16: ; GFX10: ; %bb.0: @@ -37859,18 +40571,32 @@ define <2 x bfloat> @v_vselect_v2bf16(<2 x i1> %cond, <2 x bfloat> %a, <2 x bflo ; GFX8-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_vselect_v2bf16: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_and_b32_e32 v1, 1, v1 -; GFX9-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v1 -; GFX9-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v0 -; GFX9-NEXT: v_cndmask_b32_e64 v0, v3, v2, s[4:5] -; GFX9-NEXT: v_cndmask_b32_sdwa v1, v3, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX9-NEXT: s_mov_b32 s4, 0x5040100 -; GFX9-NEXT: v_perm_b32 v0, v1, v0, s4 -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: v_vselect_v2bf16: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_and_b32_e32 v1, 1, v1 +; GFX900-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX900-NEXT: v_cmp_eq_u32_e32 vcc, 1, v1 +; GFX900-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v0 +; GFX900-NEXT: v_cndmask_b32_e64 v0, v3, v2, s[4:5] +; GFX900-NEXT: v_cndmask_b32_sdwa v1, v3, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_perm_b32 v0, v1, v0, s4 +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_vselect_v2bf16: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_and_b32_e32 v1, 1, v1 +; GFX950-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, 1, v1 +; GFX950-NEXT: v_cmp_eq_u32_e64 s[0:1], 1, v0 +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: v_cndmask_b32_sdwa v1, v3, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX950-NEXT: v_cndmask_b32_e64 v0, v3, v2, s[0:1] +; GFX950-NEXT: s_mov_b32 s0, 0x5040100 +; GFX950-NEXT: v_perm_b32 v0, v1, v0, s0 +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_vselect_v2bf16: ; GFX10: ; %bb.0: @@ -37946,15 +40672,27 @@ define amdgpu_ps i32 @s_select_bf16(bfloat inreg %a, bfloat inreg %b, i32 %c) { ; GFX8-NEXT: v_readfirstlane_b32 s0, v0 ; GFX8-NEXT: ; return to shader part epilog ; -; GFX9-LABEL: s_select_bf16: -; GFX9: ; %bb.0: -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: v_mov_b32_e32 v2, s0 -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX9-NEXT: v_readfirstlane_b32 s0, v0 -; GFX9-NEXT: ; return to shader part epilog +; GFX900-LABEL: s_select_bf16: +; GFX900: ; %bb.0: +; GFX900-NEXT: v_mov_b32_e32 v1, s1 +; GFX900-NEXT: v_mov_b32_e32 v2, s0 +; GFX900-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX900-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc +; GFX900-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX900-NEXT: v_readfirstlane_b32 s0, v0 +; GFX900-NEXT: ; return to shader part epilog +; +; GFX950-LABEL: s_select_bf16: +; GFX950: ; %bb.0: +; GFX950-NEXT: v_mov_b32_e32 v1, s1 +; GFX950-NEXT: v_mov_b32_e32 v2, s0 +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc +; GFX950-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: v_readfirstlane_b32 s0, v0 +; GFX950-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: s_select_bf16: ; GFX10: ; %bb.0: @@ -38046,21 +40784,39 @@ define amdgpu_ps i32 @s_select_v2bf16(<2 x bfloat> inreg %a, <2 x bfloat> inreg ; GFX8-NEXT: v_readfirstlane_b32 s0, v0 ; GFX8-NEXT: ; return to shader part epilog ; -; GFX9-LABEL: s_select_v2bf16: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_lshr_b32 s2, s0, 16 -; GFX9-NEXT: s_lshr_b32 s3, s1, 16 -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: v_mov_b32_e32 v2, s2 -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: v_mov_b32_e32 v2, s0 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc -; GFX9-NEXT: s_mov_b32 s0, 0x5040100 -; GFX9-NEXT: v_perm_b32 v0, v0, v1, s0 -; GFX9-NEXT: v_readfirstlane_b32 s0, v0 -; GFX9-NEXT: ; return to shader part epilog +; GFX900-LABEL: s_select_v2bf16: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_lshr_b32 s2, s0, 16 +; GFX900-NEXT: s_lshr_b32 s3, s1, 16 +; GFX900-NEXT: v_mov_b32_e32 v1, s3 +; GFX900-NEXT: v_mov_b32_e32 v2, s2 +; GFX900-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX900-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc +; GFX900-NEXT: v_mov_b32_e32 v1, s1 +; GFX900-NEXT: v_mov_b32_e32 v2, s0 +; GFX900-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; GFX900-NEXT: s_mov_b32 s0, 0x5040100 +; GFX900-NEXT: v_perm_b32 v0, v0, v1, s0 +; GFX900-NEXT: v_readfirstlane_b32 s0, v0 +; GFX900-NEXT: ; return to shader part epilog +; +; GFX950-LABEL: s_select_v2bf16: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_lshr_b32 s2, s0, 16 +; GFX950-NEXT: s_lshr_b32 s3, s1, 16 +; GFX950-NEXT: v_mov_b32_e32 v1, s3 +; GFX950-NEXT: v_mov_b32_e32 v2, s2 +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc +; GFX950-NEXT: v_mov_b32_e32 v1, s1 +; GFX950-NEXT: v_mov_b32_e32 v2, s0 +; GFX950-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; GFX950-NEXT: s_mov_b32 s0, 0x5040100 +; GFX950-NEXT: v_perm_b32 v0, v0, v1, s0 +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: v_readfirstlane_b32 s0, v0 +; GFX950-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: s_select_v2bf16: ; GFX10: ; %bb.0: @@ -38159,22 +40915,42 @@ define amdgpu_ps i32 @s_vselect_v2bf16(<2 x bfloat> inreg %a, <2 x bfloat> inreg ; GFX8-NEXT: v_readfirstlane_b32 s0, v0 ; GFX8-NEXT: ; return to shader part epilog ; -; GFX9-LABEL: s_vselect_v2bf16: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_lshr_b32 s2, s0, 16 -; GFX9-NEXT: s_lshr_b32 s3, s1, 16 -; GFX9-NEXT: v_mov_b32_e32 v2, s3 -; GFX9-NEXT: v_mov_b32_e32 v3, s2 -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; GFX9-NEXT: v_mov_b32_e32 v2, s1 -; GFX9-NEXT: v_mov_b32_e32 v3, s0 -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc -; GFX9-NEXT: s_mov_b32 s0, 0x5040100 -; GFX9-NEXT: v_perm_b32 v0, v1, v0, s0 -; GFX9-NEXT: v_readfirstlane_b32 s0, v0 -; GFX9-NEXT: ; return to shader part epilog +; GFX900-LABEL: s_vselect_v2bf16: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_lshr_b32 s2, s0, 16 +; GFX900-NEXT: s_lshr_b32 s3, s1, 16 +; GFX900-NEXT: v_mov_b32_e32 v2, s3 +; GFX900-NEXT: v_mov_b32_e32 v3, s2 +; GFX900-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 +; GFX900-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; GFX900-NEXT: v_mov_b32_e32 v2, s1 +; GFX900-NEXT: v_mov_b32_e32 v3, s0 +; GFX900-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX900-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc +; GFX900-NEXT: s_mov_b32 s0, 0x5040100 +; GFX900-NEXT: v_perm_b32 v0, v1, v0, s0 +; GFX900-NEXT: v_readfirstlane_b32 s0, v0 +; GFX900-NEXT: ; return to shader part epilog +; +; GFX950-LABEL: s_vselect_v2bf16: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_lshr_b32 s2, s0, 16 +; GFX950-NEXT: s_lshr_b32 s3, s1, 16 +; GFX950-NEXT: v_mov_b32_e32 v2, s3 +; GFX950-NEXT: v_mov_b32_e32 v3, s2 +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; GFX950-NEXT: v_mov_b32_e32 v2, s1 +; GFX950-NEXT: v_mov_b32_e32 v3, s0 +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX950-NEXT: s_mov_b32 s0, 0x5040100 +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc +; GFX950-NEXT: v_perm_b32 v0, v1, v0, s0 +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: v_readfirstlane_b32 s0, v0 +; GFX950-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: s_vselect_v2bf16: ; GFX10: ; %bb.0: @@ -38285,14 +41061,24 @@ define <3 x bfloat> @v_select_v3bf16(i1 %cond, <3 x bfloat> %a, <3 x bfloat> %b) ; GFX8-NEXT: v_cndmask_b32_e32 v1, v4, v2, vcc ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_select_v3bf16: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v3, v1, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v1, v4, v2, vcc -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: v_select_v3bf16: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX900-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GFX900-NEXT: v_cndmask_b32_e32 v0, v3, v1, vcc +; GFX900-NEXT: v_cndmask_b32_e32 v1, v4, v2, vcc +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_select_v3bf16: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e32 v0, v3, v1, vcc +; GFX950-NEXT: v_cndmask_b32_e32 v1, v4, v2, vcc +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_select_v3bf16: ; GFX10: ; %bb.0: @@ -38383,14 +41169,24 @@ define <4 x bfloat> @v_select_v4bf16(i1 %cond, <4 x bfloat> %a, <4 x bfloat> %b) ; GFX8-NEXT: v_cndmask_b32_e32 v1, v4, v2, vcc ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_select_v4bf16: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v3, v1, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v1, v4, v2, vcc -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: v_select_v4bf16: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX900-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GFX900-NEXT: v_cndmask_b32_e32 v0, v3, v1, vcc +; GFX900-NEXT: v_cndmask_b32_e32 v1, v4, v2, vcc +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_select_v4bf16: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e32 v0, v3, v1, vcc +; GFX950-NEXT: v_cndmask_b32_e32 v1, v4, v2, vcc +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_select_v4bf16: ; GFX10: ; %bb.0: @@ -38504,15 +41300,26 @@ define <6 x bfloat> @v_select_v6bf16(i1 %cond, <6 x bfloat> %a, <6 x bfloat> %b) ; GFX8-NEXT: v_cndmask_b32_e32 v2, v6, v3, vcc ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_select_v6bf16: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v1, v5, v2, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v2, v6, v3, vcc -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: v_select_v6bf16: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX900-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GFX900-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc +; GFX900-NEXT: v_cndmask_b32_e32 v1, v5, v2, vcc +; GFX900-NEXT: v_cndmask_b32_e32 v2, v6, v3, vcc +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_select_v6bf16: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc +; GFX950-NEXT: v_cndmask_b32_e32 v1, v5, v2, vcc +; GFX950-NEXT: v_cndmask_b32_e32 v2, v6, v3, vcc +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_select_v6bf16: ; GFX10: ; %bb.0: @@ -38651,16 +41458,28 @@ define <8 x bfloat> @v_select_v8bf16(i1 %cond, <8 x bfloat> %a, <8 x bfloat> %b) ; GFX8-NEXT: v_cndmask_b32_e32 v3, v8, v4, vcc ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_select_v8bf16: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v5, v1, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v1, v6, v2, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v2, v7, v3, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v3, v8, v4, vcc -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: v_select_v8bf16: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX900-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GFX900-NEXT: v_cndmask_b32_e32 v0, v5, v1, vcc +; GFX900-NEXT: v_cndmask_b32_e32 v1, v6, v2, vcc +; GFX900-NEXT: v_cndmask_b32_e32 v2, v7, v3, vcc +; GFX900-NEXT: v_cndmask_b32_e32 v3, v8, v4, vcc +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_select_v8bf16: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e32 v0, v5, v1, vcc +; GFX950-NEXT: v_cndmask_b32_e32 v1, v6, v2, vcc +; GFX950-NEXT: v_cndmask_b32_e32 v2, v7, v3, vcc +; GFX950-NEXT: v_cndmask_b32_e32 v3, v8, v4, vcc +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_select_v8bf16: ; GFX10: ; %bb.0: @@ -38900,20 +41719,36 @@ define <16 x bfloat> @v_select_v16bf16(i1 %cond, <16 x bfloat> %a, <16 x bfloat> ; GFX8-NEXT: v_cndmask_b32_e32 v7, v16, v8, vcc ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_select_v16bf16: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v9, v1, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v1, v10, v2, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v2, v11, v3, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v3, v12, v4, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v4, v13, v5, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v5, v14, v6, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v6, v15, v7, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v7, v16, v8, vcc -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: v_select_v16bf16: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX900-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GFX900-NEXT: v_cndmask_b32_e32 v0, v9, v1, vcc +; GFX900-NEXT: v_cndmask_b32_e32 v1, v10, v2, vcc +; GFX900-NEXT: v_cndmask_b32_e32 v2, v11, v3, vcc +; GFX900-NEXT: v_cndmask_b32_e32 v3, v12, v4, vcc +; GFX900-NEXT: v_cndmask_b32_e32 v4, v13, v5, vcc +; GFX900-NEXT: v_cndmask_b32_e32 v5, v14, v6, vcc +; GFX900-NEXT: v_cndmask_b32_e32 v6, v15, v7, vcc +; GFX900-NEXT: v_cndmask_b32_e32 v7, v16, v8, vcc +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_select_v16bf16: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e32 v0, v9, v1, vcc +; GFX950-NEXT: v_cndmask_b32_e32 v1, v10, v2, vcc +; GFX950-NEXT: v_cndmask_b32_e32 v2, v11, v3, vcc +; GFX950-NEXT: v_cndmask_b32_e32 v3, v12, v4, vcc +; GFX950-NEXT: v_cndmask_b32_e32 v4, v13, v5, vcc +; GFX950-NEXT: v_cndmask_b32_e32 v5, v14, v6, vcc +; GFX950-NEXT: v_cndmask_b32_e32 v6, v15, v7, vcc +; GFX950-NEXT: v_cndmask_b32_e32 v7, v16, v8, vcc +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_select_v16bf16: ; GFX10: ; %bb.0: @@ -39469,32 +42304,60 @@ define <32 x bfloat> @v_select_v32bf16(i1 %cond, <32 x bfloat> %a, <32 x bfloat> ; GFX8-NEXT: v_cndmask_b32_e32 v15, v18, v16, vcc ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_select_v32bf16: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v17, v1, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v1, v18, v2, vcc -; GFX9-NEXT: buffer_load_dword v17, off, s[0:3], s32 -; GFX9-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:4 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v19, v3, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v3, v20, v4, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v4, v21, v5, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v5, v22, v6, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v6, v23, v7, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v7, v24, v8, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v8, v25, v9, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v9, v26, v10, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v10, v27, v11, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v11, v28, v12, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v12, v29, v13, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v13, v30, v14, vcc -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_cndmask_b32_e32 v14, v17, v15, vcc -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_cndmask_b32_e32 v15, v18, v16, vcc -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: v_select_v32bf16: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX900-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GFX900-NEXT: v_cndmask_b32_e32 v0, v17, v1, vcc +; GFX900-NEXT: v_cndmask_b32_e32 v1, v18, v2, vcc +; GFX900-NEXT: buffer_load_dword v17, off, s[0:3], s32 +; GFX900-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:4 +; GFX900-NEXT: v_cndmask_b32_e32 v2, v19, v3, vcc +; GFX900-NEXT: v_cndmask_b32_e32 v3, v20, v4, vcc +; GFX900-NEXT: v_cndmask_b32_e32 v4, v21, v5, vcc +; GFX900-NEXT: v_cndmask_b32_e32 v5, v22, v6, vcc +; GFX900-NEXT: v_cndmask_b32_e32 v6, v23, v7, vcc +; GFX900-NEXT: v_cndmask_b32_e32 v7, v24, v8, vcc +; GFX900-NEXT: v_cndmask_b32_e32 v8, v25, v9, vcc +; GFX900-NEXT: v_cndmask_b32_e32 v9, v26, v10, vcc +; GFX900-NEXT: v_cndmask_b32_e32 v10, v27, v11, vcc +; GFX900-NEXT: v_cndmask_b32_e32 v11, v28, v12, vcc +; GFX900-NEXT: v_cndmask_b32_e32 v12, v29, v13, vcc +; GFX900-NEXT: v_cndmask_b32_e32 v13, v30, v14, vcc +; GFX900-NEXT: s_waitcnt vmcnt(1) +; GFX900-NEXT: v_cndmask_b32_e32 v14, v17, v15, vcc +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: v_cndmask_b32_e32 v15, v18, v16, vcc +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_select_v32bf16: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: scratch_load_dword v31, off, s32 +; GFX950-NEXT: scratch_load_dword v32, off, s32 offset:4 +; GFX950-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e32 v0, v17, v1, vcc +; GFX950-NEXT: v_cndmask_b32_e32 v1, v18, v2, vcc +; GFX950-NEXT: v_cndmask_b32_e32 v2, v19, v3, vcc +; GFX950-NEXT: v_cndmask_b32_e32 v3, v20, v4, vcc +; GFX950-NEXT: v_cndmask_b32_e32 v4, v21, v5, vcc +; GFX950-NEXT: v_cndmask_b32_e32 v5, v22, v6, vcc +; GFX950-NEXT: v_cndmask_b32_e32 v6, v23, v7, vcc +; GFX950-NEXT: v_cndmask_b32_e32 v7, v24, v8, vcc +; GFX950-NEXT: v_cndmask_b32_e32 v8, v25, v9, vcc +; GFX950-NEXT: v_cndmask_b32_e32 v9, v26, v10, vcc +; GFX950-NEXT: v_cndmask_b32_e32 v10, v27, v11, vcc +; GFX950-NEXT: v_cndmask_b32_e32 v11, v28, v12, vcc +; GFX950-NEXT: v_cndmask_b32_e32 v12, v29, v13, vcc +; GFX950-NEXT: v_cndmask_b32_e32 v13, v30, v14, vcc +; GFX950-NEXT: s_waitcnt vmcnt(1) +; GFX950-NEXT: v_cndmask_b32_e32 v14, v31, v15, vcc +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: v_cndmask_b32_e32 v15, v32, v16, vcc +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_select_v32bf16: ; GFX10: ; %bb.0: @@ -39604,19 +42467,34 @@ define amdgpu_ps <2 x i32> @s_select_v3bf16(<3 x bfloat> inreg %a, <3 x bfloat> ; GFX8-NEXT: v_readfirstlane_b32 s1, v1 ; GFX8-NEXT: ; return to shader part epilog ; -; GFX9-LABEL: s_select_v3bf16: -; GFX9: ; %bb.0: -; GFX9-NEXT: v_mov_b32_e32 v1, s2 -; GFX9-NEXT: v_mov_b32_e32 v2, s0 -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: v_mov_b32_e32 v2, s1 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc -; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX9-NEXT: v_readfirstlane_b32 s0, v0 -; GFX9-NEXT: v_readfirstlane_b32 s1, v1 -; GFX9-NEXT: ; return to shader part epilog +; GFX900-LABEL: s_select_v3bf16: +; GFX900: ; %bb.0: +; GFX900-NEXT: v_mov_b32_e32 v1, s2 +; GFX900-NEXT: v_mov_b32_e32 v2, s0 +; GFX900-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX900-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc +; GFX900-NEXT: v_mov_b32_e32 v1, s3 +; GFX900-NEXT: v_mov_b32_e32 v2, s1 +; GFX900-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; GFX900-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX900-NEXT: v_readfirstlane_b32 s0, v0 +; GFX900-NEXT: v_readfirstlane_b32 s1, v1 +; GFX900-NEXT: ; return to shader part epilog +; +; GFX950-LABEL: s_select_v3bf16: +; GFX950: ; %bb.0: +; GFX950-NEXT: v_mov_b32_e32 v1, s2 +; GFX950-NEXT: v_mov_b32_e32 v2, s0 +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc +; GFX950-NEXT: v_mov_b32_e32 v1, s3 +; GFX950-NEXT: v_mov_b32_e32 v2, s1 +; GFX950-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; GFX950-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX950-NEXT: v_readfirstlane_b32 s0, v0 +; GFX950-NEXT: v_readfirstlane_b32 s1, v1 +; GFX950-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: s_select_v3bf16: ; GFX10: ; %bb.0: @@ -39720,18 +42598,32 @@ define amdgpu_ps <2 x i32> @s_select_v4bf16(<4 x bfloat> inreg %a, <4 x bfloat> ; GFX8-NEXT: v_readfirstlane_b32 s1, v0 ; GFX8-NEXT: ; return to shader part epilog ; -; GFX9-LABEL: s_select_v4bf16: -; GFX9: ; %bb.0: -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: v_mov_b32_e32 v2, s1 -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc -; GFX9-NEXT: v_mov_b32_e32 v1, s2 -; GFX9-NEXT: v_mov_b32_e32 v2, s0 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc -; GFX9-NEXT: v_readfirstlane_b32 s0, v1 -; GFX9-NEXT: v_readfirstlane_b32 s1, v0 -; GFX9-NEXT: ; return to shader part epilog +; GFX900-LABEL: s_select_v4bf16: +; GFX900: ; %bb.0: +; GFX900-NEXT: v_mov_b32_e32 v1, s3 +; GFX900-NEXT: v_mov_b32_e32 v2, s1 +; GFX900-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX900-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc +; GFX900-NEXT: v_mov_b32_e32 v1, s2 +; GFX900-NEXT: v_mov_b32_e32 v2, s0 +; GFX900-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; GFX900-NEXT: v_readfirstlane_b32 s0, v1 +; GFX900-NEXT: v_readfirstlane_b32 s1, v0 +; GFX900-NEXT: ; return to shader part epilog +; +; GFX950-LABEL: s_select_v4bf16: +; GFX950: ; %bb.0: +; GFX950-NEXT: v_mov_b32_e32 v1, s3 +; GFX950-NEXT: v_mov_b32_e32 v2, s1 +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc +; GFX950-NEXT: v_mov_b32_e32 v1, s2 +; GFX950-NEXT: v_mov_b32_e32 v2, s0 +; GFX950-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; GFX950-NEXT: v_readfirstlane_b32 s1, v0 +; GFX950-NEXT: v_readfirstlane_b32 s0, v1 +; GFX950-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: s_select_v4bf16: ; GFX10: ; %bb.0: @@ -39854,34 +42746,66 @@ define amdgpu_ps <2 x i32> @s_vselect_v4bf16(<4 x bfloat> inreg %a, <4 x bfloat> ; GFX8-NEXT: v_readfirstlane_b32 s1, v2 ; GFX8-NEXT: ; return to shader part epilog ; -; GFX9-LABEL: s_vselect_v4bf16: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_lshr_b32 s4, s1, 16 -; GFX9-NEXT: s_lshr_b32 s5, s3, 16 -; GFX9-NEXT: v_mov_b32_e32 v4, s5 -; GFX9-NEXT: v_mov_b32_e32 v5, s4 -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 -; GFX9-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc -; GFX9-NEXT: v_mov_b32_e32 v4, s3 -; GFX9-NEXT: v_mov_b32_e32 v5, s1 -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v4, v5, vcc -; GFX9-NEXT: s_mov_b32 s1, 0x5040100 -; GFX9-NEXT: s_lshr_b32 s3, s0, 16 -; GFX9-NEXT: s_lshr_b32 s4, s2, 16 -; GFX9-NEXT: v_perm_b32 v2, v3, v2, s1 -; GFX9-NEXT: v_mov_b32_e32 v3, s4 -; GFX9-NEXT: v_mov_b32_e32 v4, s3 -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc -; GFX9-NEXT: v_mov_b32_e32 v3, s2 -; GFX9-NEXT: v_mov_b32_e32 v4, s0 -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v3, v4, vcc -; GFX9-NEXT: v_perm_b32 v0, v1, v0, s1 -; GFX9-NEXT: v_readfirstlane_b32 s0, v0 -; GFX9-NEXT: v_readfirstlane_b32 s1, v2 -; GFX9-NEXT: ; return to shader part epilog +; GFX900-LABEL: s_vselect_v4bf16: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_lshr_b32 s4, s1, 16 +; GFX900-NEXT: s_lshr_b32 s5, s3, 16 +; GFX900-NEXT: v_mov_b32_e32 v4, s5 +; GFX900-NEXT: v_mov_b32_e32 v5, s4 +; GFX900-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 +; GFX900-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc +; GFX900-NEXT: v_mov_b32_e32 v4, s3 +; GFX900-NEXT: v_mov_b32_e32 v5, s1 +; GFX900-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 +; GFX900-NEXT: v_cndmask_b32_e32 v2, v4, v5, vcc +; GFX900-NEXT: s_mov_b32 s1, 0x5040100 +; GFX900-NEXT: s_lshr_b32 s3, s0, 16 +; GFX900-NEXT: s_lshr_b32 s4, s2, 16 +; GFX900-NEXT: v_perm_b32 v2, v3, v2, s1 +; GFX900-NEXT: v_mov_b32_e32 v3, s4 +; GFX900-NEXT: v_mov_b32_e32 v4, s3 +; GFX900-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 +; GFX900-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc +; GFX900-NEXT: v_mov_b32_e32 v3, s2 +; GFX900-NEXT: v_mov_b32_e32 v4, s0 +; GFX900-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX900-NEXT: v_cndmask_b32_e32 v0, v3, v4, vcc +; GFX900-NEXT: v_perm_b32 v0, v1, v0, s1 +; GFX900-NEXT: v_readfirstlane_b32 s0, v0 +; GFX900-NEXT: v_readfirstlane_b32 s1, v2 +; GFX900-NEXT: ; return to shader part epilog +; +; GFX950-LABEL: s_vselect_v4bf16: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_lshr_b32 s4, s1, 16 +; GFX950-NEXT: s_lshr_b32 s5, s3, 16 +; GFX950-NEXT: v_mov_b32_e32 v4, s5 +; GFX950-NEXT: v_mov_b32_e32 v5, s4 +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 +; GFX950-NEXT: s_lshr_b32 s4, s2, 16 +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc +; GFX950-NEXT: v_mov_b32_e32 v4, s3 +; GFX950-NEXT: v_mov_b32_e32 v5, s1 +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 +; GFX950-NEXT: s_mov_b32 s1, 0x5040100 +; GFX950-NEXT: s_lshr_b32 s3, s0, 16 +; GFX950-NEXT: v_cndmask_b32_e32 v2, v4, v5, vcc +; GFX950-NEXT: v_perm_b32 v2, v3, v2, s1 +; GFX950-NEXT: v_mov_b32_e32 v3, s4 +; GFX950-NEXT: v_mov_b32_e32 v4, s3 +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc +; GFX950-NEXT: v_mov_b32_e32 v3, s2 +; GFX950-NEXT: v_mov_b32_e32 v4, s0 +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e32 v0, v3, v4, vcc +; GFX950-NEXT: v_perm_b32 v0, v1, v0, s1 +; GFX950-NEXT: v_readfirstlane_b32 s1, v2 +; GFX950-NEXT: v_readfirstlane_b32 s0, v0 +; GFX950-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: s_vselect_v4bf16: ; GFX10: ; %bb.0: @@ -40053,26 +42977,48 @@ define <4 x bfloat> @v_vselect_v4bf16(<4 x i1> %cond, <4 x bfloat> %a, <4 x bflo ; GFX8-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_vselect_v4bf16: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_and_b32_e32 v1, 1, v1 -; GFX9-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v1 -; GFX9-NEXT: v_and_b32_e32 v1, 1, v3 -; GFX9-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v1 -; GFX9-NEXT: v_and_b32_e32 v1, 1, v2 -; GFX9-NEXT: v_cndmask_b32_sdwa v2, v7, v5, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX9-NEXT: v_cmp_eq_u32_e64 s[6:7], 1, v1 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v6, v4, vcc -; GFX9-NEXT: s_mov_b64 vcc, s[4:5] -; GFX9-NEXT: v_cndmask_b32_e64 v1, v7, v5, s[6:7] -; GFX9-NEXT: v_cndmask_b32_sdwa v3, v6, v4, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX9-NEXT: s_mov_b32 s4, 0x5040100 -; GFX9-NEXT: v_perm_b32 v0, v3, v0, s4 -; GFX9-NEXT: v_perm_b32 v1, v2, v1, s4 -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: v_vselect_v4bf16: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_and_b32_e32 v1, 1, v1 +; GFX900-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v1 +; GFX900-NEXT: v_and_b32_e32 v1, 1, v3 +; GFX900-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX900-NEXT: v_cmp_eq_u32_e32 vcc, 1, v1 +; GFX900-NEXT: v_and_b32_e32 v1, 1, v2 +; GFX900-NEXT: v_cndmask_b32_sdwa v2, v7, v5, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX900-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GFX900-NEXT: v_cmp_eq_u32_e64 s[6:7], 1, v1 +; GFX900-NEXT: v_cndmask_b32_e32 v0, v6, v4, vcc +; GFX900-NEXT: s_mov_b64 vcc, s[4:5] +; GFX900-NEXT: v_cndmask_b32_e64 v1, v7, v5, s[6:7] +; GFX900-NEXT: v_cndmask_b32_sdwa v3, v6, v4, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_perm_b32 v0, v3, v0, s4 +; GFX900-NEXT: v_perm_b32 v1, v2, v1, s4 +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_vselect_v4bf16: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_and_b32_e32 v1, 1, v1 +; GFX950-NEXT: v_cmp_eq_u32_e64 s[0:1], 1, v1 +; GFX950-NEXT: v_and_b32_e32 v1, 1, v3 +; GFX950-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, 1, v1 +; GFX950-NEXT: v_and_b32_e32 v1, 1, v2 +; GFX950-NEXT: v_cmp_eq_u32_e64 s[2:3], 1, v1 +; GFX950-NEXT: v_cndmask_b32_sdwa v2, v7, v5, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GFX950-NEXT: v_cndmask_b32_e64 v1, v7, v5, s[2:3] +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: v_cndmask_b32_e32 v0, v6, v4, vcc +; GFX950-NEXT: s_mov_b64 vcc, s[0:1] +; GFX950-NEXT: v_cndmask_b32_sdwa v3, v6, v4, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX950-NEXT: s_mov_b32 s0, 0x5040100 +; GFX950-NEXT: v_perm_b32 v0, v3, v0, s0 +; GFX950-NEXT: v_perm_b32 v1, v2, v1, s0 +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_vselect_v4bf16: ; GFX10: ; %bb.0: @@ -40294,47 +43240,93 @@ define <8 x bfloat> @v_vselect_v8bf16(<8 x i1> %cond, <8 x bfloat> %a, <8 x bflo ; GFX8-NEXT: v_or_b32_sdwa v3, v6, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_vselect_v8bf16: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_and_b32_e32 v6, 1, v6 -; GFX9-NEXT: v_and_b32_e32 v7, 1, v7 -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v6 -; GFX9-NEXT: v_and_b32_e32 v4, 1, v4 -; GFX9-NEXT: v_cndmask_b32_e32 v6, v15, v11, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v11, 16, v11 -; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v15 -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v7 -; GFX9-NEXT: v_and_b32_e32 v5, 1, v5 -; GFX9-NEXT: v_cndmask_b32_e32 v7, v15, v11, vcc -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v4 -; GFX9-NEXT: v_and_b32_e32 v2, 1, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v4, v14, v10, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v10, 16, v10 -; GFX9-NEXT: v_lshrrev_b32_e32 v11, 16, v14 -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v5 -; GFX9-NEXT: v_and_b32_e32 v3, 1, v3 -; GFX9-NEXT: v_cndmask_b32_e32 v5, v11, v10, vcc -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 -; GFX9-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v13, v9, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v9, 16, v9 -; GFX9-NEXT: v_lshrrev_b32_e32 v10, 16, v13 -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v3 -; GFX9-NEXT: v_and_b32_e32 v1, 1, v1 -; GFX9-NEXT: v_cndmask_b32_e32 v3, v10, v9, vcc -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v12, v8, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v8, 16, v8 -; GFX9-NEXT: v_lshrrev_b32_e32 v9, 16, v12 -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v1 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v9, v8, vcc -; GFX9-NEXT: s_mov_b32 s4, 0x5040100 -; GFX9-NEXT: v_perm_b32 v0, v1, v0, s4 -; GFX9-NEXT: v_perm_b32 v1, v3, v2, s4 -; GFX9-NEXT: v_perm_b32 v2, v5, v4, s4 -; GFX9-NEXT: v_perm_b32 v3, v7, v6, s4 -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: v_vselect_v8bf16: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_and_b32_e32 v6, 1, v6 +; GFX900-NEXT: v_and_b32_e32 v7, 1, v7 +; GFX900-NEXT: v_cmp_eq_u32_e32 vcc, 1, v6 +; GFX900-NEXT: v_and_b32_e32 v4, 1, v4 +; GFX900-NEXT: v_cndmask_b32_e32 v6, v15, v11, vcc +; GFX900-NEXT: v_lshrrev_b32_e32 v11, 16, v11 +; GFX900-NEXT: v_lshrrev_b32_e32 v15, 16, v15 +; GFX900-NEXT: v_cmp_eq_u32_e32 vcc, 1, v7 +; GFX900-NEXT: v_and_b32_e32 v5, 1, v5 +; GFX900-NEXT: v_cndmask_b32_e32 v7, v15, v11, vcc +; GFX900-NEXT: v_cmp_eq_u32_e32 vcc, 1, v4 +; GFX900-NEXT: v_and_b32_e32 v2, 1, v2 +; GFX900-NEXT: v_cndmask_b32_e32 v4, v14, v10, vcc +; GFX900-NEXT: v_lshrrev_b32_e32 v10, 16, v10 +; GFX900-NEXT: v_lshrrev_b32_e32 v11, 16, v14 +; GFX900-NEXT: v_cmp_eq_u32_e32 vcc, 1, v5 +; GFX900-NEXT: v_and_b32_e32 v3, 1, v3 +; GFX900-NEXT: v_cndmask_b32_e32 v5, v11, v10, vcc +; GFX900-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 +; GFX900-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX900-NEXT: v_cndmask_b32_e32 v2, v13, v9, vcc +; GFX900-NEXT: v_lshrrev_b32_e32 v9, 16, v9 +; GFX900-NEXT: v_lshrrev_b32_e32 v10, 16, v13 +; GFX900-NEXT: v_cmp_eq_u32_e32 vcc, 1, v3 +; GFX900-NEXT: v_and_b32_e32 v1, 1, v1 +; GFX900-NEXT: v_cndmask_b32_e32 v3, v10, v9, vcc +; GFX900-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GFX900-NEXT: v_cndmask_b32_e32 v0, v12, v8, vcc +; GFX900-NEXT: v_lshrrev_b32_e32 v8, 16, v8 +; GFX900-NEXT: v_lshrrev_b32_e32 v9, 16, v12 +; GFX900-NEXT: v_cmp_eq_u32_e32 vcc, 1, v1 +; GFX900-NEXT: v_cndmask_b32_e32 v1, v9, v8, vcc +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_perm_b32 v0, v1, v0, s4 +; GFX900-NEXT: v_perm_b32 v1, v3, v2, s4 +; GFX900-NEXT: v_perm_b32 v2, v5, v4, s4 +; GFX900-NEXT: v_perm_b32 v3, v7, v6, s4 +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_vselect_v8bf16: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_and_b32_e32 v6, 1, v6 +; GFX950-NEXT: v_and_b32_e32 v7, 1, v7 +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, 1, v6 +; GFX950-NEXT: v_and_b32_e32 v4, 1, v4 +; GFX950-NEXT: v_and_b32_e32 v5, 1, v5 +; GFX950-NEXT: v_cndmask_b32_e32 v6, v15, v11, vcc +; GFX950-NEXT: v_lshrrev_b32_e32 v11, 16, v11 +; GFX950-NEXT: v_lshrrev_b32_e32 v15, 16, v15 +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, 1, v7 +; GFX950-NEXT: v_and_b32_e32 v2, 1, v2 +; GFX950-NEXT: v_and_b32_e32 v3, 1, v3 +; GFX950-NEXT: v_cndmask_b32_e32 v7, v15, v11, vcc +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, 1, v4 +; GFX950-NEXT: v_lshrrev_b32_e32 v11, 16, v14 +; GFX950-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX950-NEXT: v_cndmask_b32_e32 v4, v14, v10, vcc +; GFX950-NEXT: v_lshrrev_b32_e32 v10, 16, v10 +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, 1, v5 +; GFX950-NEXT: v_and_b32_e32 v1, 1, v1 +; GFX950-NEXT: s_mov_b32 s0, 0x5040100 +; GFX950-NEXT: v_cndmask_b32_e32 v5, v11, v10, vcc +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 +; GFX950-NEXT: v_lshrrev_b32_e32 v10, 16, v13 +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: v_cndmask_b32_e32 v2, v13, v9, vcc +; GFX950-NEXT: v_lshrrev_b32_e32 v9, 16, v9 +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, 1, v3 +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e32 v3, v10, v9, vcc +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GFX950-NEXT: v_lshrrev_b32_e32 v9, 16, v12 +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: v_cndmask_b32_e32 v0, v12, v8, vcc +; GFX950-NEXT: v_lshrrev_b32_e32 v8, 16, v8 +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, 1, v1 +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e32 v1, v9, v8, vcc +; GFX950-NEXT: v_perm_b32 v0, v1, v0, s0 +; GFX950-NEXT: v_perm_b32 v1, v3, v2, s0 +; GFX950-NEXT: v_perm_b32 v2, v5, v4, s0 +; GFX950-NEXT: v_perm_b32 v3, v7, v6, s0 +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_vselect_v8bf16: ; GFX10: ; %bb.0: @@ -40803,85 +43795,171 @@ define <16 x bfloat> @v_vselect_v16bf16(<16 x i1> %cond, <16 x bfloat> %a, <16 x ; GFX8-NEXT: v_or_b32_sdwa v7, v9, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_vselect_v16bf16: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_and_b32_e32 v6, 1, v6 -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v6 -; GFX9-NEXT: v_and_b32_e32 v6, 1, v8 -; GFX9-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v6 -; GFX9-NEXT: v_and_b32_e32 v6, 1, v10 -; GFX9-NEXT: v_cmp_eq_u32_e64 s[6:7], 1, v6 -; GFX9-NEXT: v_and_b32_e32 v6, 1, v12 -; GFX9-NEXT: v_cmp_eq_u32_e64 s[8:9], 1, v6 -; GFX9-NEXT: v_and_b32_e32 v8, 1, v13 -; GFX9-NEXT: v_cndmask_b32_e64 v6, v30, v22, s[8:9] -; GFX9-NEXT: v_cmp_eq_u32_e64 s[8:9], 1, v8 -; GFX9-NEXT: buffer_load_dword v8, off, s[0:3], s32 -; GFX9-NEXT: v_lshrrev_b32_e32 v10, 16, v22 -; GFX9-NEXT: v_lshrrev_b32_e32 v12, 16, v30 -; GFX9-NEXT: v_and_b32_e32 v11, 1, v11 -; GFX9-NEXT: v_and_b32_e32 v9, 1, v9 -; GFX9-NEXT: v_and_b32_e32 v7, 1, v7 -; GFX9-NEXT: v_and_b32_e32 v4, 1, v4 -; GFX9-NEXT: v_and_b32_e32 v13, 1, v14 -; GFX9-NEXT: v_cndmask_b32_e64 v10, v12, v10, s[8:9] -; GFX9-NEXT: v_lshrrev_b32_e32 v12, 16, v21 -; GFX9-NEXT: v_cndmask_b32_e64 v14, v29, v21, s[6:7] -; GFX9-NEXT: v_lshrrev_b32_e32 v21, 16, v29 -; GFX9-NEXT: v_cmp_eq_u32_e64 s[6:7], 1, v11 -; GFX9-NEXT: v_lshrrev_b32_e32 v11, 16, v20 -; GFX9-NEXT: v_cndmask_b32_e64 v20, v28, v20, s[4:5] -; GFX9-NEXT: v_lshrrev_b32_e32 v22, 16, v19 -; GFX9-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v9 -; GFX9-NEXT: v_lshrrev_b32_e32 v9, 16, v27 -; GFX9-NEXT: v_cndmask_b32_e32 v19, v27, v19, vcc -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v7 -; GFX9-NEXT: v_cndmask_b32_e64 v12, v21, v12, s[6:7] -; GFX9-NEXT: v_lshrrev_b32_e32 v21, 16, v28 -; GFX9-NEXT: v_and_b32_e32 v5, 1, v5 -; GFX9-NEXT: v_cndmask_b32_e32 v9, v9, v22, vcc -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v4 -; GFX9-NEXT: v_cndmask_b32_e64 v11, v21, v11, s[4:5] -; GFX9-NEXT: v_lshrrev_b32_e32 v21, 16, v18 -; GFX9-NEXT: v_lshrrev_b32_e32 v27, 16, v26 -; GFX9-NEXT: v_cndmask_b32_e32 v4, v26, v18, vcc -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v5 -; GFX9-NEXT: v_and_b32_e32 v15, 1, v15 -; GFX9-NEXT: v_cndmask_b32_e32 v5, v27, v21, vcc -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v13 -; GFX9-NEXT: v_lshrrev_b32_e32 v7, 16, v23 -; GFX9-NEXT: v_and_b32_e32 v2, 1, v2 -; GFX9-NEXT: v_and_b32_e32 v3, 1, v3 -; GFX9-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX9-NEXT: v_and_b32_e32 v1, 1, v1 -; GFX9-NEXT: s_mov_b32 s4, 0x5040100 -; GFX9-NEXT: v_perm_b32 v6, v10, v6, s4 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_cndmask_b32_e32 v13, v8, v23, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v8, 16, v8 -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v15 -; GFX9-NEXT: v_cndmask_b32_e32 v7, v8, v7, vcc -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v25, v17, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v8, 16, v17 -; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v25 -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v3 -; GFX9-NEXT: v_cndmask_b32_e32 v3, v15, v8, vcc -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v24, v16, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v8, 16, v16 -; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v24 -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v1 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v15, v8, vcc -; GFX9-NEXT: v_perm_b32 v0, v1, v0, s4 -; GFX9-NEXT: v_perm_b32 v1, v3, v2, s4 -; GFX9-NEXT: v_perm_b32 v2, v5, v4, s4 -; GFX9-NEXT: v_perm_b32 v3, v9, v19, s4 -; GFX9-NEXT: v_perm_b32 v4, v11, v20, s4 -; GFX9-NEXT: v_perm_b32 v5, v12, v14, s4 -; GFX9-NEXT: v_perm_b32 v7, v7, v13, s4 -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: v_vselect_v16bf16: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_and_b32_e32 v6, 1, v6 +; GFX900-NEXT: v_cmp_eq_u32_e32 vcc, 1, v6 +; GFX900-NEXT: v_and_b32_e32 v6, 1, v8 +; GFX900-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v6 +; GFX900-NEXT: v_and_b32_e32 v6, 1, v10 +; GFX900-NEXT: v_cmp_eq_u32_e64 s[6:7], 1, v6 +; GFX900-NEXT: v_and_b32_e32 v6, 1, v12 +; GFX900-NEXT: v_cmp_eq_u32_e64 s[8:9], 1, v6 +; GFX900-NEXT: v_and_b32_e32 v8, 1, v13 +; GFX900-NEXT: v_cndmask_b32_e64 v6, v30, v22, s[8:9] +; GFX900-NEXT: v_cmp_eq_u32_e64 s[8:9], 1, v8 +; GFX900-NEXT: buffer_load_dword v8, off, s[0:3], s32 +; GFX900-NEXT: v_lshrrev_b32_e32 v10, 16, v22 +; GFX900-NEXT: v_lshrrev_b32_e32 v12, 16, v30 +; GFX900-NEXT: v_and_b32_e32 v11, 1, v11 +; GFX900-NEXT: v_and_b32_e32 v9, 1, v9 +; GFX900-NEXT: v_and_b32_e32 v7, 1, v7 +; GFX900-NEXT: v_and_b32_e32 v4, 1, v4 +; GFX900-NEXT: v_and_b32_e32 v13, 1, v14 +; GFX900-NEXT: v_cndmask_b32_e64 v10, v12, v10, s[8:9] +; GFX900-NEXT: v_lshrrev_b32_e32 v12, 16, v21 +; GFX900-NEXT: v_cndmask_b32_e64 v14, v29, v21, s[6:7] +; GFX900-NEXT: v_lshrrev_b32_e32 v21, 16, v29 +; GFX900-NEXT: v_cmp_eq_u32_e64 s[6:7], 1, v11 +; GFX900-NEXT: v_lshrrev_b32_e32 v11, 16, v20 +; GFX900-NEXT: v_cndmask_b32_e64 v20, v28, v20, s[4:5] +; GFX900-NEXT: v_lshrrev_b32_e32 v22, 16, v19 +; GFX900-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v9 +; GFX900-NEXT: v_lshrrev_b32_e32 v9, 16, v27 +; GFX900-NEXT: v_cndmask_b32_e32 v19, v27, v19, vcc +; GFX900-NEXT: v_cmp_eq_u32_e32 vcc, 1, v7 +; GFX900-NEXT: v_cndmask_b32_e64 v12, v21, v12, s[6:7] +; GFX900-NEXT: v_lshrrev_b32_e32 v21, 16, v28 +; GFX900-NEXT: v_and_b32_e32 v5, 1, v5 +; GFX900-NEXT: v_cndmask_b32_e32 v9, v9, v22, vcc +; GFX900-NEXT: v_cmp_eq_u32_e32 vcc, 1, v4 +; GFX900-NEXT: v_cndmask_b32_e64 v11, v21, v11, s[4:5] +; GFX900-NEXT: v_lshrrev_b32_e32 v21, 16, v18 +; GFX900-NEXT: v_lshrrev_b32_e32 v27, 16, v26 +; GFX900-NEXT: v_cndmask_b32_e32 v4, v26, v18, vcc +; GFX900-NEXT: v_cmp_eq_u32_e32 vcc, 1, v5 +; GFX900-NEXT: v_and_b32_e32 v15, 1, v15 +; GFX900-NEXT: v_cndmask_b32_e32 v5, v27, v21, vcc +; GFX900-NEXT: v_cmp_eq_u32_e32 vcc, 1, v13 +; GFX900-NEXT: v_lshrrev_b32_e32 v7, 16, v23 +; GFX900-NEXT: v_and_b32_e32 v2, 1, v2 +; GFX900-NEXT: v_and_b32_e32 v3, 1, v3 +; GFX900-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX900-NEXT: v_and_b32_e32 v1, 1, v1 +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_perm_b32 v6, v10, v6, s4 +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: v_cndmask_b32_e32 v13, v8, v23, vcc +; GFX900-NEXT: v_lshrrev_b32_e32 v8, 16, v8 +; GFX900-NEXT: v_cmp_eq_u32_e32 vcc, 1, v15 +; GFX900-NEXT: v_cndmask_b32_e32 v7, v8, v7, vcc +; GFX900-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 +; GFX900-NEXT: v_cndmask_b32_e32 v2, v25, v17, vcc +; GFX900-NEXT: v_lshrrev_b32_e32 v8, 16, v17 +; GFX900-NEXT: v_lshrrev_b32_e32 v15, 16, v25 +; GFX900-NEXT: v_cmp_eq_u32_e32 vcc, 1, v3 +; GFX900-NEXT: v_cndmask_b32_e32 v3, v15, v8, vcc +; GFX900-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GFX900-NEXT: v_cndmask_b32_e32 v0, v24, v16, vcc +; GFX900-NEXT: v_lshrrev_b32_e32 v8, 16, v16 +; GFX900-NEXT: v_lshrrev_b32_e32 v15, 16, v24 +; GFX900-NEXT: v_cmp_eq_u32_e32 vcc, 1, v1 +; GFX900-NEXT: v_cndmask_b32_e32 v1, v15, v8, vcc +; GFX900-NEXT: v_perm_b32 v0, v1, v0, s4 +; GFX900-NEXT: v_perm_b32 v1, v3, v2, s4 +; GFX900-NEXT: v_perm_b32 v2, v5, v4, s4 +; GFX900-NEXT: v_perm_b32 v3, v9, v19, s4 +; GFX900-NEXT: v_perm_b32 v4, v11, v20, s4 +; GFX900-NEXT: v_perm_b32 v5, v12, v14, s4 +; GFX900-NEXT: v_perm_b32 v7, v7, v13, s4 +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_vselect_v16bf16: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: scratch_load_dword v31, off, s32 +; GFX950-NEXT: v_and_b32_e32 v12, 1, v12 +; GFX950-NEXT: v_and_b32_e32 v13, 1, v13 +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, 1, v12 +; GFX950-NEXT: v_and_b32_e32 v10, 1, v10 +; GFX950-NEXT: v_lshrrev_b32_e32 v33, 16, v22 +; GFX950-NEXT: v_lshrrev_b32_e32 v34, 16, v30 +; GFX950-NEXT: v_cndmask_b32_e32 v12, v30, v22, vcc +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, 1, v13 +; GFX950-NEXT: v_and_b32_e32 v11, 1, v11 +; GFX950-NEXT: v_and_b32_e32 v8, 1, v8 +; GFX950-NEXT: v_cndmask_b32_e32 v13, v34, v33, vcc +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, 1, v10 +; GFX950-NEXT: v_lshrrev_b32_e32 v35, 16, v21 +; GFX950-NEXT: v_lshrrev_b32_e32 v36, 16, v29 +; GFX950-NEXT: v_cndmask_b32_e32 v10, v29, v21, vcc +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, 1, v11 +; GFX950-NEXT: v_and_b32_e32 v9, 1, v9 +; GFX950-NEXT: v_and_b32_e32 v6, 1, v6 +; GFX950-NEXT: v_cndmask_b32_e32 v11, v36, v35, vcc +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, 1, v8 +; GFX950-NEXT: v_lshrrev_b32_e32 v37, 16, v20 +; GFX950-NEXT: v_lshrrev_b32_e32 v38, 16, v28 +; GFX950-NEXT: v_cndmask_b32_e32 v8, v28, v20, vcc +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, 1, v9 +; GFX950-NEXT: v_and_b32_e32 v7, 1, v7 +; GFX950-NEXT: v_and_b32_e32 v4, 1, v4 +; GFX950-NEXT: v_cndmask_b32_e32 v9, v38, v37, vcc +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, 1, v6 +; GFX950-NEXT: v_lshrrev_b32_e32 v39, 16, v19 +; GFX950-NEXT: v_lshrrev_b32_e32 v48, 16, v27 +; GFX950-NEXT: v_cndmask_b32_e32 v6, v27, v19, vcc +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, 1, v7 +; GFX950-NEXT: v_and_b32_e32 v14, 1, v14 +; GFX950-NEXT: v_and_b32_e32 v15, 1, v15 +; GFX950-NEXT: v_cndmask_b32_e32 v7, v48, v39, vcc +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, 1, v4 +; GFX950-NEXT: v_and_b32_e32 v5, 1, v5 +; GFX950-NEXT: v_lshrrev_b32_e32 v32, 16, v23 +; GFX950-NEXT: v_cndmask_b32_e32 v4, v26, v18, vcc +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, 1, v14 +; GFX950-NEXT: v_lshrrev_b32_e32 v49, 16, v18 +; GFX950-NEXT: v_and_b32_e32 v2, 1, v2 +; GFX950-NEXT: v_lshrrev_b32_e32 v50, 16, v26 +; GFX950-NEXT: v_and_b32_e32 v3, 1, v3 +; GFX950-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX950-NEXT: v_and_b32_e32 v1, 1, v1 +; GFX950-NEXT: s_mov_b32 s0, 0x5040100 +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: v_cndmask_b32_e32 v14, v31, v23, vcc +; GFX950-NEXT: v_lshrrev_b32_e32 v18, 16, v31 +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, 1, v15 +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e32 v15, v18, v32, vcc +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, 1, v5 +; GFX950-NEXT: v_lshrrev_b32_e32 v18, 16, v25 +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: v_cndmask_b32_e32 v5, v50, v49, vcc +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e32 v2, v25, v17, vcc +; GFX950-NEXT: v_lshrrev_b32_e32 v17, 16, v17 +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, 1, v3 +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e32 v3, v18, v17, vcc +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GFX950-NEXT: v_lshrrev_b32_e32 v17, 16, v24 +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: v_cndmask_b32_e32 v0, v24, v16, vcc +; GFX950-NEXT: v_lshrrev_b32_e32 v16, 16, v16 +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, 1, v1 +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e32 v1, v17, v16, vcc +; GFX950-NEXT: v_perm_b32 v0, v1, v0, s0 +; GFX950-NEXT: v_perm_b32 v1, v3, v2, s0 +; GFX950-NEXT: v_perm_b32 v2, v5, v4, s0 +; GFX950-NEXT: v_perm_b32 v3, v7, v6, s0 +; GFX950-NEXT: v_perm_b32 v4, v9, v8, s0 +; GFX950-NEXT: v_perm_b32 v5, v11, v10, s0 +; GFX950-NEXT: v_perm_b32 v6, v13, v12, s0 +; GFX950-NEXT: v_perm_b32 v7, v15, v14, s0 +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_vselect_v16bf16: ; GFX10: ; %bb.0: @@ -41981,205 +45059,438 @@ define <32 x bfloat> @v_vselect_v32bf16(<32 x i1> %cond, <32 x bfloat> %a, <32 x ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_vselect_v32bf16: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; GFX9-NEXT: s_mov_b64 exec, s[4:5] -; GFX9-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX9-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v0 -; GFX9-NEXT: v_and_b32_e32 v0, 1, v3 -; GFX9-NEXT: v_cmp_eq_u32_e64 s[6:7], 1, v0 -; GFX9-NEXT: v_and_b32_e32 v0, 1, v2 -; GFX9-NEXT: v_cmp_eq_u32_e64 s[8:9], 1, v0 -; GFX9-NEXT: v_and_b32_e32 v0, 1, v5 -; GFX9-NEXT: v_cmp_eq_u32_e64 s[10:11], 1, v0 -; GFX9-NEXT: v_and_b32_e32 v0, 1, v4 -; GFX9-NEXT: v_cmp_eq_u32_e64 s[12:13], 1, v0 -; GFX9-NEXT: v_and_b32_e32 v0, 1, v7 -; GFX9-NEXT: v_cmp_eq_u32_e64 s[14:15], 1, v0 -; GFX9-NEXT: v_and_b32_e32 v0, 1, v6 -; GFX9-NEXT: v_cmp_eq_u32_e64 s[16:17], 1, v0 -; GFX9-NEXT: v_and_b32_e32 v0, 1, v9 -; GFX9-NEXT: v_cmp_eq_u32_e64 s[18:19], 1, v0 -; GFX9-NEXT: v_and_b32_e32 v0, 1, v8 -; GFX9-NEXT: v_cmp_eq_u32_e64 s[20:21], 1, v0 -; GFX9-NEXT: v_and_b32_e32 v0, 1, v11 -; GFX9-NEXT: v_cmp_eq_u32_e64 s[22:23], 1, v0 -; GFX9-NEXT: v_and_b32_e32 v0, 1, v10 -; GFX9-NEXT: v_cmp_eq_u32_e64 s[24:25], 1, v0 -; GFX9-NEXT: v_and_b32_e32 v0, 1, v13 -; GFX9-NEXT: v_cmp_eq_u32_e64 s[26:27], 1, v0 -; GFX9-NEXT: v_and_b32_e32 v0, 1, v12 -; GFX9-NEXT: v_cmp_eq_u32_e64 s[28:29], 1, v0 -; GFX9-NEXT: v_and_b32_e32 v0, 1, v15 -; GFX9-NEXT: v_cmp_eq_u32_e64 s[40:41], 1, v0 -; GFX9-NEXT: v_and_b32_e32 v0, 1, v14 -; GFX9-NEXT: v_cmp_eq_u32_e64 s[42:43], 1, v0 -; GFX9-NEXT: v_and_b32_e32 v0, 1, v17 -; GFX9-NEXT: v_cmp_eq_u32_e64 s[44:45], 1, v0 -; GFX9-NEXT: v_and_b32_e32 v0, 1, v16 -; GFX9-NEXT: v_cmp_eq_u32_e64 s[46:47], 1, v0 -; GFX9-NEXT: v_and_b32_e32 v0, 1, v19 -; GFX9-NEXT: v_cmp_eq_u32_e64 s[56:57], 1, v0 -; GFX9-NEXT: v_and_b32_e32 v0, 1, v18 -; GFX9-NEXT: v_cmp_eq_u32_e64 s[58:59], 1, v0 -; GFX9-NEXT: v_and_b32_e32 v0, 1, v21 -; GFX9-NEXT: v_cmp_eq_u32_e64 s[60:61], 1, v0 -; GFX9-NEXT: v_and_b32_e32 v0, 1, v20 -; GFX9-NEXT: v_cmp_eq_u32_e64 s[62:63], 1, v0 -; GFX9-NEXT: v_and_b32_e32 v0, 1, v23 -; GFX9-NEXT: v_cmp_eq_u32_e64 s[72:73], 1, v0 -; GFX9-NEXT: v_and_b32_e32 v0, 1, v22 -; GFX9-NEXT: v_cmp_eq_u32_e64 s[74:75], 1, v0 -; GFX9-NEXT: v_and_b32_e32 v0, 1, v25 -; GFX9-NEXT: v_cmp_eq_u32_e64 s[76:77], 1, v0 -; GFX9-NEXT: v_and_b32_e32 v0, 1, v24 -; GFX9-NEXT: v_cmp_eq_u32_e64 s[78:79], 1, v0 -; GFX9-NEXT: v_and_b32_e32 v0, 1, v27 -; GFX9-NEXT: v_cmp_eq_u32_e64 s[88:89], 1, v0 -; GFX9-NEXT: v_and_b32_e32 v0, 1, v26 -; GFX9-NEXT: v_cmp_eq_u32_e64 s[90:91], 1, v0 -; GFX9-NEXT: v_and_b32_e32 v0, 1, v29 -; GFX9-NEXT: v_cmp_eq_u32_e64 s[92:93], 1, v0 -; GFX9-NEXT: v_and_b32_e32 v0, 1, v28 -; GFX9-NEXT: v_cmp_eq_u32_e64 s[94:95], 1, v0 -; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 -; GFX9-NEXT: v_writelane_b32 v33, s30, 0 -; GFX9-NEXT: v_writelane_b32 v33, s31, 1 -; GFX9-NEXT: v_writelane_b32 v33, s34, 2 -; GFX9-NEXT: v_and_b32_e32 v1, 1, v1 -; GFX9-NEXT: v_writelane_b32 v33, s35, 3 -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v1 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX9-NEXT: v_cmp_eq_u32_e64 s[30:31], 1, v0 -; GFX9-NEXT: v_and_b32_e32 v0, 1, v30 -; GFX9-NEXT: v_cmp_eq_u32_e64 s[34:35], 1, v0 -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:68 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:4 -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:72 -; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:8 -; GFX9-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:76 -; GFX9-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:12 -; GFX9-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:80 -; GFX9-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:16 -; GFX9-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:84 -; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:20 -; GFX9-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:88 -; GFX9-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:24 -; GFX9-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:92 -; GFX9-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:28 -; GFX9-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:96 -; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:32 -; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:100 -; GFX9-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:36 -; GFX9-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:104 -; GFX9-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:40 -; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:108 -; GFX9-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:44 -; GFX9-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:112 -; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:48 -; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:116 -; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:52 -; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:120 -; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:56 -; GFX9-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:124 -; GFX9-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:60 -; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:128 -; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:64 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_cndmask_b32_e64 v30, v31, v32, s[34:35] -; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v32 -; GFX9-NEXT: v_lshrrev_b32_e32 v31, 16, v31 -; GFX9-NEXT: v_cndmask_b32_e64 v31, v31, v32, s[30:31] -; GFX9-NEXT: v_cndmask_b32_e64 v32, v28, v29, s[94:95] -; GFX9-NEXT: v_lshrrev_b32_e32 v29, 16, v29 -; GFX9-NEXT: v_lshrrev_b32_e32 v28, 16, v28 -; GFX9-NEXT: v_cndmask_b32_e64 v28, v28, v29, s[92:93] -; GFX9-NEXT: v_cndmask_b32_e64 v29, v26, v27, s[90:91] -; GFX9-NEXT: v_lshrrev_b32_e32 v27, 16, v27 -; GFX9-NEXT: v_lshrrev_b32_e32 v26, 16, v26 -; GFX9-NEXT: v_cndmask_b32_e64 v26, v26, v27, s[88:89] -; GFX9-NEXT: v_cndmask_b32_e64 v27, v24, v25, s[78:79] -; GFX9-NEXT: v_lshrrev_b32_e32 v25, 16, v25 -; GFX9-NEXT: v_lshrrev_b32_e32 v24, 16, v24 -; GFX9-NEXT: v_cndmask_b32_e64 v24, v24, v25, s[76:77] -; GFX9-NEXT: v_cndmask_b32_e64 v25, v22, v23, s[74:75] -; GFX9-NEXT: v_lshrrev_b32_e32 v23, 16, v23 -; GFX9-NEXT: v_lshrrev_b32_e32 v22, 16, v22 -; GFX9-NEXT: v_cndmask_b32_e64 v22, v22, v23, s[72:73] -; GFX9-NEXT: v_cndmask_b32_e64 v23, v20, v21, s[62:63] -; GFX9-NEXT: v_lshrrev_b32_e32 v21, 16, v21 -; GFX9-NEXT: v_lshrrev_b32_e32 v20, 16, v20 -; GFX9-NEXT: v_cndmask_b32_e64 v20, v20, v21, s[60:61] -; GFX9-NEXT: v_cndmask_b32_e64 v21, v18, v19, s[58:59] -; GFX9-NEXT: v_lshrrev_b32_e32 v19, 16, v19 -; GFX9-NEXT: v_lshrrev_b32_e32 v18, 16, v18 -; GFX9-NEXT: v_cndmask_b32_e64 v18, v18, v19, s[56:57] -; GFX9-NEXT: v_cndmask_b32_e64 v19, v16, v17, s[46:47] -; GFX9-NEXT: v_lshrrev_b32_e32 v17, 16, v17 -; GFX9-NEXT: v_lshrrev_b32_e32 v16, 16, v16 -; GFX9-NEXT: v_cndmask_b32_e64 v16, v16, v17, s[44:45] -; GFX9-NEXT: v_cndmask_b32_e64 v17, v14, v15, s[42:43] -; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v15 -; GFX9-NEXT: v_lshrrev_b32_e32 v14, 16, v14 -; GFX9-NEXT: v_cndmask_b32_e64 v14, v14, v15, s[40:41] -; GFX9-NEXT: v_cndmask_b32_e64 v15, v12, v13, s[28:29] -; GFX9-NEXT: v_lshrrev_b32_e32 v13, 16, v13 -; GFX9-NEXT: v_lshrrev_b32_e32 v12, 16, v12 -; GFX9-NEXT: v_cndmask_b32_e64 v12, v12, v13, s[26:27] -; GFX9-NEXT: v_cndmask_b32_e64 v13, v10, v11, s[24:25] -; GFX9-NEXT: v_lshrrev_b32_e32 v11, 16, v11 -; GFX9-NEXT: v_lshrrev_b32_e32 v10, 16, v10 -; GFX9-NEXT: v_cndmask_b32_e64 v10, v10, v11, s[22:23] -; GFX9-NEXT: v_cndmask_b32_e64 v11, v8, v9, s[20:21] -; GFX9-NEXT: v_lshrrev_b32_e32 v9, 16, v9 -; GFX9-NEXT: v_lshrrev_b32_e32 v8, 16, v8 -; GFX9-NEXT: v_cndmask_b32_e64 v8, v8, v9, s[18:19] -; GFX9-NEXT: v_cndmask_b32_e64 v9, v6, v7, s[16:17] -; GFX9-NEXT: v_lshrrev_b32_e32 v7, 16, v7 -; GFX9-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; GFX9-NEXT: v_cndmask_b32_e64 v6, v6, v7, s[14:15] -; GFX9-NEXT: v_cndmask_b32_e64 v7, v4, v5, s[12:13] -; GFX9-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GFX9-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; GFX9-NEXT: v_cndmask_b32_e64 v4, v4, v5, s[10:11] -; GFX9-NEXT: v_cndmask_b32_e64 v5, v2, v3, s[8:9] -; GFX9-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX9-NEXT: v_cndmask_b32_e64 v2, v2, v3, s[6:7] -; GFX9-NEXT: v_cndmask_b32_e64 v3, v0, v1, s[4:5] -; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc -; GFX9-NEXT: s_mov_b32 s4, 0x5040100 -; GFX9-NEXT: v_perm_b32 v0, v0, v3, s4 -; GFX9-NEXT: v_perm_b32 v1, v2, v5, s4 -; GFX9-NEXT: v_perm_b32 v2, v4, v7, s4 -; GFX9-NEXT: v_perm_b32 v3, v6, v9, s4 -; GFX9-NEXT: v_perm_b32 v4, v8, v11, s4 -; GFX9-NEXT: v_perm_b32 v5, v10, v13, s4 -; GFX9-NEXT: v_perm_b32 v6, v12, v15, s4 -; GFX9-NEXT: v_perm_b32 v7, v14, v17, s4 -; GFX9-NEXT: v_perm_b32 v8, v16, v19, s4 -; GFX9-NEXT: v_perm_b32 v9, v18, v21, s4 -; GFX9-NEXT: v_perm_b32 v10, v20, v23, s4 -; GFX9-NEXT: v_perm_b32 v11, v22, v25, s4 -; GFX9-NEXT: v_perm_b32 v12, v24, v27, s4 -; GFX9-NEXT: v_perm_b32 v13, v26, v29, s4 -; GFX9-NEXT: v_perm_b32 v14, v28, v32, s4 -; GFX9-NEXT: v_perm_b32 v15, v31, v30, s4 -; GFX9-NEXT: v_readlane_b32 s35, v33, 3 -; GFX9-NEXT: v_readlane_b32 s34, v33, 2 -; GFX9-NEXT: v_readlane_b32 s31, v33, 1 -; GFX9-NEXT: v_readlane_b32 s30, v33, 0 -; GFX9-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; GFX9-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; GFX9-NEXT: s_mov_b64 exec, s[4:5] -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: v_vselect_v32bf16: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GFX900-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; GFX900-NEXT: s_mov_b64 exec, s[4:5] +; GFX900-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX900-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v0 +; GFX900-NEXT: v_and_b32_e32 v0, 1, v3 +; GFX900-NEXT: v_cmp_eq_u32_e64 s[6:7], 1, v0 +; GFX900-NEXT: v_and_b32_e32 v0, 1, v2 +; GFX900-NEXT: v_cmp_eq_u32_e64 s[8:9], 1, v0 +; GFX900-NEXT: v_and_b32_e32 v0, 1, v5 +; GFX900-NEXT: v_cmp_eq_u32_e64 s[10:11], 1, v0 +; GFX900-NEXT: v_and_b32_e32 v0, 1, v4 +; GFX900-NEXT: v_cmp_eq_u32_e64 s[12:13], 1, v0 +; GFX900-NEXT: v_and_b32_e32 v0, 1, v7 +; GFX900-NEXT: v_cmp_eq_u32_e64 s[14:15], 1, v0 +; GFX900-NEXT: v_and_b32_e32 v0, 1, v6 +; GFX900-NEXT: v_cmp_eq_u32_e64 s[16:17], 1, v0 +; GFX900-NEXT: v_and_b32_e32 v0, 1, v9 +; GFX900-NEXT: v_cmp_eq_u32_e64 s[18:19], 1, v0 +; GFX900-NEXT: v_and_b32_e32 v0, 1, v8 +; GFX900-NEXT: v_cmp_eq_u32_e64 s[20:21], 1, v0 +; GFX900-NEXT: v_and_b32_e32 v0, 1, v11 +; GFX900-NEXT: v_cmp_eq_u32_e64 s[22:23], 1, v0 +; GFX900-NEXT: v_and_b32_e32 v0, 1, v10 +; GFX900-NEXT: v_cmp_eq_u32_e64 s[24:25], 1, v0 +; GFX900-NEXT: v_and_b32_e32 v0, 1, v13 +; GFX900-NEXT: v_cmp_eq_u32_e64 s[26:27], 1, v0 +; GFX900-NEXT: v_and_b32_e32 v0, 1, v12 +; GFX900-NEXT: v_cmp_eq_u32_e64 s[28:29], 1, v0 +; GFX900-NEXT: v_and_b32_e32 v0, 1, v15 +; GFX900-NEXT: v_cmp_eq_u32_e64 s[40:41], 1, v0 +; GFX900-NEXT: v_and_b32_e32 v0, 1, v14 +; GFX900-NEXT: v_cmp_eq_u32_e64 s[42:43], 1, v0 +; GFX900-NEXT: v_and_b32_e32 v0, 1, v17 +; GFX900-NEXT: v_cmp_eq_u32_e64 s[44:45], 1, v0 +; GFX900-NEXT: v_and_b32_e32 v0, 1, v16 +; GFX900-NEXT: v_cmp_eq_u32_e64 s[46:47], 1, v0 +; GFX900-NEXT: v_and_b32_e32 v0, 1, v19 +; GFX900-NEXT: v_cmp_eq_u32_e64 s[56:57], 1, v0 +; GFX900-NEXT: v_and_b32_e32 v0, 1, v18 +; GFX900-NEXT: v_cmp_eq_u32_e64 s[58:59], 1, v0 +; GFX900-NEXT: v_and_b32_e32 v0, 1, v21 +; GFX900-NEXT: v_cmp_eq_u32_e64 s[60:61], 1, v0 +; GFX900-NEXT: v_and_b32_e32 v0, 1, v20 +; GFX900-NEXT: v_cmp_eq_u32_e64 s[62:63], 1, v0 +; GFX900-NEXT: v_and_b32_e32 v0, 1, v23 +; GFX900-NEXT: v_cmp_eq_u32_e64 s[72:73], 1, v0 +; GFX900-NEXT: v_and_b32_e32 v0, 1, v22 +; GFX900-NEXT: v_cmp_eq_u32_e64 s[74:75], 1, v0 +; GFX900-NEXT: v_and_b32_e32 v0, 1, v25 +; GFX900-NEXT: v_cmp_eq_u32_e64 s[76:77], 1, v0 +; GFX900-NEXT: v_and_b32_e32 v0, 1, v24 +; GFX900-NEXT: v_cmp_eq_u32_e64 s[78:79], 1, v0 +; GFX900-NEXT: v_and_b32_e32 v0, 1, v27 +; GFX900-NEXT: v_cmp_eq_u32_e64 s[88:89], 1, v0 +; GFX900-NEXT: v_and_b32_e32 v0, 1, v26 +; GFX900-NEXT: v_cmp_eq_u32_e64 s[90:91], 1, v0 +; GFX900-NEXT: v_and_b32_e32 v0, 1, v29 +; GFX900-NEXT: v_cmp_eq_u32_e64 s[92:93], 1, v0 +; GFX900-NEXT: v_and_b32_e32 v0, 1, v28 +; GFX900-NEXT: v_cmp_eq_u32_e64 s[94:95], 1, v0 +; GFX900-NEXT: buffer_load_ushort v0, off, s[0:3], s32 +; GFX900-NEXT: v_writelane_b32 v33, s30, 0 +; GFX900-NEXT: v_writelane_b32 v33, s31, 1 +; GFX900-NEXT: v_writelane_b32 v33, s34, 2 +; GFX900-NEXT: v_and_b32_e32 v1, 1, v1 +; GFX900-NEXT: v_writelane_b32 v33, s35, 3 +; GFX900-NEXT: v_cmp_eq_u32_e32 vcc, 1, v1 +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX900-NEXT: v_cmp_eq_u32_e64 s[30:31], 1, v0 +; GFX900-NEXT: v_and_b32_e32 v0, 1, v30 +; GFX900-NEXT: v_cmp_eq_u32_e64 s[34:35], 1, v0 +; GFX900-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:68 +; GFX900-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:4 +; GFX900-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:72 +; GFX900-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:8 +; GFX900-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:76 +; GFX900-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:12 +; GFX900-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:80 +; GFX900-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:16 +; GFX900-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:84 +; GFX900-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:20 +; GFX900-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:88 +; GFX900-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:24 +; GFX900-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:92 +; GFX900-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:28 +; GFX900-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:96 +; GFX900-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:32 +; GFX900-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:100 +; GFX900-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:36 +; GFX900-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:104 +; GFX900-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:40 +; GFX900-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:108 +; GFX900-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:44 +; GFX900-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:112 +; GFX900-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:48 +; GFX900-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:116 +; GFX900-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:52 +; GFX900-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:120 +; GFX900-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:56 +; GFX900-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:124 +; GFX900-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:60 +; GFX900-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:128 +; GFX900-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:64 +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: v_cndmask_b32_e64 v30, v31, v32, s[34:35] +; GFX900-NEXT: v_lshrrev_b32_e32 v32, 16, v32 +; GFX900-NEXT: v_lshrrev_b32_e32 v31, 16, v31 +; GFX900-NEXT: v_cndmask_b32_e64 v31, v31, v32, s[30:31] +; GFX900-NEXT: v_cndmask_b32_e64 v32, v28, v29, s[94:95] +; GFX900-NEXT: v_lshrrev_b32_e32 v29, 16, v29 +; GFX900-NEXT: v_lshrrev_b32_e32 v28, 16, v28 +; GFX900-NEXT: v_cndmask_b32_e64 v28, v28, v29, s[92:93] +; GFX900-NEXT: v_cndmask_b32_e64 v29, v26, v27, s[90:91] +; GFX900-NEXT: v_lshrrev_b32_e32 v27, 16, v27 +; GFX900-NEXT: v_lshrrev_b32_e32 v26, 16, v26 +; GFX900-NEXT: v_cndmask_b32_e64 v26, v26, v27, s[88:89] +; GFX900-NEXT: v_cndmask_b32_e64 v27, v24, v25, s[78:79] +; GFX900-NEXT: v_lshrrev_b32_e32 v25, 16, v25 +; GFX900-NEXT: v_lshrrev_b32_e32 v24, 16, v24 +; GFX900-NEXT: v_cndmask_b32_e64 v24, v24, v25, s[76:77] +; GFX900-NEXT: v_cndmask_b32_e64 v25, v22, v23, s[74:75] +; GFX900-NEXT: v_lshrrev_b32_e32 v23, 16, v23 +; GFX900-NEXT: v_lshrrev_b32_e32 v22, 16, v22 +; GFX900-NEXT: v_cndmask_b32_e64 v22, v22, v23, s[72:73] +; GFX900-NEXT: v_cndmask_b32_e64 v23, v20, v21, s[62:63] +; GFX900-NEXT: v_lshrrev_b32_e32 v21, 16, v21 +; GFX900-NEXT: v_lshrrev_b32_e32 v20, 16, v20 +; GFX900-NEXT: v_cndmask_b32_e64 v20, v20, v21, s[60:61] +; GFX900-NEXT: v_cndmask_b32_e64 v21, v18, v19, s[58:59] +; GFX900-NEXT: v_lshrrev_b32_e32 v19, 16, v19 +; GFX900-NEXT: v_lshrrev_b32_e32 v18, 16, v18 +; GFX900-NEXT: v_cndmask_b32_e64 v18, v18, v19, s[56:57] +; GFX900-NEXT: v_cndmask_b32_e64 v19, v16, v17, s[46:47] +; GFX900-NEXT: v_lshrrev_b32_e32 v17, 16, v17 +; GFX900-NEXT: v_lshrrev_b32_e32 v16, 16, v16 +; GFX900-NEXT: v_cndmask_b32_e64 v16, v16, v17, s[44:45] +; GFX900-NEXT: v_cndmask_b32_e64 v17, v14, v15, s[42:43] +; GFX900-NEXT: v_lshrrev_b32_e32 v15, 16, v15 +; GFX900-NEXT: v_lshrrev_b32_e32 v14, 16, v14 +; GFX900-NEXT: v_cndmask_b32_e64 v14, v14, v15, s[40:41] +; GFX900-NEXT: v_cndmask_b32_e64 v15, v12, v13, s[28:29] +; GFX900-NEXT: v_lshrrev_b32_e32 v13, 16, v13 +; GFX900-NEXT: v_lshrrev_b32_e32 v12, 16, v12 +; GFX900-NEXT: v_cndmask_b32_e64 v12, v12, v13, s[26:27] +; GFX900-NEXT: v_cndmask_b32_e64 v13, v10, v11, s[24:25] +; GFX900-NEXT: v_lshrrev_b32_e32 v11, 16, v11 +; GFX900-NEXT: v_lshrrev_b32_e32 v10, 16, v10 +; GFX900-NEXT: v_cndmask_b32_e64 v10, v10, v11, s[22:23] +; GFX900-NEXT: v_cndmask_b32_e64 v11, v8, v9, s[20:21] +; GFX900-NEXT: v_lshrrev_b32_e32 v9, 16, v9 +; GFX900-NEXT: v_lshrrev_b32_e32 v8, 16, v8 +; GFX900-NEXT: v_cndmask_b32_e64 v8, v8, v9, s[18:19] +; GFX900-NEXT: v_cndmask_b32_e64 v9, v6, v7, s[16:17] +; GFX900-NEXT: v_lshrrev_b32_e32 v7, 16, v7 +; GFX900-NEXT: v_lshrrev_b32_e32 v6, 16, v6 +; GFX900-NEXT: v_cndmask_b32_e64 v6, v6, v7, s[14:15] +; GFX900-NEXT: v_cndmask_b32_e64 v7, v4, v5, s[12:13] +; GFX900-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; GFX900-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; GFX900-NEXT: v_cndmask_b32_e64 v4, v4, v5, s[10:11] +; GFX900-NEXT: v_cndmask_b32_e64 v5, v2, v3, s[8:9] +; GFX900-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX900-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX900-NEXT: v_cndmask_b32_e64 v2, v2, v3, s[6:7] +; GFX900-NEXT: v_cndmask_b32_e64 v3, v0, v1, s[4:5] +; GFX900-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX900-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX900-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_perm_b32 v0, v0, v3, s4 +; GFX900-NEXT: v_perm_b32 v1, v2, v5, s4 +; GFX900-NEXT: v_perm_b32 v2, v4, v7, s4 +; GFX900-NEXT: v_perm_b32 v3, v6, v9, s4 +; GFX900-NEXT: v_perm_b32 v4, v8, v11, s4 +; GFX900-NEXT: v_perm_b32 v5, v10, v13, s4 +; GFX900-NEXT: v_perm_b32 v6, v12, v15, s4 +; GFX900-NEXT: v_perm_b32 v7, v14, v17, s4 +; GFX900-NEXT: v_perm_b32 v8, v16, v19, s4 +; GFX900-NEXT: v_perm_b32 v9, v18, v21, s4 +; GFX900-NEXT: v_perm_b32 v10, v20, v23, s4 +; GFX900-NEXT: v_perm_b32 v11, v22, v25, s4 +; GFX900-NEXT: v_perm_b32 v12, v24, v27, s4 +; GFX900-NEXT: v_perm_b32 v13, v26, v29, s4 +; GFX900-NEXT: v_perm_b32 v14, v28, v32, s4 +; GFX900-NEXT: v_perm_b32 v15, v31, v30, s4 +; GFX900-NEXT: v_readlane_b32 s35, v33, 3 +; GFX900-NEXT: v_readlane_b32 s34, v33, 2 +; GFX900-NEXT: v_readlane_b32 s31, v33, 1 +; GFX900-NEXT: v_readlane_b32 s30, v33, 0 +; GFX900-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GFX900-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; GFX900-NEXT: s_mov_b64 exec, s[4:5] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_vselect_v32bf16: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_accvgpr_write_b32 a0, v40 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_write_b32 a1, v41 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_write_b32 a2, v42 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_write_b32 a3, v43 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_write_b32 a4, v44 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_write_b32 a5, v45 ; Reload Reuse +; GFX950-NEXT: scratch_load_dword v31, off, s32 offset:60 +; GFX950-NEXT: scratch_load_dword v32, off, s32 offset:124 +; GFX950-NEXT: scratch_load_ushort v33, off, s32 +; GFX950-NEXT: scratch_load_dword v34, off, s32 offset:64 +; GFX950-NEXT: scratch_load_dword v35, off, s32 offset:128 +; GFX950-NEXT: scratch_load_dword v36, off, s32 offset:120 +; GFX950-NEXT: scratch_load_dword v37, off, s32 offset:56 +; GFX950-NEXT: scratch_load_dword v38, off, s32 offset:116 +; GFX950-NEXT: scratch_load_dword v39, off, s32 offset:52 +; GFX950-NEXT: scratch_load_dword v48, off, s32 offset:112 +; GFX950-NEXT: scratch_load_dword v49, off, s32 offset:48 +; GFX950-NEXT: scratch_load_dword v50, off, s32 offset:88 +; GFX950-NEXT: scratch_load_dword v51, off, s32 offset:24 +; GFX950-NEXT: scratch_load_dword v52, off, s32 offset:92 +; GFX950-NEXT: scratch_load_dword v53, off, s32 offset:28 +; GFX950-NEXT: scratch_load_dword v54, off, s32 offset:108 +; GFX950-NEXT: scratch_load_dword v55, off, s32 offset:44 +; GFX950-NEXT: scratch_load_dword v40, off, s32 offset:96 +; GFX950-NEXT: scratch_load_dword v41, off, s32 offset:32 +; GFX950-NEXT: scratch_load_dword v42, off, s32 offset:100 +; GFX950-NEXT: scratch_load_dword v43, off, s32 offset:36 +; GFX950-NEXT: scratch_load_dword v44, off, s32 offset:104 +; GFX950-NEXT: scratch_load_dword v45, off, s32 offset:40 +; GFX950-NEXT: v_and_b32_e32 v29, 1, v29 +; GFX950-NEXT: v_accvgpr_write_b32 a8, v56 ; Reload Reuse +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, 1, v29 +; GFX950-NEXT: scratch_load_dword v29, off, s32 offset:84 +; GFX950-NEXT: scratch_load_dword v56, off, s32 offset:20 +; GFX950-NEXT: v_and_b32_e32 v28, 1, v28 +; GFX950-NEXT: v_accvgpr_write_b32 a9, v57 ; Reload Reuse +; GFX950-NEXT: v_cmp_eq_u32_e64 s[0:1], 1, v28 +; GFX950-NEXT: v_accvgpr_write_b32 a10, v58 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_write_b32 a11, v59 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_write_b32 a6, v46 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_write_b32 a7, v47 ; Reload Reuse +; GFX950-NEXT: v_and_b32_e32 v26, 1, v26 +; GFX950-NEXT: v_and_b32_e32 v27, 1, v27 +; GFX950-NEXT: v_and_b32_e32 v24, 1, v24 +; GFX950-NEXT: v_and_b32_e32 v25, 1, v25 +; GFX950-NEXT: v_and_b32_e32 v22, 1, v22 +; GFX950-NEXT: v_and_b32_e32 v23, 1, v23 +; GFX950-NEXT: v_and_b32_e32 v20, 1, v20 +; GFX950-NEXT: v_and_b32_e32 v21, 1, v21 +; GFX950-NEXT: v_and_b32_e32 v18, 1, v18 +; GFX950-NEXT: v_and_b32_e32 v19, 1, v19 +; GFX950-NEXT: v_and_b32_e32 v16, 1, v16 +; GFX950-NEXT: v_and_b32_e32 v17, 1, v17 +; GFX950-NEXT: v_and_b32_e32 v14, 1, v14 +; GFX950-NEXT: v_and_b32_e32 v15, 1, v15 +; GFX950-NEXT: v_and_b32_e32 v12, 1, v12 +; GFX950-NEXT: v_and_b32_e32 v13, 1, v13 +; GFX950-NEXT: v_and_b32_e32 v10, 1, v10 +; GFX950-NEXT: v_and_b32_e32 v11, 1, v11 +; GFX950-NEXT: v_and_b32_e32 v8, 1, v8 +; GFX950-NEXT: v_and_b32_e32 v9, 1, v9 +; GFX950-NEXT: v_and_b32_e32 v6, 1, v6 +; GFX950-NEXT: v_and_b32_e32 v7, 1, v7 +; GFX950-NEXT: v_and_b32_e32 v4, 1, v4 +; GFX950-NEXT: v_and_b32_e32 v5, 1, v5 +; GFX950-NEXT: v_and_b32_e32 v2, 1, v2 +; GFX950-NEXT: v_and_b32_e32 v3, 1, v3 +; GFX950-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX950-NEXT: v_and_b32_e32 v1, 1, v1 +; GFX950-NEXT: s_waitcnt vmcnt(24) +; GFX950-NEXT: v_lshrrev_b32_e32 v46, 16, v31 +; GFX950-NEXT: s_waitcnt vmcnt(23) +; GFX950-NEXT: v_lshrrev_b32_e32 v47, 16, v32 +; GFX950-NEXT: s_waitcnt vmcnt(22) +; GFX950-NEXT: v_and_b32_e32 v28, 1, v33 +; GFX950-NEXT: scratch_load_dword v33, off, s32 offset:80 +; GFX950-NEXT: scratch_load_dword v57, off, s32 offset:16 +; GFX950-NEXT: v_cmp_eq_u32_e64 s[2:3], 1, v28 +; GFX950-NEXT: v_and_b32_e32 v28, 1, v30 +; GFX950-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v28 +; GFX950-NEXT: scratch_load_dword v28, off, s32 offset:76 +; GFX950-NEXT: scratch_load_dword v30, off, s32 offset:12 +; GFX950-NEXT: s_waitcnt vmcnt(25) +; GFX950-NEXT: v_lshrrev_b32_e32 v58, 16, v34 +; GFX950-NEXT: s_waitcnt vmcnt(24) +; GFX950-NEXT: v_lshrrev_b32_e32 v59, 16, v35 +; GFX950-NEXT: v_cndmask_b32_e64 v34, v35, v34, s[4:5] +; GFX950-NEXT: scratch_load_dword v35, off, s32 offset:72 +; GFX950-NEXT: v_cndmask_b32_e64 v58, v59, v58, s[2:3] +; GFX950-NEXT: scratch_load_dword v59, off, s32 offset:8 +; GFX950-NEXT: v_cndmask_b32_e64 v31, v32, v31, s[0:1] +; GFX950-NEXT: scratch_load_dword v32, off, s32 offset:68 +; GFX950-NEXT: v_cndmask_b32_e32 v46, v47, v46, vcc +; GFX950-NEXT: scratch_load_dword v47, off, s32 offset:4 +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, 1, v26 +; GFX950-NEXT: s_mov_b32 s0, 0x5040100 +; GFX950-NEXT: s_waitcnt vmcnt(26) +; GFX950-NEXT: v_cndmask_b32_e32 v26, v36, v37, vcc +; GFX950-NEXT: v_lshrrev_b32_e32 v37, 16, v37 +; GFX950-NEXT: v_lshrrev_b32_e32 v36, 16, v36 +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, 1, v27 +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e32 v27, v36, v37, vcc +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, 1, v24 +; GFX950-NEXT: s_waitcnt vmcnt(24) +; GFX950-NEXT: v_lshrrev_b32_e32 v36, 16, v39 +; GFX950-NEXT: v_lshrrev_b32_e32 v37, 16, v38 +; GFX950-NEXT: v_cndmask_b32_e32 v24, v38, v39, vcc +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, 1, v25 +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e32 v25, v37, v36, vcc +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, 1, v22 +; GFX950-NEXT: s_waitcnt vmcnt(22) +; GFX950-NEXT: v_lshrrev_b32_e32 v36, 16, v49 +; GFX950-NEXT: v_lshrrev_b32_e32 v37, 16, v48 +; GFX950-NEXT: v_cndmask_b32_e32 v22, v48, v49, vcc +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, 1, v23 +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e32 v23, v37, v36, vcc +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, 1, v20 +; GFX950-NEXT: s_waitcnt vmcnt(16) +; GFX950-NEXT: v_lshrrev_b32_e32 v36, 16, v55 +; GFX950-NEXT: v_lshrrev_b32_e32 v37, 16, v54 +; GFX950-NEXT: v_cndmask_b32_e32 v20, v54, v55, vcc +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, 1, v21 +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e32 v21, v37, v36, vcc +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, 1, v18 +; GFX950-NEXT: s_waitcnt vmcnt(10) +; GFX950-NEXT: v_lshrrev_b32_e32 v36, 16, v45 +; GFX950-NEXT: v_lshrrev_b32_e32 v37, 16, v44 +; GFX950-NEXT: v_cndmask_b32_e32 v18, v44, v45, vcc +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, 1, v19 +; GFX950-NEXT: v_accvgpr_read_b32 v45, a5 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_read_b32 v44, a4 ; Reload Reuse +; GFX950-NEXT: v_cndmask_b32_e32 v19, v37, v36, vcc +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, 1, v16 +; GFX950-NEXT: v_lshrrev_b32_e32 v36, 16, v43 +; GFX950-NEXT: v_lshrrev_b32_e32 v37, 16, v42 +; GFX950-NEXT: v_cndmask_b32_e32 v16, v42, v43, vcc +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, 1, v17 +; GFX950-NEXT: v_accvgpr_read_b32 v43, a3 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_read_b32 v42, a2 ; Reload Reuse +; GFX950-NEXT: v_cndmask_b32_e32 v17, v37, v36, vcc +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, 1, v14 +; GFX950-NEXT: v_lshrrev_b32_e32 v36, 16, v41 +; GFX950-NEXT: v_lshrrev_b32_e32 v37, 16, v40 +; GFX950-NEXT: v_cndmask_b32_e32 v14, v40, v41, vcc +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, 1, v15 +; GFX950-NEXT: v_accvgpr_read_b32 v41, a1 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_read_b32 v40, a0 ; Reload Reuse +; GFX950-NEXT: v_cndmask_b32_e32 v15, v37, v36, vcc +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, 1, v12 +; GFX950-NEXT: v_lshrrev_b32_e32 v36, 16, v53 +; GFX950-NEXT: v_lshrrev_b32_e32 v37, 16, v52 +; GFX950-NEXT: v_cndmask_b32_e32 v12, v52, v53, vcc +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, 1, v13 +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e32 v13, v37, v36, vcc +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, 1, v10 +; GFX950-NEXT: v_lshrrev_b32_e32 v36, 16, v51 +; GFX950-NEXT: v_lshrrev_b32_e32 v37, 16, v50 +; GFX950-NEXT: v_cndmask_b32_e32 v10, v50, v51, vcc +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, 1, v11 +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e32 v11, v37, v36, vcc +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, 1, v8 +; GFX950-NEXT: s_waitcnt vmcnt(8) +; GFX950-NEXT: v_lshrrev_b32_e32 v36, 16, v56 +; GFX950-NEXT: v_cndmask_b32_e32 v8, v29, v56, vcc +; GFX950-NEXT: v_lshrrev_b32_e32 v29, 16, v29 +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, 1, v9 +; GFX950-NEXT: v_accvgpr_read_b32 v56, a8 ; Reload Reuse +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: v_cndmask_b32_e32 v9, v29, v36, vcc +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, 1, v6 +; GFX950-NEXT: s_waitcnt vmcnt(6) +; GFX950-NEXT: v_lshrrev_b32_e32 v29, 16, v57 +; GFX950-NEXT: v_cndmask_b32_e32 v6, v33, v57, vcc +; GFX950-NEXT: v_lshrrev_b32_e32 v33, 16, v33 +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, 1, v7 +; GFX950-NEXT: v_accvgpr_read_b32 v57, a9 ; Reload Reuse +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: v_cndmask_b32_e32 v7, v33, v29, vcc +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, 1, v4 +; GFX950-NEXT: s_waitcnt vmcnt(4) +; GFX950-NEXT: v_lshrrev_b32_e32 v29, 16, v30 +; GFX950-NEXT: v_cndmask_b32_e32 v4, v28, v30, vcc +; GFX950-NEXT: v_lshrrev_b32_e32 v28, 16, v28 +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, 1, v5 +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e32 v5, v28, v29, vcc +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 +; GFX950-NEXT: s_waitcnt vmcnt(2) +; GFX950-NEXT: v_lshrrev_b32_e32 v28, 16, v59 +; GFX950-NEXT: v_lshrrev_b32_e32 v29, 16, v35 +; GFX950-NEXT: v_cndmask_b32_e32 v2, v35, v59, vcc +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, 1, v3 +; GFX950-NEXT: v_accvgpr_read_b32 v59, a11 ; Reload Reuse +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: v_cndmask_b32_e32 v3, v29, v28, vcc +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: v_lshrrev_b32_e32 v28, 16, v47 +; GFX950-NEXT: v_lshrrev_b32_e32 v29, 16, v32 +; GFX950-NEXT: v_cndmask_b32_e32 v0, v32, v47, vcc +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, 1, v1 +; GFX950-NEXT: v_accvgpr_read_b32 v47, a7 ; Reload Reuse +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: v_cndmask_b32_e32 v1, v29, v28, vcc +; GFX950-NEXT: v_perm_b32 v0, v1, v0, s0 +; GFX950-NEXT: v_perm_b32 v1, v3, v2, s0 +; GFX950-NEXT: v_perm_b32 v2, v5, v4, s0 +; GFX950-NEXT: v_perm_b32 v3, v7, v6, s0 +; GFX950-NEXT: v_perm_b32 v4, v9, v8, s0 +; GFX950-NEXT: v_perm_b32 v5, v11, v10, s0 +; GFX950-NEXT: v_perm_b32 v6, v13, v12, s0 +; GFX950-NEXT: v_perm_b32 v7, v15, v14, s0 +; GFX950-NEXT: v_perm_b32 v8, v17, v16, s0 +; GFX950-NEXT: v_perm_b32 v9, v19, v18, s0 +; GFX950-NEXT: v_perm_b32 v10, v21, v20, s0 +; GFX950-NEXT: v_perm_b32 v11, v23, v22, s0 +; GFX950-NEXT: v_perm_b32 v12, v25, v24, s0 +; GFX950-NEXT: v_perm_b32 v13, v27, v26, s0 +; GFX950-NEXT: v_perm_b32 v14, v46, v31, s0 +; GFX950-NEXT: v_perm_b32 v15, v58, v34, s0 +; GFX950-NEXT: v_accvgpr_read_b32 v58, a10 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_read_b32 v46, a6 ; Reload Reuse +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_vselect_v32bf16: ; GFX10: ; %bb.0: @@ -42769,21 +46080,31 @@ define bfloat @v_fma_bf16(bfloat %a, bfloat %b, bfloat %c) { ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_fma_bf16: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX9-NEXT: v_fma_f32 v0, v0, v1, v2 -; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1 -; GFX9-NEXT: s_movk_i32 s4, 0x7fff -; GFX9-NEXT: v_add3_u32 v1, v1, v0, s4 -; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v0 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: v_fma_bf16: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX900-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX900-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX900-NEXT: v_fma_f32 v0, v0, v1, v2 +; GFX900-NEXT: v_bfe_u32 v1, v0, 16, 1 +; GFX900-NEXT: s_movk_i32 s4, 0x7fff +; GFX900-NEXT: v_add3_u32 v1, v1, v0, s4 +; GFX900-NEXT: v_or_b32_e32 v2, 0x400000, v0 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX900-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc +; GFX900-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_fma_bf16: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX950-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX950-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX950-NEXT: v_fmac_f32_e32 v2, v0, v1 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v0, v2, s0 +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_fma_bf16: ; GFX10: ; %bb.0: @@ -42912,31 +46233,45 @@ define <2 x bfloat> @v_fma_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b, <2 x bfloat> ; GFX8-NEXT: v_alignbit_b32 v0, v0, v3, 16 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_fma_v2bf16: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v2 -; GFX9-NEXT: v_lshlrev_b32_e32 v4, 16, v1 -; GFX9-NEXT: v_lshlrev_b32_e32 v5, 16, v0 -; GFX9-NEXT: v_fma_f32 v3, v5, v4, v3 -; GFX9-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX9-NEXT: v_bfe_u32 v4, v3, 16, 1 -; GFX9-NEXT: s_movk_i32 s4, 0x7fff -; GFX9-NEXT: v_fma_f32 v0, v0, v1, v2 -; GFX9-NEXT: v_add3_u32 v4, v4, v3, s4 -; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v3 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1 -; GFX9-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc -; GFX9-NEXT: v_add3_u32 v1, v1, v0, s4 -; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v0 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc -; GFX9-NEXT: s_mov_b32 s4, 0x7060302 -; GFX9-NEXT: v_perm_b32 v0, v0, v3, s4 -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: v_fma_v2bf16: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_lshlrev_b32_e32 v3, 16, v2 +; GFX900-NEXT: v_lshlrev_b32_e32 v4, 16, v1 +; GFX900-NEXT: v_lshlrev_b32_e32 v5, 16, v0 +; GFX900-NEXT: v_fma_f32 v3, v5, v4, v3 +; GFX900-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX900-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX900-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX900-NEXT: v_bfe_u32 v4, v3, 16, 1 +; GFX900-NEXT: s_movk_i32 s4, 0x7fff +; GFX900-NEXT: v_fma_f32 v0, v0, v1, v2 +; GFX900-NEXT: v_add3_u32 v4, v4, v3, s4 +; GFX900-NEXT: v_or_b32_e32 v5, 0x400000, v3 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX900-NEXT: v_bfe_u32 v1, v0, 16, 1 +; GFX900-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc +; GFX900-NEXT: v_add3_u32 v1, v1, v0, s4 +; GFX900-NEXT: v_or_b32_e32 v2, 0x400000, v0 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX900-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_perm_b32 v0, v0, v3, s4 +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_fma_v2bf16: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 +; GFX950-NEXT: v_and_b32_e32 v4, 0xffff0000, v1 +; GFX950-NEXT: v_and_b32_e32 v5, 0xffff0000, v0 +; GFX950-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX950-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX950-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX950-NEXT: v_fmac_f32_e32 v3, v5, v4 +; GFX950-NEXT: v_fmac_f32_e32 v2, v0, v1 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v0, v2, v3 +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_fma_v2bf16: ; GFX10: ; %bb.0: @@ -43118,41 +46453,60 @@ define <3 x bfloat> @v_fma_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b, <3 x bfloat> ; GFX8-NEXT: v_alignbit_b32 v0, v0, v3, 16 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_fma_v3bf16: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX9-NEXT: v_fma_f32 v1, v1, v3, v5 -; GFX9-NEXT: v_bfe_u32 v3, v1, 16, 1 -; GFX9-NEXT: s_movk_i32 s4, 0x7fff -; GFX9-NEXT: v_add3_u32 v3, v3, v1, s4 -; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v1 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc -; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v4 -; GFX9-NEXT: v_lshlrev_b32_e32 v5, 16, v2 -; GFX9-NEXT: v_lshlrev_b32_e32 v6, 16, v0 -; GFX9-NEXT: v_fma_f32 v3, v6, v5, v3 -; GFX9-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 -; GFX9-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX9-NEXT: v_bfe_u32 v5, v3, 16, 1 -; GFX9-NEXT: v_fma_f32 v0, v0, v2, v4 -; GFX9-NEXT: v_add3_u32 v5, v5, v3, s4 -; GFX9-NEXT: v_or_b32_e32 v6, 0x400000, v3 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; GFX9-NEXT: v_bfe_u32 v2, v0, 16, 1 -; GFX9-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc -; GFX9-NEXT: v_add3_u32 v2, v2, v0, s4 -; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v0 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc -; GFX9-NEXT: s_mov_b32 s4, 0x7060302 -; GFX9-NEXT: v_perm_b32 v0, v0, v3, s4 -; GFX9-NEXT: v_alignbit_b32 v1, s4, v1, 16 -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: v_fma_v3bf16: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX900-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX900-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX900-NEXT: v_fma_f32 v1, v1, v3, v5 +; GFX900-NEXT: v_bfe_u32 v3, v1, 16, 1 +; GFX900-NEXT: s_movk_i32 s4, 0x7fff +; GFX900-NEXT: v_add3_u32 v3, v3, v1, s4 +; GFX900-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX900-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc +; GFX900-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; GFX900-NEXT: v_lshlrev_b32_e32 v5, 16, v2 +; GFX900-NEXT: v_lshlrev_b32_e32 v6, 16, v0 +; GFX900-NEXT: v_fma_f32 v3, v6, v5, v3 +; GFX900-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 +; GFX900-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX900-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX900-NEXT: v_bfe_u32 v5, v3, 16, 1 +; GFX900-NEXT: v_fma_f32 v0, v0, v2, v4 +; GFX900-NEXT: v_add3_u32 v5, v5, v3, s4 +; GFX900-NEXT: v_or_b32_e32 v6, 0x400000, v3 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX900-NEXT: v_bfe_u32 v2, v0, 16, 1 +; GFX900-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc +; GFX900-NEXT: v_add3_u32 v2, v2, v0, s4 +; GFX900-NEXT: v_or_b32_e32 v4, 0x400000, v0 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX900-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_perm_b32 v0, v0, v3, s4 +; GFX900-NEXT: v_alignbit_b32 v1, s4, v1, 16 +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_fma_v3bf16: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX950-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX950-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX950-NEXT: v_fmac_f32_e32 v5, v1, v3 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v1, v5, s0 +; GFX950-NEXT: v_and_b32_e32 v3, 0xffff0000, v4 +; GFX950-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX950-NEXT: v_and_b32_e32 v6, 0xffff0000, v0 +; GFX950-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX950-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX950-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX950-NEXT: v_fmac_f32_e32 v3, v6, v5 +; GFX950-NEXT: v_fmac_f32_e32 v4, v0, v2 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v0, v4, v3 +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_fma_v3bf16: ; GFX10: ; %bb.0: @@ -43394,50 +46748,73 @@ define <4 x bfloat> @v_fma_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b, <4 x bfloat> ; GFX8-NEXT: v_alignbit_b32 v1, v1, v6, 16 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_fma_v4bf16: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_lshlrev_b32_e32 v6, 16, v5 -; GFX9-NEXT: v_lshlrev_b32_e32 v7, 16, v3 -; GFX9-NEXT: v_lshlrev_b32_e32 v8, 16, v1 -; GFX9-NEXT: v_fma_f32 v6, v8, v7, v6 -; GFX9-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 -; GFX9-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 -; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GFX9-NEXT: v_bfe_u32 v7, v6, 16, 1 -; GFX9-NEXT: s_movk_i32 s4, 0x7fff -; GFX9-NEXT: v_fma_f32 v1, v1, v3, v5 -; GFX9-NEXT: v_add3_u32 v7, v7, v6, s4 -; GFX9-NEXT: v_or_b32_e32 v8, 0x400000, v6 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 -; GFX9-NEXT: v_bfe_u32 v3, v1, 16, 1 -; GFX9-NEXT: v_cndmask_b32_e32 v6, v7, v8, vcc -; GFX9-NEXT: v_add3_u32 v3, v3, v1, s4 -; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v1 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc -; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v4 -; GFX9-NEXT: v_lshlrev_b32_e32 v5, 16, v2 -; GFX9-NEXT: v_lshlrev_b32_e32 v7, 16, v0 -; GFX9-NEXT: v_fma_f32 v3, v7, v5, v3 -; GFX9-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 -; GFX9-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX9-NEXT: v_bfe_u32 v5, v3, 16, 1 -; GFX9-NEXT: v_fma_f32 v0, v0, v2, v4 -; GFX9-NEXT: v_add3_u32 v5, v5, v3, s4 -; GFX9-NEXT: v_or_b32_e32 v7, 0x400000, v3 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; GFX9-NEXT: v_bfe_u32 v2, v0, 16, 1 -; GFX9-NEXT: v_cndmask_b32_e32 v3, v5, v7, vcc -; GFX9-NEXT: v_add3_u32 v2, v2, v0, s4 -; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v0 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc -; GFX9-NEXT: s_mov_b32 s4, 0x7060302 -; GFX9-NEXT: v_perm_b32 v0, v0, v3, s4 -; GFX9-NEXT: v_perm_b32 v1, v1, v6, s4 -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: v_fma_v4bf16: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_lshlrev_b32_e32 v6, 16, v5 +; GFX900-NEXT: v_lshlrev_b32_e32 v7, 16, v3 +; GFX900-NEXT: v_lshlrev_b32_e32 v8, 16, v1 +; GFX900-NEXT: v_fma_f32 v6, v8, v7, v6 +; GFX900-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 +; GFX900-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; GFX900-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX900-NEXT: v_bfe_u32 v7, v6, 16, 1 +; GFX900-NEXT: s_movk_i32 s4, 0x7fff +; GFX900-NEXT: v_fma_f32 v1, v1, v3, v5 +; GFX900-NEXT: v_add3_u32 v7, v7, v6, s4 +; GFX900-NEXT: v_or_b32_e32 v8, 0x400000, v6 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; GFX900-NEXT: v_bfe_u32 v3, v1, 16, 1 +; GFX900-NEXT: v_cndmask_b32_e32 v6, v7, v8, vcc +; GFX900-NEXT: v_add3_u32 v3, v3, v1, s4 +; GFX900-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX900-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc +; GFX900-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; GFX900-NEXT: v_lshlrev_b32_e32 v5, 16, v2 +; GFX900-NEXT: v_lshlrev_b32_e32 v7, 16, v0 +; GFX900-NEXT: v_fma_f32 v3, v7, v5, v3 +; GFX900-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 +; GFX900-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX900-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX900-NEXT: v_bfe_u32 v5, v3, 16, 1 +; GFX900-NEXT: v_fma_f32 v0, v0, v2, v4 +; GFX900-NEXT: v_add3_u32 v5, v5, v3, s4 +; GFX900-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX900-NEXT: v_bfe_u32 v2, v0, 16, 1 +; GFX900-NEXT: v_cndmask_b32_e32 v3, v5, v7, vcc +; GFX900-NEXT: v_add3_u32 v2, v2, v0, s4 +; GFX900-NEXT: v_or_b32_e32 v4, 0x400000, v0 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX900-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_perm_b32 v0, v0, v3, s4 +; GFX900-NEXT: v_perm_b32 v1, v1, v6, s4 +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_fma_v4bf16: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_and_b32_e32 v6, 0xffff0000, v5 +; GFX950-NEXT: v_and_b32_e32 v7, 0xffff0000, v3 +; GFX950-NEXT: v_and_b32_e32 v8, 0xffff0000, v1 +; GFX950-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX950-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX950-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX950-NEXT: v_fmac_f32_e32 v6, v8, v7 +; GFX950-NEXT: v_fmac_f32_e32 v5, v1, v3 +; GFX950-NEXT: v_and_b32_e32 v1, 0xffff0000, v4 +; GFX950-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 +; GFX950-NEXT: v_and_b32_e32 v7, 0xffff0000, v0 +; GFX950-NEXT: v_fmac_f32_e32 v1, v7, v3 +; GFX950-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; GFX950-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX950-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX950-NEXT: v_fmac_f32_e32 v3, v0, v2 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v0, v3, v1 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v1, v5, v6 +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_fma_v4bf16: ; GFX10: ; %bb.0: @@ -43640,28 +47017,41 @@ define bfloat @v_fmuladd_bf16(bfloat %a, bfloat %b, bfloat %c) { ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_fmuladd_bf16: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX9-NEXT: v_mul_f32_e32 v0, v0, v1 -; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1 -; GFX9-NEXT: s_movk_i32 s4, 0x7fff -; GFX9-NEXT: v_add3_u32 v1, v1, v0, s4 -; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v0 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v3, vcc -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; GFX9-NEXT: v_add_f32_e32 v0, v0, v1 -; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1 -; GFX9-NEXT: v_add3_u32 v1, v1, v0, s4 -; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v0 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: v_fmuladd_bf16: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX900-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX900-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX900-NEXT: v_bfe_u32 v1, v0, 16, 1 +; GFX900-NEXT: s_movk_i32 s4, 0x7fff +; GFX900-NEXT: v_add3_u32 v1, v1, v0, s4 +; GFX900-NEXT: v_or_b32_e32 v3, 0x400000, v0 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX900-NEXT: v_cndmask_b32_e32 v0, v1, v3, vcc +; GFX900-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX900-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; GFX900-NEXT: v_add_f32_e32 v0, v0, v1 +; GFX900-NEXT: v_bfe_u32 v1, v0, 16, 1 +; GFX900-NEXT: v_add3_u32 v1, v1, v0, s4 +; GFX900-NEXT: v_or_b32_e32 v2, 0x400000, v0 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX900-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc +; GFX900-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_fmuladd_bf16: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX950-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX950-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v0, v0, s0 +; GFX950-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX950-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; GFX950-NEXT: v_add_f32_e32 v0, v0, v1 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v0, v0, s0 +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_fmuladd_bf16: ; GFX10: ; %bb.0: @@ -43839,45 +47229,65 @@ define <2 x bfloat> @v_fmuladd_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b, <2 x bfl ; GFX8-NEXT: v_alignbit_b32 v0, v0, v3, 16 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_fmuladd_v2bf16: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v1 -; GFX9-NEXT: v_lshlrev_b32_e32 v4, 16, v0 -; GFX9-NEXT: v_mul_f32_e32 v3, v4, v3 -; GFX9-NEXT: v_bfe_u32 v4, v3, 16, 1 -; GFX9-NEXT: s_movk_i32 s4, 0x7fff -; GFX9-NEXT: v_add3_u32 v4, v4, v3, s4 -; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v3 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; GFX9-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc -; GFX9-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 -; GFX9-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX9-NEXT: v_add_f32_e32 v3, v3, v4 -; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX9-NEXT: v_bfe_u32 v4, v3, 16, 1 -; GFX9-NEXT: v_mul_f32_e32 v0, v0, v1 -; GFX9-NEXT: v_add3_u32 v4, v4, v3, s4 -; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v3 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1 -; GFX9-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc -; GFX9-NEXT: v_add3_u32 v1, v1, v0, s4 -; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v0 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v4, vcc -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v2 -; GFX9-NEXT: v_add_f32_e32 v0, v0, v1 -; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1 -; GFX9-NEXT: v_add3_u32 v1, v1, v0, s4 -; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v0 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc -; GFX9-NEXT: s_mov_b32 s4, 0x7060302 -; GFX9-NEXT: v_perm_b32 v0, v0, v3, s4 -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: v_fmuladd_v2bf16: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_lshlrev_b32_e32 v3, 16, v1 +; GFX900-NEXT: v_lshlrev_b32_e32 v4, 16, v0 +; GFX900-NEXT: v_mul_f32_e32 v3, v4, v3 +; GFX900-NEXT: v_bfe_u32 v4, v3, 16, 1 +; GFX900-NEXT: s_movk_i32 s4, 0x7fff +; GFX900-NEXT: v_add3_u32 v4, v4, v3, s4 +; GFX900-NEXT: v_or_b32_e32 v5, 0x400000, v3 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX900-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc +; GFX900-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; GFX900-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX900-NEXT: v_add_f32_e32 v3, v3, v4 +; GFX900-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX900-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX900-NEXT: v_bfe_u32 v4, v3, 16, 1 +; GFX900-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX900-NEXT: v_add3_u32 v4, v4, v3, s4 +; GFX900-NEXT: v_or_b32_e32 v5, 0x400000, v3 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX900-NEXT: v_bfe_u32 v1, v0, 16, 1 +; GFX900-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc +; GFX900-NEXT: v_add3_u32 v1, v1, v0, s4 +; GFX900-NEXT: v_or_b32_e32 v4, 0x400000, v0 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX900-NEXT: v_cndmask_b32_e32 v0, v1, v4, vcc +; GFX900-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX900-NEXT: v_and_b32_e32 v1, 0xffff0000, v2 +; GFX900-NEXT: v_add_f32_e32 v0, v0, v1 +; GFX900-NEXT: v_bfe_u32 v1, v0, 16, 1 +; GFX900-NEXT: v_add3_u32 v1, v1, v0, s4 +; GFX900-NEXT: v_or_b32_e32 v2, 0x400000, v0 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX900-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_perm_b32 v0, v0, v3, s4 +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_fmuladd_v2bf16: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 +; GFX950-NEXT: v_and_b32_e32 v4, 0xffff0000, v0 +; GFX950-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX950-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX950-NEXT: v_mul_f32_e32 v3, v4, v3 +; GFX950-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v3, v3, s0 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v0, v0, s0 +; GFX950-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX950-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 +; GFX950-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX950-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; GFX950-NEXT: v_add_f32_e32 v3, v3, v4 +; GFX950-NEXT: v_add_f32_e32 v0, v0, v1 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v0, v0, v3 +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_fmuladd_v2bf16: ; GFX10: ; %bb.0: @@ -44145,62 +47555,90 @@ define <3 x bfloat> @v_fmuladd_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b, <3 x bfl ; GFX8-NEXT: v_alignbit_b32 v0, v0, v3, 16 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_fmuladd_v3bf16: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX9-NEXT: v_mul_f32_e32 v1, v1, v3 -; GFX9-NEXT: v_bfe_u32 v3, v1, 16, 1 -; GFX9-NEXT: s_movk_i32 s4, 0x7fff -; GFX9-NEXT: v_add3_u32 v3, v3, v1, s4 -; GFX9-NEXT: v_or_b32_e32 v6, 0x400000, v1 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v6, vcc -; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v5 -; GFX9-NEXT: v_add_f32_e32 v1, v1, v3 -; GFX9-NEXT: v_bfe_u32 v3, v1, 16, 1 -; GFX9-NEXT: v_add3_u32 v3, v3, v1, s4 -; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v1 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc -; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v2 -; GFX9-NEXT: v_lshlrev_b32_e32 v5, 16, v0 -; GFX9-NEXT: v_mul_f32_e32 v3, v5, v3 -; GFX9-NEXT: v_bfe_u32 v5, v3, 16, 1 -; GFX9-NEXT: v_add3_u32 v5, v5, v3, s4 -; GFX9-NEXT: v_or_b32_e32 v6, 0x400000, v3 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; GFX9-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc -; GFX9-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 -; GFX9-NEXT: v_lshlrev_b32_e32 v5, 16, v4 -; GFX9-NEXT: v_add_f32_e32 v3, v3, v5 -; GFX9-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX9-NEXT: v_bfe_u32 v5, v3, 16, 1 -; GFX9-NEXT: v_mul_f32_e32 v0, v0, v2 -; GFX9-NEXT: v_add3_u32 v5, v5, v3, s4 -; GFX9-NEXT: v_or_b32_e32 v6, 0x400000, v3 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; GFX9-NEXT: v_bfe_u32 v2, v0, 16, 1 -; GFX9-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc -; GFX9-NEXT: v_add3_u32 v2, v2, v0, s4 -; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v0 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v5, vcc -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX9-NEXT: v_and_b32_e32 v2, 0xffff0000, v4 -; GFX9-NEXT: v_add_f32_e32 v0, v0, v2 -; GFX9-NEXT: v_bfe_u32 v2, v0, 16, 1 -; GFX9-NEXT: v_add3_u32 v2, v2, v0, s4 -; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v0 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc -; GFX9-NEXT: s_mov_b32 s4, 0x7060302 -; GFX9-NEXT: v_perm_b32 v0, v0, v3, s4 -; GFX9-NEXT: v_alignbit_b32 v1, s4, v1, 16 -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: v_fmuladd_v3bf16: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX900-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX900-NEXT: v_mul_f32_e32 v1, v1, v3 +; GFX900-NEXT: v_bfe_u32 v3, v1, 16, 1 +; GFX900-NEXT: s_movk_i32 s4, 0x7fff +; GFX900-NEXT: v_add3_u32 v3, v3, v1, s4 +; GFX900-NEXT: v_or_b32_e32 v6, 0x400000, v1 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX900-NEXT: v_cndmask_b32_e32 v1, v3, v6, vcc +; GFX900-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX900-NEXT: v_lshlrev_b32_e32 v3, 16, v5 +; GFX900-NEXT: v_add_f32_e32 v1, v1, v3 +; GFX900-NEXT: v_bfe_u32 v3, v1, 16, 1 +; GFX900-NEXT: v_add3_u32 v3, v3, v1, s4 +; GFX900-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX900-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc +; GFX900-NEXT: v_lshlrev_b32_e32 v3, 16, v2 +; GFX900-NEXT: v_lshlrev_b32_e32 v5, 16, v0 +; GFX900-NEXT: v_mul_f32_e32 v3, v5, v3 +; GFX900-NEXT: v_bfe_u32 v5, v3, 16, 1 +; GFX900-NEXT: v_add3_u32 v5, v5, v3, s4 +; GFX900-NEXT: v_or_b32_e32 v6, 0x400000, v3 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX900-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc +; GFX900-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; GFX900-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX900-NEXT: v_add_f32_e32 v3, v3, v5 +; GFX900-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX900-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX900-NEXT: v_bfe_u32 v5, v3, 16, 1 +; GFX900-NEXT: v_mul_f32_e32 v0, v0, v2 +; GFX900-NEXT: v_add3_u32 v5, v5, v3, s4 +; GFX900-NEXT: v_or_b32_e32 v6, 0x400000, v3 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX900-NEXT: v_bfe_u32 v2, v0, 16, 1 +; GFX900-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc +; GFX900-NEXT: v_add3_u32 v2, v2, v0, s4 +; GFX900-NEXT: v_or_b32_e32 v5, 0x400000, v0 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX900-NEXT: v_cndmask_b32_e32 v0, v2, v5, vcc +; GFX900-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX900-NEXT: v_and_b32_e32 v2, 0xffff0000, v4 +; GFX900-NEXT: v_add_f32_e32 v0, v0, v2 +; GFX900-NEXT: v_bfe_u32 v2, v0, 16, 1 +; GFX900-NEXT: v_add3_u32 v2, v2, v0, s4 +; GFX900-NEXT: v_or_b32_e32 v4, 0x400000, v0 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX900-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_perm_b32 v0, v0, v3, s4 +; GFX900-NEXT: v_alignbit_b32 v1, s4, v1, 16 +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_fmuladd_v3bf16: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX950-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX950-NEXT: v_mul_f32_e32 v1, v1, v3 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v1, v1, s0 +; GFX950-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX950-NEXT: v_lshlrev_b32_e32 v3, 16, v5 +; GFX950-NEXT: v_add_f32_e32 v1, v1, v3 +; GFX950-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 +; GFX950-NEXT: v_and_b32_e32 v5, 0xffff0000, v0 +; GFX950-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX950-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX950-NEXT: v_mul_f32_e32 v3, v5, v3 +; GFX950-NEXT: v_mul_f32_e32 v0, v0, v2 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v3, v3, s0 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v0, v0, s0 +; GFX950-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX950-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 +; GFX950-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX950-NEXT: v_lshlrev_b32_e32 v2, 16, v4 +; GFX950-NEXT: v_add_f32_e32 v3, v3, v5 +; GFX950-NEXT: v_add_f32_e32 v0, v0, v2 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v1, v1, s0 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v0, v0, v3 +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_fmuladd_v3bf16: ; GFX10: ; %bb.0: @@ -44560,78 +47998,113 @@ define <4 x bfloat> @v_fmuladd_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b, <4 x bfl ; GFX8-NEXT: v_alignbit_b32 v1, v1, v6, 16 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_fmuladd_v4bf16: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_lshlrev_b32_e32 v6, 16, v3 -; GFX9-NEXT: v_lshlrev_b32_e32 v7, 16, v1 -; GFX9-NEXT: v_mul_f32_e32 v6, v7, v6 -; GFX9-NEXT: v_bfe_u32 v7, v6, 16, 1 -; GFX9-NEXT: s_movk_i32 s4, 0x7fff -; GFX9-NEXT: v_add3_u32 v7, v7, v6, s4 -; GFX9-NEXT: v_or_b32_e32 v8, 0x400000, v6 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 -; GFX9-NEXT: v_cndmask_b32_e32 v6, v7, v8, vcc -; GFX9-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 -; GFX9-NEXT: v_lshlrev_b32_e32 v7, 16, v5 -; GFX9-NEXT: v_add_f32_e32 v6, v6, v7 -; GFX9-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 -; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GFX9-NEXT: v_bfe_u32 v7, v6, 16, 1 -; GFX9-NEXT: v_mul_f32_e32 v1, v1, v3 -; GFX9-NEXT: v_add3_u32 v7, v7, v6, s4 -; GFX9-NEXT: v_or_b32_e32 v8, 0x400000, v6 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 -; GFX9-NEXT: v_bfe_u32 v3, v1, 16, 1 -; GFX9-NEXT: v_cndmask_b32_e32 v6, v7, v8, vcc -; GFX9-NEXT: v_add3_u32 v3, v3, v1, s4 -; GFX9-NEXT: v_or_b32_e32 v7, 0x400000, v1 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v7, vcc -; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GFX9-NEXT: v_and_b32_e32 v3, 0xffff0000, v5 -; GFX9-NEXT: v_add_f32_e32 v1, v1, v3 -; GFX9-NEXT: v_bfe_u32 v3, v1, 16, 1 -; GFX9-NEXT: v_add3_u32 v3, v3, v1, s4 -; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v1 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc -; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v2 -; GFX9-NEXT: v_lshlrev_b32_e32 v5, 16, v0 -; GFX9-NEXT: v_mul_f32_e32 v3, v5, v3 -; GFX9-NEXT: v_bfe_u32 v5, v3, 16, 1 -; GFX9-NEXT: v_add3_u32 v5, v5, v3, s4 -; GFX9-NEXT: v_or_b32_e32 v7, 0x400000, v3 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; GFX9-NEXT: v_cndmask_b32_e32 v3, v5, v7, vcc -; GFX9-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 -; GFX9-NEXT: v_lshlrev_b32_e32 v5, 16, v4 -; GFX9-NEXT: v_add_f32_e32 v3, v3, v5 -; GFX9-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX9-NEXT: v_bfe_u32 v5, v3, 16, 1 -; GFX9-NEXT: v_mul_f32_e32 v0, v0, v2 -; GFX9-NEXT: v_add3_u32 v5, v5, v3, s4 -; GFX9-NEXT: v_or_b32_e32 v7, 0x400000, v3 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; GFX9-NEXT: v_bfe_u32 v2, v0, 16, 1 -; GFX9-NEXT: v_cndmask_b32_e32 v3, v5, v7, vcc -; GFX9-NEXT: v_add3_u32 v2, v2, v0, s4 -; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v0 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v5, vcc -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX9-NEXT: v_and_b32_e32 v2, 0xffff0000, v4 -; GFX9-NEXT: v_add_f32_e32 v0, v0, v2 -; GFX9-NEXT: v_bfe_u32 v2, v0, 16, 1 -; GFX9-NEXT: v_add3_u32 v2, v2, v0, s4 -; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v0 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc -; GFX9-NEXT: s_mov_b32 s4, 0x7060302 -; GFX9-NEXT: v_perm_b32 v0, v0, v3, s4 -; GFX9-NEXT: v_perm_b32 v1, v1, v6, s4 -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: v_fmuladd_v4bf16: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_lshlrev_b32_e32 v6, 16, v3 +; GFX900-NEXT: v_lshlrev_b32_e32 v7, 16, v1 +; GFX900-NEXT: v_mul_f32_e32 v6, v7, v6 +; GFX900-NEXT: v_bfe_u32 v7, v6, 16, 1 +; GFX900-NEXT: s_movk_i32 s4, 0x7fff +; GFX900-NEXT: v_add3_u32 v7, v7, v6, s4 +; GFX900-NEXT: v_or_b32_e32 v8, 0x400000, v6 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; GFX900-NEXT: v_cndmask_b32_e32 v6, v7, v8, vcc +; GFX900-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 +; GFX900-NEXT: v_lshlrev_b32_e32 v7, 16, v5 +; GFX900-NEXT: v_add_f32_e32 v6, v6, v7 +; GFX900-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; GFX900-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX900-NEXT: v_bfe_u32 v7, v6, 16, 1 +; GFX900-NEXT: v_mul_f32_e32 v1, v1, v3 +; GFX900-NEXT: v_add3_u32 v7, v7, v6, s4 +; GFX900-NEXT: v_or_b32_e32 v8, 0x400000, v6 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; GFX900-NEXT: v_bfe_u32 v3, v1, 16, 1 +; GFX900-NEXT: v_cndmask_b32_e32 v6, v7, v8, vcc +; GFX900-NEXT: v_add3_u32 v3, v3, v1, s4 +; GFX900-NEXT: v_or_b32_e32 v7, 0x400000, v1 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX900-NEXT: v_cndmask_b32_e32 v1, v3, v7, vcc +; GFX900-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX900-NEXT: v_and_b32_e32 v3, 0xffff0000, v5 +; GFX900-NEXT: v_add_f32_e32 v1, v1, v3 +; GFX900-NEXT: v_bfe_u32 v3, v1, 16, 1 +; GFX900-NEXT: v_add3_u32 v3, v3, v1, s4 +; GFX900-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX900-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc +; GFX900-NEXT: v_lshlrev_b32_e32 v3, 16, v2 +; GFX900-NEXT: v_lshlrev_b32_e32 v5, 16, v0 +; GFX900-NEXT: v_mul_f32_e32 v3, v5, v3 +; GFX900-NEXT: v_bfe_u32 v5, v3, 16, 1 +; GFX900-NEXT: v_add3_u32 v5, v5, v3, s4 +; GFX900-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX900-NEXT: v_cndmask_b32_e32 v3, v5, v7, vcc +; GFX900-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; GFX900-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX900-NEXT: v_add_f32_e32 v3, v3, v5 +; GFX900-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX900-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX900-NEXT: v_bfe_u32 v5, v3, 16, 1 +; GFX900-NEXT: v_mul_f32_e32 v0, v0, v2 +; GFX900-NEXT: v_add3_u32 v5, v5, v3, s4 +; GFX900-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX900-NEXT: v_bfe_u32 v2, v0, 16, 1 +; GFX900-NEXT: v_cndmask_b32_e32 v3, v5, v7, vcc +; GFX900-NEXT: v_add3_u32 v2, v2, v0, s4 +; GFX900-NEXT: v_or_b32_e32 v5, 0x400000, v0 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX900-NEXT: v_cndmask_b32_e32 v0, v2, v5, vcc +; GFX900-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX900-NEXT: v_and_b32_e32 v2, 0xffff0000, v4 +; GFX900-NEXT: v_add_f32_e32 v0, v0, v2 +; GFX900-NEXT: v_bfe_u32 v2, v0, 16, 1 +; GFX900-NEXT: v_add3_u32 v2, v2, v0, s4 +; GFX900-NEXT: v_or_b32_e32 v4, 0x400000, v0 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX900-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_perm_b32 v0, v0, v3, s4 +; GFX900-NEXT: v_perm_b32 v1, v1, v6, s4 +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_fmuladd_v4bf16: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX950-NEXT: v_and_b32_e32 v7, 0xffff0000, v1 +; GFX950-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX950-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX950-NEXT: v_mul_f32_e32 v1, v1, v3 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v1, v1, s0 +; GFX950-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX950-NEXT: v_lshlrev_b32_e32 v3, 16, v5 +; GFX950-NEXT: v_mul_f32_e32 v6, v7, v6 +; GFX950-NEXT: v_and_b32_e32 v7, 0xffff0000, v5 +; GFX950-NEXT: v_add_f32_e32 v1, v1, v3 +; GFX950-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 +; GFX950-NEXT: v_and_b32_e32 v5, 0xffff0000, v0 +; GFX950-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX950-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX950-NEXT: v_mul_f32_e32 v3, v5, v3 +; GFX950-NEXT: v_mul_f32_e32 v0, v0, v2 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v6, v6, s0 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v3, v3, s0 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v0, v0, s0 +; GFX950-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; GFX950-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX950-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 +; GFX950-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX950-NEXT: v_lshlrev_b32_e32 v2, 16, v4 +; GFX950-NEXT: v_add_f32_e32 v6, v6, v7 +; GFX950-NEXT: v_add_f32_e32 v3, v3, v5 +; GFX950-NEXT: v_add_f32_e32 v0, v0, v2 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v0, v0, v3 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v1, v1, v6 +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_fmuladd_v4bf16: ; GFX10: ; %bb.0: diff --git a/llvm/test/CodeGen/AMDGPU/branch-folding-implicit-def-subreg.ll b/llvm/test/CodeGen/AMDGPU/branch-folding-implicit-def-subreg.ll index 7eb7d72..006fe51 100644 --- a/llvm/test/CodeGen/AMDGPU/branch-folding-implicit-def-subreg.ll +++ b/llvm/test/CodeGen/AMDGPU/branch-folding-implicit-def-subreg.ll @@ -766,10 +766,10 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 ; GFX90A-NEXT: renamable $vgpr10 = COPY renamable $sgpr22, implicit $exec ; GFX90A-NEXT: renamable $vgpr12_vgpr13 = DS_READ_B64_gfx9 killed renamable $vgpr10, 0, 0, implicit $exec :: (load (s64) from %ir.8, addrspace 3) ; GFX90A-NEXT: renamable $vgpr10 = COPY renamable $sgpr46, implicit $exec - ; GFX90A-NEXT: renamable $vgpr11 = V_ALIGNBIT_B32_e64 killed $sgpr47, killed $vgpr10, 1, implicit $exec - ; GFX90A-NEXT: renamable $vgpr52 = V_ALIGNBIT_B32_e64 $vgpr17, $vgpr16, 1, implicit $exec + ; GFX90A-NEXT: renamable $vgpr11 = V_ALIGNBIT_B32_opsel_e64 0, killed $sgpr47, 0, killed $vgpr10, 0, 1, 0, 0, implicit $exec + ; GFX90A-NEXT: renamable $vgpr52 = V_ALIGNBIT_B32_opsel_e64 0, $vgpr17, 0, $vgpr16, 0, 1, 0, 0, implicit $exec ; GFX90A-NEXT: renamable $vgpr17 = V_CNDMASK_B32_e64 0, 0, 0, 1, $sgpr12_sgpr13, implicit $exec - ; GFX90A-NEXT: renamable $vgpr15 = V_ALIGNBIT_B32_e64 $vgpr15, $vgpr14, 1, implicit $exec + ; GFX90A-NEXT: renamable $vgpr15 = V_ALIGNBIT_B32_opsel_e64 0, $vgpr15, 0, $vgpr14, 0, 1, 0, 0, implicit $exec ; GFX90A-NEXT: renamable $sgpr52_sgpr53 = S_XOR_B64 $exec, -1, implicit-def dead $scc ; GFX90A-NEXT: renamable $sgpr62_sgpr63 = S_OR_B64 renamable $sgpr36_sgpr37, $exec, implicit-def dead $scc ; GFX90A-NEXT: renamable $vgpr10 = COPY renamable $vgpr14, implicit $exec diff --git a/llvm/test/CodeGen/AMDGPU/branch-relax-indirect-branch.mir b/llvm/test/CodeGen/AMDGPU/branch-relax-indirect-branch.mir index 253e7e2..0e5ef3c 100644 --- a/llvm/test/CodeGen/AMDGPU/branch-relax-indirect-branch.mir +++ b/llvm/test/CodeGen/AMDGPU/branch-relax-indirect-branch.mir @@ -68,7 +68,7 @@ body: | ; CHECK-NEXT: successors: %bb.3(0x04000000), %bb.7(0x7c000000) ; CHECK-NEXT: liveins: $vcc_hi, $vcc_lo, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $sgpr14, $sgpr15, $sgpr16, $sgpr17, $sgpr18, $sgpr19, $sgpr20, $sgpr21, $sgpr22, $sgpr23, $sgpr24, $sgpr25, $sgpr26, $sgpr27, $sgpr28, $sgpr29, $sgpr30, $sgpr31, $sgpr34, $sgpr35, $sgpr36, $sgpr37, $sgpr38, $sgpr39, $sgpr40, $sgpr41, $sgpr42, $sgpr43, $sgpr44, $sgpr45, $sgpr46, $sgpr47, $sgpr48, $sgpr49, $sgpr50, $sgpr51, $sgpr52, $sgpr53, $sgpr54, $sgpr55, $sgpr56, $sgpr57, $sgpr58, $sgpr59, $sgpr60, $sgpr61, $sgpr62, $sgpr63, $sgpr64, $sgpr65, $sgpr66, $sgpr67, $sgpr68, $sgpr69, $sgpr70, $sgpr71, $sgpr72, $sgpr73, $sgpr74, $sgpr75, $sgpr76, $sgpr77, $sgpr78, $sgpr79, $sgpr80, $sgpr81, $sgpr82, $sgpr83, $sgpr84, $sgpr85, $sgpr86, $sgpr87, $sgpr88, $sgpr89, $sgpr90, $sgpr91, $sgpr92, $sgpr93, $sgpr94, $sgpr95, $sgpr96, $sgpr97, $sgpr98, $sgpr99, $sgpr100, $sgpr101, $vgpr0, $vgpr1 ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: INLINEASM &"v_nop_e64\0A v_nop_e64\0A v_nop_e64\0A v_nop_e64\0A v_nop_e64\0A v_nop_e64\0A v_nop_e64\0A v_nop_e64", 1 /* sideeffect attdialect */, 2097162 /* regdef:VRegOrLds_32 */, def renamable $sgpr4 + ; CHECK-NEXT: INLINEASM &"v_nop_e64\0A v_nop_e64\0A v_nop_e64\0A v_nop_e64\0A v_nop_e64\0A v_nop_e64\0A v_nop_e64\0A v_nop_e64", 1 /* sideeffect attdialect */, 2359306 /* regdef:SReg_32 */, def renamable $sgpr4 ; CHECK-NEXT: S_CMP_LG_U32 killed renamable $sgpr4, 0, implicit-def $scc ; CHECK-NEXT: S_CBRANCH_SCC0 %bb.3, implicit killed $scc ; CHECK-NEXT: {{ $}} @@ -149,7 +149,7 @@ body: | successors: %bb.3(0x04000000), %bb.2(0x7c000000) liveins: $vcc_hi, $vcc_lo, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $sgpr14, $sgpr15, $sgpr16, $sgpr17, $sgpr18, $sgpr19, $sgpr20, $sgpr21, $sgpr22, $sgpr23, $sgpr24, $sgpr25, $sgpr26, $sgpr27, $sgpr28, $sgpr29, $sgpr30, $sgpr31, $sgpr34, $sgpr35, $sgpr36, $sgpr37, $sgpr38, $sgpr39, $sgpr40, $sgpr41, $sgpr42, $sgpr43, $sgpr44, $sgpr45, $sgpr46, $sgpr47, $sgpr48, $sgpr49, $sgpr50, $sgpr51, $sgpr52, $sgpr53, $sgpr54, $sgpr55, $sgpr56, $sgpr57, $sgpr58, $sgpr59, $sgpr60, $sgpr61, $sgpr62, $sgpr63, $sgpr64, $sgpr65, $sgpr66, $sgpr67, $sgpr68, $sgpr69, $sgpr70, $sgpr71, $sgpr72, $sgpr73, $sgpr74, $sgpr75, $sgpr76, $sgpr77, $sgpr78, $sgpr79, $sgpr80, $sgpr81, $sgpr82, $sgpr83, $sgpr84, $sgpr85, $sgpr86, $sgpr87, $sgpr88, $sgpr89, $sgpr90, $sgpr91, $sgpr92, $sgpr93, $sgpr94, $sgpr95, $sgpr96, $sgpr97, $sgpr98, $sgpr99, $sgpr100, $sgpr101, $vgpr0, $vgpr1 - INLINEASM &"v_nop_e64\0A v_nop_e64\0A v_nop_e64\0A v_nop_e64\0A v_nop_e64\0A v_nop_e64\0A v_nop_e64\0A v_nop_e64", 1 /* sideeffect attdialect */, 2097162 /* regdef:SReg_32 */, def renamable $sgpr4 + INLINEASM &"v_nop_e64\0A v_nop_e64\0A v_nop_e64\0A v_nop_e64\0A v_nop_e64\0A v_nop_e64\0A v_nop_e64\0A v_nop_e64", 1 /* sideeffect attdialect */, 2359306 /* regdef:SReg_32 */, def renamable $sgpr4 S_CMP_LG_U32 killed renamable $sgpr4, 0, implicit-def $scc S_CBRANCH_SCC1 %bb.2, implicit killed $scc diff --git a/llvm/test/CodeGen/AMDGPU/branch-relax-no-terminators.mir b/llvm/test/CodeGen/AMDGPU/branch-relax-no-terminators.mir index 474ba71..a25c52f 100644 --- a/llvm/test/CodeGen/AMDGPU/branch-relax-no-terminators.mir +++ b/llvm/test/CodeGen/AMDGPU/branch-relax-no-terminators.mir @@ -69,7 +69,7 @@ body: | ; CHECK-NEXT: successors: %bb.3(0x04000000), %bb.7(0x7c000000) ; CHECK-NEXT: liveins: $vcc_hi, $vcc_lo, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $sgpr14, $sgpr15, $sgpr16, $sgpr17, $sgpr18, $sgpr19, $sgpr20, $sgpr21, $sgpr22, $sgpr23, $sgpr24, $sgpr25, $sgpr26, $sgpr27, $sgpr28, $sgpr29, $sgpr30, $sgpr31, $sgpr34, $sgpr35, $sgpr36, $sgpr37, $sgpr38, $sgpr39, $sgpr40, $sgpr41, $sgpr42, $sgpr43, $sgpr44, $sgpr45, $sgpr46, $sgpr47, $sgpr48, $sgpr49, $sgpr50, $sgpr51, $sgpr52, $sgpr53, $sgpr54, $sgpr55, $sgpr56, $sgpr57, $sgpr58, $sgpr59, $sgpr60, $sgpr61, $sgpr62, $sgpr63, $sgpr64, $sgpr65, $sgpr66, $sgpr67, $sgpr68, $sgpr69, $sgpr70, $sgpr71, $sgpr72, $sgpr73, $sgpr74, $sgpr75, $sgpr76, $sgpr77, $sgpr78, $sgpr79, $sgpr80, $sgpr81, $sgpr82, $sgpr83, $sgpr84, $sgpr85, $sgpr86, $sgpr87, $sgpr88, $sgpr89, $sgpr90, $sgpr91, $sgpr92, $sgpr93, $sgpr94, $sgpr95, $sgpr96, $sgpr97, $sgpr98, $sgpr99, $sgpr100, $sgpr101, $vgpr0, $vgpr1 ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: INLINEASM &"v_nop_e64\0A v_nop_e64\0A v_nop_e64\0A v_nop_e64\0A v_nop_e64\0A v_nop_e64\0A v_nop_e64\0A v_nop_e64", 1 /* sideeffect attdialect */, 2097162 /* regdef:VRegOrLds_32 */, def renamable $sgpr4 + ; CHECK-NEXT: INLINEASM &"v_nop_e64\0A v_nop_e64\0A v_nop_e64\0A v_nop_e64\0A v_nop_e64\0A v_nop_e64\0A v_nop_e64\0A v_nop_e64", 1 /* sideeffect attdialect */, 2359306 /* regdef:SReg_32 */, def renamable $sgpr4 ; CHECK-NEXT: S_CMP_LG_U32 killed renamable $sgpr4, 0, implicit-def $scc ; CHECK-NEXT: S_CBRANCH_SCC0 %bb.3, implicit killed $scc ; CHECK-NEXT: {{ $}} @@ -151,7 +151,7 @@ body: | successors: %bb.3(0x04000000), %bb.2(0x7c000000) liveins: $vcc_hi, $vcc_lo, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $sgpr14, $sgpr15, $sgpr16, $sgpr17, $sgpr18, $sgpr19, $sgpr20, $sgpr21, $sgpr22, $sgpr23, $sgpr24, $sgpr25, $sgpr26, $sgpr27, $sgpr28, $sgpr29, $sgpr30, $sgpr31, $sgpr34, $sgpr35, $sgpr36, $sgpr37, $sgpr38, $sgpr39, $sgpr40, $sgpr41, $sgpr42, $sgpr43, $sgpr44, $sgpr45, $sgpr46, $sgpr47, $sgpr48, $sgpr49, $sgpr50, $sgpr51, $sgpr52, $sgpr53, $sgpr54, $sgpr55, $sgpr56, $sgpr57, $sgpr58, $sgpr59, $sgpr60, $sgpr61, $sgpr62, $sgpr63, $sgpr64, $sgpr65, $sgpr66, $sgpr67, $sgpr68, $sgpr69, $sgpr70, $sgpr71, $sgpr72, $sgpr73, $sgpr74, $sgpr75, $sgpr76, $sgpr77, $sgpr78, $sgpr79, $sgpr80, $sgpr81, $sgpr82, $sgpr83, $sgpr84, $sgpr85, $sgpr86, $sgpr87, $sgpr88, $sgpr89, $sgpr90, $sgpr91, $sgpr92, $sgpr93, $sgpr94, $sgpr95, $sgpr96, $sgpr97, $sgpr98, $sgpr99, $sgpr100, $sgpr101, $vgpr0, $vgpr1 - INLINEASM &"v_nop_e64\0A v_nop_e64\0A v_nop_e64\0A v_nop_e64\0A v_nop_e64\0A v_nop_e64\0A v_nop_e64\0A v_nop_e64", 1 /* sideeffect attdialect */, 2097162 /* regdef:SReg_32 */, def renamable $sgpr4 + INLINEASM &"v_nop_e64\0A v_nop_e64\0A v_nop_e64\0A v_nop_e64\0A v_nop_e64\0A v_nop_e64\0A v_nop_e64\0A v_nop_e64", 1 /* sideeffect attdialect */, 2359306 /* regdef:SReg_32 */, def renamable $sgpr4 S_CMP_LG_U32 killed renamable $sgpr4, 0, implicit-def $scc S_CBRANCH_SCC1 %bb.2, implicit killed $scc diff --git a/llvm/test/CodeGen/AMDGPU/coalescer-early-clobber-subreg.mir b/llvm/test/CodeGen/AMDGPU/coalescer-early-clobber-subreg.mir index 4404f1a..ac8ef48 100644 --- a/llvm/test/CodeGen/AMDGPU/coalescer-early-clobber-subreg.mir +++ b/llvm/test/CodeGen/AMDGPU/coalescer-early-clobber-subreg.mir @@ -20,10 +20,10 @@ body: | ; CHECK-LABEL: name: foo1 ; CHECK: liveins: $vgpr0_vgpr1 ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: INLINEASM &"", 0 /* attdialect */, 1835018 /* regdef:VS_32 */, def undef %2.sub0, 1835019 /* regdef-ec:VS_32 */, def undef early-clobber %2.sub1 + ; CHECK-NEXT: INLINEASM &"", 0 /* attdialect */, 2228234 /* regdef:VGPR_32 */, def undef %2.sub0, 2228235 /* regdef-ec:VGPR_32 */, def undef early-clobber %2.sub1 ; CHECK-NEXT: FLAT_STORE_DWORDX2 $vgpr0_vgpr1, %2, 0, 0, implicit $exec, implicit $flat_scr :: (store (s64)) ; CHECK-NEXT: S_ENDPGM 0 - INLINEASM &"", 0 /* attdialect */, 1835018 /* regdef:VGPR_32 */, def %0:vgpr_32, 1835019 /* regdef-ec:VGPR_32 */, def early-clobber %1:vgpr_32 + INLINEASM &"", 0 /* attdialect */, 2228234 /* regdef:VGPR_32 */, def %0:vgpr_32, 2228235 /* regdef-ec:VGPR_32 */, def early-clobber %1:vgpr_32 undef %2.sub0:vreg_64 = COPY killed %0 %2.sub1:vreg_64 = COPY killed %1 FLAT_STORE_DWORDX2 killed $vgpr0_vgpr1, killed %2, 0, 0, implicit $exec, implicit $flat_scr :: (store (s64)) @@ -41,10 +41,10 @@ body: | ; CHECK-LABEL: name: foo2 ; CHECK: liveins: $vgpr0_vgpr1 ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: INLINEASM &"", 0 /* attdialect */, 1835019 /* regdef-ec:VS_32 */, def undef early-clobber %2.sub1, 1835018 /* regdef:VS_32 */, def undef %2.sub0 + ; CHECK-NEXT: INLINEASM &"", 0 /* attdialect */, 2228235 /* regdef-ec:VGPR_32 */, def undef early-clobber %2.sub1, 2228234 /* regdef:VGPR_32 */, def undef %2.sub0 ; CHECK-NEXT: FLAT_STORE_DWORDX2 $vgpr0_vgpr1, %2, 0, 0, implicit $exec, implicit $flat_scr :: (store (s64)) ; CHECK-NEXT: S_ENDPGM 0 - INLINEASM &"", 0 /* attdialect */, 1835019 /* regdef-ec:VGPR_32 */, def early-clobber %1:vgpr_32, 1835018 /* regdef:VGPR_32 */, def %0:vgpr_32 + INLINEASM &"", 0 /* attdialect */, 2228235 /* regdef-ec:VGPR_32 */, def early-clobber %1:vgpr_32, 2228234 /* regdef:VGPR_32 */, def %0:vgpr_32 undef %2.sub0:vreg_64 = COPY killed %0 %2.sub1:vreg_64 = COPY killed %1 FLAT_STORE_DWORDX2 killed $vgpr0_vgpr1, killed %2, 0, 0, implicit $exec, implicit $flat_scr :: (store (s64)) @@ -62,10 +62,10 @@ body: | ; CHECK-LABEL: name: foo3 ; CHECK: liveins: $vgpr0_vgpr1 ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: INLINEASM &"", 0 /* attdialect */, 1835018 /* regdef:VS_32 */, def undef %2.sub0, 1835019 /* regdef-ec:VS_32 */, def undef early-clobber %2.sub1 + ; CHECK-NEXT: INLINEASM &"", 0 /* attdialect */, 2228234 /* regdef:VGPR_32 */, def undef %2.sub0, 2228235 /* regdef-ec:VGPR_32 */, def undef early-clobber %2.sub1 ; CHECK-NEXT: FLAT_STORE_DWORDX2 $vgpr0_vgpr1, %2, 0, 0, implicit $exec, implicit $flat_scr :: (store (s64)) ; CHECK-NEXT: S_ENDPGM 0 - INLINEASM &"", 0 /* attdialect */, 1835018 /* regdef:VGPR_32 */, def %1:vgpr_32, 1835019 /* regdef-ec:VGPR_32 */, def early-clobber %0:vgpr_32 + INLINEASM &"", 0 /* attdialect */, 2228234 /* regdef:VGPR_32 */, def %1:vgpr_32, 2228235 /* regdef-ec:VGPR_32 */, def early-clobber %0:vgpr_32 undef %2.sub0:vreg_64 = COPY killed %1 %2.sub1:vreg_64 = COPY killed %0 FLAT_STORE_DWORDX2 killed $vgpr0_vgpr1, killed %2, 0, 0, implicit $exec, implicit $flat_scr :: (store (s64)) @@ -83,10 +83,10 @@ body: | ; CHECK-LABEL: name: foo4 ; CHECK: liveins: $vgpr0_vgpr1 ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: INLINEASM &"", 0 /* attdialect */, 1835019 /* regdef-ec:VS_32 */, def undef early-clobber %2.sub1, 1835018 /* regdef:VS_32 */, def undef %2.sub0 + ; CHECK-NEXT: INLINEASM &"", 0 /* attdialect */, 2228235 /* regdef-ec:VGPR_32 */, def undef early-clobber %2.sub1, 2228234 /* regdef:VGPR_32 */, def undef %2.sub0 ; CHECK-NEXT: FLAT_STORE_DWORDX2 $vgpr0_vgpr1, %2, 0, 0, implicit $exec, implicit $flat_scr :: (store (s64)) ; CHECK-NEXT: S_ENDPGM 0 - INLINEASM &"", 0 /* attdialect */, 1835019 /* regdef-ec:VGPR_32 */, def early-clobber %0:vgpr_32, 1835018 /* regdef:VGPR_32 */, def %1:vgpr_32 + INLINEASM &"", 0 /* attdialect */, 2228235 /* regdef-ec:VGPR_32 */, def early-clobber %0:vgpr_32, 2228234 /* regdef:VGPR_32 */, def %1:vgpr_32 undef %2.sub0:vreg_64 = COPY killed %1 %2.sub1:vreg_64 = COPY killed %0 FLAT_STORE_DWORDX2 killed $vgpr0_vgpr1, killed %2, 0, 0, implicit $exec, implicit $flat_scr :: (store (s64)) diff --git a/llvm/test/CodeGen/AMDGPU/fdiv.bf16.ll b/llvm/test/CodeGen/AMDGPU/fdiv.bf16.ll new file mode 100644 index 0000000..01ebe7d --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/fdiv.bf16.ll @@ -0,0 +1,298 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3 +; RUN: llc -mtriple=amdgcn -mcpu=gfx1250 -mattr=+real-true16 -denormal-fp-math-f32=preserve-sign < %s | FileCheck -check-prefixes=GFX1250-TRUE16 %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1250 -mattr=-real-true16 -denormal-fp-math-f32=preserve-sign < %s | FileCheck -check-prefixes=GFX1250-FAKE16 %s + +/* TODO: Support safe bf16 fdiv lowering. +define bfloat @v_fdiv_bf16(bfloat %x, bfloat %y) { + %fdiv = fdiv bfloat %x, %y + ret bfloat %fdiv +} +*/ + +define bfloat @v_rcp_bf16(bfloat %x) { +; GFX1250-TRUE16-LABEL: v_rcp_bf16: +; GFX1250-TRUE16: ; %bb.0: +; GFX1250-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX1250-TRUE16-NEXT: v_rcp_bf16_e32 v0.l, v0.l +; GFX1250-TRUE16-NEXT: s_set_pc_i64 s[30:31] +; +; GFX1250-FAKE16-LABEL: v_rcp_bf16: +; GFX1250-FAKE16: ; %bb.0: +; GFX1250-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX1250-FAKE16-NEXT: v_rcp_bf16_e32 v0, v0 +; GFX1250-FAKE16-NEXT: s_set_pc_i64 s[30:31] + %fdiv = fdiv bfloat 1.0, %x + ret bfloat %fdiv +} + +define bfloat @v_rcp_bf16_abs(bfloat %x) { +; GFX1250-TRUE16-LABEL: v_rcp_bf16_abs: +; GFX1250-TRUE16: ; %bb.0: +; GFX1250-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX1250-TRUE16-NEXT: v_rcp_bf16_e64 v0.l, |v0.l| +; GFX1250-TRUE16-NEXT: s_set_pc_i64 s[30:31] +; +; GFX1250-FAKE16-LABEL: v_rcp_bf16_abs: +; GFX1250-FAKE16: ; %bb.0: +; GFX1250-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX1250-FAKE16-NEXT: v_rcp_bf16_e64 v0, |v0| +; GFX1250-FAKE16-NEXT: s_set_pc_i64 s[30:31] + %fabs = call bfloat @llvm.fabs.bf16(bfloat %x) + %fdiv = fdiv bfloat 1.0, %fabs + ret bfloat %fdiv +} + +define bfloat @v_rcp_bf16_afn(bfloat %x) { +; GFX1250-TRUE16-LABEL: v_rcp_bf16_afn: +; GFX1250-TRUE16: ; %bb.0: +; GFX1250-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX1250-TRUE16-NEXT: v_rcp_bf16_e32 v0.l, v0.l +; GFX1250-TRUE16-NEXT: s_set_pc_i64 s[30:31] +; +; GFX1250-FAKE16-LABEL: v_rcp_bf16_afn: +; GFX1250-FAKE16: ; %bb.0: +; GFX1250-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX1250-FAKE16-NEXT: v_rcp_bf16_e32 v0, v0 +; GFX1250-FAKE16-NEXT: s_set_pc_i64 s[30:31] + %fdiv = fdiv afn bfloat 1.0, %x + ret bfloat %fdiv +} + +define bfloat @v_rcp_bf16_neg(bfloat %x) { +; GFX1250-TRUE16-LABEL: v_rcp_bf16_neg: +; GFX1250-TRUE16: ; %bb.0: +; GFX1250-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX1250-TRUE16-NEXT: v_rcp_bf16_e64 v0.l, -v0.l +; GFX1250-TRUE16-NEXT: s_set_pc_i64 s[30:31] +; +; GFX1250-FAKE16-LABEL: v_rcp_bf16_neg: +; GFX1250-FAKE16: ; %bb.0: +; GFX1250-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX1250-FAKE16-NEXT: v_rcp_bf16_e64 v0, -v0 +; GFX1250-FAKE16-NEXT: s_set_pc_i64 s[30:31] + %fdiv = fdiv bfloat -1.0, %x + ret bfloat %fdiv +} + +; TODO: Support lowering to v_rsq_bf16. +define bfloat @v_rsq_bf16(bfloat %x) { +; GFX1250-TRUE16-LABEL: v_rsq_bf16: +; GFX1250-TRUE16: ; %bb.0: +; GFX1250-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX1250-TRUE16-NEXT: v_sqrt_bf16_e32 v0.l, v0.l +; GFX1250-TRUE16-NEXT: s_delay_alu instid0(TRANS32_DEP_1) +; GFX1250-TRUE16-NEXT: v_rcp_bf16_e32 v0.l, v0.l +; GFX1250-TRUE16-NEXT: s_set_pc_i64 s[30:31] +; +; GFX1250-FAKE16-LABEL: v_rsq_bf16: +; GFX1250-FAKE16: ; %bb.0: +; GFX1250-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX1250-FAKE16-NEXT: v_sqrt_bf16_e32 v0, v0 +; GFX1250-FAKE16-NEXT: s_delay_alu instid0(TRANS32_DEP_1) +; GFX1250-FAKE16-NEXT: v_rcp_bf16_e32 v0, v0 +; GFX1250-FAKE16-NEXT: s_set_pc_i64 s[30:31] + %sqrt = call contract bfloat @llvm.sqrt.bf16(bfloat %x) + %fdiv = fdiv contract bfloat 1.0, %sqrt + ret bfloat %fdiv +} + +; TODO: Support lowering to v_rsq_bf16. +define bfloat @v_rsq_bf16_neg(bfloat %x) { +; GFX1250-TRUE16-LABEL: v_rsq_bf16_neg: +; GFX1250-TRUE16: ; %bb.0: +; GFX1250-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX1250-TRUE16-NEXT: v_sqrt_bf16_e32 v0.l, v0.l +; GFX1250-TRUE16-NEXT: s_delay_alu instid0(TRANS32_DEP_1) +; GFX1250-TRUE16-NEXT: v_rcp_bf16_e64 v0.l, -v0.l +; GFX1250-TRUE16-NEXT: s_set_pc_i64 s[30:31] +; +; GFX1250-FAKE16-LABEL: v_rsq_bf16_neg: +; GFX1250-FAKE16: ; %bb.0: +; GFX1250-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX1250-FAKE16-NEXT: v_sqrt_bf16_e32 v0, v0 +; GFX1250-FAKE16-NEXT: s_delay_alu instid0(TRANS32_DEP_1) +; GFX1250-FAKE16-NEXT: v_rcp_bf16_e64 v0, -v0 +; GFX1250-FAKE16-NEXT: s_set_pc_i64 s[30:31] + %sqrt = call contract bfloat @llvm.sqrt.bf16(bfloat %x) + %fdiv = fdiv contract bfloat -1.0, %sqrt + ret bfloat %fdiv +} + +; TODO: Support lowering to v_rsq_bf16. +define <2 x bfloat> @v_rsq_bf16_multi_use(bfloat %x) { +; GFX1250-TRUE16-LABEL: v_rsq_bf16_multi_use: +; GFX1250-TRUE16: ; %bb.0: +; GFX1250-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX1250-TRUE16-NEXT: v_mov_b16_e32 v1.l, v0.l +; GFX1250-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(TRANS32_DEP_1) +; GFX1250-TRUE16-NEXT: v_sqrt_bf16_e32 v1.l, v1.l +; GFX1250-TRUE16-NEXT: v_rcp_bf16_e32 v1.h, v1.l +; GFX1250-TRUE16-NEXT: v_nop +; GFX1250-TRUE16-NEXT: v_mov_b16_e32 v1.l, v0.l +; GFX1250-TRUE16-NEXT: s_delay_alu instid0(TRANS32_DEP_1) | instid1(VALU_DEP_1) +; GFX1250-TRUE16-NEXT: v_mov_b32_e32 v0, v1 +; GFX1250-TRUE16-NEXT: s_set_pc_i64 s[30:31] +; +; GFX1250-FAKE16-LABEL: v_rsq_bf16_multi_use: +; GFX1250-FAKE16: ; %bb.0: +; GFX1250-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX1250-FAKE16-NEXT: v_sqrt_bf16_e32 v1, v0 +; GFX1250-FAKE16-NEXT: s_delay_alu instid0(TRANS32_DEP_1) | instskip(SKIP_1) | instid1(TRANS32_DEP_1) +; GFX1250-FAKE16-NEXT: v_rcp_bf16_e32 v1, v1 +; GFX1250-FAKE16-NEXT: v_nop +; GFX1250-FAKE16-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 +; GFX1250-FAKE16-NEXT: s_set_pc_i64 s[30:31] + %sqrt = call contract bfloat @llvm.sqrt.bf16(bfloat %x) + %fdiv = fdiv contract bfloat 1.0, %sqrt + %r = insertelement <2 x bfloat> zeroinitializer, bfloat %x, i32 0 + %r2 = insertelement <2 x bfloat> %r, bfloat %fdiv, i32 1 + ret <2 x bfloat> %r2 +} + +; TODO: Support lowering to v_rsq_bf16. +define bfloat @v_rsq_bf16_missing_contract0(bfloat %x) { +; GFX1250-TRUE16-LABEL: v_rsq_bf16_missing_contract0: +; GFX1250-TRUE16: ; %bb.0: +; GFX1250-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX1250-TRUE16-NEXT: v_sqrt_bf16_e32 v0.l, v0.l +; GFX1250-TRUE16-NEXT: s_delay_alu instid0(TRANS32_DEP_1) +; GFX1250-TRUE16-NEXT: v_rcp_bf16_e32 v0.l, v0.l +; GFX1250-TRUE16-NEXT: s_set_pc_i64 s[30:31] +; +; GFX1250-FAKE16-LABEL: v_rsq_bf16_missing_contract0: +; GFX1250-FAKE16: ; %bb.0: +; GFX1250-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX1250-FAKE16-NEXT: v_sqrt_bf16_e32 v0, v0 +; GFX1250-FAKE16-NEXT: s_delay_alu instid0(TRANS32_DEP_1) +; GFX1250-FAKE16-NEXT: v_rcp_bf16_e32 v0, v0 +; GFX1250-FAKE16-NEXT: s_set_pc_i64 s[30:31] + %sqrt = call bfloat @llvm.sqrt.bf16(bfloat %x) + %fdiv = fdiv contract bfloat 1.0, %sqrt + ret bfloat %fdiv +} + +; TODO: Support lowering to v_rsq_bf16. +define bfloat @v_rsq_bf16_missing_contract1(bfloat %x) { +; GFX1250-TRUE16-LABEL: v_rsq_bf16_missing_contract1: +; GFX1250-TRUE16: ; %bb.0: +; GFX1250-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX1250-TRUE16-NEXT: v_sqrt_bf16_e32 v0.l, v0.l +; GFX1250-TRUE16-NEXT: s_delay_alu instid0(TRANS32_DEP_1) +; GFX1250-TRUE16-NEXT: v_rcp_bf16_e32 v0.l, v0.l +; GFX1250-TRUE16-NEXT: s_set_pc_i64 s[30:31] +; +; GFX1250-FAKE16-LABEL: v_rsq_bf16_missing_contract1: +; GFX1250-FAKE16: ; %bb.0: +; GFX1250-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX1250-FAKE16-NEXT: v_sqrt_bf16_e32 v0, v0 +; GFX1250-FAKE16-NEXT: s_delay_alu instid0(TRANS32_DEP_1) +; GFX1250-FAKE16-NEXT: v_rcp_bf16_e32 v0, v0 +; GFX1250-FAKE16-NEXT: s_set_pc_i64 s[30:31] + %sqrt = call contract bfloat @llvm.sqrt.bf16(bfloat %x) + %fdiv = fdiv bfloat 1.0, %sqrt + ret bfloat %fdiv +} + +; TODO: Support lowering to v_rsq_bf16. +define bfloat @v_neg_rsq_bf16_missing_contract1(bfloat %x) { +; GFX1250-TRUE16-LABEL: v_neg_rsq_bf16_missing_contract1: +; GFX1250-TRUE16: ; %bb.0: +; GFX1250-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX1250-TRUE16-NEXT: v_sqrt_bf16_e32 v0.l, v0.l +; GFX1250-TRUE16-NEXT: s_delay_alu instid0(TRANS32_DEP_1) +; GFX1250-TRUE16-NEXT: v_rcp_bf16_e64 v0.l, -v0.l +; GFX1250-TRUE16-NEXT: s_set_pc_i64 s[30:31] +; +; GFX1250-FAKE16-LABEL: v_neg_rsq_bf16_missing_contract1: +; GFX1250-FAKE16: ; %bb.0: +; GFX1250-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX1250-FAKE16-NEXT: v_sqrt_bf16_e32 v0, v0 +; GFX1250-FAKE16-NEXT: s_delay_alu instid0(TRANS32_DEP_1) +; GFX1250-FAKE16-NEXT: v_rcp_bf16_e64 v0, -v0 +; GFX1250-FAKE16-NEXT: s_set_pc_i64 s[30:31] + %sqrt = call contract bfloat @llvm.sqrt.bf16(bfloat %x) + %fdiv = fdiv bfloat -1.0, %sqrt + ret bfloat %fdiv +} + +define <2 x bfloat> @v_rsq_v2bf16(<2 x bfloat> %a) { +; GFX1250-TRUE16-LABEL: v_rsq_v2bf16: +; GFX1250-TRUE16: ; %bb.0: +; GFX1250-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX1250-TRUE16-NEXT: v_sqrt_bf16_e32 v0.h, v0.h +; GFX1250-TRUE16-NEXT: v_sqrt_bf16_e32 v0.l, v0.l +; GFX1250-TRUE16-NEXT: s_delay_alu instid0(TRANS32_DEP_2) | instskip(NEXT) | instid1(TRANS32_DEP_2) +; GFX1250-TRUE16-NEXT: v_rcp_bf16_e32 v0.h, v0.h +; GFX1250-TRUE16-NEXT: v_rcp_bf16_e32 v0.l, v0.l +; GFX1250-TRUE16-NEXT: s_set_pc_i64 s[30:31] +; +; GFX1250-FAKE16-LABEL: v_rsq_v2bf16: +; GFX1250-FAKE16: ; %bb.0: +; GFX1250-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX1250-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; GFX1250-FAKE16-NEXT: v_sqrt_bf16_e32 v0, v0 +; GFX1250-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(TRANS32_DEP_2) +; GFX1250-FAKE16-NEXT: v_sqrt_bf16_e32 v1, v1 +; GFX1250-FAKE16-NEXT: v_rcp_bf16_e32 v0, v0 +; GFX1250-FAKE16-NEXT: s_delay_alu instid0(TRANS32_DEP_2) | instskip(SKIP_1) | instid1(TRANS32_DEP_1) +; GFX1250-FAKE16-NEXT: v_rcp_bf16_e32 v1, v1 +; GFX1250-FAKE16-NEXT: v_nop +; GFX1250-FAKE16-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 +; GFX1250-FAKE16-NEXT: s_set_pc_i64 s[30:31] + %sqrt = call contract <2 x bfloat> @llvm.sqrt.v2bf16(<2 x bfloat> %a) + %fdiv = fdiv contract <2 x bfloat> <bfloat 1.0, bfloat 1.0>, %sqrt + ret <2 x bfloat> %fdiv +} + +define <2 x bfloat> @v_neg_rsq_v2bf16(<2 x bfloat> %a) { +; GFX1250-TRUE16-LABEL: v_neg_rsq_v2bf16: +; GFX1250-TRUE16: ; %bb.0: +; GFX1250-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX1250-TRUE16-NEXT: v_sqrt_bf16_e32 v0.h, v0.h +; GFX1250-TRUE16-NEXT: v_sqrt_bf16_e32 v0.l, v0.l +; GFX1250-TRUE16-NEXT: s_delay_alu instid0(TRANS32_DEP_2) | instskip(NEXT) | instid1(TRANS32_DEP_2) +; GFX1250-TRUE16-NEXT: v_rcp_bf16_e64 v0.h, -v0.h +; GFX1250-TRUE16-NEXT: v_rcp_bf16_e64 v0.l, -v0.l +; GFX1250-TRUE16-NEXT: s_set_pc_i64 s[30:31] +; +; GFX1250-FAKE16-LABEL: v_neg_rsq_v2bf16: +; GFX1250-FAKE16: ; %bb.0: +; GFX1250-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX1250-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; GFX1250-FAKE16-NEXT: v_sqrt_bf16_e32 v0, v0 +; GFX1250-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(TRANS32_DEP_2) +; GFX1250-FAKE16-NEXT: v_sqrt_bf16_e32 v1, v1 +; GFX1250-FAKE16-NEXT: v_rcp_bf16_e64 v0, -v0 +; GFX1250-FAKE16-NEXT: s_delay_alu instid0(TRANS32_DEP_2) | instskip(SKIP_1) | instid1(TRANS32_DEP_1) +; GFX1250-FAKE16-NEXT: v_rcp_bf16_e64 v1, -v1 +; GFX1250-FAKE16-NEXT: v_nop +; GFX1250-FAKE16-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 +; GFX1250-FAKE16-NEXT: s_set_pc_i64 s[30:31] + %sqrt = call contract <2 x bfloat> @llvm.sqrt.v2bf16(<2 x bfloat> %a) + %fdiv = fdiv contract <2 x bfloat> <bfloat -1.0, bfloat -1.0>, %sqrt + ret <2 x bfloat> %fdiv +} diff --git a/llvm/test/CodeGen/AMDGPU/irtranslator-whole-wave-functions.ll b/llvm/test/CodeGen/AMDGPU/irtranslator-whole-wave-functions.ll new file mode 100644 index 0000000..b68786b --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/irtranslator-whole-wave-functions.ll @@ -0,0 +1,103 @@ +; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5 +; RUN: llc -global-isel=1 -mtriple=amdgcn--amdpal -mcpu=gfx1200 -stop-after=irtranslator -verify-machineinstrs < %s | FileCheck %s + +define amdgpu_gfx_whole_wave i32 @basic_test(i1 %active, i32 %a, i32 %b) { + ; CHECK-LABEL: name: basic_test + ; CHECK: bb.1 (%ir-block.0): + ; CHECK-NEXT: liveins: $vgpr0, $vgpr1 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; CHECK-NEXT: [[AMDGPU_WHOLE_WAVE_FUNC_SETUP:%[0-9]+]]:_(s1) = G_AMDGPU_WHOLE_WAVE_FUNC_SETUP + ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 5 + ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 3 + ; CHECK-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[AMDGPU_WHOLE_WAVE_FUNC_SETUP]](s1), [[COPY]], [[C]] + ; CHECK-NEXT: [[SELECT1:%[0-9]+]]:_(s32) = G_SELECT [[AMDGPU_WHOLE_WAVE_FUNC_SETUP]](s1), [[COPY1]], [[C1]] + ; CHECK-NEXT: [[INTRINSIC_CONVERGENT:%[0-9]+]]:_(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.update.dpp), [[SELECT]](s32), [[SELECT1]](s32), 1, 1, 1, 0 + ; CHECK-NEXT: $vgpr0 = COPY [[INTRINSIC_CONVERGENT]](s32) + ; CHECK-NEXT: G_AMDGPU_WHOLE_WAVE_FUNC_RETURN [[AMDGPU_WHOLE_WAVE_FUNC_SETUP]](s1), implicit $vgpr0 + %x = select i1 %active, i32 %a, i32 5 + %y = select i1 %active, i32 %b, i32 3 + %ret = call i32 @llvm.amdgcn.update.dpp.i32(i32 %x, i32 %y, i32 1, i32 1, i32 1, i1 false) + ret i32 %ret +} + +; Make sure we don't crash if %active is not used at all. +define amdgpu_gfx_whole_wave i32 @unused_active(i1 %active, i32 %a, i32 %b) { + ; CHECK-LABEL: name: unused_active + ; CHECK: bb.1 (%ir-block.0): + ; CHECK-NEXT: liveins: $vgpr0, $vgpr1 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; CHECK-NEXT: [[AMDGPU_WHOLE_WAVE_FUNC_SETUP:%[0-9]+]]:_(s1) = G_AMDGPU_WHOLE_WAVE_FUNC_SETUP + ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 14 + ; CHECK-NEXT: $vgpr0 = COPY [[C]](s32) + ; CHECK-NEXT: G_AMDGPU_WHOLE_WAVE_FUNC_RETURN [[AMDGPU_WHOLE_WAVE_FUNC_SETUP]](s1), implicit $vgpr0 + ret i32 14 +} + +define amdgpu_gfx_whole_wave i32 @multiple_blocks(i1 %active, i32 %a, i32 %b) { + ; CHECK-LABEL: name: multiple_blocks + ; CHECK: bb.1 (%ir-block.0): + ; CHECK-NEXT: successors: %bb.2(0x40000000), %bb.3(0x40000000) + ; CHECK-NEXT: liveins: $vgpr0, $vgpr1 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; CHECK-NEXT: [[AMDGPU_WHOLE_WAVE_FUNC_SETUP:%[0-9]+]]:_(s1) = G_AMDGPU_WHOLE_WAVE_FUNC_SETUP + ; CHECK-NEXT: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(eq), [[COPY]](s32), [[COPY1]] + ; CHECK-NEXT: [[INT:%[0-9]+]]:_(s1), [[INT1:%[0-9]+]]:_(s32) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.if), [[ICMP]](s1) + ; CHECK-NEXT: G_BRCOND [[INT]](s1), %bb.2 + ; CHECK-NEXT: G_BR %bb.3 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.2.if.then: + ; CHECK-NEXT: successors: %bb.3(0x80000000) + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[ADD:%[0-9]+]]:_(s32) = G_ADD [[COPY]], [[COPY1]] + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.3.if.end: + ; CHECK-NEXT: [[PHI:%[0-9]+]]:_(s32) = G_PHI [[COPY1]](s32), %bb.1, [[ADD]](s32), %bb.2 + ; CHECK-NEXT: G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[INT1]](s32) + ; CHECK-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[AMDGPU_WHOLE_WAVE_FUNC_SETUP]](s1), [[COPY]], [[PHI]] + ; CHECK-NEXT: $vgpr0 = COPY [[SELECT]](s32) + ; CHECK-NEXT: G_AMDGPU_WHOLE_WAVE_FUNC_RETURN [[AMDGPU_WHOLE_WAVE_FUNC_SETUP]](s1), implicit $vgpr0 + %c = icmp eq i32 %a, %b + br i1 %c, label %if.then, label %if.end + +if.then: ; preds = %0 + %d = add i32 %a, %b + br label %if.end + +if.end: + %f = phi i32 [ %d, %if.then ], [ %b, %0 ] + %e = select i1 %active, i32 %a, i32 %f + ret i32 %e +} + +define amdgpu_gfx_whole_wave i64 @ret_64(i1 %active, i64 %a, i64 %b) { + ; CHECK-LABEL: name: ret_64 + ; CHECK: bb.1 (%ir-block.0): + ; CHECK-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; CHECK-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32) + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2 + ; CHECK-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3 + ; CHECK-NEXT: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY2]](s32), [[COPY3]](s32) + ; CHECK-NEXT: [[AMDGPU_WHOLE_WAVE_FUNC_SETUP:%[0-9]+]]:_(s1) = G_AMDGPU_WHOLE_WAVE_FUNC_SETUP + ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 5 + ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 3 + ; CHECK-NEXT: [[SELECT:%[0-9]+]]:_(s64) = G_SELECT [[AMDGPU_WHOLE_WAVE_FUNC_SETUP]](s1), [[MV]], [[C]] + ; CHECK-NEXT: [[SELECT1:%[0-9]+]]:_(s64) = G_SELECT [[AMDGPU_WHOLE_WAVE_FUNC_SETUP]](s1), [[MV1]], [[C1]] + ; CHECK-NEXT: [[INTRINSIC_CONVERGENT:%[0-9]+]]:_(s64) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.update.dpp), [[SELECT]](s64), [[SELECT1]](s64), 1, 1, 1, 0 + ; CHECK-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[INTRINSIC_CONVERGENT]](s64) + ; CHECK-NEXT: $vgpr0 = COPY [[UV]](s32) + ; CHECK-NEXT: $vgpr1 = COPY [[UV1]](s32) + ; CHECK-NEXT: G_AMDGPU_WHOLE_WAVE_FUNC_RETURN [[AMDGPU_WHOLE_WAVE_FUNC_SETUP]](s1), implicit $vgpr0, implicit $vgpr1 + %x = select i1 %active, i64 %a, i64 5 + %y = select i1 %active, i64 %b, i64 3 + %ret = call i64 @llvm.amdgcn.update.dpp.i64(i64 %x, i64 %y, i32 1, i32 1, i32 1, i1 false) + ret i64 %ret +} diff --git a/llvm/test/CodeGen/AMDGPU/isel-whole-wave-functions.ll b/llvm/test/CodeGen/AMDGPU/isel-whole-wave-functions.ll new file mode 100644 index 0000000..3450d63 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/isel-whole-wave-functions.ll @@ -0,0 +1,191 @@ +; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5 +; RUN: llc -global-isel=0 -mtriple=amdgcn--amdpal -mcpu=gfx1200 -stop-after=finalize-isel < %s | FileCheck --check-prefix=DAGISEL %s +; RUN: llc -global-isel=1 -mtriple=amdgcn--amdpal -mcpu=gfx1200 -stop-after=finalize-isel < %s | FileCheck --check-prefix=GISEL %s + +define amdgpu_gfx_whole_wave i32 @basic_test(i1 %active, i32 %a, i32 %b) { + ; DAGISEL-LABEL: name: basic_test + ; DAGISEL: bb.0 (%ir-block.0): + ; DAGISEL-NEXT: liveins: $vgpr0, $vgpr1 + ; DAGISEL-NEXT: {{ $}} + ; DAGISEL-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; DAGISEL-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; DAGISEL-NEXT: [[SI_WHOLE_WAVE_FUNC_SETUP:%[0-9]+]]:sreg_32_xm0_xexec = SI_WHOLE_WAVE_FUNC_SETUP implicit-def dead $exec, implicit $exec + ; DAGISEL-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 5 + ; DAGISEL-NEXT: [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, killed [[S_MOV_B32_]], 0, [[COPY1]], [[SI_WHOLE_WAVE_FUNC_SETUP]], implicit $exec + ; DAGISEL-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 3 + ; DAGISEL-NEXT: [[V_CNDMASK_B32_e64_1:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, killed [[S_MOV_B32_1]], 0, [[COPY]], [[SI_WHOLE_WAVE_FUNC_SETUP]], implicit $exec + ; DAGISEL-NEXT: [[V_MOV_B32_dpp:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[V_CNDMASK_B32_e64_]], killed [[V_CNDMASK_B32_e64_1]], 1, 1, 1, 0, implicit $exec + ; DAGISEL-NEXT: $vgpr0 = COPY [[V_MOV_B32_dpp]] + ; DAGISEL-NEXT: [[DEF:%[0-9]+]]:sreg_32 = IMPLICIT_DEF + ; DAGISEL-NEXT: SI_WHOLE_WAVE_FUNC_RETURN killed [[SI_WHOLE_WAVE_FUNC_SETUP]], implicit $vgpr0 + ; + ; GISEL-LABEL: name: basic_test + ; GISEL: bb.1 (%ir-block.0): + ; GISEL-NEXT: liveins: $vgpr0, $vgpr1 + ; GISEL-NEXT: {{ $}} + ; GISEL-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GISEL-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GISEL-NEXT: [[SI_WHOLE_WAVE_FUNC_SETUP:%[0-9]+]]:sreg_32_xm0_xexec = SI_WHOLE_WAVE_FUNC_SETUP implicit-def dead $exec, implicit $exec + ; GISEL-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 5 + ; GISEL-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] + ; GISEL-NEXT: [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, [[COPY2]], 0, [[COPY]], [[SI_WHOLE_WAVE_FUNC_SETUP]], implicit $exec + ; GISEL-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 3 + ; GISEL-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]] + ; GISEL-NEXT: [[V_CNDMASK_B32_e64_1:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, [[COPY3]], 0, [[COPY1]], [[SI_WHOLE_WAVE_FUNC_SETUP]], implicit $exec + ; GISEL-NEXT: [[V_MOV_B32_dpp:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[V_CNDMASK_B32_e64_]], [[V_CNDMASK_B32_e64_1]], 1, 1, 1, 0, implicit $exec + ; GISEL-NEXT: $vgpr0 = COPY [[V_MOV_B32_dpp]] + ; GISEL-NEXT: SI_WHOLE_WAVE_FUNC_RETURN [[SI_WHOLE_WAVE_FUNC_SETUP]], implicit $vgpr0 + %x = select i1 %active, i32 %a, i32 5 + %y = select i1 %active, i32 %b, i32 3 + %ret = call i32 @llvm.amdgcn.update.dpp.i32(i32 %x, i32 %y, i32 1, i32 1, i32 1, i1 false) + ret i32 %ret +} + +; Make sure we don't crash if %active is not used at all. +define amdgpu_gfx_whole_wave i32 @unused_active(i1 %active, i32 %a, i32 %b) { + ; DAGISEL-LABEL: name: unused_active + ; DAGISEL: bb.0 (%ir-block.0): + ; DAGISEL-NEXT: [[SI_WHOLE_WAVE_FUNC_SETUP:%[0-9]+]]:sreg_32 = SI_WHOLE_WAVE_FUNC_SETUP implicit-def dead $exec, implicit $exec + ; DAGISEL-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 14, implicit $exec + ; DAGISEL-NEXT: $vgpr0 = COPY [[V_MOV_B32_e32_]] + ; DAGISEL-NEXT: [[DEF:%[0-9]+]]:sreg_32 = IMPLICIT_DEF + ; DAGISEL-NEXT: SI_WHOLE_WAVE_FUNC_RETURN killed [[SI_WHOLE_WAVE_FUNC_SETUP]], implicit $vgpr0 + ; + ; GISEL-LABEL: name: unused_active + ; GISEL: bb.1 (%ir-block.0): + ; GISEL-NEXT: liveins: $vgpr0, $vgpr1 + ; GISEL-NEXT: {{ $}} + ; GISEL-NEXT: [[SI_WHOLE_WAVE_FUNC_SETUP:%[0-9]+]]:sreg_32_xm0_xexec = SI_WHOLE_WAVE_FUNC_SETUP implicit-def dead $exec, implicit $exec + ; GISEL-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 14 + ; GISEL-NEXT: $vgpr0 = COPY [[S_MOV_B32_]] + ; GISEL-NEXT: SI_WHOLE_WAVE_FUNC_RETURN [[SI_WHOLE_WAVE_FUNC_SETUP]], implicit $vgpr0 + ret i32 14 +} + +define amdgpu_gfx_whole_wave i32 @multiple_blocks(i1 %active, i32 %a, i32 %b) { + ; DAGISEL-LABEL: name: multiple_blocks + ; DAGISEL: bb.0 (%ir-block.0): + ; DAGISEL-NEXT: successors: %bb.1(0x40000000), %bb.2(0x40000000) + ; DAGISEL-NEXT: liveins: $vgpr0, $vgpr1 + ; DAGISEL-NEXT: {{ $}} + ; DAGISEL-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; DAGISEL-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; DAGISEL-NEXT: [[SI_WHOLE_WAVE_FUNC_SETUP:%[0-9]+]]:sreg_32 = SI_WHOLE_WAVE_FUNC_SETUP implicit-def dead $exec, implicit $exec + ; DAGISEL-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY [[SI_WHOLE_WAVE_FUNC_SETUP]] + ; DAGISEL-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_32 = V_CMP_EQ_U32_e64 [[COPY1]], [[COPY]], implicit $exec + ; DAGISEL-NEXT: [[SI_IF:%[0-9]+]]:sreg_32 = SI_IF killed [[V_CMP_EQ_U32_e64_]], %bb.2, implicit-def dead $exec, implicit-def dead $scc, implicit $exec + ; DAGISEL-NEXT: S_BRANCH %bb.1 + ; DAGISEL-NEXT: {{ $}} + ; DAGISEL-NEXT: bb.1.if.then: + ; DAGISEL-NEXT: successors: %bb.2(0x80000000) + ; DAGISEL-NEXT: {{ $}} + ; DAGISEL-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[COPY1]], [[COPY]], 0, implicit $exec + ; DAGISEL-NEXT: {{ $}} + ; DAGISEL-NEXT: bb.2.if.end: + ; DAGISEL-NEXT: [[PHI:%[0-9]+]]:vgpr_32 = PHI [[COPY]], %bb.0, [[V_ADD_U32_e64_]], %bb.1 + ; DAGISEL-NEXT: SI_END_CF [[SI_IF]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec + ; DAGISEL-NEXT: [[COPY3:%[0-9]+]]:sreg_32_xm0_xexec = COPY [[COPY2]] + ; DAGISEL-NEXT: [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, [[PHI]], 0, [[COPY1]], [[COPY3]], implicit $exec + ; DAGISEL-NEXT: $vgpr0 = COPY [[V_CNDMASK_B32_e64_]] + ; DAGISEL-NEXT: [[DEF:%[0-9]+]]:sreg_32 = IMPLICIT_DEF + ; DAGISEL-NEXT: SI_WHOLE_WAVE_FUNC_RETURN killed [[SI_WHOLE_WAVE_FUNC_SETUP]], implicit $vgpr0 + ; + ; GISEL-LABEL: name: multiple_blocks + ; GISEL: bb.1 (%ir-block.0): + ; GISEL-NEXT: successors: %bb.2(0x40000000), %bb.3(0x40000000) + ; GISEL-NEXT: liveins: $vgpr0, $vgpr1 + ; GISEL-NEXT: {{ $}} + ; GISEL-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GISEL-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GISEL-NEXT: [[SI_WHOLE_WAVE_FUNC_SETUP:%[0-9]+]]:sreg_32_xm0_xexec = SI_WHOLE_WAVE_FUNC_SETUP implicit-def dead $exec, implicit $exec + ; GISEL-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U32_e64 [[COPY]], [[COPY1]], implicit $exec + ; GISEL-NEXT: [[SI_IF:%[0-9]+]]:sreg_32_xm0_xexec = SI_IF [[V_CMP_EQ_U32_e64_]], %bb.3, implicit-def $exec, implicit-def $scc, implicit $exec + ; GISEL-NEXT: S_BRANCH %bb.2 + ; GISEL-NEXT: {{ $}} + ; GISEL-NEXT: bb.2.if.then: + ; GISEL-NEXT: successors: %bb.3(0x80000000) + ; GISEL-NEXT: {{ $}} + ; GISEL-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[COPY]], [[COPY1]], 0, implicit $exec + ; GISEL-NEXT: {{ $}} + ; GISEL-NEXT: bb.3.if.end: + ; GISEL-NEXT: [[PHI:%[0-9]+]]:vgpr_32 = PHI [[COPY1]], %bb.1, [[V_ADD_U32_e64_]], %bb.2 + ; GISEL-NEXT: SI_END_CF [[SI_IF]], implicit-def $exec, implicit-def $scc, implicit $exec + ; GISEL-NEXT: [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, [[PHI]], 0, [[COPY]], [[SI_WHOLE_WAVE_FUNC_SETUP]], implicit $exec + ; GISEL-NEXT: $vgpr0 = COPY [[V_CNDMASK_B32_e64_]] + ; GISEL-NEXT: SI_WHOLE_WAVE_FUNC_RETURN [[SI_WHOLE_WAVE_FUNC_SETUP]], implicit $vgpr0 + %c = icmp eq i32 %a, %b + br i1 %c, label %if.then, label %if.end + +if.then: ; preds = %0 + %d = add i32 %a, %b + br label %if.end + +if.end: + %f = phi i32 [ %d, %if.then ], [ %b, %0 ] + %e = select i1 %active, i32 %a, i32 %f + ret i32 %e +} + +define amdgpu_gfx_whole_wave i64 @ret_64(i1 %active, i64 %a, i64 %b) { + ; DAGISEL-LABEL: name: ret_64 + ; DAGISEL: bb.0 (%ir-block.0): + ; DAGISEL-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3 + ; DAGISEL-NEXT: {{ $}} + ; DAGISEL-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr3 + ; DAGISEL-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; DAGISEL-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; DAGISEL-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; DAGISEL-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF + ; DAGISEL-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF + ; DAGISEL-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY]], %subreg.sub1 + ; DAGISEL-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF + ; DAGISEL-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF + ; DAGISEL-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1 + ; DAGISEL-NEXT: [[SI_WHOLE_WAVE_FUNC_SETUP:%[0-9]+]]:sreg_32 = SI_WHOLE_WAVE_FUNC_SETUP implicit-def dead $exec, implicit $exec + ; DAGISEL-NEXT: [[COPY4:%[0-9]+]]:sreg_32_xm0_xexec = COPY [[SI_WHOLE_WAVE_FUNC_SETUP]] + ; DAGISEL-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub1 + ; DAGISEL-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 + ; DAGISEL-NEXT: [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, [[S_MOV_B32_]], 0, killed [[COPY5]], [[COPY4]], implicit $exec + ; DAGISEL-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub0 + ; DAGISEL-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 5 + ; DAGISEL-NEXT: [[V_CNDMASK_B32_e64_1:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, killed [[S_MOV_B32_1]], 0, killed [[COPY6]], [[COPY4]], implicit $exec + ; DAGISEL-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1 + ; DAGISEL-NEXT: [[V_CNDMASK_B32_e64_2:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, [[S_MOV_B32_]], 0, killed [[COPY7]], [[COPY4]], implicit $exec + ; DAGISEL-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0 + ; DAGISEL-NEXT: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 3 + ; DAGISEL-NEXT: [[V_CNDMASK_B32_e64_3:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, killed [[S_MOV_B32_2]], 0, killed [[COPY8]], [[COPY4]], implicit $exec + ; DAGISEL-NEXT: [[V_MOV_B32_dpp:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[V_CNDMASK_B32_e64_1]], killed [[V_CNDMASK_B32_e64_3]], 1, 1, 1, 0, implicit $exec + ; DAGISEL-NEXT: [[V_MOV_B32_dpp1:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[V_CNDMASK_B32_e64_]], killed [[V_CNDMASK_B32_e64_2]], 1, 1, 1, 0, implicit $exec + ; DAGISEL-NEXT: $vgpr0 = COPY [[V_MOV_B32_dpp]] + ; DAGISEL-NEXT: $vgpr1 = COPY [[V_MOV_B32_dpp1]] + ; DAGISEL-NEXT: [[DEF4:%[0-9]+]]:sreg_32 = IMPLICIT_DEF + ; DAGISEL-NEXT: SI_WHOLE_WAVE_FUNC_RETURN killed [[SI_WHOLE_WAVE_FUNC_SETUP]], implicit $vgpr0, implicit $vgpr1 + ; + ; GISEL-LABEL: name: ret_64 + ; GISEL: bb.1 (%ir-block.0): + ; GISEL-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3 + ; GISEL-NEXT: {{ $}} + ; GISEL-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GISEL-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GISEL-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; GISEL-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3 + ; GISEL-NEXT: [[SI_WHOLE_WAVE_FUNC_SETUP:%[0-9]+]]:sreg_32_xm0_xexec = SI_WHOLE_WAVE_FUNC_SETUP implicit-def dead $exec, implicit $exec + ; GISEL-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 5, implicit $exec + ; GISEL-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + ; GISEL-NEXT: [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, [[V_MOV_B32_e32_]], 0, [[COPY]], [[SI_WHOLE_WAVE_FUNC_SETUP]], implicit $exec + ; GISEL-NEXT: [[V_CNDMASK_B32_e64_1:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, [[V_MOV_B32_e32_1]], 0, [[COPY1]], [[SI_WHOLE_WAVE_FUNC_SETUP]], implicit $exec + ; GISEL-NEXT: [[V_MOV_B32_e32_2:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 3, implicit $exec + ; GISEL-NEXT: [[V_MOV_B32_e32_3:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + ; GISEL-NEXT: [[V_CNDMASK_B32_e64_2:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, [[V_MOV_B32_e32_2]], 0, [[COPY2]], [[SI_WHOLE_WAVE_FUNC_SETUP]], implicit $exec + ; GISEL-NEXT: [[V_CNDMASK_B32_e64_3:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, [[V_MOV_B32_e32_3]], 0, [[COPY3]], [[SI_WHOLE_WAVE_FUNC_SETUP]], implicit $exec + ; GISEL-NEXT: [[V_MOV_B32_dpp:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[V_CNDMASK_B32_e64_]], [[V_CNDMASK_B32_e64_2]], 1, 1, 1, 0, implicit $exec + ; GISEL-NEXT: [[V_MOV_B32_dpp1:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[V_CNDMASK_B32_e64_1]], [[V_CNDMASK_B32_e64_3]], 1, 1, 1, 0, implicit $exec + ; GISEL-NEXT: $vgpr0 = COPY [[V_MOV_B32_dpp]] + ; GISEL-NEXT: $vgpr1 = COPY [[V_MOV_B32_dpp1]] + ; GISEL-NEXT: SI_WHOLE_WAVE_FUNC_RETURN [[SI_WHOLE_WAVE_FUNC_SETUP]], implicit $vgpr0, implicit $vgpr1 + %x = select i1 %active, i64 %a, i64 5 + %y = select i1 %active, i64 %b, i64 3 + %ret = call i64 @llvm.amdgcn.update.dpp.i64(i64 %x, i64 %y, i32 1, i32 1, i32 1, i1 false) + ret i64 %ret +} + diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.alignbyte.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.alignbyte.ll index b77b2f7..1ec4f25 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.alignbyte.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.alignbyte.ll @@ -1,5 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 ; RUN: llc -mtriple=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX9 %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1030 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX10 %s ; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11-TRUE16 %s ; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11-FAKE16 %s @@ -19,6 +21,30 @@ define amdgpu_kernel void @v_alignbyte_b32(ptr addrspace(1) %out, i32 %src1, i32 ; GCN-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GCN-NEXT: s_endpgm ; +; GFX9-LABEL: v_alignbyte_b32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 +; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: v_mov_b32_e32 v2, s2 +; GFX9-NEXT: v_alignbyte_b32 v1, s0, v1, v2 +; GFX9-NEXT: global_store_dword v0, v1, s[6:7] +; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: v_alignbyte_b32: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c +; GFX10-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x24 +; GFX10-NEXT: v_mov_b32_e32 v1, 0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-NEXT: v_alignbyte_b32 v0, s0, s1, v0 +; GFX10-NEXT: global_store_dword v1, v0, s[4:5] +; GFX10-NEXT: s_endpgm +; ; GFX11-TRUE16-LABEL: v_alignbyte_b32: ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_clause 0x1 @@ -73,6 +99,41 @@ define amdgpu_kernel void @v_alignbyte_b32_2(ptr addrspace(1) %out, ptr addrspac ; GCN-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GCN-NEXT: s_endpgm ; +; GFX9-LABEL: v_alignbyte_b32_2: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: global_load_dword v1, v0, s[2:3] glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: global_load_dword v2, v0, s[6:7] glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_load_dword s2, s[4:5], 0x3c +; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_alignbyte_b32 v1, v1, v2, s2 +; GFX9-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: v_alignbyte_b32_2: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 +; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX10-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: global_load_dword v1, v0, s[2:3] glc dlc +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: global_load_dword v0, v0, s[6:7] glc dlc +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: s_load_dword s2, s[4:5], 0x3c +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: v_alignbyte_b32 v0, v1, v0, s2 +; GFX10-NEXT: global_store_dword v2, v0, s[0:1] +; GFX10-NEXT: s_endpgm +; ; GFX11-TRUE16-LABEL: v_alignbyte_b32_2: ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_clause 0x1 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.f16.fp8.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.f16.fp8.ll index 25889de..9565314 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.f16.fp8.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.f16.fp8.ll @@ -9,6 +9,172 @@ declare half @llvm.amdgcn.cvt.f16.fp8(i32, i32) declare <2 x half> @llvm.amdgcn.cvt.pk.f16.bf8(i16) declare <2 x half> @llvm.amdgcn.cvt.pk.f16.fp8(i16) +define amdgpu_ps float @test_cvt_f16_bf8_byte0(i32 %a) { +; GFX1250-SDAG-REAL16-LABEL: test_cvt_f16_bf8_byte0: +; GFX1250-SDAG-REAL16: ; %bb.0: +; GFX1250-SDAG-REAL16-NEXT: v_cvt_f16_bf8_e32 v0.l, v0 +; GFX1250-SDAG-REAL16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-SDAG-REAL16-NEXT: v_cvt_f32_f16_e32 v0, v0.l +; GFX1250-SDAG-REAL16-NEXT: ; return to shader part epilog +; +; GFX1250-SDAG-FAKE16-LABEL: test_cvt_f16_bf8_byte0: +; GFX1250-SDAG-FAKE16: ; %bb.0: +; GFX1250-SDAG-FAKE16-NEXT: v_cvt_f16_bf8_e32 v0, v0 +; GFX1250-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-SDAG-FAKE16-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX1250-SDAG-FAKE16-NEXT: ; return to shader part epilog +; +; GFX1250-GISEL-REAL16-LABEL: test_cvt_f16_bf8_byte0: +; GFX1250-GISEL-REAL16: ; %bb.0: +; GFX1250-GISEL-REAL16-NEXT: v_cvt_f16_bf8_e32 v0.l, v0 +; GFX1250-GISEL-REAL16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-GISEL-REAL16-NEXT: v_cvt_f32_f16_e32 v0, v0.l +; GFX1250-GISEL-REAL16-NEXT: ; return to shader part epilog +; +; GFX1250-GISEL-FAKE16-LABEL: test_cvt_f16_bf8_byte0: +; GFX1250-GISEL-FAKE16: ; %bb.0: +; GFX1250-GISEL-FAKE16-NEXT: v_cvt_f16_bf8_e32 v0, v0 +; GFX1250-GISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-GISEL-FAKE16-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX1250-GISEL-FAKE16-NEXT: ; return to shader part epilog + %cvt = tail call half @llvm.amdgcn.cvt.f16.bf8(i32 %a, i32 0) + %ret = fpext half %cvt to float + ret float %ret +} + +define amdgpu_ps float @test_cvt_f16_bf8_byte1(i32 %a) { +; GFX1250-SDAG-REAL16-LABEL: test_cvt_f16_bf8_byte1: +; GFX1250-SDAG-REAL16: ; %bb.0: +; GFX1250-SDAG-REAL16-NEXT: v_cvt_f16_bf8_e64 v0.l, v0 byte_sel:1 +; GFX1250-SDAG-REAL16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-SDAG-REAL16-NEXT: v_cvt_f32_f16_e32 v0, v0.l +; GFX1250-SDAG-REAL16-NEXT: ; return to shader part epilog +; +; GFX1250-SDAG-FAKE16-LABEL: test_cvt_f16_bf8_byte1: +; GFX1250-SDAG-FAKE16: ; %bb.0: +; GFX1250-SDAG-FAKE16-NEXT: v_cvt_f16_bf8_e64 v0, v0 byte_sel:1 +; GFX1250-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-SDAG-FAKE16-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX1250-SDAG-FAKE16-NEXT: ; return to shader part epilog +; +; GFX1250-GISEL-REAL16-LABEL: test_cvt_f16_bf8_byte1: +; GFX1250-GISEL-REAL16: ; %bb.0: +; GFX1250-GISEL-REAL16-NEXT: v_cvt_f16_bf8_e64 v0.l, v0 byte_sel:1 +; GFX1250-GISEL-REAL16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-GISEL-REAL16-NEXT: v_cvt_f32_f16_e32 v0, v0.l +; GFX1250-GISEL-REAL16-NEXT: ; return to shader part epilog +; +; GFX1250-GISEL-FAKE16-LABEL: test_cvt_f16_bf8_byte1: +; GFX1250-GISEL-FAKE16: ; %bb.0: +; GFX1250-GISEL-FAKE16-NEXT: v_cvt_f16_bf8_e64 v0, v0 byte_sel:1 +; GFX1250-GISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-GISEL-FAKE16-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX1250-GISEL-FAKE16-NEXT: ; return to shader part epilog + %cvt = tail call half @llvm.amdgcn.cvt.f16.bf8(i32 %a, i32 1) + %ret = fpext half %cvt to float + ret float %ret +} + +define amdgpu_ps float @test_cvt_f16_bf8_byte2(i32 %a) { +; GFX1250-SDAG-REAL16-LABEL: test_cvt_f16_bf8_byte2: +; GFX1250-SDAG-REAL16: ; %bb.0: +; GFX1250-SDAG-REAL16-NEXT: v_cvt_f16_bf8_e64 v0.l, v0 byte_sel:2 +; GFX1250-SDAG-REAL16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-SDAG-REAL16-NEXT: v_cvt_f32_f16_e32 v0, v0.l +; GFX1250-SDAG-REAL16-NEXT: ; return to shader part epilog +; +; GFX1250-SDAG-FAKE16-LABEL: test_cvt_f16_bf8_byte2: +; GFX1250-SDAG-FAKE16: ; %bb.0: +; GFX1250-SDAG-FAKE16-NEXT: v_cvt_f16_bf8_e64 v0, v0 byte_sel:2 +; GFX1250-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-SDAG-FAKE16-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX1250-SDAG-FAKE16-NEXT: ; return to shader part epilog +; +; GFX1250-GISEL-REAL16-LABEL: test_cvt_f16_bf8_byte2: +; GFX1250-GISEL-REAL16: ; %bb.0: +; GFX1250-GISEL-REAL16-NEXT: v_cvt_f16_bf8_e64 v0.l, v0 byte_sel:2 +; GFX1250-GISEL-REAL16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-GISEL-REAL16-NEXT: v_cvt_f32_f16_e32 v0, v0.l +; GFX1250-GISEL-REAL16-NEXT: ; return to shader part epilog +; +; GFX1250-GISEL-FAKE16-LABEL: test_cvt_f16_bf8_byte2: +; GFX1250-GISEL-FAKE16: ; %bb.0: +; GFX1250-GISEL-FAKE16-NEXT: v_cvt_f16_bf8_e64 v0, v0 byte_sel:2 +; GFX1250-GISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-GISEL-FAKE16-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX1250-GISEL-FAKE16-NEXT: ; return to shader part epilog + %cvt = tail call half @llvm.amdgcn.cvt.f16.bf8(i32 %a, i32 2) + %ret = fpext half %cvt to float + ret float %ret +} + +define amdgpu_ps float @test_cvt_f16_bf8_byte3(i32 %a) { +; GFX1250-SDAG-REAL16-LABEL: test_cvt_f16_bf8_byte3: +; GFX1250-SDAG-REAL16: ; %bb.0: +; GFX1250-SDAG-REAL16-NEXT: v_cvt_f16_bf8_e64 v0.l, v0 byte_sel:3 +; GFX1250-SDAG-REAL16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-SDAG-REAL16-NEXT: v_cvt_f32_f16_e32 v0, v0.l +; GFX1250-SDAG-REAL16-NEXT: ; return to shader part epilog +; +; GFX1250-SDAG-FAKE16-LABEL: test_cvt_f16_bf8_byte3: +; GFX1250-SDAG-FAKE16: ; %bb.0: +; GFX1250-SDAG-FAKE16-NEXT: v_cvt_f16_bf8_e64 v0, v0 byte_sel:3 +; GFX1250-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-SDAG-FAKE16-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX1250-SDAG-FAKE16-NEXT: ; return to shader part epilog +; +; GFX1250-GISEL-REAL16-LABEL: test_cvt_f16_bf8_byte3: +; GFX1250-GISEL-REAL16: ; %bb.0: +; GFX1250-GISEL-REAL16-NEXT: v_cvt_f16_bf8_e64 v0.l, v0 byte_sel:3 +; GFX1250-GISEL-REAL16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-GISEL-REAL16-NEXT: v_cvt_f32_f16_e32 v0, v0.l +; GFX1250-GISEL-REAL16-NEXT: ; return to shader part epilog +; +; GFX1250-GISEL-FAKE16-LABEL: test_cvt_f16_bf8_byte3: +; GFX1250-GISEL-FAKE16: ; %bb.0: +; GFX1250-GISEL-FAKE16-NEXT: v_cvt_f16_bf8_e64 v0, v0 byte_sel:3 +; GFX1250-GISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-GISEL-FAKE16-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX1250-GISEL-FAKE16-NEXT: ; return to shader part epilog + %cvt = tail call half @llvm.amdgcn.cvt.f16.bf8(i32 %a, i32 3) + %ret = fpext half %cvt to float + ret float %ret +} + +define amdgpu_ps float @test_cvt_f16_bf8_byte3_hi(i32 %a) { +; GFX1250-SDAG-REAL16-LABEL: test_cvt_f16_bf8_byte3_hi: +; GFX1250-SDAG-REAL16: ; %bb.0: +; GFX1250-SDAG-REAL16-NEXT: v_cvt_f16_bf8_e64 v0.h, v0 byte_sel:3 +; GFX1250-SDAG-REAL16-NEXT: v_mov_b16_e32 v0.l, 0 +; GFX1250-SDAG-REAL16-NEXT: ; return to shader part epilog +; +; GFX1250-SDAG-FAKE16-LABEL: test_cvt_f16_bf8_byte3_hi: +; GFX1250-SDAG-FAKE16: ; %bb.0: +; GFX1250-SDAG-FAKE16-NEXT: v_cvt_f16_bf8_e64 v0, v0 byte_sel:3 +; GFX1250-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-SDAG-FAKE16-NEXT: v_perm_b32 v0, v0, 0, 0x5040100 +; GFX1250-SDAG-FAKE16-NEXT: ; return to shader part epilog +; +; GFX1250-GISEL-REAL16-LABEL: test_cvt_f16_bf8_byte3_hi: +; GFX1250-GISEL-REAL16: ; %bb.0: +; GFX1250-GISEL-REAL16-NEXT: v_cvt_f16_bf8_e64 v0.l, v0 byte_sel:3 +; GFX1250-GISEL-REAL16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-GISEL-REAL16-NEXT: v_lshl_or_b32 v0, v0, 16, 0 +; GFX1250-GISEL-REAL16-NEXT: ; return to shader part epilog +; +; GFX1250-GISEL-FAKE16-LABEL: test_cvt_f16_bf8_byte3_hi: +; GFX1250-GISEL-FAKE16: ; %bb.0: +; GFX1250-GISEL-FAKE16-NEXT: v_cvt_f16_bf8_e64 v0, v0 byte_sel:3 +; GFX1250-GISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-GISEL-FAKE16-NEXT: v_lshl_or_b32 v0, v0, 16, 0 +; GFX1250-GISEL-FAKE16-NEXT: ; return to shader part epilog + %cvt = tail call half @llvm.amdgcn.cvt.f16.bf8(i32 %a, i32 3) + %ins.0 = insertelement <2 x half> undef, half 0.0, i32 0 + %ins.1 = insertelement <2 x half> %ins.0, half %cvt, i32 1 + %ret = bitcast <2 x half> %ins.1 to float + ret float %ret +} + define amdgpu_ps float @test_cvt_f16_fp8_byte0(i32 %a) { ; GFX1250-SDAG-REAL16-LABEL: test_cvt_f16_fp8_byte0: ; GFX1250-SDAG-REAL16: ; %bb.0: diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wmma.gfx1250.w32.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wmma.gfx1250.w32.ll index 2f5ff90..9149ed5 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wmma.gfx1250.w32.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wmma.gfx1250.w32.ll @@ -304,6 +304,556 @@ bb: ret void } +define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4(<16 x i32> %A, <16 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) { +; GFX1250-LABEL: test_wmma_f32_16x16x128_f8f6f4: +; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: v_wmma_f32_16x16x128_f8f6f4 v[32:39], v[0:15], v[16:31], v[32:39] matrix_a_fmt:MATRIX_FMT_BF8 matrix_b_fmt:MATRIX_FMT_FP6 +; GFX1250-NEXT: s_clause 0x1 +; GFX1250-NEXT: global_store_b128 v[40:41], v[36:39], off offset:16 +; GFX1250-NEXT: global_store_b128 v[40:41], v[32:35], off +; GFX1250-NEXT: s_endpgm +; +; GISEL-LABEL: test_wmma_f32_16x16x128_f8f6f4: +; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: v_wmma_f32_16x16x128_f8f6f4 v[32:39], v[0:15], v[16:31], v[32:39] matrix_a_fmt:MATRIX_FMT_BF8 matrix_b_fmt:MATRIX_FMT_FP6 +; GISEL-NEXT: s_clause 0x1 +; GISEL-NEXT: global_store_b128 v[40:41], v[32:35], off +; GISEL-NEXT: global_store_b128 v[40:41], v[36:39], off offset:16 +; GISEL-NEXT: s_endpgm +bb: + %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v16i32.v16i32(i32 1, <16 x i32> %A, i32 2, <16 x i32> %B, i16 0, <8 x float> %C) + store <8 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4_fp8_bf8(<16 x i32> %A, <16 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) { +; GFX1250-LABEL: test_wmma_f32_16x16x128_f8f6f4_fp8_bf8: +; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: v_wmma_f32_16x16x128_f8f6f4 v[32:39], v[0:15], v[16:31], v[32:39] matrix_b_fmt:MATRIX_FMT_BF8 +; GFX1250-NEXT: s_clause 0x1 +; GFX1250-NEXT: global_store_b128 v[40:41], v[36:39], off offset:16 +; GFX1250-NEXT: global_store_b128 v[40:41], v[32:35], off +; GFX1250-NEXT: s_endpgm +; +; GISEL-LABEL: test_wmma_f32_16x16x128_f8f6f4_fp8_bf8: +; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: v_wmma_f32_16x16x128_f8f6f4 v[32:39], v[0:15], v[16:31], v[32:39] matrix_b_fmt:MATRIX_FMT_BF8 +; GISEL-NEXT: s_clause 0x1 +; GISEL-NEXT: global_store_b128 v[40:41], v[32:35], off +; GISEL-NEXT: global_store_b128 v[40:41], v[36:39], off offset:16 +; GISEL-NEXT: s_endpgm +bb: + %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v16i32.v16i32(i32 0, <16 x i32> %A, i32 1, <16 x i32> %B, i16 0, <8 x float> %C) + store <8 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4_fp8_fp6(<16 x i32> %A, <12 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) { +; GFX1250-LABEL: test_wmma_f32_16x16x128_f8f6f4_fp8_fp6: +; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: v_wmma_f32_16x16x128_f8f6f4 v[28:35], v[0:15], v[16:27], v[28:35] matrix_b_fmt:MATRIX_FMT_FP6 +; GFX1250-NEXT: s_clause 0x1 +; GFX1250-NEXT: global_store_b128 v[36:37], v[32:35], off offset:16 +; GFX1250-NEXT: global_store_b128 v[36:37], v[28:31], off +; GFX1250-NEXT: s_endpgm +; +; GISEL-LABEL: test_wmma_f32_16x16x128_f8f6f4_fp8_fp6: +; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: v_wmma_f32_16x16x128_f8f6f4 v[28:35], v[0:15], v[16:27], v[28:35] matrix_b_fmt:MATRIX_FMT_FP6 +; GISEL-NEXT: s_clause 0x1 +; GISEL-NEXT: global_store_b128 v[36:37], v[28:31], off +; GISEL-NEXT: global_store_b128 v[36:37], v[32:35], off offset:16 +; GISEL-NEXT: s_endpgm +bb: + %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v16i32.v12i32(i32 0, <16 x i32> %A, i32 2, <12 x i32> %B, i16 0, <8 x float> %C) + store <8 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4_fp8_bf6(<16 x i32> %A, <12 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) { +; GFX1250-LABEL: test_wmma_f32_16x16x128_f8f6f4_fp8_bf6: +; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: v_wmma_f32_16x16x128_f8f6f4 v[28:35], v[0:15], v[16:27], v[28:35] matrix_b_fmt:MATRIX_FMT_BF6 +; GFX1250-NEXT: s_clause 0x1 +; GFX1250-NEXT: global_store_b128 v[36:37], v[32:35], off offset:16 +; GFX1250-NEXT: global_store_b128 v[36:37], v[28:31], off +; GFX1250-NEXT: s_endpgm +; +; GISEL-LABEL: test_wmma_f32_16x16x128_f8f6f4_fp8_bf6: +; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: v_wmma_f32_16x16x128_f8f6f4 v[28:35], v[0:15], v[16:27], v[28:35] matrix_b_fmt:MATRIX_FMT_BF6 +; GISEL-NEXT: s_clause 0x1 +; GISEL-NEXT: global_store_b128 v[36:37], v[28:31], off +; GISEL-NEXT: global_store_b128 v[36:37], v[32:35], off offset:16 +; GISEL-NEXT: s_endpgm +bb: + %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v16i32.v12i32(i32 0, <16 x i32> %A, i32 3, <12 x i32> %B, i16 0, <8 x float> %C) + store <8 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4_fp8_fp4(<16 x i32> %A, <8 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) { +; GFX1250-LABEL: test_wmma_f32_16x16x128_f8f6f4_fp8_fp4: +; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: v_wmma_f32_16x16x128_f8f6f4 v[24:31], v[0:15], v[16:23], v[24:31] matrix_b_fmt:MATRIX_FMT_FP4 +; GFX1250-NEXT: s_clause 0x1 +; GFX1250-NEXT: global_store_b128 v[32:33], v[28:31], off offset:16 +; GFX1250-NEXT: global_store_b128 v[32:33], v[24:27], off +; GFX1250-NEXT: s_endpgm +; +; GISEL-LABEL: test_wmma_f32_16x16x128_f8f6f4_fp8_fp4: +; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: v_wmma_f32_16x16x128_f8f6f4 v[24:31], v[0:15], v[16:23], v[24:31] matrix_b_fmt:MATRIX_FMT_FP4 +; GISEL-NEXT: s_clause 0x1 +; GISEL-NEXT: global_store_b128 v[32:33], v[24:27], off +; GISEL-NEXT: global_store_b128 v[32:33], v[28:31], off offset:16 +; GISEL-NEXT: s_endpgm +bb: + %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v16i32.v8i32(i32 0, <16 x i32> %A, i32 4, <8 x i32> %B, i16 0, <8 x float> %C) + store <8 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4_bf8_fp8(<16 x i32> %A, <16 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) { +; GFX1250-LABEL: test_wmma_f32_16x16x128_f8f6f4_bf8_fp8: +; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: v_wmma_f32_16x16x128_f8f6f4 v[32:39], v[0:15], v[16:31], v[32:39] matrix_a_fmt:MATRIX_FMT_BF8 +; GFX1250-NEXT: s_clause 0x1 +; GFX1250-NEXT: global_store_b128 v[40:41], v[36:39], off offset:16 +; GFX1250-NEXT: global_store_b128 v[40:41], v[32:35], off +; GFX1250-NEXT: s_endpgm +; +; GISEL-LABEL: test_wmma_f32_16x16x128_f8f6f4_bf8_fp8: +; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: v_wmma_f32_16x16x128_f8f6f4 v[32:39], v[0:15], v[16:31], v[32:39] matrix_a_fmt:MATRIX_FMT_BF8 +; GISEL-NEXT: s_clause 0x1 +; GISEL-NEXT: global_store_b128 v[40:41], v[32:35], off +; GISEL-NEXT: global_store_b128 v[40:41], v[36:39], off offset:16 +; GISEL-NEXT: s_endpgm +bb: + %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v16i32.v16i32(i32 1, <16 x i32> %A, i32 0, <16 x i32> %B, i16 0, <8 x float> %C) + store <8 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4_bf8_bf8(<16 x i32> %A, <16 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) { +; GFX1250-LABEL: test_wmma_f32_16x16x128_f8f6f4_bf8_bf8: +; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: v_wmma_f32_16x16x128_f8f6f4 v[32:39], v[0:15], v[16:31], v[32:39] matrix_a_fmt:MATRIX_FMT_BF8 matrix_b_fmt:MATRIX_FMT_BF8 +; GFX1250-NEXT: s_clause 0x1 +; GFX1250-NEXT: global_store_b128 v[40:41], v[36:39], off offset:16 +; GFX1250-NEXT: global_store_b128 v[40:41], v[32:35], off +; GFX1250-NEXT: s_endpgm +; +; GISEL-LABEL: test_wmma_f32_16x16x128_f8f6f4_bf8_bf8: +; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: v_wmma_f32_16x16x128_f8f6f4 v[32:39], v[0:15], v[16:31], v[32:39] matrix_a_fmt:MATRIX_FMT_BF8 matrix_b_fmt:MATRIX_FMT_BF8 +; GISEL-NEXT: s_clause 0x1 +; GISEL-NEXT: global_store_b128 v[40:41], v[32:35], off +; GISEL-NEXT: global_store_b128 v[40:41], v[36:39], off offset:16 +; GISEL-NEXT: s_endpgm +bb: + %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v16i32.v16i32(i32 1, <16 x i32> %A, i32 1, <16 x i32> %B, i16 0, <8 x float> %C) + store <8 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4_bf8_fp6(<16 x i32> %A, <12 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) { +; GFX1250-LABEL: test_wmma_f32_16x16x128_f8f6f4_bf8_fp6: +; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: v_wmma_f32_16x16x128_f8f6f4 v[28:35], v[0:15], v[16:27], v[28:35] matrix_a_fmt:MATRIX_FMT_BF8 matrix_b_fmt:MATRIX_FMT_FP6 +; GFX1250-NEXT: s_clause 0x1 +; GFX1250-NEXT: global_store_b128 v[36:37], v[32:35], off offset:16 +; GFX1250-NEXT: global_store_b128 v[36:37], v[28:31], off +; GFX1250-NEXT: s_endpgm +; +; GISEL-LABEL: test_wmma_f32_16x16x128_f8f6f4_bf8_fp6: +; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: v_wmma_f32_16x16x128_f8f6f4 v[28:35], v[0:15], v[16:27], v[28:35] matrix_a_fmt:MATRIX_FMT_BF8 matrix_b_fmt:MATRIX_FMT_FP6 +; GISEL-NEXT: s_clause 0x1 +; GISEL-NEXT: global_store_b128 v[36:37], v[28:31], off +; GISEL-NEXT: global_store_b128 v[36:37], v[32:35], off offset:16 +; GISEL-NEXT: s_endpgm +bb: + %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v16i32.v12i32(i32 1, <16 x i32> %A, i32 2, <12 x i32> %B, i16 0, <8 x float> %C) + store <8 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4_bf8_bf6(<16 x i32> %A, <12 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) { +; GFX1250-LABEL: test_wmma_f32_16x16x128_f8f6f4_bf8_bf6: +; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: v_wmma_f32_16x16x128_f8f6f4 v[28:35], v[0:15], v[16:27], v[28:35] matrix_a_fmt:MATRIX_FMT_BF8 matrix_b_fmt:MATRIX_FMT_BF6 +; GFX1250-NEXT: s_clause 0x1 +; GFX1250-NEXT: global_store_b128 v[36:37], v[32:35], off offset:16 +; GFX1250-NEXT: global_store_b128 v[36:37], v[28:31], off +; GFX1250-NEXT: s_endpgm +; +; GISEL-LABEL: test_wmma_f32_16x16x128_f8f6f4_bf8_bf6: +; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: v_wmma_f32_16x16x128_f8f6f4 v[28:35], v[0:15], v[16:27], v[28:35] matrix_a_fmt:MATRIX_FMT_BF8 matrix_b_fmt:MATRIX_FMT_BF6 +; GISEL-NEXT: s_clause 0x1 +; GISEL-NEXT: global_store_b128 v[36:37], v[28:31], off +; GISEL-NEXT: global_store_b128 v[36:37], v[32:35], off offset:16 +; GISEL-NEXT: s_endpgm +bb: + %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v16i32.v12i32(i32 1, <16 x i32> %A, i32 3, <12 x i32> %B, i16 0, <8 x float> %C) + store <8 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4_bf8_fp4(<16 x i32> %A, <8 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) { +; GFX1250-LABEL: test_wmma_f32_16x16x128_f8f6f4_bf8_fp4: +; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: v_wmma_f32_16x16x128_f8f6f4 v[24:31], v[0:15], v[16:23], v[24:31] matrix_a_fmt:MATRIX_FMT_BF8 matrix_b_fmt:MATRIX_FMT_FP4 +; GFX1250-NEXT: s_clause 0x1 +; GFX1250-NEXT: global_store_b128 v[32:33], v[28:31], off offset:16 +; GFX1250-NEXT: global_store_b128 v[32:33], v[24:27], off +; GFX1250-NEXT: s_endpgm +; +; GISEL-LABEL: test_wmma_f32_16x16x128_f8f6f4_bf8_fp4: +; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: v_wmma_f32_16x16x128_f8f6f4 v[24:31], v[0:15], v[16:23], v[24:31] matrix_a_fmt:MATRIX_FMT_BF8 matrix_b_fmt:MATRIX_FMT_FP4 +; GISEL-NEXT: s_clause 0x1 +; GISEL-NEXT: global_store_b128 v[32:33], v[24:27], off +; GISEL-NEXT: global_store_b128 v[32:33], v[28:31], off offset:16 +; GISEL-NEXT: s_endpgm +bb: + %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v16i32.v8i32(i32 1, <16 x i32> %A, i32 4, <8 x i32> %B, i16 0, <8 x float> %C) + store <8 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4_fp6_fp8(<12 x i32> %A, <16 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) { +; GFX1250-LABEL: test_wmma_f32_16x16x128_f8f6f4_fp6_fp8: +; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: v_wmma_f32_16x16x128_f8f6f4 v[28:35], v[0:11], v[12:27], v[28:35] matrix_a_fmt:MATRIX_FMT_FP6 +; GFX1250-NEXT: s_clause 0x1 +; GFX1250-NEXT: global_store_b128 v[36:37], v[32:35], off offset:16 +; GFX1250-NEXT: global_store_b128 v[36:37], v[28:31], off +; GFX1250-NEXT: s_endpgm +; +; GISEL-LABEL: test_wmma_f32_16x16x128_f8f6f4_fp6_fp8: +; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: v_wmma_f32_16x16x128_f8f6f4 v[28:35], v[0:11], v[12:27], v[28:35] matrix_a_fmt:MATRIX_FMT_FP6 +; GISEL-NEXT: s_clause 0x1 +; GISEL-NEXT: global_store_b128 v[36:37], v[28:31], off +; GISEL-NEXT: global_store_b128 v[36:37], v[32:35], off offset:16 +; GISEL-NEXT: s_endpgm +bb: + %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v12i32.v16i32(i32 2, <12 x i32> %A, i32 0, <16 x i32> %B, i16 0, <8 x float> %C) + store <8 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4_fp6_bf8(<12 x i32> %A, <16 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) { +; GFX1250-LABEL: test_wmma_f32_16x16x128_f8f6f4_fp6_bf8: +; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: v_wmma_f32_16x16x128_f8f6f4 v[28:35], v[0:11], v[12:27], v[28:35] matrix_a_fmt:MATRIX_FMT_FP6 matrix_b_fmt:MATRIX_FMT_BF8 +; GFX1250-NEXT: s_clause 0x1 +; GFX1250-NEXT: global_store_b128 v[36:37], v[32:35], off offset:16 +; GFX1250-NEXT: global_store_b128 v[36:37], v[28:31], off +; GFX1250-NEXT: s_endpgm +; +; GISEL-LABEL: test_wmma_f32_16x16x128_f8f6f4_fp6_bf8: +; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: v_wmma_f32_16x16x128_f8f6f4 v[28:35], v[0:11], v[12:27], v[28:35] matrix_a_fmt:MATRIX_FMT_FP6 matrix_b_fmt:MATRIX_FMT_BF8 +; GISEL-NEXT: s_clause 0x1 +; GISEL-NEXT: global_store_b128 v[36:37], v[28:31], off +; GISEL-NEXT: global_store_b128 v[36:37], v[32:35], off offset:16 +; GISEL-NEXT: s_endpgm +bb: + %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v12i32.v16i32(i32 2, <12 x i32> %A, i32 1, <16 x i32> %B, i16 0, <8 x float> %C) + store <8 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4_fp6_fp6(<12 x i32> %A, <12 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) { +; GFX1250-LABEL: test_wmma_f32_16x16x128_f8f6f4_fp6_fp6: +; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: v_wmma_f32_16x16x128_f8f6f4 v[24:31], v[0:11], v[12:23], v[24:31] matrix_a_fmt:MATRIX_FMT_FP6 matrix_b_fmt:MATRIX_FMT_FP6 +; GFX1250-NEXT: s_clause 0x1 +; GFX1250-NEXT: global_store_b128 v[32:33], v[28:31], off offset:16 +; GFX1250-NEXT: global_store_b128 v[32:33], v[24:27], off +; GFX1250-NEXT: s_endpgm +; +; GISEL-LABEL: test_wmma_f32_16x16x128_f8f6f4_fp6_fp6: +; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: v_wmma_f32_16x16x128_f8f6f4 v[24:31], v[0:11], v[12:23], v[24:31] matrix_a_fmt:MATRIX_FMT_FP6 matrix_b_fmt:MATRIX_FMT_FP6 +; GISEL-NEXT: s_clause 0x1 +; GISEL-NEXT: global_store_b128 v[32:33], v[24:27], off +; GISEL-NEXT: global_store_b128 v[32:33], v[28:31], off offset:16 +; GISEL-NEXT: s_endpgm +bb: + %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v12i32.v12i32(i32 2, <12 x i32> %A, i32 2, <12 x i32> %B, i16 0, <8 x float> %C) + store <8 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4_fp6_bf6(<12 x i32> %A, <12 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) { +; GFX1250-LABEL: test_wmma_f32_16x16x128_f8f6f4_fp6_bf6: +; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: v_wmma_f32_16x16x128_f8f6f4 v[24:31], v[0:11], v[12:23], v[24:31] matrix_a_fmt:MATRIX_FMT_FP6 matrix_b_fmt:MATRIX_FMT_FP4 +; GFX1250-NEXT: s_clause 0x1 +; GFX1250-NEXT: global_store_b128 v[32:33], v[28:31], off offset:16 +; GFX1250-NEXT: global_store_b128 v[32:33], v[24:27], off +; GFX1250-NEXT: s_endpgm +; +; GISEL-LABEL: test_wmma_f32_16x16x128_f8f6f4_fp6_bf6: +; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: v_wmma_f32_16x16x128_f8f6f4 v[24:31], v[0:11], v[12:23], v[24:31] matrix_a_fmt:MATRIX_FMT_FP6 matrix_b_fmt:MATRIX_FMT_FP4 +; GISEL-NEXT: s_clause 0x1 +; GISEL-NEXT: global_store_b128 v[32:33], v[24:27], off +; GISEL-NEXT: global_store_b128 v[32:33], v[28:31], off offset:16 +; GISEL-NEXT: s_endpgm +bb: + %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v12i32.v12i32(i32 2, <12 x i32> %A, i32 4, <12 x i32> %B, i16 0, <8 x float> %C) + store <8 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4_fp6_fp4(<12 x i32> %A, <8 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) { +; GFX1250-LABEL: test_wmma_f32_16x16x128_f8f6f4_fp6_fp4: +; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: v_wmma_f32_16x16x128_f8f6f4 v[20:27], v[0:11], v[12:19], v[20:27] matrix_a_fmt:MATRIX_FMT_FP6 matrix_b_fmt:MATRIX_FMT_FP4 +; GFX1250-NEXT: s_clause 0x1 +; GFX1250-NEXT: global_store_b128 v[28:29], v[24:27], off offset:16 +; GFX1250-NEXT: global_store_b128 v[28:29], v[20:23], off +; GFX1250-NEXT: s_endpgm +; +; GISEL-LABEL: test_wmma_f32_16x16x128_f8f6f4_fp6_fp4: +; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: v_wmma_f32_16x16x128_f8f6f4 v[20:27], v[0:11], v[12:19], v[20:27] matrix_a_fmt:MATRIX_FMT_FP6 matrix_b_fmt:MATRIX_FMT_FP4 +; GISEL-NEXT: s_clause 0x1 +; GISEL-NEXT: global_store_b128 v[28:29], v[20:23], off +; GISEL-NEXT: global_store_b128 v[28:29], v[24:27], off offset:16 +; GISEL-NEXT: s_endpgm +bb: + %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v12i32.v8i32(i32 2, <12 x i32> %A, i32 4, <8 x i32> %B, i16 0, <8 x float> %C) + store <8 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4_bf6_fp8(<12 x i32> %A, <16 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) { +; GFX1250-LABEL: test_wmma_f32_16x16x128_f8f6f4_bf6_fp8: +; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: v_wmma_f32_16x16x128_f8f6f4 v[28:35], v[0:11], v[12:27], v[28:35] matrix_a_fmt:MATRIX_FMT_BF6 +; GFX1250-NEXT: s_clause 0x1 +; GFX1250-NEXT: global_store_b128 v[36:37], v[32:35], off offset:16 +; GFX1250-NEXT: global_store_b128 v[36:37], v[28:31], off +; GFX1250-NEXT: s_endpgm +; +; GISEL-LABEL: test_wmma_f32_16x16x128_f8f6f4_bf6_fp8: +; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: v_wmma_f32_16x16x128_f8f6f4 v[28:35], v[0:11], v[12:27], v[28:35] matrix_a_fmt:MATRIX_FMT_BF6 +; GISEL-NEXT: s_clause 0x1 +; GISEL-NEXT: global_store_b128 v[36:37], v[28:31], off +; GISEL-NEXT: global_store_b128 v[36:37], v[32:35], off offset:16 +; GISEL-NEXT: s_endpgm +bb: + %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v12i32.v16i32(i32 3, <12 x i32> %A, i32 0, <16 x i32> %B, i16 0, <8 x float> %C) + store <8 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4_bf6_bf8(<12 x i32> %A, <16 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) { +; GFX1250-LABEL: test_wmma_f32_16x16x128_f8f6f4_bf6_bf8: +; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: v_wmma_f32_16x16x128_f8f6f4 v[28:35], v[0:11], v[12:27], v[28:35] matrix_a_fmt:MATRIX_FMT_BF6 matrix_b_fmt:MATRIX_FMT_BF8 +; GFX1250-NEXT: s_clause 0x1 +; GFX1250-NEXT: global_store_b128 v[36:37], v[32:35], off offset:16 +; GFX1250-NEXT: global_store_b128 v[36:37], v[28:31], off +; GFX1250-NEXT: s_endpgm +; +; GISEL-LABEL: test_wmma_f32_16x16x128_f8f6f4_bf6_bf8: +; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: v_wmma_f32_16x16x128_f8f6f4 v[28:35], v[0:11], v[12:27], v[28:35] matrix_a_fmt:MATRIX_FMT_BF6 matrix_b_fmt:MATRIX_FMT_BF8 +; GISEL-NEXT: s_clause 0x1 +; GISEL-NEXT: global_store_b128 v[36:37], v[28:31], off +; GISEL-NEXT: global_store_b128 v[36:37], v[32:35], off offset:16 +; GISEL-NEXT: s_endpgm +bb: + %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v12i32.v16i32(i32 3, <12 x i32> %A, i32 1, <16 x i32> %B, i16 0, <8 x float> %C) + store <8 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4_bf6_fp6(<12 x i32> %A, <12 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) { +; GFX1250-LABEL: test_wmma_f32_16x16x128_f8f6f4_bf6_fp6: +; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: v_wmma_f32_16x16x128_f8f6f4 v[24:31], v[0:11], v[12:23], v[24:31] matrix_a_fmt:MATRIX_FMT_BF6 matrix_b_fmt:MATRIX_FMT_FP6 +; GFX1250-NEXT: s_clause 0x1 +; GFX1250-NEXT: global_store_b128 v[32:33], v[28:31], off offset:16 +; GFX1250-NEXT: global_store_b128 v[32:33], v[24:27], off +; GFX1250-NEXT: s_endpgm +; +; GISEL-LABEL: test_wmma_f32_16x16x128_f8f6f4_bf6_fp6: +; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: v_wmma_f32_16x16x128_f8f6f4 v[24:31], v[0:11], v[12:23], v[24:31] matrix_a_fmt:MATRIX_FMT_BF6 matrix_b_fmt:MATRIX_FMT_FP6 +; GISEL-NEXT: s_clause 0x1 +; GISEL-NEXT: global_store_b128 v[32:33], v[24:27], off +; GISEL-NEXT: global_store_b128 v[32:33], v[28:31], off offset:16 +; GISEL-NEXT: s_endpgm +bb: + %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v12i32.v12i32(i32 3, <12 x i32> %A, i32 2, <12 x i32> %B, i16 0, <8 x float> %C) + store <8 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4_bf6_bf6(<12 x i32> %A, <12 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) { +; GFX1250-LABEL: test_wmma_f32_16x16x128_f8f6f4_bf6_bf6: +; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: v_wmma_f32_16x16x128_f8f6f4 v[24:31], v[0:11], v[12:23], v[24:31] matrix_a_fmt:MATRIX_FMT_BF6 matrix_b_fmt:MATRIX_FMT_FP4 +; GFX1250-NEXT: s_clause 0x1 +; GFX1250-NEXT: global_store_b128 v[32:33], v[28:31], off offset:16 +; GFX1250-NEXT: global_store_b128 v[32:33], v[24:27], off +; GFX1250-NEXT: s_endpgm +; +; GISEL-LABEL: test_wmma_f32_16x16x128_f8f6f4_bf6_bf6: +; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: v_wmma_f32_16x16x128_f8f6f4 v[24:31], v[0:11], v[12:23], v[24:31] matrix_a_fmt:MATRIX_FMT_BF6 matrix_b_fmt:MATRIX_FMT_FP4 +; GISEL-NEXT: s_clause 0x1 +; GISEL-NEXT: global_store_b128 v[32:33], v[24:27], off +; GISEL-NEXT: global_store_b128 v[32:33], v[28:31], off offset:16 +; GISEL-NEXT: s_endpgm +bb: + %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v12i32.v12i32(i32 3, <12 x i32> %A, i32 4, <12 x i32> %B, i16 0, <8 x float> %C) + store <8 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4_bf6_fp4(<12 x i32> %A, <8 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) { +; GFX1250-LABEL: test_wmma_f32_16x16x128_f8f6f4_bf6_fp4: +; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: v_wmma_f32_16x16x128_f8f6f4 v[20:27], v[0:11], v[12:19], v[20:27] matrix_a_fmt:MATRIX_FMT_BF6 matrix_b_fmt:MATRIX_FMT_FP4 +; GFX1250-NEXT: s_clause 0x1 +; GFX1250-NEXT: global_store_b128 v[28:29], v[24:27], off offset:16 +; GFX1250-NEXT: global_store_b128 v[28:29], v[20:23], off +; GFX1250-NEXT: s_endpgm +; +; GISEL-LABEL: test_wmma_f32_16x16x128_f8f6f4_bf6_fp4: +; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: v_wmma_f32_16x16x128_f8f6f4 v[20:27], v[0:11], v[12:19], v[20:27] matrix_a_fmt:MATRIX_FMT_BF6 matrix_b_fmt:MATRIX_FMT_FP4 +; GISEL-NEXT: s_clause 0x1 +; GISEL-NEXT: global_store_b128 v[28:29], v[20:23], off +; GISEL-NEXT: global_store_b128 v[28:29], v[24:27], off offset:16 +; GISEL-NEXT: s_endpgm +bb: + %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v12i32.v8i32(i32 3, <12 x i32> %A, i32 4, <8 x i32> %B, i16 0, <8 x float> %C) + store <8 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4_fp4_fp8(<8 x i32> %A, <16 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) { +; GFX1250-LABEL: test_wmma_f32_16x16x128_f8f6f4_fp4_fp8: +; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: v_wmma_f32_16x16x128_f8f6f4 v[24:31], v[0:7], v[8:23], v[24:31] matrix_a_fmt:MATRIX_FMT_FP4 +; GFX1250-NEXT: s_clause 0x1 +; GFX1250-NEXT: global_store_b128 v[32:33], v[28:31], off offset:16 +; GFX1250-NEXT: global_store_b128 v[32:33], v[24:27], off +; GFX1250-NEXT: s_endpgm +; +; GISEL-LABEL: test_wmma_f32_16x16x128_f8f6f4_fp4_fp8: +; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: v_wmma_f32_16x16x128_f8f6f4 v[24:31], v[0:7], v[8:23], v[24:31] matrix_a_fmt:MATRIX_FMT_FP4 +; GISEL-NEXT: s_clause 0x1 +; GISEL-NEXT: global_store_b128 v[32:33], v[24:27], off +; GISEL-NEXT: global_store_b128 v[32:33], v[28:31], off offset:16 +; GISEL-NEXT: s_endpgm +bb: + %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v8i32.v16i32(i32 4, <8 x i32> %A, i32 0, <16 x i32> %B, i16 0, <8 x float> %C) + store <8 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4_fp4_bf8(<8 x i32> %A, <16 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) { +; GFX1250-LABEL: test_wmma_f32_16x16x128_f8f6f4_fp4_bf8: +; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: v_wmma_f32_16x16x128_f8f6f4 v[24:31], v[0:7], v[8:23], v[24:31] matrix_a_fmt:MATRIX_FMT_FP4 matrix_b_fmt:MATRIX_FMT_BF8 +; GFX1250-NEXT: s_clause 0x1 +; GFX1250-NEXT: global_store_b128 v[32:33], v[28:31], off offset:16 +; GFX1250-NEXT: global_store_b128 v[32:33], v[24:27], off +; GFX1250-NEXT: s_endpgm +; +; GISEL-LABEL: test_wmma_f32_16x16x128_f8f6f4_fp4_bf8: +; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: v_wmma_f32_16x16x128_f8f6f4 v[24:31], v[0:7], v[8:23], v[24:31] matrix_a_fmt:MATRIX_FMT_FP4 matrix_b_fmt:MATRIX_FMT_BF8 +; GISEL-NEXT: s_clause 0x1 +; GISEL-NEXT: global_store_b128 v[32:33], v[24:27], off +; GISEL-NEXT: global_store_b128 v[32:33], v[28:31], off offset:16 +; GISEL-NEXT: s_endpgm +bb: + %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v8i32.v16i32(i32 4, <8 x i32> %A, i32 1, <16 x i32> %B, i16 0, <8 x float> %C) + store <8 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4_fp4_fp6(<8 x i32> %A, <12 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) { +; GFX1250-LABEL: test_wmma_f32_16x16x128_f8f6f4_fp4_fp6: +; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: v_wmma_f32_16x16x128_f8f6f4 v[20:27], v[0:7], v[8:19], v[20:27] matrix_a_fmt:MATRIX_FMT_FP4 matrix_b_fmt:MATRIX_FMT_FP6 +; GFX1250-NEXT: s_clause 0x1 +; GFX1250-NEXT: global_store_b128 v[28:29], v[24:27], off offset:16 +; GFX1250-NEXT: global_store_b128 v[28:29], v[20:23], off +; GFX1250-NEXT: s_endpgm +; +; GISEL-LABEL: test_wmma_f32_16x16x128_f8f6f4_fp4_fp6: +; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: v_wmma_f32_16x16x128_f8f6f4 v[20:27], v[0:7], v[8:19], v[20:27] matrix_a_fmt:MATRIX_FMT_FP4 matrix_b_fmt:MATRIX_FMT_FP6 +; GISEL-NEXT: s_clause 0x1 +; GISEL-NEXT: global_store_b128 v[28:29], v[20:23], off +; GISEL-NEXT: global_store_b128 v[28:29], v[24:27], off offset:16 +; GISEL-NEXT: s_endpgm +bb: + %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v8i32.v12i32(i32 4, <8 x i32> %A, i32 2, <12 x i32> %B, i16 0, <8 x float> %C) + store <8 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4_fp4_bf6(<8 x i32> %A, <12 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) { +; GFX1250-LABEL: test_wmma_f32_16x16x128_f8f6f4_fp4_bf6: +; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: v_wmma_f32_16x16x128_f8f6f4 v[20:27], v[0:7], v[8:19], v[20:27] matrix_a_fmt:MATRIX_FMT_FP4 matrix_b_fmt:MATRIX_FMT_FP4 +; GFX1250-NEXT: s_clause 0x1 +; GFX1250-NEXT: global_store_b128 v[28:29], v[24:27], off offset:16 +; GFX1250-NEXT: global_store_b128 v[28:29], v[20:23], off +; GFX1250-NEXT: s_endpgm +; +; GISEL-LABEL: test_wmma_f32_16x16x128_f8f6f4_fp4_bf6: +; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: v_wmma_f32_16x16x128_f8f6f4 v[20:27], v[0:7], v[8:19], v[20:27] matrix_a_fmt:MATRIX_FMT_FP4 matrix_b_fmt:MATRIX_FMT_FP4 +; GISEL-NEXT: s_clause 0x1 +; GISEL-NEXT: global_store_b128 v[28:29], v[20:23], off +; GISEL-NEXT: global_store_b128 v[28:29], v[24:27], off offset:16 +; GISEL-NEXT: s_endpgm +bb: + %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v8i32.v12i32(i32 4, <8 x i32> %A, i32 4, <12 x i32> %B, i16 0, <8 x float> %C) + store <8 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4_fp4_fp4(<8 x i32> %A, <8 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) { +; GFX1250-LABEL: test_wmma_f32_16x16x128_f8f6f4_fp4_fp4: +; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: v_wmma_f32_16x16x128_f8f6f4 v[16:23], v[0:7], v[8:15], v[16:23] matrix_a_fmt:MATRIX_FMT_FP4 matrix_b_fmt:MATRIX_FMT_FP4 +; GFX1250-NEXT: s_clause 0x1 +; GFX1250-NEXT: global_store_b128 v[24:25], v[20:23], off offset:16 +; GFX1250-NEXT: global_store_b128 v[24:25], v[16:19], off +; GFX1250-NEXT: s_endpgm +; +; GISEL-LABEL: test_wmma_f32_16x16x128_f8f6f4_fp4_fp4: +; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: v_wmma_f32_16x16x128_f8f6f4 v[16:23], v[0:7], v[8:15], v[16:23] matrix_a_fmt:MATRIX_FMT_FP4 matrix_b_fmt:MATRIX_FMT_FP4 +; GISEL-NEXT: s_clause 0x1 +; GISEL-NEXT: global_store_b128 v[24:25], v[16:19], off +; GISEL-NEXT: global_store_b128 v[24:25], v[20:23], off offset:16 +; GISEL-NEXT: s_endpgm +bb: + %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v8i32.v8i32(i32 4, <8 x i32> %A, i32 4, <8 x i32> %B, i16 0, <8 x float> %C) + store <8 x float> %res, ptr addrspace(1) %out + ret void +} + define amdgpu_ps void @test_wmma_f16_16x16x128_fp8_fp8(<16 x i32> %A, <16 x i32> %B, <8 x half> %C, ptr addrspace(1) %out) { ; GFX1250-LABEL: test_wmma_f16_16x16x128_fp8_fp8: ; GFX1250: ; %bb.0: ; %bb @@ -815,6 +1365,7 @@ declare <8 x half> @llvm.amdgcn.wmma.f16.16x16x64.bf8.bf8.v8f16.v8i32(<8 x i32>, declare <8 x i32> @llvm.amdgcn.wmma.i32.16x16x64.iu8.v8i32.v8i32(i1 immarg, <8 x i32>, i1 immarg, <8 x i32>, <8 x i32>, i1, i1) declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x32.f16.v8f32.v16f16(i1, <16 x half>, i1, <16 x half>, i16, <8 x float>, i1, i1) declare <8 x half> @llvm.amdgcn.wmma.f16.16x16x32.f16.v8f16.v16f16(i1, <16 x half>, i1, <16 x half>, i16, <8 x half>, i1, i1) +declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v16i32.v16i32(i32, <16 x i32>, i32, <16 x i32>, i16, <8 x float>) declare <8 x half> @llvm.amdgcn.wmma.f16.16x16x128.fp8.fp8.v8f16.v16i32(<16 x i32>, <16 x i32>, i16, <8 x half>, i1, i1) declare <8 x half> @llvm.amdgcn.wmma.f16.16x16x128.fp8.bf8.v8f16.v16i32(<16 x i32>, <16 x i32>, i16, <8 x half>, i1, i1) declare <8 x half> @llvm.amdgcn.wmma.f16.16x16x128.bf8.fp8.v8f16.v16i32(<16 x i32>, <16 x i32>, i16, <8 x half>, i1, i1) @@ -824,6 +1375,7 @@ declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.fp8.bf8.v8f32.v16i32(<16 x i declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.bf8.fp8.v8f32.v16i32(<16 x i32>, <16 x i32>, i16, <8 x float>, i1, i1) declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.bf8.bf8.v8f32.v16i32(<16 x i32>, <16 x i32>, i16, <8 x float>, i1, i1) declare <16 x float> @llvm.amdgcn.wmma.f32.32x16x128.f4.v16i32.v8i32.v16f32(<16 x i32>, <8 x i32>, i16, <16 x float>) + declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x64.bf16.v8f32.v16bf16.v32bf16.i16(i1, <16 x bfloat>, i1, <32 x bfloat>, <8 x float>, i16, i1, i1) declare <8 x bfloat> @llvm.amdgcn.swmmac.bf16.16x16x64.bf16.v8bf16.v16bf16.v32bf16.i16(i1, <16 x bfloat>, i1, <32 x bfloat>, <8 x bfloat>, i16, i1, i1) declare <8 x float> @llvm.amdgcn.swmmac.bf16f32.16x16x64.bf16.v8f32.v16bf16.v32bf16.i16(i1, <16 x bfloat>, i1, <32 x bfloat>, <8 x float>, i16, i1, i1) diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wmma.imm.gfx1250.w32.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wmma.imm.gfx1250.w32.ll index fe8358f..12ea314 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wmma.imm.gfx1250.w32.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wmma.imm.gfx1250.w32.ll @@ -1342,6 +1342,110 @@ bb: ret void } +define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4(<16 x i32> %A, <16 x i32> %B, ptr addrspace(1) %out) { +; GFX1250-LABEL: test_wmma_f32_16x16x128_f8f6f4: +; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: v_wmma_f32_16x16x128_f8f6f4 v[34:41], v[0:15], v[16:31], 1.0 +; GFX1250-NEXT: s_clause 0x1 +; GFX1250-NEXT: global_store_b128 v[32:33], v[38:41], off offset:16 +; GFX1250-NEXT: global_store_b128 v[32:33], v[34:37], off +; GFX1250-NEXT: s_endpgm +; +; GISEL-LABEL: test_wmma_f32_16x16x128_f8f6f4: +; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: v_wmma_f32_16x16x128_f8f6f4 v[34:41], v[0:15], v[16:31], 1.0 +; GISEL-NEXT: s_clause 0x1 +; GISEL-NEXT: global_store_b128 v[32:33], v[34:37], off +; GISEL-NEXT: global_store_b128 v[32:33], v[38:41], off offset:16 +; GISEL-NEXT: s_endpgm +bb: + %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v16i32.v16i32(i32 0, <16 x i32> %A, i32 0, <16 x i32> %B, i16 0, <8 x float> <float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>) + store <8 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4_non_splat(<16 x i32> %A, <16 x i32> %B, ptr addrspace(1) %out) { +; GFX1250-LABEL: test_wmma_f32_16x16x128_f8f6f4_non_splat: +; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: v_dual_mov_b32 v34, 1.0 :: v_dual_mov_b32 v35, 2.0 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX1250-NEXT: v_dual_mov_b32 v36, v34 :: v_dual_mov_b32 v37, v34 +; GFX1250-NEXT: v_dual_mov_b32 v38, v34 :: v_dual_mov_b32 v39, v34 +; GFX1250-NEXT: v_dual_mov_b32 v40, v34 :: v_dual_mov_b32 v41, v34 +; GFX1250-NEXT: v_wmma_f32_16x16x128_f8f6f4 v[34:41], v[0:15], v[16:31], v[34:41] +; GFX1250-NEXT: s_clause 0x1 +; GFX1250-NEXT: global_store_b128 v[32:33], v[38:41], off offset:16 +; GFX1250-NEXT: global_store_b128 v[32:33], v[34:37], off +; GFX1250-NEXT: s_endpgm +; +; GISEL-LABEL: test_wmma_f32_16x16x128_f8f6f4_non_splat: +; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: s_mov_b32 s0, 1.0 +; GISEL-NEXT: s_mov_b32 s1, 2.0 +; GISEL-NEXT: s_mov_b32 s6, s0 +; GISEL-NEXT: s_mov_b32 s7, s0 +; GISEL-NEXT: s_mov_b32 s2, s0 +; GISEL-NEXT: s_mov_b32 s3, s0 +; GISEL-NEXT: s_mov_b32 s4, s0 +; GISEL-NEXT: s_mov_b32 s5, s0 +; GISEL-NEXT: v_mov_b64_e32 v[40:41], s[6:7] +; GISEL-NEXT: v_mov_b64_e32 v[38:39], s[4:5] +; GISEL-NEXT: v_mov_b64_e32 v[36:37], s[2:3] +; GISEL-NEXT: v_mov_b64_e32 v[34:35], s[0:1] +; GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GISEL-NEXT: v_wmma_f32_16x16x128_f8f6f4 v[34:41], v[0:15], v[16:31], v[34:41] +; GISEL-NEXT: s_clause 0x1 +; GISEL-NEXT: global_store_b128 v[32:33], v[34:37], off +; GISEL-NEXT: global_store_b128 v[32:33], v[38:41], off offset:16 +; GISEL-NEXT: s_endpgm +bb: + %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v16i32.v16i32(i32 0, <16 x i32> %A, i32 0, <16 x i32> %B, i16 0, <8 x float> <float 1.0, float 2.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>) + store <8 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4_non_inlineable(<16 x i32> %A, <16 x i32> %B, ptr addrspace(1) %out) { +; GFX1250-LABEL: test_wmma_f32_16x16x128_f8f6f4_non_inlineable: +; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: v_mov_b32_e32 v34, 0x40400000 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX1250-NEXT: v_dual_mov_b32 v35, v34 :: v_dual_mov_b32 v36, v34 +; GFX1250-NEXT: v_dual_mov_b32 v37, v34 :: v_dual_mov_b32 v38, v34 +; GFX1250-NEXT: v_dual_mov_b32 v39, v34 :: v_dual_mov_b32 v40, v34 +; GFX1250-NEXT: v_mov_b32_e32 v41, v34 +; GFX1250-NEXT: v_wmma_f32_16x16x128_f8f6f4 v[34:41], v[0:15], v[16:31], v[34:41] +; GFX1250-NEXT: s_clause 0x1 +; GFX1250-NEXT: global_store_b128 v[32:33], v[38:41], off offset:16 +; GFX1250-NEXT: global_store_b128 v[32:33], v[34:37], off +; GFX1250-NEXT: s_endpgm +; +; GISEL-LABEL: test_wmma_f32_16x16x128_f8f6f4_non_inlineable: +; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: s_mov_b32 s0, 0x40400000 +; GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GISEL-NEXT: s_mov_b32 s6, s0 +; GISEL-NEXT: s_mov_b32 s7, s0 +; GISEL-NEXT: s_mov_b32 s1, s0 +; GISEL-NEXT: s_mov_b32 s2, s0 +; GISEL-NEXT: s_mov_b32 s3, s0 +; GISEL-NEXT: s_mov_b32 s4, s0 +; GISEL-NEXT: s_mov_b32 s5, s0 +; GISEL-NEXT: v_mov_b64_e32 v[40:41], s[6:7] +; GISEL-NEXT: v_mov_b64_e32 v[38:39], s[4:5] +; GISEL-NEXT: v_mov_b64_e32 v[36:37], s[2:3] +; GISEL-NEXT: v_mov_b64_e32 v[34:35], s[0:1] +; GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GISEL-NEXT: v_wmma_f32_16x16x128_f8f6f4 v[34:41], v[0:15], v[16:31], v[34:41] +; GISEL-NEXT: s_clause 0x1 +; GISEL-NEXT: global_store_b128 v[32:33], v[34:37], off +; GISEL-NEXT: global_store_b128 v[32:33], v[38:41], off offset:16 +; GISEL-NEXT: s_endpgm +bb: + %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v16i32.v16i32(i32 0, <16 x i32> %A, i32 0, <16 x i32> %B, i16 0, <8 x float> <float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0>) + store <8 x float> %res, ptr addrspace(1) %out + ret void +} + define amdgpu_ps void @test_wmma_f16_16x16x128_fp8_fp8(<16 x i32> %A, <16 x i32> %B, ptr addrspace(1) %out) { ; GFX1250-LABEL: test_wmma_f16_16x16x128_fp8_fp8: ; GFX1250: ; %bb.0: ; %bb @@ -2227,6 +2331,7 @@ declare <8 x half> @llvm.amdgcn.wmma.f16.16x16x64.bf8.bf8.v8f16.v8i32(<8 x i32>, declare <8 x i32> @llvm.amdgcn.wmma.i32.16x16x64.iu8.v8i32.v8i32(i1 immarg, <8 x i32>, i1 immarg, <8 x i32>, <8 x i32>, i1, i1) declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x32.f16.v8f32.v16f16(i1, <16 x half>, i1, <16 x half>, i16, <8 x float>, i1, i1) declare <8 x half> @llvm.amdgcn.wmma.f16.16x16x32.f16.v8f16.v16f16(i1, <16 x half>, i1, <16 x half>, i16, <8 x half>, i1, i1) +declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v16i32.v16i32(i32, <16 x i32>, i32, <16 x i32>, i16, <8 x float>) declare <8 x half> @llvm.amdgcn.wmma.f16.16x16x128.fp8.fp8.v8f16.v16i32(<16 x i32>, <16 x i32>, i16, <8 x half>, i1, i1) declare <8 x half> @llvm.amdgcn.wmma.f16.16x16x128.fp8.bf8.v8f16.v16i32(<16 x i32>, <16 x i32>, i16, <8 x half>, i1, i1) declare <8 x half> @llvm.amdgcn.wmma.f16.16x16x128.bf8.fp8.v8f16.v16i32(<16 x i32>, <16 x i32>, i16, <8 x half>, i1, i1) diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wmma.imod.gfx1250.w32.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wmma.imod.gfx1250.w32.ll index 9802144a..bf8308b 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wmma.imod.gfx1250.w32.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wmma.imod.gfx1250.w32.ll @@ -1126,6 +1126,72 @@ bb: ret void } +define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4_negC(<16 x i32> %A, <16 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) { +; GFX1250-LABEL: test_wmma_f32_16x16x128_f8f6f4_negC: +; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: v_wmma_f32_16x16x128_f8f6f4 v[32:39], v[0:15], v[16:31], v[32:39] neg_lo:[0,0,1] +; GFX1250-NEXT: s_clause 0x1 +; GFX1250-NEXT: global_store_b128 v[40:41], v[36:39], off offset:16 +; GFX1250-NEXT: global_store_b128 v[40:41], v[32:35], off +; GFX1250-NEXT: s_endpgm +; +; GISEL-LABEL: test_wmma_f32_16x16x128_f8f6f4_negC: +; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: v_wmma_f32_16x16x128_f8f6f4 v[32:39], v[0:15], v[16:31], v[32:39] neg_lo:[0,0,1] +; GISEL-NEXT: s_clause 0x1 +; GISEL-NEXT: global_store_b128 v[40:41], v[32:35], off +; GISEL-NEXT: global_store_b128 v[40:41], v[36:39], off offset:16 +; GISEL-NEXT: s_endpgm +bb: + %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v16i32.v16i32(i32 0, <16 x i32> %A, i32 0, <16 x i32> %B, i16 1, <8 x float> %C) + store <8 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4_neg_absC(<16 x i32> %A, <16 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) { +; GFX1250-LABEL: test_wmma_f32_16x16x128_f8f6f4_neg_absC: +; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: v_wmma_f32_16x16x128_f8f6f4 v[32:39], v[0:15], v[16:31], v[32:39] neg_lo:[0,0,1] neg_hi:[0,0,1] +; GFX1250-NEXT: s_clause 0x1 +; GFX1250-NEXT: global_store_b128 v[40:41], v[36:39], off offset:16 +; GFX1250-NEXT: global_store_b128 v[40:41], v[32:35], off +; GFX1250-NEXT: s_endpgm +; +; GISEL-LABEL: test_wmma_f32_16x16x128_f8f6f4_neg_absC: +; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: v_wmma_f32_16x16x128_f8f6f4 v[32:39], v[0:15], v[16:31], v[32:39] neg_lo:[0,0,1] neg_hi:[0,0,1] +; GISEL-NEXT: s_clause 0x1 +; GISEL-NEXT: global_store_b128 v[40:41], v[32:35], off +; GISEL-NEXT: global_store_b128 v[40:41], v[36:39], off offset:16 +; GISEL-NEXT: s_endpgm +bb: + %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v16i32.v16i32(i32 0, <16 x i32> %A, i32 0, <16 x i32> %B, i16 3, <8 x float> %C) + store <8 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4_ignoreC(<16 x i32> %A, <16 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) { +; GFX1250-LABEL: test_wmma_f32_16x16x128_f8f6f4_ignoreC: +; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: v_wmma_f32_16x16x128_f8f6f4 v[32:39], v[0:15], v[16:31], v[32:39] +; GFX1250-NEXT: s_clause 0x1 +; GFX1250-NEXT: global_store_b128 v[40:41], v[36:39], off offset:16 +; GFX1250-NEXT: global_store_b128 v[40:41], v[32:35], off +; GFX1250-NEXT: s_endpgm +; +; GISEL-LABEL: test_wmma_f32_16x16x128_f8f6f4_ignoreC: +; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: v_wmma_f32_16x16x128_f8f6f4 v[32:39], v[0:15], v[16:31], v[32:39] +; GISEL-NEXT: s_clause 0x1 +; GISEL-NEXT: global_store_b128 v[40:41], v[32:35], off +; GISEL-NEXT: global_store_b128 v[40:41], v[36:39], off offset:16 +; GISEL-NEXT: s_endpgm +bb: + %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v16i32.v16i32(i32 0, <16 x i32> %A, i32 0, <16 x i32> %B, i16 4, <8 x float> %C) + store <8 x float> %res, ptr addrspace(1) %out + ret void +} + define amdgpu_ps void @test_wmma_f16_16x16x128_fp8_fp8_negC(<16 x i32> %A, <16 x i32> %B, <8 x half> %C, ptr addrspace(1) %out) { ; GFX1250-LABEL: test_wmma_f16_16x16x128_fp8_fp8_negC: ; GFX1250: ; %bb.0: ; %bb @@ -1967,6 +2033,7 @@ declare <8 x half> @llvm.amdgcn.wmma.f16.16x16x64.bf8.bf8.v8f16.v8i32(<8 x i32>, declare <8 x i32> @llvm.amdgcn.wmma.i32.16x16x64.iu8.v8i32.v8i32(i1 immarg, <8 x i32>, i1 immarg, <8 x i32>, <8 x i32>, i1, i1) declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x32.f16.v8f32.v16f16(i1, <16 x half>, i1, <16 x half>, i16, <8 x float>, i1, i1) declare <8 x half> @llvm.amdgcn.wmma.f16.16x16x32.f16.v8f16.v16f16(i1, <16 x half>, i1, <16 x half>, i16, <8 x half>, i1, i1) +declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v16i32.v16i32(i32, <16 x i32>, i32, <16 x i32>, i16, <8 x float>) declare <8 x half> @llvm.amdgcn.wmma.f16.16x16x128.fp8.fp8.v8f16.v16i32(<16 x i32>, <16 x i32>, i16, <8 x half>, i1, i1) declare <8 x half> @llvm.amdgcn.wmma.f16.16x16x128.fp8.bf8.v8f16.v16i32(<16 x i32>, <16 x i32>, i16, <8 x half>, i1, i1) declare <8 x half> @llvm.amdgcn.wmma.f16.16x16x128.bf8.fp8.v8f16.v16i32(<16 x i32>, <16 x i32>, i16, <8 x half>, i1, i1) diff --git a/llvm/test/CodeGen/AMDGPU/llvm.cos.bf16.ll b/llvm/test/CodeGen/AMDGPU/llvm.cos.bf16.ll new file mode 100644 index 0000000..ced96ee --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/llvm.cos.bf16.ll @@ -0,0 +1,38 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1250 %s -o - | FileCheck -check-prefixes=GCN %s +; xUN: llc -global-isel=1 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1250 -verify-machineinstrs %s -o - | FileCheck -check-prefix=GCN %s + +; FIXME: GlobalISel does not work with bf16 + +declare bfloat @llvm.cos.bf16(bfloat) #0 + +define amdgpu_kernel void @cos_bf16_constant_4(ptr addrspace(1) %out) #1 { +; GCN-LABEL: cos_bf16_constant_4: +; GCN: ; %bb.0: +; GCN-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GCN-NEXT: v_cos_bf16_e32 v0, 0x3f23 +; GCN-NEXT: v_mov_b32_e32 v1, 0 +; GCN-NEXT: s_wait_kmcnt 0x0 +; GCN-NEXT: global_store_b16 v1, v0, s[0:1] +; GCN-NEXT: s_endpgm + %cos = call bfloat @llvm.cos.bf16(bfloat 4.0) #0 + store bfloat %cos, ptr addrspace(1) %out, align 2 + ret void +} + +define amdgpu_kernel void @cos_bf16_constant_100(ptr addrspace(1) %out) #1 { +; GCN-LABEL: cos_bf16_constant_100: +; GCN: ; %bb.0: +; GCN-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GCN-NEXT: v_cos_bf16_e32 v0, 0x417f +; GCN-NEXT: v_mov_b32_e32 v1, 0 +; GCN-NEXT: s_wait_kmcnt 0x0 +; GCN-NEXT: global_store_b16 v1, v0, s[0:1] +; GCN-NEXT: s_endpgm + %cos = call bfloat @llvm.cos.bf16(bfloat 100.0) #0 + store bfloat %cos, ptr addrspace(1) %out, align 2 + ret void +} + +attributes #0 = { nounwind readnone } +attributes #1 = { nounwind } diff --git a/llvm/test/CodeGen/AMDGPU/llvm.exp.ll b/llvm/test/CodeGen/AMDGPU/llvm.exp.ll index 978f223..8c1e166 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.exp.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.exp.ll @@ -5213,121 +5213,15 @@ define float @v_exp_f32_dynamic_mode(float %in) #1 { } define float @v_exp_f32_undef() { -; VI-SDAG-LABEL: v_exp_f32_undef: -; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-SDAG-NEXT: v_rndne_f32_e32 v0, 0 -; VI-SDAG-NEXT: s_mov_b32 s4, 0x7fc00000 -; VI-SDAG-NEXT: v_add_f32_e64 v1, -v0, s4 -; VI-SDAG-NEXT: v_exp_f32_e32 v1, v1 -; VI-SDAG-NEXT: v_cvt_i32_f32_e32 v0, v0 -; VI-SDAG-NEXT: v_ldexp_f32 v0, v1, v0 -; VI-SDAG-NEXT: s_setpc_b64 s[30:31] -; -; VI-GISEL-LABEL: v_exp_f32_undef: -; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-GISEL-NEXT: v_sub_f32_e64 v0, s4, 0 -; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0x3fb8a000 -; VI-GISEL-NEXT: v_mov_b32_e32 v2, 0x39a3b295 -; VI-GISEL-NEXT: v_mul_f32_e32 v3, 0x39a3b295, v0 -; VI-GISEL-NEXT: v_mul_f32_e32 v0, 0x3fb8a000, v0 -; VI-GISEL-NEXT: v_mul_f32_e32 v1, 0, v1 -; VI-GISEL-NEXT: v_add_f32_e32 v0, v0, v3 -; VI-GISEL-NEXT: v_mul_f32_e32 v2, 0, v2 -; VI-GISEL-NEXT: v_add_f32_e32 v0, v2, v0 -; VI-GISEL-NEXT: v_rndne_f32_e32 v2, v1 -; VI-GISEL-NEXT: v_sub_f32_e32 v1, v1, v2 -; VI-GISEL-NEXT: v_add_f32_e32 v0, v1, v0 -; VI-GISEL-NEXT: v_cvt_i32_f32_e32 v1, v2 -; VI-GISEL-NEXT: v_exp_f32_e32 v0, v0 -; VI-GISEL-NEXT: v_mov_b32_e32 v2, 0x7f800000 -; VI-GISEL-NEXT: v_ldexp_f32 v0, v0, v1 -; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0xc2ce8ed0 -; VI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s4, v1 -; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0x42b17218 -; VI-GISEL-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc -; VI-GISEL-NEXT: v_cmp_gt_f32_e32 vcc, s4, v1 -; VI-GISEL-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc -; VI-GISEL-NEXT: s_setpc_b64 s[30:31] -; -; GFX900-SDAG-LABEL: v_exp_f32_undef: -; GFX900-SDAG: ; %bb.0: -; GFX900-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-SDAG-NEXT: v_mov_b32_e32 v0, 0x3fb8aa3b -; GFX900-SDAG-NEXT: v_mov_b32_e32 v1, 0xffc00000 -; GFX900-SDAG-NEXT: v_fma_f32 v0, s4, v0, v1 -; GFX900-SDAG-NEXT: v_mov_b32_e32 v1, 0x32a5705f -; GFX900-SDAG-NEXT: v_fma_f32 v0, s4, v1, v0 -; GFX900-SDAG-NEXT: v_rndne_f32_e32 v1, 0x7fc00000 -; GFX900-SDAG-NEXT: v_sub_f32_e32 v2, 0x7fc00000, v1 -; GFX900-SDAG-NEXT: v_add_f32_e32 v0, v2, v0 -; GFX900-SDAG-NEXT: v_exp_f32_e32 v0, v0 -; GFX900-SDAG-NEXT: v_cvt_i32_f32_e32 v1, v1 -; GFX900-SDAG-NEXT: v_ldexp_f32 v0, v0, v1 -; GFX900-SDAG-NEXT: s_setpc_b64 s[30:31] -; -; GFX900-GISEL-LABEL: v_exp_f32_undef: -; GFX900-GISEL: ; %bb.0: -; GFX900-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-GISEL-NEXT: v_mov_b32_e32 v0, 0x3fb8aa3b -; GFX900-GISEL-NEXT: v_mul_f32_e32 v1, s4, v0 -; GFX900-GISEL-NEXT: v_fma_f32 v0, s4, v0, -v1 -; GFX900-GISEL-NEXT: v_mov_b32_e32 v2, 0x32a5705f -; GFX900-GISEL-NEXT: v_fma_f32 v0, s4, v2, v0 -; GFX900-GISEL-NEXT: v_rndne_f32_e32 v2, v1 -; GFX900-GISEL-NEXT: v_sub_f32_e32 v1, v1, v2 -; GFX900-GISEL-NEXT: v_add_f32_e32 v0, v1, v0 -; GFX900-GISEL-NEXT: v_cvt_i32_f32_e32 v1, v2 -; GFX900-GISEL-NEXT: v_exp_f32_e32 v0, v0 -; GFX900-GISEL-NEXT: v_mov_b32_e32 v2, 0x7f800000 -; GFX900-GISEL-NEXT: v_ldexp_f32 v0, v0, v1 -; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0xc2ce8ed0 -; GFX900-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s4, v1 -; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0x42b17218 -; GFX900-GISEL-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc -; GFX900-GISEL-NEXT: v_cmp_gt_f32_e32 vcc, s4, v1 -; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc -; GFX900-GISEL-NEXT: s_setpc_b64 s[30:31] -; -; SI-SDAG-LABEL: v_exp_f32_undef: -; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-SDAG-NEXT: v_mov_b32_e32 v0, 0x3fb8aa3b -; SI-SDAG-NEXT: v_mov_b32_e32 v1, 0xffc00000 -; SI-SDAG-NEXT: v_fma_f32 v0, s4, v0, v1 -; SI-SDAG-NEXT: v_mov_b32_e32 v1, 0x32a5705f -; SI-SDAG-NEXT: v_fma_f32 v0, s4, v1, v0 -; SI-SDAG-NEXT: v_rndne_f32_e32 v1, 0x7fc00000 -; SI-SDAG-NEXT: v_sub_f32_e32 v2, 0x7fc00000, v1 -; SI-SDAG-NEXT: v_add_f32_e32 v0, v2, v0 -; SI-SDAG-NEXT: v_exp_f32_e32 v0, v0 -; SI-SDAG-NEXT: v_cvt_i32_f32_e32 v1, v1 -; SI-SDAG-NEXT: v_ldexp_f32_e32 v0, v0, v1 -; SI-SDAG-NEXT: s_setpc_b64 s[30:31] +; GCN-LABEL: v_exp_f32_undef: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: s_setpc_b64 s[30:31] ; -; SI-GISEL-LABEL: v_exp_f32_undef: -; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-GISEL-NEXT: v_mov_b32_e32 v0, 0x3fb8aa3b -; SI-GISEL-NEXT: v_mul_f32_e32 v1, s4, v0 -; SI-GISEL-NEXT: v_fma_f32 v0, s4, v0, -v1 -; SI-GISEL-NEXT: v_mov_b32_e32 v2, 0x32a5705f -; SI-GISEL-NEXT: v_fma_f32 v0, s4, v2, v0 -; SI-GISEL-NEXT: v_rndne_f32_e32 v2, v1 -; SI-GISEL-NEXT: v_sub_f32_e32 v1, v1, v2 -; SI-GISEL-NEXT: v_add_f32_e32 v0, v1, v0 -; SI-GISEL-NEXT: v_cvt_i32_f32_e32 v1, v2 -; SI-GISEL-NEXT: v_exp_f32_e32 v0, v0 -; SI-GISEL-NEXT: v_mov_b32_e32 v2, 0x7f800000 -; SI-GISEL-NEXT: v_ldexp_f32_e32 v0, v0, v1 -; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0xc2ce8ed0 -; SI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s4, v1 -; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0x42b17218 -; SI-GISEL-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc -; SI-GISEL-NEXT: v_cmp_gt_f32_e32 vcc, s4, v1 -; SI-GISEL-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc -; SI-GISEL-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: v_exp_f32_undef: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] ; ; R600-LABEL: v_exp_f32_undef: ; R600: ; %bb.0: diff --git a/llvm/test/CodeGen/AMDGPU/llvm.exp10.ll b/llvm/test/CodeGen/AMDGPU/llvm.exp10.ll index 70c3787..edc505b 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.exp10.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.exp10.ll @@ -5291,121 +5291,15 @@ define float @v_exp10_f32_dynamic_mode(float %in) #1 { } define float @v_exp10_f32_undef() { -; VI-SDAG-LABEL: v_exp10_f32_undef: -; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-SDAG-NEXT: v_rndne_f32_e32 v0, 0 -; VI-SDAG-NEXT: s_mov_b32 s4, 0x7fc00000 -; VI-SDAG-NEXT: v_add_f32_e64 v1, -v0, s4 -; VI-SDAG-NEXT: v_exp_f32_e32 v1, v1 -; VI-SDAG-NEXT: v_cvt_i32_f32_e32 v0, v0 -; VI-SDAG-NEXT: v_ldexp_f32 v0, v1, v0 -; VI-SDAG-NEXT: s_setpc_b64 s[30:31] -; -; VI-GISEL-LABEL: v_exp10_f32_undef: -; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-GISEL-NEXT: v_sub_f32_e64 v0, s4, 0 -; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0x40549000 -; VI-GISEL-NEXT: v_mov_b32_e32 v2, 0x3a2784bc -; VI-GISEL-NEXT: v_mul_f32_e32 v3, 0x3a2784bc, v0 -; VI-GISEL-NEXT: v_mul_f32_e32 v0, 0x40549000, v0 -; VI-GISEL-NEXT: v_mul_f32_e32 v1, 0, v1 -; VI-GISEL-NEXT: v_add_f32_e32 v0, v0, v3 -; VI-GISEL-NEXT: v_mul_f32_e32 v2, 0, v2 -; VI-GISEL-NEXT: v_add_f32_e32 v0, v2, v0 -; VI-GISEL-NEXT: v_rndne_f32_e32 v2, v1 -; VI-GISEL-NEXT: v_sub_f32_e32 v1, v1, v2 -; VI-GISEL-NEXT: v_add_f32_e32 v0, v1, v0 -; VI-GISEL-NEXT: v_cvt_i32_f32_e32 v1, v2 -; VI-GISEL-NEXT: v_exp_f32_e32 v0, v0 -; VI-GISEL-NEXT: v_mov_b32_e32 v2, 0x7f800000 -; VI-GISEL-NEXT: v_ldexp_f32 v0, v0, v1 -; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0xc23369f4 -; VI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s4, v1 -; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0x421a209b -; VI-GISEL-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc -; VI-GISEL-NEXT: v_cmp_gt_f32_e32 vcc, s4, v1 -; VI-GISEL-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc -; VI-GISEL-NEXT: s_setpc_b64 s[30:31] -; -; GFX900-SDAG-LABEL: v_exp10_f32_undef: -; GFX900-SDAG: ; %bb.0: -; GFX900-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-SDAG-NEXT: v_mov_b32_e32 v0, 0x40549a78 -; GFX900-SDAG-NEXT: v_mov_b32_e32 v1, 0xffc00000 -; GFX900-SDAG-NEXT: v_fma_f32 v0, s4, v0, v1 -; GFX900-SDAG-NEXT: v_mov_b32_e32 v1, 0x33979a37 -; GFX900-SDAG-NEXT: v_fma_f32 v0, s4, v1, v0 -; GFX900-SDAG-NEXT: v_rndne_f32_e32 v1, 0x7fc00000 -; GFX900-SDAG-NEXT: v_sub_f32_e32 v2, 0x7fc00000, v1 -; GFX900-SDAG-NEXT: v_add_f32_e32 v0, v2, v0 -; GFX900-SDAG-NEXT: v_exp_f32_e32 v0, v0 -; GFX900-SDAG-NEXT: v_cvt_i32_f32_e32 v1, v1 -; GFX900-SDAG-NEXT: v_ldexp_f32 v0, v0, v1 -; GFX900-SDAG-NEXT: s_setpc_b64 s[30:31] -; -; GFX900-GISEL-LABEL: v_exp10_f32_undef: -; GFX900-GISEL: ; %bb.0: -; GFX900-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-GISEL-NEXT: v_mov_b32_e32 v0, 0x40549a78 -; GFX900-GISEL-NEXT: v_mul_f32_e32 v1, s4, v0 -; GFX900-GISEL-NEXT: v_fma_f32 v0, s4, v0, -v1 -; GFX900-GISEL-NEXT: v_mov_b32_e32 v2, 0x33979a37 -; GFX900-GISEL-NEXT: v_fma_f32 v0, s4, v2, v0 -; GFX900-GISEL-NEXT: v_rndne_f32_e32 v2, v1 -; GFX900-GISEL-NEXT: v_sub_f32_e32 v1, v1, v2 -; GFX900-GISEL-NEXT: v_add_f32_e32 v0, v1, v0 -; GFX900-GISEL-NEXT: v_cvt_i32_f32_e32 v1, v2 -; GFX900-GISEL-NEXT: v_exp_f32_e32 v0, v0 -; GFX900-GISEL-NEXT: v_mov_b32_e32 v2, 0x7f800000 -; GFX900-GISEL-NEXT: v_ldexp_f32 v0, v0, v1 -; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0xc23369f4 -; GFX900-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s4, v1 -; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0x421a209b -; GFX900-GISEL-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc -; GFX900-GISEL-NEXT: v_cmp_gt_f32_e32 vcc, s4, v1 -; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc -; GFX900-GISEL-NEXT: s_setpc_b64 s[30:31] -; -; SI-SDAG-LABEL: v_exp10_f32_undef: -; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-SDAG-NEXT: v_mov_b32_e32 v0, 0x40549a78 -; SI-SDAG-NEXT: v_mov_b32_e32 v1, 0xffc00000 -; SI-SDAG-NEXT: v_fma_f32 v0, s4, v0, v1 -; SI-SDAG-NEXT: v_mov_b32_e32 v1, 0x33979a37 -; SI-SDAG-NEXT: v_fma_f32 v0, s4, v1, v0 -; SI-SDAG-NEXT: v_rndne_f32_e32 v1, 0x7fc00000 -; SI-SDAG-NEXT: v_sub_f32_e32 v2, 0x7fc00000, v1 -; SI-SDAG-NEXT: v_add_f32_e32 v0, v2, v0 -; SI-SDAG-NEXT: v_exp_f32_e32 v0, v0 -; SI-SDAG-NEXT: v_cvt_i32_f32_e32 v1, v1 -; SI-SDAG-NEXT: v_ldexp_f32_e32 v0, v0, v1 -; SI-SDAG-NEXT: s_setpc_b64 s[30:31] +; GCN-LABEL: v_exp10_f32_undef: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: s_setpc_b64 s[30:31] ; -; SI-GISEL-LABEL: v_exp10_f32_undef: -; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-GISEL-NEXT: v_mov_b32_e32 v0, 0x40549a78 -; SI-GISEL-NEXT: v_mul_f32_e32 v1, s4, v0 -; SI-GISEL-NEXT: v_fma_f32 v0, s4, v0, -v1 -; SI-GISEL-NEXT: v_mov_b32_e32 v2, 0x33979a37 -; SI-GISEL-NEXT: v_fma_f32 v0, s4, v2, v0 -; SI-GISEL-NEXT: v_rndne_f32_e32 v2, v1 -; SI-GISEL-NEXT: v_sub_f32_e32 v1, v1, v2 -; SI-GISEL-NEXT: v_add_f32_e32 v0, v1, v0 -; SI-GISEL-NEXT: v_cvt_i32_f32_e32 v1, v2 -; SI-GISEL-NEXT: v_exp_f32_e32 v0, v0 -; SI-GISEL-NEXT: v_mov_b32_e32 v2, 0x7f800000 -; SI-GISEL-NEXT: v_ldexp_f32_e32 v0, v0, v1 -; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0xc23369f4 -; SI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s4, v1 -; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0x421a209b -; SI-GISEL-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc -; SI-GISEL-NEXT: v_cmp_gt_f32_e32 vcc, s4, v1 -; SI-GISEL-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc -; SI-GISEL-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: v_exp10_f32_undef: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] ; ; R600-LABEL: v_exp10_f32_undef: ; R600: ; %bb.0: diff --git a/llvm/test/CodeGen/AMDGPU/llvm.exp2.ll b/llvm/test/CodeGen/AMDGPU/llvm.exp2.ll index 15bcab9..e71ea50 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.exp2.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.exp2.ll @@ -2783,56 +2783,10 @@ define float @v_exp2_f32_dynamic_mode(float %in) #1 { } define float @v_exp2_f32_undef() { -; GCN-SDAG-LABEL: v_exp2_f32_undef: -; GCN-SDAG: ; %bb.0: -; GCN-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-SDAG-NEXT: v_exp_f32_e32 v0, 0x7fc00000 -; GCN-SDAG-NEXT: s_setpc_b64 s[30:31] -; -; SI-GISEL-LABEL: v_exp2_f32_undef: -; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-GISEL-NEXT: v_mov_b32_e32 v0, 0xc2fc0000 -; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0x42800000 -; SI-GISEL-NEXT: v_add_f32_e32 v1, s4, v1 -; SI-GISEL-NEXT: v_add_f32_e64 v2, s4, 0 -; SI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s4, v0 -; SI-GISEL-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc -; SI-GISEL-NEXT: v_exp_f32_e32 v0, v0 -; SI-GISEL-NEXT: v_not_b32_e32 v1, 63 -; SI-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc -; SI-GISEL-NEXT: v_ldexp_f32_e32 v0, v0, v1 -; SI-GISEL-NEXT: s_setpc_b64 s[30:31] -; -; VI-GISEL-LABEL: v_exp2_f32_undef: -; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-GISEL-NEXT: v_mov_b32_e32 v0, 0xc2fc0000 -; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0x42800000 -; VI-GISEL-NEXT: v_add_f32_e32 v1, s4, v1 -; VI-GISEL-NEXT: v_add_f32_e64 v2, s4, 0 -; VI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s4, v0 -; VI-GISEL-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc -; VI-GISEL-NEXT: v_exp_f32_e32 v0, v0 -; VI-GISEL-NEXT: v_not_b32_e32 v1, 63 -; VI-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc -; VI-GISEL-NEXT: v_ldexp_f32 v0, v0, v1 -; VI-GISEL-NEXT: s_setpc_b64 s[30:31] -; -; GFX900-GISEL-LABEL: v_exp2_f32_undef: -; GFX900-GISEL: ; %bb.0: -; GFX900-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-GISEL-NEXT: v_mov_b32_e32 v0, 0xc2fc0000 -; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0x42800000 -; GFX900-GISEL-NEXT: v_add_f32_e32 v1, s4, v1 -; GFX900-GISEL-NEXT: v_add_f32_e64 v2, s4, 0 -; GFX900-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s4, v0 -; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc -; GFX900-GISEL-NEXT: v_exp_f32_e32 v0, v0 -; GFX900-GISEL-NEXT: v_not_b32_e32 v1, 63 -; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc -; GFX900-GISEL-NEXT: v_ldexp_f32 v0, v0, v1 -; GFX900-GISEL-NEXT: s_setpc_b64 s[30:31] +; GCN-LABEL: v_exp2_f32_undef: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: s_setpc_b64 s[30:31] ; ; R600-LABEL: v_exp2_f32_undef: ; R600: ; %bb.0: @@ -4076,3 +4030,4 @@ attributes #1 = { "denormal-fp-math-f32"="dynamic,dynamic" } attributes #2 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) } ;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: ; GCN-GISEL: {{.*}} +; GCN-SDAG: {{.*}} diff --git a/llvm/test/CodeGen/AMDGPU/llvm.log.ll b/llvm/test/CodeGen/AMDGPU/llvm.log.ll index 5634df5..38d1b47 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.log.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.log.ll @@ -5590,162 +5590,15 @@ define float @v_log_f32_dynamic_mode(float %in) #1 { } define float @v_log_f32_undef() { -; SI-SDAG-LABEL: v_log_f32_undef: -; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-SDAG-NEXT: v_log_f32_e32 v0, s4 -; SI-SDAG-NEXT: s_mov_b32 s4, 0x3f317217 -; SI-SDAG-NEXT: s_mov_b32 s5, 0x3377d1cf -; SI-SDAG-NEXT: s_mov_b32 s6, 0x7f800000 -; SI-SDAG-NEXT: v_mul_f32_e32 v1, 0x3f317217, v0 -; SI-SDAG-NEXT: v_fma_f32 v2, v0, s4, -v1 -; SI-SDAG-NEXT: v_fma_f32 v2, v0, s5, v2 -; SI-SDAG-NEXT: v_add_f32_e32 v1, v1, v2 -; SI-SDAG-NEXT: v_cmp_lt_f32_e64 vcc, |v0|, s6 -; SI-SDAG-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc -; SI-SDAG-NEXT: s_setpc_b64 s[30:31] -; -; SI-GISEL-LABEL: v_log_f32_undef: -; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-GISEL-NEXT: v_mov_b32_e32 v0, 0x800000 -; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0x4f800000 -; SI-GISEL-NEXT: v_mul_f32_e32 v1, s4, v1 -; SI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s4, v0 -; SI-GISEL-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc -; SI-GISEL-NEXT: v_log_f32_e32 v0, v0 -; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0x3f317217 -; SI-GISEL-NEXT: v_mov_b32_e32 v3, 0x3377d1cf -; SI-GISEL-NEXT: v_mul_f32_e32 v2, 0x3f317217, v0 -; SI-GISEL-NEXT: v_fma_f32 v1, v0, v1, -v2 -; SI-GISEL-NEXT: v_fma_f32 v1, v0, v3, v1 -; SI-GISEL-NEXT: v_add_f32_e32 v1, v2, v1 -; SI-GISEL-NEXT: v_mov_b32_e32 v2, 0x7f800000 -; SI-GISEL-NEXT: v_cmp_lt_f32_e64 s[4:5], |v0|, v2 -; SI-GISEL-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[4:5] -; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0x41b17218 -; SI-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc -; SI-GISEL-NEXT: v_sub_f32_e32 v0, v0, v1 -; SI-GISEL-NEXT: s_setpc_b64 s[30:31] -; -; VI-SDAG-LABEL: v_log_f32_undef: -; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-SDAG-NEXT: v_log_f32_e32 v0, s4 -; VI-SDAG-NEXT: s_mov_b32 s4, 0x7f800000 -; VI-SDAG-NEXT: v_and_b32_e32 v1, 0xfffff000, v0 -; VI-SDAG-NEXT: v_sub_f32_e32 v3, v0, v1 -; VI-SDAG-NEXT: v_mul_f32_e32 v2, 0x3805fdf4, v1 -; VI-SDAG-NEXT: v_mul_f32_e32 v4, 0x3805fdf4, v3 -; VI-SDAG-NEXT: v_mul_f32_e32 v3, 0x3f317000, v3 -; VI-SDAG-NEXT: v_add_f32_e32 v2, v2, v4 -; VI-SDAG-NEXT: v_mul_f32_e32 v1, 0x3f317000, v1 -; VI-SDAG-NEXT: v_add_f32_e32 v2, v3, v2 -; VI-SDAG-NEXT: v_add_f32_e32 v1, v1, v2 -; VI-SDAG-NEXT: v_cmp_lt_f32_e64 vcc, |v0|, s4 -; VI-SDAG-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc -; VI-SDAG-NEXT: s_setpc_b64 s[30:31] -; -; VI-GISEL-LABEL: v_log_f32_undef: -; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-GISEL-NEXT: v_mov_b32_e32 v0, 0x800000 -; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0x4f800000 -; VI-GISEL-NEXT: v_mul_f32_e32 v1, s4, v1 -; VI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s4, v0 -; VI-GISEL-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc -; VI-GISEL-NEXT: v_log_f32_e32 v0, v0 -; VI-GISEL-NEXT: v_and_b32_e32 v1, 0xfffff000, v0 -; VI-GISEL-NEXT: v_sub_f32_e32 v2, v0, v1 -; VI-GISEL-NEXT: v_mul_f32_e32 v3, 0x3805fdf4, v1 -; VI-GISEL-NEXT: v_mul_f32_e32 v4, 0x3805fdf4, v2 -; VI-GISEL-NEXT: v_add_f32_e32 v3, v3, v4 -; VI-GISEL-NEXT: v_mul_f32_e32 v2, 0x3f317000, v2 -; VI-GISEL-NEXT: v_add_f32_e32 v2, v2, v3 -; VI-GISEL-NEXT: v_mul_f32_e32 v1, 0x3f317000, v1 -; VI-GISEL-NEXT: v_add_f32_e32 v1, v1, v2 -; VI-GISEL-NEXT: v_mov_b32_e32 v2, 0x7f800000 -; VI-GISEL-NEXT: v_cmp_lt_f32_e64 s[4:5], |v0|, v2 -; VI-GISEL-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[4:5] -; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0x41b17218 -; VI-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc -; VI-GISEL-NEXT: v_sub_f32_e32 v0, v0, v1 -; VI-GISEL-NEXT: s_setpc_b64 s[30:31] -; -; GFX900-SDAG-LABEL: v_log_f32_undef: -; GFX900-SDAG: ; %bb.0: -; GFX900-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-SDAG-NEXT: v_log_f32_e32 v0, s4 -; GFX900-SDAG-NEXT: s_mov_b32 s4, 0x3f317217 -; GFX900-SDAG-NEXT: s_mov_b32 s5, 0x3377d1cf -; GFX900-SDAG-NEXT: s_mov_b32 s6, 0x7f800000 -; GFX900-SDAG-NEXT: v_mul_f32_e32 v1, 0x3f317217, v0 -; GFX900-SDAG-NEXT: v_fma_f32 v2, v0, s4, -v1 -; GFX900-SDAG-NEXT: v_fma_f32 v2, v0, s5, v2 -; GFX900-SDAG-NEXT: v_add_f32_e32 v1, v1, v2 -; GFX900-SDAG-NEXT: v_cmp_lt_f32_e64 vcc, |v0|, s6 -; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc -; GFX900-SDAG-NEXT: s_setpc_b64 s[30:31] -; -; GFX900-GISEL-LABEL: v_log_f32_undef: -; GFX900-GISEL: ; %bb.0: -; GFX900-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-GISEL-NEXT: v_mov_b32_e32 v0, 0x800000 -; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0x4f800000 -; GFX900-GISEL-NEXT: v_mul_f32_e32 v1, s4, v1 -; GFX900-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s4, v0 -; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc -; GFX900-GISEL-NEXT: v_log_f32_e32 v0, v0 -; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0x3f317217 -; GFX900-GISEL-NEXT: v_mov_b32_e32 v3, 0x3377d1cf -; GFX900-GISEL-NEXT: v_mul_f32_e32 v2, 0x3f317217, v0 -; GFX900-GISEL-NEXT: v_fma_f32 v1, v0, v1, -v2 -; GFX900-GISEL-NEXT: v_fma_f32 v1, v0, v3, v1 -; GFX900-GISEL-NEXT: v_add_f32_e32 v1, v2, v1 -; GFX900-GISEL-NEXT: v_mov_b32_e32 v2, 0x7f800000 -; GFX900-GISEL-NEXT: v_cmp_lt_f32_e64 s[4:5], |v0|, v2 -; GFX900-GISEL-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[4:5] -; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0x41b17218 -; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc -; GFX900-GISEL-NEXT: v_sub_f32_e32 v0, v0, v1 -; GFX900-GISEL-NEXT: s_setpc_b64 s[30:31] -; -; GFX1100-SDAG-LABEL: v_log_f32_undef: -; GFX1100-SDAG: ; %bb.0: -; GFX1100-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX1100-SDAG-NEXT: v_log_f32_e32 v0, s0 -; GFX1100-SDAG-NEXT: s_waitcnt_depctr 0xfff -; GFX1100-SDAG-NEXT: v_mul_f32_e32 v1, 0x3f317217, v0 -; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 vcc_lo, 0x7f800000, |v0| -; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1100-SDAG-NEXT: v_fma_f32 v2, 0x3f317217, v0, -v1 -; GFX1100-SDAG-NEXT: v_fmamk_f32 v2, v0, 0x3377d1cf, v2 -; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1100-SDAG-NEXT: v_add_f32_e32 v1, v1, v2 -; GFX1100-SDAG-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo -; GFX1100-SDAG-NEXT: s_setpc_b64 s[30:31] +; GFX689-LABEL: v_log_f32_undef: +; GFX689: ; %bb.0: +; GFX689-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX689-NEXT: s_setpc_b64 s[30:31] ; -; GFX1100-GISEL-LABEL: v_log_f32_undef: -; GFX1100-GISEL: ; %bb.0: -; GFX1100-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX1100-GISEL-NEXT: v_mul_f32_e64 v0, 0x4f800000, s0 -; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 vcc_lo, 0x800000, s0 -; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1100-GISEL-NEXT: v_cndmask_b32_e32 v0, s0, v0, vcc_lo -; GFX1100-GISEL-NEXT: v_log_f32_e32 v0, v0 -; GFX1100-GISEL-NEXT: s_waitcnt_depctr 0xfff -; GFX1100-GISEL-NEXT: v_mul_f32_e32 v1, 0x3f317217, v0 -; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s0, 0x7f800000, |v0| -; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1100-GISEL-NEXT: v_fma_f32 v2, 0x3f317217, v0, -v1 -; GFX1100-GISEL-NEXT: v_fmac_f32_e32 v2, 0x3377d1cf, v0 -; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1100-GISEL-NEXT: v_add_f32_e32 v1, v1, v2 -; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v0, v0, v1, s0 -; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 0x41b17218, vcc_lo -; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1100-GISEL-NEXT: v_sub_f32_e32 v0, v0, v1 -; GFX1100-GISEL-NEXT: s_setpc_b64 s[30:31] +; GFX1100-LABEL: v_log_f32_undef: +; GFX1100: ; %bb.0: +; GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1100-NEXT: s_setpc_b64 s[30:31] ; ; R600-LABEL: v_log_f32_undef: ; R600: ; %bb.0: diff --git a/llvm/test/CodeGen/AMDGPU/llvm.log10.ll b/llvm/test/CodeGen/AMDGPU/llvm.log10.ll index 8d1a231..058933f 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.log10.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.log10.ll @@ -5590,162 +5590,15 @@ define float @v_log10_f32_dynamic_mode(float %in) #1 { } define float @v_log10_f32_undef() { -; SI-SDAG-LABEL: v_log10_f32_undef: -; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-SDAG-NEXT: v_log_f32_e32 v0, s4 -; SI-SDAG-NEXT: s_mov_b32 s4, 0x3e9a209a -; SI-SDAG-NEXT: s_mov_b32 s5, 0x3284fbcf -; SI-SDAG-NEXT: s_mov_b32 s6, 0x7f800000 -; SI-SDAG-NEXT: v_mul_f32_e32 v1, 0x3e9a209a, v0 -; SI-SDAG-NEXT: v_fma_f32 v2, v0, s4, -v1 -; SI-SDAG-NEXT: v_fma_f32 v2, v0, s5, v2 -; SI-SDAG-NEXT: v_add_f32_e32 v1, v1, v2 -; SI-SDAG-NEXT: v_cmp_lt_f32_e64 vcc, |v0|, s6 -; SI-SDAG-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc -; SI-SDAG-NEXT: s_setpc_b64 s[30:31] -; -; SI-GISEL-LABEL: v_log10_f32_undef: -; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-GISEL-NEXT: v_mov_b32_e32 v0, 0x800000 -; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0x4f800000 -; SI-GISEL-NEXT: v_mul_f32_e32 v1, s4, v1 -; SI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s4, v0 -; SI-GISEL-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc -; SI-GISEL-NEXT: v_log_f32_e32 v0, v0 -; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0x3e9a209a -; SI-GISEL-NEXT: v_mov_b32_e32 v3, 0x3284fbcf -; SI-GISEL-NEXT: v_mul_f32_e32 v2, 0x3e9a209a, v0 -; SI-GISEL-NEXT: v_fma_f32 v1, v0, v1, -v2 -; SI-GISEL-NEXT: v_fma_f32 v1, v0, v3, v1 -; SI-GISEL-NEXT: v_add_f32_e32 v1, v2, v1 -; SI-GISEL-NEXT: v_mov_b32_e32 v2, 0x7f800000 -; SI-GISEL-NEXT: v_cmp_lt_f32_e64 s[4:5], |v0|, v2 -; SI-GISEL-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[4:5] -; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0x411a209b -; SI-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc -; SI-GISEL-NEXT: v_sub_f32_e32 v0, v0, v1 -; SI-GISEL-NEXT: s_setpc_b64 s[30:31] -; -; VI-SDAG-LABEL: v_log10_f32_undef: -; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-SDAG-NEXT: v_log_f32_e32 v0, s4 -; VI-SDAG-NEXT: s_mov_b32 s4, 0x7f800000 -; VI-SDAG-NEXT: v_and_b32_e32 v1, 0xfffff000, v0 -; VI-SDAG-NEXT: v_sub_f32_e32 v3, v0, v1 -; VI-SDAG-NEXT: v_mul_f32_e32 v2, 0x369a84fb, v1 -; VI-SDAG-NEXT: v_mul_f32_e32 v4, 0x369a84fb, v3 -; VI-SDAG-NEXT: v_mul_f32_e32 v3, 0x3e9a2000, v3 -; VI-SDAG-NEXT: v_add_f32_e32 v2, v2, v4 -; VI-SDAG-NEXT: v_mul_f32_e32 v1, 0x3e9a2000, v1 -; VI-SDAG-NEXT: v_add_f32_e32 v2, v3, v2 -; VI-SDAG-NEXT: v_add_f32_e32 v1, v1, v2 -; VI-SDAG-NEXT: v_cmp_lt_f32_e64 vcc, |v0|, s4 -; VI-SDAG-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc -; VI-SDAG-NEXT: s_setpc_b64 s[30:31] -; -; VI-GISEL-LABEL: v_log10_f32_undef: -; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-GISEL-NEXT: v_mov_b32_e32 v0, 0x800000 -; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0x4f800000 -; VI-GISEL-NEXT: v_mul_f32_e32 v1, s4, v1 -; VI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s4, v0 -; VI-GISEL-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc -; VI-GISEL-NEXT: v_log_f32_e32 v0, v0 -; VI-GISEL-NEXT: v_and_b32_e32 v1, 0xfffff000, v0 -; VI-GISEL-NEXT: v_sub_f32_e32 v2, v0, v1 -; VI-GISEL-NEXT: v_mul_f32_e32 v3, 0x369a84fb, v1 -; VI-GISEL-NEXT: v_mul_f32_e32 v4, 0x369a84fb, v2 -; VI-GISEL-NEXT: v_add_f32_e32 v3, v3, v4 -; VI-GISEL-NEXT: v_mul_f32_e32 v2, 0x3e9a2000, v2 -; VI-GISEL-NEXT: v_add_f32_e32 v2, v2, v3 -; VI-GISEL-NEXT: v_mul_f32_e32 v1, 0x3e9a2000, v1 -; VI-GISEL-NEXT: v_add_f32_e32 v1, v1, v2 -; VI-GISEL-NEXT: v_mov_b32_e32 v2, 0x7f800000 -; VI-GISEL-NEXT: v_cmp_lt_f32_e64 s[4:5], |v0|, v2 -; VI-GISEL-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[4:5] -; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0x411a209b -; VI-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc -; VI-GISEL-NEXT: v_sub_f32_e32 v0, v0, v1 -; VI-GISEL-NEXT: s_setpc_b64 s[30:31] -; -; GFX900-SDAG-LABEL: v_log10_f32_undef: -; GFX900-SDAG: ; %bb.0: -; GFX900-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-SDAG-NEXT: v_log_f32_e32 v0, s4 -; GFX900-SDAG-NEXT: s_mov_b32 s4, 0x3e9a209a -; GFX900-SDAG-NEXT: s_mov_b32 s5, 0x3284fbcf -; GFX900-SDAG-NEXT: s_mov_b32 s6, 0x7f800000 -; GFX900-SDAG-NEXT: v_mul_f32_e32 v1, 0x3e9a209a, v0 -; GFX900-SDAG-NEXT: v_fma_f32 v2, v0, s4, -v1 -; GFX900-SDAG-NEXT: v_fma_f32 v2, v0, s5, v2 -; GFX900-SDAG-NEXT: v_add_f32_e32 v1, v1, v2 -; GFX900-SDAG-NEXT: v_cmp_lt_f32_e64 vcc, |v0|, s6 -; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc -; GFX900-SDAG-NEXT: s_setpc_b64 s[30:31] -; -; GFX900-GISEL-LABEL: v_log10_f32_undef: -; GFX900-GISEL: ; %bb.0: -; GFX900-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-GISEL-NEXT: v_mov_b32_e32 v0, 0x800000 -; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0x4f800000 -; GFX900-GISEL-NEXT: v_mul_f32_e32 v1, s4, v1 -; GFX900-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s4, v0 -; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc -; GFX900-GISEL-NEXT: v_log_f32_e32 v0, v0 -; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0x3e9a209a -; GFX900-GISEL-NEXT: v_mov_b32_e32 v3, 0x3284fbcf -; GFX900-GISEL-NEXT: v_mul_f32_e32 v2, 0x3e9a209a, v0 -; GFX900-GISEL-NEXT: v_fma_f32 v1, v0, v1, -v2 -; GFX900-GISEL-NEXT: v_fma_f32 v1, v0, v3, v1 -; GFX900-GISEL-NEXT: v_add_f32_e32 v1, v2, v1 -; GFX900-GISEL-NEXT: v_mov_b32_e32 v2, 0x7f800000 -; GFX900-GISEL-NEXT: v_cmp_lt_f32_e64 s[4:5], |v0|, v2 -; GFX900-GISEL-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[4:5] -; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0x411a209b -; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc -; GFX900-GISEL-NEXT: v_sub_f32_e32 v0, v0, v1 -; GFX900-GISEL-NEXT: s_setpc_b64 s[30:31] -; -; GFX1100-SDAG-LABEL: v_log10_f32_undef: -; GFX1100-SDAG: ; %bb.0: -; GFX1100-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX1100-SDAG-NEXT: v_log_f32_e32 v0, s0 -; GFX1100-SDAG-NEXT: s_waitcnt_depctr 0xfff -; GFX1100-SDAG-NEXT: v_mul_f32_e32 v1, 0x3e9a209a, v0 -; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 vcc_lo, 0x7f800000, |v0| -; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1100-SDAG-NEXT: v_fma_f32 v2, 0x3e9a209a, v0, -v1 -; GFX1100-SDAG-NEXT: v_fmamk_f32 v2, v0, 0x3284fbcf, v2 -; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1100-SDAG-NEXT: v_add_f32_e32 v1, v1, v2 -; GFX1100-SDAG-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo -; GFX1100-SDAG-NEXT: s_setpc_b64 s[30:31] +; GFX689-LABEL: v_log10_f32_undef: +; GFX689: ; %bb.0: +; GFX689-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX689-NEXT: s_setpc_b64 s[30:31] ; -; GFX1100-GISEL-LABEL: v_log10_f32_undef: -; GFX1100-GISEL: ; %bb.0: -; GFX1100-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX1100-GISEL-NEXT: v_mul_f32_e64 v0, 0x4f800000, s0 -; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 vcc_lo, 0x800000, s0 -; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1100-GISEL-NEXT: v_cndmask_b32_e32 v0, s0, v0, vcc_lo -; GFX1100-GISEL-NEXT: v_log_f32_e32 v0, v0 -; GFX1100-GISEL-NEXT: s_waitcnt_depctr 0xfff -; GFX1100-GISEL-NEXT: v_mul_f32_e32 v1, 0x3e9a209a, v0 -; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s0, 0x7f800000, |v0| -; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1100-GISEL-NEXT: v_fma_f32 v2, 0x3e9a209a, v0, -v1 -; GFX1100-GISEL-NEXT: v_fmac_f32_e32 v2, 0x3284fbcf, v0 -; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1100-GISEL-NEXT: v_add_f32_e32 v1, v1, v2 -; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v0, v0, v1, s0 -; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 0x411a209b, vcc_lo -; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1100-GISEL-NEXT: v_sub_f32_e32 v0, v0, v1 -; GFX1100-GISEL-NEXT: s_setpc_b64 s[30:31] +; GFX1100-LABEL: v_log10_f32_undef: +; GFX1100: ; %bb.0: +; GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1100-NEXT: s_setpc_b64 s[30:31] ; ; R600-LABEL: v_log10_f32_undef: ; R600: ; %bb.0: diff --git a/llvm/test/CodeGen/AMDGPU/llvm.log2.ll b/llvm/test/CodeGen/AMDGPU/llvm.log2.ll index 7ca72bf..4ca612a 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.log2.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.log2.ll @@ -3542,45 +3542,15 @@ define float @v_log2_f32_dynamic_mode(float %in) #1 { } define float @v_log2_f32_undef() { -; GFX689-SDAG-LABEL: v_log2_f32_undef: -; GFX689-SDAG: ; %bb.0: -; GFX689-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX689-SDAG-NEXT: v_log_f32_e32 v0, s4 -; GFX689-SDAG-NEXT: s_setpc_b64 s[30:31] -; -; GFX689-GISEL-LABEL: v_log2_f32_undef: -; GFX689-GISEL: ; %bb.0: -; GFX689-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX689-GISEL-NEXT: v_mov_b32_e32 v0, 0x800000 -; GFX689-GISEL-NEXT: v_mov_b32_e32 v1, 0x4f800000 -; GFX689-GISEL-NEXT: v_mul_f32_e32 v1, s4, v1 -; GFX689-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s4, v0 -; GFX689-GISEL-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc -; GFX689-GISEL-NEXT: v_log_f32_e32 v0, v0 -; GFX689-GISEL-NEXT: v_mov_b32_e32 v1, 0x42000000 -; GFX689-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc -; GFX689-GISEL-NEXT: v_sub_f32_e32 v0, v0, v1 -; GFX689-GISEL-NEXT: s_setpc_b64 s[30:31] -; -; GFX1100-SDAG-LABEL: v_log2_f32_undef: -; GFX1100-SDAG: ; %bb.0: -; GFX1100-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX1100-SDAG-NEXT: v_log_f32_e32 v0, s0 -; GFX1100-SDAG-NEXT: s_setpc_b64 s[30:31] +; GFX689-LABEL: v_log2_f32_undef: +; GFX689: ; %bb.0: +; GFX689-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX689-NEXT: s_setpc_b64 s[30:31] ; -; GFX1100-GISEL-LABEL: v_log2_f32_undef: -; GFX1100-GISEL: ; %bb.0: -; GFX1100-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX1100-GISEL-NEXT: v_mul_f32_e64 v0, 0x4f800000, s0 -; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 vcc_lo, 0x800000, s0 -; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1100-GISEL-NEXT: v_cndmask_b32_e32 v0, s0, v0, vcc_lo -; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 0x42000000, vcc_lo -; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX1100-GISEL-NEXT: v_log_f32_e32 v0, v0 -; GFX1100-GISEL-NEXT: s_waitcnt_depctr 0xfff -; GFX1100-GISEL-NEXT: v_sub_f32_e32 v0, v0, v1 -; GFX1100-GISEL-NEXT: s_setpc_b64 s[30:31] +; GFX1100-LABEL: v_log2_f32_undef: +; GFX1100: ; %bb.0: +; GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1100-NEXT: s_setpc_b64 s[30:31] ; ; R600-LABEL: v_log2_f32_undef: ; R600: ; %bb.0: diff --git a/llvm/test/CodeGen/AMDGPU/llvm.sin.bf16.ll b/llvm/test/CodeGen/AMDGPU/llvm.sin.bf16.ll new file mode 100644 index 0000000..7a355a3 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/llvm.sin.bf16.ll @@ -0,0 +1,38 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1250 %s -o - | FileCheck -check-prefixes=GCN %s +; xUN: llc -global-isel=1 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1250 -verify-machineinstrs %s -o - | FileCheck -check-prefix=GCN %s + +; FIXME: GlobalISel does not work with bf16 + +declare bfloat @llvm.sin.bf16(bfloat) #0 + +define amdgpu_kernel void @sin_bf16_constant_4(ptr addrspace(1) %out) #1 { +; GCN-LABEL: sin_bf16_constant_4: +; GCN: ; %bb.0: +; GCN-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GCN-NEXT: v_sin_bf16_e32 v0, 0x3f23 +; GCN-NEXT: v_mov_b32_e32 v1, 0 +; GCN-NEXT: s_wait_kmcnt 0x0 +; GCN-NEXT: global_store_b16 v1, v0, s[0:1] +; GCN-NEXT: s_endpgm + %sin = call bfloat @llvm.sin.bf16(bfloat 4.0) #0 + store bfloat %sin, ptr addrspace(1) %out, align 2 + ret void +} + +define amdgpu_kernel void @sin_bf16_constant_100(ptr addrspace(1) %out) #1 { +; GCN-LABEL: sin_bf16_constant_100: +; GCN: ; %bb.0: +; GCN-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GCN-NEXT: v_sin_bf16_e32 v0, 0x417f +; GCN-NEXT: v_mov_b32_e32 v1, 0 +; GCN-NEXT: s_wait_kmcnt 0x0 +; GCN-NEXT: global_store_b16 v1, v0, s[0:1] +; GCN-NEXT: s_endpgm + %sin = call bfloat @llvm.sin.bf16(bfloat 100.0) #0 + store bfloat %sin, ptr addrspace(1) %out, align 2 + ret void +} + +attributes #0 = { nounwind readnone } +attributes #1 = { nounwind } diff --git a/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-unoptimized-debug-data.ll b/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-unoptimized-debug-data.ll index 047bdde..8281320 100644 --- a/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-unoptimized-debug-data.ll +++ b/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-unoptimized-debug-data.ll @@ -11,11 +11,13 @@ define float @debug_stash_pointer(ptr addrspace(8) %buf, i32 %idx, ptr addrspace ; CHECK-NEXT: #dbg_value(ptr addrspace(5) [[BUF_PTR_VAR]], [[META10:![0-9]+]], !DIExpression(), [[DBG21]]) ; CHECK-NEXT: [[AUX_PTR_VAR:%.*]] = alloca i160, align 32, addrspace(5), !dbg [[DBG22:![0-9]+]] ; CHECK-NEXT: #dbg_value(ptr addrspace(5) [[AUX_PTR_VAR]], [[META12:![0-9]+]], !DIExpression(), [[DBG22]]) -; CHECK-NEXT: #dbg_value({ ptr addrspace(8), i32 } poison, [[META13:![0-9]+]], !DIExpression(), [[META23:![0-9]+]]) +; CHECK-NEXT: #dbg_value(i32 0, [[META13:![0-9]+]], !DIExpression(DW_OP_LLVM_fragment, 128, 32), [[META23:![0-9]+]]) +; CHECK-NEXT: #dbg_value(ptr addrspace(8) [[BUF]], [[META13]], !DIExpression(DW_OP_LLVM_fragment, 0, 128), [[META23]]) ; CHECK-NEXT: [[BUF_PTR_INT_RSRC:%.*]] = ptrtoint ptr addrspace(8) [[BUF]] to i160, !dbg [[DBG24:![0-9]+]] ; CHECK-NEXT: [[BUF_PTR_INT:%.*]] = shl nuw i160 [[BUF_PTR_INT_RSRC]], 32, !dbg [[DBG24]] ; CHECK-NEXT: store i160 [[BUF_PTR_INT]], ptr addrspace(5) [[BUF_PTR_VAR]], align 32, !dbg [[DBG24]] -; CHECK-NEXT: #dbg_value({ ptr addrspace(8), i32 } poison, [[META15:![0-9]+]], !DIExpression(), [[META25:![0-9]+]]) +; CHECK-NEXT: #dbg_value(i32 0, [[META15:![0-9]+]], !DIExpression(DW_OP_LLVM_fragment, 128, 32), [[META25:![0-9]+]]) +; CHECK-NEXT: #dbg_value(ptr addrspace(8) [[AUX]], [[META15]], !DIExpression(DW_OP_LLVM_fragment, 0, 128), [[META25]]) ; CHECK-NEXT: [[AUX_PTR_INT_RSRC:%.*]] = ptrtoint ptr addrspace(8) [[AUX]] to i160, !dbg [[DBG26:![0-9]+]] ; CHECK-NEXT: [[AUX_PTR_INT:%.*]] = shl nuw i160 [[AUX_PTR_INT_RSRC]], 32, !dbg [[DBG26]] ; CHECK-NEXT: store i160 [[AUX_PTR_INT]], ptr addrspace(5) [[AUX_PTR_VAR]], align 32, !dbg [[DBG26]] @@ -24,10 +26,12 @@ define float @debug_stash_pointer(ptr addrspace(8) %buf, i32 %idx, ptr addrspace ; CHECK-NEXT: [[TMP2:%.*]] = trunc i160 [[TMP1]] to i128, !dbg [[DBG27]] ; CHECK-NEXT: [[BUF_PTR_2_PTR_RSRC:%.*]] = inttoptr i128 [[TMP2]] to ptr addrspace(8), !dbg [[DBG27]] ; CHECK-NEXT: [[BUF_PTR_2_PTR_OFF:%.*]] = trunc i160 [[BUF_PTR_2]] to i32, !dbg [[DBG27]] -; CHECK-NEXT: #dbg_value({ ptr addrspace(8), i32 } poison, [[META16:![0-9]+]], !DIExpression(), [[DBG27]]) +; CHECK-NEXT: #dbg_value(i32 [[BUF_PTR_2_PTR_OFF]], [[META16:![0-9]+]], !DIExpression(DW_OP_LLVM_fragment, 128, 32), [[DBG27]]) +; CHECK-NEXT: #dbg_value(ptr addrspace(8) [[BUF_PTR_2_PTR_RSRC]], [[META16]], !DIExpression(DW_OP_LLVM_fragment, 0, 128), [[DBG27]]) ; CHECK-NEXT: [[BUF_PTR_3_IDX:%.*]] = mul i32 [[IDX]], 4, !dbg [[DBG28:![0-9]+]] ; CHECK-NEXT: [[BUF_PTR_3:%.*]] = add i32 [[BUF_PTR_2_PTR_OFF]], [[BUF_PTR_3_IDX]], !dbg [[DBG28]] -; CHECK-NEXT: #dbg_value({ ptr addrspace(8), i32 } poison, [[META17:![0-9]+]], !DIExpression(), [[DBG28]]) +; CHECK-NEXT: #dbg_value(i32 [[BUF_PTR_3]], [[META17:![0-9]+]], !DIExpression(DW_OP_LLVM_fragment, 128, 32), [[DBG28]]) +; CHECK-NEXT: #dbg_value(ptr addrspace(8) [[BUF_PTR_2_PTR_RSRC]], [[META17]], !DIExpression(DW_OP_LLVM_fragment, 0, 128), [[DBG28]]) ; CHECK-NEXT: [[BUF_PTR_3_INT_RSRC:%.*]] = ptrtoint ptr addrspace(8) [[BUF_PTR_2_PTR_RSRC]] to i160, !dbg [[DBG29:![0-9]+]] ; CHECK-NEXT: [[TMP3:%.*]] = shl nuw i160 [[BUF_PTR_3_INT_RSRC]], 32, !dbg [[DBG29]] ; CHECK-NEXT: [[BUF_PTR_3_INT_OFF:%.*]] = zext i32 [[BUF_PTR_3]] to i160, !dbg [[DBG29]] @@ -38,7 +42,8 @@ define float @debug_stash_pointer(ptr addrspace(8) %buf, i32 %idx, ptr addrspace ; CHECK-NEXT: [[TMP5:%.*]] = trunc i160 [[TMP4]] to i128, !dbg [[DBG30]] ; CHECK-NEXT: [[BUF_PTR_4_PTR_RSRC:%.*]] = inttoptr i128 [[TMP5]] to ptr addrspace(8), !dbg [[DBG30]] ; CHECK-NEXT: [[BUF_PTR_4_PTR_OFF:%.*]] = trunc i160 [[BUF_PTR_4]] to i32, !dbg [[DBG30]] -; CHECK-NEXT: #dbg_value({ ptr addrspace(8), i32 } poison, [[META18:![0-9]+]], !DIExpression(), [[DBG30]]) +; CHECK-NEXT: #dbg_value(i32 [[BUF_PTR_4_PTR_OFF]], [[META18:![0-9]+]], !DIExpression(DW_OP_LLVM_fragment, 128, 32), [[DBG30]]) +; CHECK-NEXT: #dbg_value(ptr addrspace(8) [[BUF_PTR_4_PTR_RSRC]], [[META18]], !DIExpression(DW_OP_LLVM_fragment, 0, 128), [[DBG30]]) ; CHECK-NEXT: [[RET:%.*]] = call float @llvm.amdgcn.raw.ptr.buffer.load.f32(ptr addrspace(8) align 4 [[BUF_PTR_4_PTR_RSRC]], i32 [[BUF_PTR_4_PTR_OFF]], i32 0, i32 0), !dbg [[DBG31:![0-9]+]] ; CHECK-NEXT: #dbg_value(float [[RET]], [[META19:![0-9]+]], !DIExpression(), [[DBG31]]) ; CHECK-NEXT: [[AUX_PTR_2:%.*]] = load i160, ptr addrspace(5) [[AUX_PTR_VAR]], align 32, !dbg [[DBG32:![0-9]+]] @@ -46,7 +51,8 @@ define float @debug_stash_pointer(ptr addrspace(8) %buf, i32 %idx, ptr addrspace ; CHECK-NEXT: [[TMP7:%.*]] = trunc i160 [[TMP6]] to i128, !dbg [[DBG32]] ; CHECK-NEXT: [[AUX_PTR_2_PTR_RSRC:%.*]] = inttoptr i128 [[TMP7]] to ptr addrspace(8), !dbg [[DBG32]] ; CHECK-NEXT: [[AUX_PTR_2_PTR_OFF:%.*]] = trunc i160 [[AUX_PTR_2]] to i32, !dbg [[DBG32]] -; CHECK-NEXT: #dbg_value({ ptr addrspace(8), i32 } poison, [[META20:![0-9]+]], !DIExpression(), [[DBG32]]) +; CHECK-NEXT: #dbg_value(i32 [[AUX_PTR_2_PTR_OFF]], [[META20:![0-9]+]], !DIExpression(DW_OP_LLVM_fragment, 128, 32), [[DBG32]]) +; CHECK-NEXT: #dbg_value(ptr addrspace(8) [[AUX_PTR_2_PTR_RSRC]], [[META20]], !DIExpression(DW_OP_LLVM_fragment, 0, 128), [[DBG32]]) ; CHECK-NEXT: [[BUF_PTR_4_LEGAL:%.*]] = bitcast i160 [[BUF_PTR_4]] to <5 x i32>, !dbg [[DBG33:![0-9]+]] ; CHECK-NEXT: [[BUF_PTR_4_SLICE_0:%.*]] = shufflevector <5 x i32> [[BUF_PTR_4_LEGAL]], <5 x i32> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>, !dbg [[DBG33]] ; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> [[BUF_PTR_4_SLICE_0]], ptr addrspace(8) align 32 [[AUX_PTR_2_PTR_RSRC]], i32 [[AUX_PTR_2_PTR_OFF]], i32 0, i32 0), !dbg [[DBG33]] diff --git a/llvm/test/CodeGen/AMDGPU/mai-hazards.mir b/llvm/test/CodeGen/AMDGPU/mai-hazards.mir index 409b1d6..ce67a2e 100644 --- a/llvm/test/CodeGen/AMDGPU/mai-hazards.mir +++ b/llvm/test/CodeGen/AMDGPU/mai-hazards.mir @@ -33,7 +33,7 @@ name: asm_write_vgpr_accvgpr_write_read body: | bb.0: - INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 1966090 /* regdef:VGPR_32 */, def $vgpr0 + INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 2228234 /* regdef:VGPR_32 */, def $vgpr0 $agpr0 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec ... @@ -47,7 +47,7 @@ name: asm_write_vgpr_accvgpr_write_read_partialnop body: | bb.0: - INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 1966090 /* regdef:VGPR_32 */, def $vgpr0 + INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 2228234 /* regdef:VGPR_32 */, def $vgpr0 S_NOP 0 $agpr0 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec ... @@ -60,7 +60,7 @@ name: asm_write_vgpr_accvgpr_write_read_otherreg body: | bb.0: liveins: $vgpr0 - INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 1966090 /* regdef:VGPR_32 */, def $vgpr1 + INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 2228234 /* regdef:VGPR_32 */, def $vgpr1 $agpr0 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec ... diff --git a/llvm/test/CodeGen/AMDGPU/pal-metadata-3.0-callable-dvgpr.ll b/llvm/test/CodeGen/AMDGPU/pal-metadata-3.0-callable-dvgpr.ll index ae35d0d..e6bc733 100644 --- a/llvm/test/CodeGen/AMDGPU/pal-metadata-3.0-callable-dvgpr.ll +++ b/llvm/test/CodeGen/AMDGPU/pal-metadata-3.0-callable-dvgpr.ll @@ -17,6 +17,7 @@ ; CHECK-NEXT: .debug_mode: 0 ; CHECK-NEXT: .excp_en: 0 ; CHECK-NEXT: .float_mode: 0xc0 +; CHECK-NEXT: .forward_progress: true ; CHECK-NEXT: .image_op: false ; CHECK-NEXT: .lds_size: 0x200 ; CHECK-NEXT: .mem_ordered: true diff --git a/llvm/test/CodeGen/AMDGPU/pal-metadata-3.0-callable.ll b/llvm/test/CodeGen/AMDGPU/pal-metadata-3.0-callable.ll index 638dc89..310040d 100644 --- a/llvm/test/CodeGen/AMDGPU/pal-metadata-3.0-callable.ll +++ b/llvm/test/CodeGen/AMDGPU/pal-metadata-3.0-callable.ll @@ -19,6 +19,7 @@ ; CHECK-NEXT: .debug_mode: 0 ; CHECK-NEXT: .excp_en: 0 ; CHECK-NEXT: .float_mode: 0xc0 +; CHECK-NEXT: .forward_progress: true ; GFX11-NEXT: .ieee_mode: true ; CHECK-NEXT: .image_op: false ; CHECK-NEXT: .lds_size: 0x200 diff --git a/llvm/test/CodeGen/AMDGPU/pal-metadata-3.0-dvgpr.ll b/llvm/test/CodeGen/AMDGPU/pal-metadata-3.0-dvgpr.ll index fb6ac2e..c1846c0 100644 --- a/llvm/test/CodeGen/AMDGPU/pal-metadata-3.0-dvgpr.ll +++ b/llvm/test/CodeGen/AMDGPU/pal-metadata-3.0-dvgpr.ll @@ -59,6 +59,7 @@ ; CHECK-NEXT: .entry_point_symbol: _amdgpu_cs_main ; CHECK-NEXT: .excp_en: 0 ; CHECK-NEXT: .float_mode: 0xc0 +; CHECK-NEXT: .forward_progress: true ; CHECK-NEXT: .image_op: false ; CHECK-NEXT: .lds_size: 0 ; CHECK-NEXT: .mem_ordered: true @@ -113,6 +114,7 @@ ; CHECK-NEXT: .debug_mode: false ; CHECK-NEXT: .entry_point: _amdgpu_gs ; CHECK-NEXT: .entry_point_symbol: gs_shader +; CHECK-NEXT: .forward_progress: true ; CHECK-NEXT: .lds_size: 0x200 ; CHECK-NEXT: .mem_ordered: true ; CHECK-NEXT: .scratch_en: false @@ -124,6 +126,7 @@ ; CHECK-NEXT: .debug_mode: false ; CHECK-NEXT: .entry_point: _amdgpu_hs ; CHECK-NEXT: .entry_point_symbol: hs_shader +; CHECK-NEXT: .forward_progress: true ; CHECK-NEXT: .lds_size: 0x1000 ; CHECK-NEXT: .mem_ordered: true ; CHECK-NEXT: .scratch_en: false @@ -135,6 +138,7 @@ ; CHECK-NEXT: .debug_mode: false ; CHECK-NEXT: .entry_point: _amdgpu_ps ; CHECK-NEXT: .entry_point_symbol: ps_shader +; CHECK-NEXT: .forward_progress: true ; CHECK-NEXT: .lds_size: 0 ; CHECK-NEXT: .mem_ordered: true ; CHECK-NEXT: .scratch_en: false diff --git a/llvm/test/CodeGen/AMDGPU/pal-metadata-3.0.ll b/llvm/test/CodeGen/AMDGPU/pal-metadata-3.0.ll index 15778c8..5c0c366 100644 --- a/llvm/test/CodeGen/AMDGPU/pal-metadata-3.0.ll +++ b/llvm/test/CodeGen/AMDGPU/pal-metadata-3.0.ll @@ -62,6 +62,7 @@ ; CHECK-NEXT: .entry_point_symbol: _amdgpu_cs_main ; CHECK-NEXT: .excp_en: 0 ; CHECK-NEXT: .float_mode: 0xc0 +; CHECK-NEXT: .forward_progress: true ; GFX11-NEXT: .ieee_mode: false ; CHECK-NEXT: .image_op: false ; CHECK-NEXT: .lds_size: 0 @@ -118,6 +119,7 @@ ; CHECK-NEXT: .debug_mode: false ; CHECK-NEXT: .entry_point: _amdgpu_gs_main ; CHECK-NEXT: .entry_point_symbol: gs_shader +; CHECK-NEXT: .forward_progress: true ; GFX11-NEXT: .ieee_mode: false ; CHECK-NEXT: .lds_size: 0x200 ; CHECK-NEXT: .mem_ordered: true @@ -130,6 +132,7 @@ ; CHECK-NEXT: .debug_mode: false ; CHECK-NEXT: .entry_point: _amdgpu_hs_main ; CHECK-NEXT: .entry_point_symbol: hs_shader +; CHECK-NEXT: .forward_progress: true ; GFX11-NEXT: .ieee_mode: false ; CHECK-NEXT: .lds_size: 0x1000 ; CHECK-NEXT: .mem_ordered: true @@ -142,6 +145,7 @@ ; CHECK-NEXT: .debug_mode: false ; CHECK-NEXT: .entry_point: _amdgpu_ps_main ; CHECK-NEXT: .entry_point_symbol: ps_shader +; CHECK-NEXT: .forward_progress: true ; GFX11-NEXT: .ieee_mode: false ; CHECK-NEXT: .lds_size: 0 ; CHECK-NEXT: .mem_ordered: true diff --git a/llvm/test/CodeGen/AMDGPU/pal-metadata-3.6.ll b/llvm/test/CodeGen/AMDGPU/pal-metadata-3.6.ll index 644722b..830872a 100644 --- a/llvm/test/CodeGen/AMDGPU/pal-metadata-3.6.ll +++ b/llvm/test/CodeGen/AMDGPU/pal-metadata-3.6.ll @@ -62,6 +62,7 @@ ; CHECK-NEXT: .entry_point_symbol: _amdgpu_cs_main ; CHECK-NEXT: .excp_en: 0 ; CHECK-NEXT: .float_mode: 0xc0 +; CHECK-NEXT: .forward_progress: true ; GFX11-NEXT: .ieee_mode: false ; CHECK-NEXT: .image_op: false ; CHECK-NEXT: .lds_size: 0 @@ -118,6 +119,7 @@ ; CHECK-NEXT: .debug_mode: false ; CHECK-NOT: .entry_point: _amdgpu_gs_main ; CHECK-NEXT: .entry_point_symbol: gs_shader +; CHECK-NEXT: .forward_progress: true ; GFX11-NEXT: .ieee_mode: false ; CHECK-NEXT: .lds_size: 0x200 ; CHECK-NEXT: .mem_ordered: true @@ -130,6 +132,7 @@ ; CHECK-NEXT: .debug_mode: false ; CHECK-NOT: .entry_point: _amdgpu_hs_main ; CHECK-NEXT: .entry_point_symbol: hs_shader +; CHECK-NEXT: .forward_progress: true ; GFX11-NEXT: .ieee_mode: false ; CHECK-NEXT: .lds_size: 0x1000 ; CHECK-NEXT: .mem_ordered: true @@ -142,6 +145,7 @@ ; CHECK-NEXT: .debug_mode: false ; CHECK-NOT: .entry_point: _amdgpu_ps_main ; CHECK-NEXT: .entry_point_symbol: ps_shader +; CHECK-NEXT: .forward_progress: true ; GFX11-NEXT: .ieee_mode: false ; CHECK-NEXT: .lds_size: 0 ; CHECK-NEXT: .mem_ordered: true diff --git a/llvm/test/CodeGen/AMDGPU/regalloc-failure-overlapping-insert-assert.mir b/llvm/test/CodeGen/AMDGPU/regalloc-failure-overlapping-insert-assert.mir index c9d0cf3..fef7332 100644 --- a/llvm/test/CodeGen/AMDGPU/regalloc-failure-overlapping-insert-assert.mir +++ b/llvm/test/CodeGen/AMDGPU/regalloc-failure-overlapping-insert-assert.mir @@ -45,13 +45,13 @@ body: | INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 10 /* regdef */, implicit-def $agpr0 %14:vgpr_32 = COPY killed $agpr0 - INLINEASM &"; def $0 $1 $2 $3 $4", 1 /* sideeffect attdialect */, 11534346 /* regdef:VReg_512 */, def %7, 10158090 /* regdef:VReg_256 */, def %8, 4784138 /* regdef:VReg_128 */, def %9, 3670026 /* regdef:VReg_96 */, def %10, 3670026 /* regdef:VReg_96 */, def %11 + INLINEASM &"; def $0 $1 $2 $3 $4", 1 /* sideeffect attdialect */, 27262986 /* regdef:VReg_512 */, def %7, 13565962 /* regdef:VReg_256 */, def %8, 6094858 /* regdef:VReg_128 */, def %9, 4784138 /* regdef:VReg_96 */, def %10, 4784138 /* regdef:VReg_96 */, def %11 INLINEASM &"; clobber", 1 /* sideeffect attdialect */, 12 /* clobber */, implicit-def dead early-clobber $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31, 12 /* clobber */, implicit-def dead early-clobber $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 - INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 11534345 /* reguse:VReg_512 */, %7 - INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 10158089 /* reguse:VReg_256 */, %8 - INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 4784137 /* reguse:VReg_128 */, %9 - INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 3670025 /* reguse:VReg_96 */, %10 - INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 3670025 /* reguse:VReg_96 */, %11 + INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 27262985 /* reguse:VReg_512 */, %7 + INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 13565961 /* reguse:VReg_256 */, %8 + INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 6094857 /* reguse:VReg_128 */, %9 + INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 4784137 /* reguse:VReg_96 */, %10 + INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 4784137 /* reguse:VReg_96 */, %11 $agpr1 = COPY %14 INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 9 /* reguse */, killed $agpr1 SI_RETURN diff --git a/llvm/test/CodeGen/AMDGPU/whole-wave-functions-pei.mir b/llvm/test/CodeGen/AMDGPU/whole-wave-functions-pei.mir new file mode 100644 index 0000000..93f4891 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/whole-wave-functions-pei.mir @@ -0,0 +1,448 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5 +# RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -run-pass=prologepilog -o - %s | FileCheck %s + +--- +name: save_inactive_lanes_non_csr_vgpr +alignment: 1 +tracksRegLiveness: true +noPhis: true +isSSA: false +noVRegs: true +hasFakeUses: false +tracksDebugUserValues: true +frameInfo: + maxAlignment: 1 + isCalleeSavedInfoValid: true +machineFunctionInfo: + maxKernArgAlign: 1 + frameOffsetReg: '$sgpr33' + stackPtrOffsetReg: '$sgpr32' + returnsVoid: false + occupancy: 16 + sgprForEXECCopy: '$sgpr105' + isWholeWaveFunction: true +body: | + bb.0: + ; CHECK-LABEL: name: save_inactive_lanes_non_csr_vgpr + ; CHECK: liveins: $vgpr0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: $sgpr0 = S_XOR_SAVEEXEC_B32 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec + ; CHECK-NEXT: SCRATCH_STORE_DWORD_SADDR $vgpr0, $sgpr32, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.0, addrspace 5) + ; CHECK-NEXT: $exec_lo = S_MOV_B32 -1 + ; CHECK-NEXT: $vgpr0 = V_MOV_B32_e32 14, implicit $exec + ; CHECK-NEXT: $exec_lo = S_XOR_B32 $sgpr0, -1, implicit-def $scc + ; CHECK-NEXT: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 0, 0, implicit $exec, implicit $flat_scr, implicit $vgpr0(tied-def 0) :: (load (s32) from %stack.0, addrspace 5) + ; CHECK-NEXT: $exec_lo = S_MOV_B32 $sgpr0 + ; CHECK-NEXT: SI_WHOLE_WAVE_FUNC_RETURN killed renamable $sgpr0, implicit killed $vgpr0 + renamable $sgpr0 = SI_WHOLE_WAVE_FUNC_SETUP implicit-def dead $exec, implicit $exec + $vgpr0 = V_MOV_B32_e32 14, implicit $exec + SI_WHOLE_WAVE_FUNC_RETURN killed renamable $sgpr0, implicit killed $vgpr0 + +... +--- +name: save_all_lanes_csr_vgpr +alignment: 1 +tracksRegLiveness: true +noPhis: true +isSSA: false +noVRegs: true +hasFakeUses: false +tracksDebugUserValues: true +frameInfo: + maxAlignment: 1 + isCalleeSavedInfoValid: true +machineFunctionInfo: + maxKernArgAlign: 1 + frameOffsetReg: '$sgpr33' + stackPtrOffsetReg: '$sgpr32' + returnsVoid: false + occupancy: 16 + sgprForEXECCopy: '$sgpr105' + isWholeWaveFunction: true +body: | + bb.0: + ; CHECK-LABEL: name: save_all_lanes_csr_vgpr + ; CHECK: liveins: $vgpr40 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: $sgpr0 = S_OR_SAVEEXEC_B32 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec + ; CHECK-NEXT: SCRATCH_STORE_DWORD_SADDR $vgpr40, $sgpr32, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.0, addrspace 5) + ; CHECK-NEXT: $vgpr40 = V_MOV_B32_e32 14, implicit $exec + ; CHECK-NEXT: $vgpr40 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 0, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.0, addrspace 5) + ; CHECK-NEXT: $exec_lo = S_MOV_B32 $sgpr0 + ; CHECK-NEXT: SI_WHOLE_WAVE_FUNC_RETURN killed renamable $sgpr0 + renamable $sgpr0 = SI_WHOLE_WAVE_FUNC_SETUP implicit-def dead $exec, implicit $exec + $vgpr40 = V_MOV_B32_e32 14, implicit $exec + SI_WHOLE_WAVE_FUNC_RETURN killed renamable $sgpr0 + +... +--- +name: save_csr_sgpr_to_non_csr_vgpr +alignment: 1 +tracksRegLiveness: true +noPhis: true +isSSA: false +noVRegs: true +hasFakeUses: false +tracksDebugUserValues: true +frameInfo: + maxAlignment: 1 + isCalleeSavedInfoValid: true +machineFunctionInfo: + maxKernArgAlign: 1 + frameOffsetReg: '$sgpr33' + stackPtrOffsetReg: '$sgpr32' + returnsVoid: false + occupancy: 16 + sgprForEXECCopy: '$sgpr105' + isWholeWaveFunction: true +body: | + bb.0: + liveins: $sgpr20, $vgpr191 + ; CHECK-LABEL: name: save_csr_sgpr_to_non_csr_vgpr + ; CHECK: liveins: $sgpr20, $vgpr191, $vgpr192 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: $vcc_lo = S_XOR_SAVEEXEC_B32 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec + ; CHECK-NEXT: SCRATCH_STORE_DWORD_SADDR $vgpr192, $sgpr32, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.0, addrspace 5) + ; CHECK-NEXT: $exec_lo = S_MOV_B32 -1 + ; CHECK-NEXT: $vgpr192 = SI_SPILL_S32_TO_VGPR killed $sgpr20, 0, $vgpr192 + ; CHECK-NEXT: $sgpr20 = S_MOV_B32 14, implicit $exec + ; CHECK-NEXT: $sgpr20 = SI_RESTORE_S32_FROM_VGPR $vgpr192, 0 + ; CHECK-NEXT: $exec_lo = S_XOR_B32 $vcc_lo, -1, implicit-def $scc + ; CHECK-NEXT: $vgpr192 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 0, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.0, addrspace 5) + ; CHECK-NEXT: $exec_lo = S_MOV_B32 $vcc_lo + ; CHECK-NEXT: SI_WHOLE_WAVE_FUNC_RETURN killed renamable $vcc_lo + $vgpr192 = SI_SPILL_S32_TO_VGPR killed $sgpr20, 0, $vgpr192 + renamable $vcc_lo = SI_WHOLE_WAVE_FUNC_SETUP implicit-def dead $exec, implicit $exec + $sgpr20 = S_MOV_B32 14, implicit $exec + $sgpr20 = SI_RESTORE_S32_FROM_VGPR $vgpr192, 0 + SI_WHOLE_WAVE_FUNC_RETURN killed renamable $vcc_lo + +... +--- +name: save_csr_sgpr_to_csr_vgpr +alignment: 1 +tracksRegLiveness: true +noPhis: true +isSSA: false +noVRegs: true +hasFakeUses: false +tracksDebugUserValues: true +frameInfo: + maxAlignment: 1 + isCalleeSavedInfoValid: true +machineFunctionInfo: + maxKernArgAlign: 1 + frameOffsetReg: '$sgpr33' + stackPtrOffsetReg: '$sgpr32' + returnsVoid: false + occupancy: 16 + sgprForEXECCopy: '$sgpr105' + isWholeWaveFunction: true +body: | + bb.0: + liveins: $sgpr20, $vgpr191 + ; CHECK-LABEL: name: save_csr_sgpr_to_csr_vgpr + ; CHECK: liveins: $sgpr20, $vgpr191 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: $vcc_lo = S_OR_SAVEEXEC_B32 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec + ; CHECK-NEXT: SCRATCH_STORE_DWORD_SADDR $vgpr191, $sgpr32, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.0, addrspace 5) + ; CHECK-NEXT: $vgpr191 = SI_SPILL_S32_TO_VGPR killed $sgpr20, 0, $vgpr191 + ; CHECK-NEXT: $sgpr20 = S_MOV_B32 14, implicit $exec + ; CHECK-NEXT: $sgpr20 = SI_RESTORE_S32_FROM_VGPR $vgpr191, 0 + ; CHECK-NEXT: $vgpr191 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 0, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.0, addrspace 5) + ; CHECK-NEXT: $exec_lo = S_MOV_B32 $vcc_lo + ; CHECK-NEXT: SI_WHOLE_WAVE_FUNC_RETURN killed renamable $vcc_lo + $vgpr191 = SI_SPILL_S32_TO_VGPR killed $sgpr20, 0, $vgpr191 + renamable $vcc_lo = SI_WHOLE_WAVE_FUNC_SETUP implicit-def dead $exec, implicit $exec + $sgpr20 = S_MOV_B32 14, implicit $exec + $sgpr20 = SI_RESTORE_S32_FROM_VGPR $vgpr191, 0 + SI_WHOLE_WAVE_FUNC_RETURN killed renamable $vcc_lo + +... +--- +name: vgpr_and_sgpr_csr +alignment: 1 +tracksRegLiveness: true +noPhis: true +isSSA: false +noVRegs: true +hasFakeUses: false +tracksDebugUserValues: true +liveins: + - { reg: '$vgpr0' } + - { reg: '$vgpr1' } +frameInfo: + maxAlignment: 4 + isCalleeSavedInfoValid: true +machineFunctionInfo: + maxKernArgAlign: 1 + hasSpilledSGPRs: true + frameOffsetReg: '$sgpr33' + stackPtrOffsetReg: '$sgpr32' + returnsVoid: false + occupancy: 16 + spillPhysVGPRs: + - '$vgpr191' + wwmReservedRegs: + - '$vgpr191' + isWholeWaveFunction: true +body: | + bb.0: + liveins: $sgpr20, $vgpr0, $vgpr1, $vgpr191 + + ; CHECK-LABEL: name: vgpr_and_sgpr_csr + ; CHECK: liveins: $sgpr20, $vgpr0, $vgpr1, $vgpr40, $vgpr49 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: $vcc_lo = S_XOR_SAVEEXEC_B32 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec + ; CHECK-NEXT: SCRATCH_STORE_DWORD_SADDR $vgpr0, $sgpr32, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.0, addrspace 5) + ; CHECK-NEXT: SCRATCH_STORE_DWORD_SADDR $vgpr49, $sgpr32, 8, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.2, addrspace 5) + ; CHECK-NEXT: $exec_lo = S_MOV_B32 -1 + ; CHECK-NEXT: SCRATCH_STORE_DWORD_SADDR $vgpr40, $sgpr32, 4, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.1, addrspace 5) + ; CHECK-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR killed $sgpr20, 0, $vgpr0 + ; CHECK-NEXT: S_NOP 0, implicit-def $vgpr40, implicit-def $sgpr20 + ; CHECK-NEXT: S_NOP 0, implicit-def $vgpr49, implicit-def $sgpr40 + ; CHECK-NEXT: $sgpr20 = SI_RESTORE_S32_FROM_VGPR $vgpr0, 0 + ; CHECK-NEXT: $vgpr40 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 4, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.1, addrspace 5) + ; CHECK-NEXT: $exec_lo = S_XOR_B32 $vcc_lo, -1, implicit-def $scc + ; CHECK-NEXT: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 0, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.0, addrspace 5) + ; CHECK-NEXT: $vgpr49 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 8, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.2, addrspace 5) + ; CHECK-NEXT: $exec_lo = S_MOV_B32 $vcc_lo + ; CHECK-NEXT: SI_WHOLE_WAVE_FUNC_RETURN killed renamable $vcc_lo + $vgpr191 = SI_SPILL_S32_TO_VGPR killed $sgpr20, 0, $vgpr191 + renamable $vcc_lo = SI_WHOLE_WAVE_FUNC_SETUP implicit-def dead $exec, implicit $exec + S_NOP 0, implicit-def $vgpr40, implicit-def $sgpr20 + S_NOP 0, implicit-def $vgpr49, implicit-def $sgpr40 + $sgpr20 = SI_RESTORE_S32_FROM_VGPR $vgpr191, 0 + SI_WHOLE_WAVE_FUNC_RETURN killed renamable $vcc_lo + +... +--- +name: split_orig_exec +alignment: 1 +tracksRegLiveness: true +noPhis: true +isSSA: false +noVRegs: true +hasFakeUses: false +tracksDebugUserValues: true +liveins: + - { reg: '$vgpr0' } + - { reg: '$vgpr1' } +frameInfo: + maxAlignment: 4 + isCalleeSavedInfoValid: true +machineFunctionInfo: + maxKernArgAlign: 1 + hasSpilledSGPRs: true + frameOffsetReg: '$sgpr33' + stackPtrOffsetReg: '$sgpr32' + returnsVoid: false + occupancy: 16 + spillPhysVGPRs: + - '$vgpr191' + wwmReservedRegs: + - '$vgpr191' + isWholeWaveFunction: true +body: | + bb.0: + liveins: $sgpr20, $vgpr0, $vgpr1, $vgpr191 + + ; CHECK-LABEL: name: split_orig_exec + ; CHECK: liveins: $sgpr20, $vgpr0, $vgpr1, $vgpr40, $vgpr49 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: $vcc_lo = S_XOR_SAVEEXEC_B32 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec + ; CHECK-NEXT: SCRATCH_STORE_DWORD_SADDR $vgpr0, $sgpr32, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.0, addrspace 5) + ; CHECK-NEXT: SCRATCH_STORE_DWORD_SADDR $vgpr49, $sgpr32, 8, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.2, addrspace 5) + ; CHECK-NEXT: $exec_lo = S_MOV_B32 -1 + ; CHECK-NEXT: SCRATCH_STORE_DWORD_SADDR $vgpr40, $sgpr32, 4, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.1, addrspace 5) + ; CHECK-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR killed $sgpr20, 0, $vgpr0 + ; CHECK-NEXT: S_NOP 0, implicit-def $vgpr40, implicit-def $sgpr20 + ; CHECK-NEXT: $sgpr3 = COPY $vcc_lo + ; CHECK-NEXT: S_NOP 0, implicit-def $vgpr49, implicit-def $sgpr40 + ; CHECK-NEXT: $sgpr20 = SI_RESTORE_S32_FROM_VGPR $vgpr0, 0 + ; CHECK-NEXT: $vgpr40 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 4, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.1, addrspace 5) + ; CHECK-NEXT: $exec_lo = S_XOR_B32 $sgpr3, -1, implicit-def $scc + ; CHECK-NEXT: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 0, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.0, addrspace 5) + ; CHECK-NEXT: $vgpr49 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 8, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.2, addrspace 5) + ; CHECK-NEXT: $exec_lo = S_MOV_B32 $sgpr3 + ; CHECK-NEXT: SI_WHOLE_WAVE_FUNC_RETURN killed renamable $sgpr3 + $vgpr191 = SI_SPILL_S32_TO_VGPR killed $sgpr20, 0, $vgpr191 + renamable $vcc_lo = SI_WHOLE_WAVE_FUNC_SETUP implicit-def dead $exec, implicit $exec + S_NOP 0, implicit-def $vgpr40, implicit-def $sgpr20 + $sgpr3 = COPY $vcc_lo + S_NOP 0, implicit-def $vgpr49, implicit-def $sgpr40 + $sgpr20 = SI_RESTORE_S32_FROM_VGPR $vgpr191, 0 + SI_WHOLE_WAVE_FUNC_RETURN killed renamable $sgpr3 + +... +--- +name: vgpr_superregs +alignment: 1 +tracksRegLiveness: true +noPhis: true +isSSA: false +noVRegs: true +hasFakeUses: false +tracksDebugUserValues: true +frameInfo: + maxAlignment: 1 + isCalleeSavedInfoValid: true +machineFunctionInfo: + maxKernArgAlign: 1 + frameOffsetReg: '$sgpr33' + stackPtrOffsetReg: '$sgpr32' + returnsVoid: false + occupancy: 16 + sgprForEXECCopy: '$sgpr105' + isWholeWaveFunction: true +body: | + bb.0: + ; CHECK-LABEL: name: vgpr_superregs + ; CHECK: liveins: $vgpr0, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr40, $vgpr41, $vgpr42 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: $sgpr0 = S_XOR_SAVEEXEC_B32 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec + ; CHECK-NEXT: SCRATCH_STORE_DWORD_SADDR $vgpr0, $sgpr32, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.0, addrspace 5) + ; CHECK-NEXT: SCRATCH_STORE_DWORD_SADDR $vgpr2, $sgpr32, 4, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.1, addrspace 5) + ; CHECK-NEXT: SCRATCH_STORE_DWORD_SADDR $vgpr3, $sgpr32, 8, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.2, addrspace 5) + ; CHECK-NEXT: SCRATCH_STORE_DWORD_SADDR $vgpr4, $sgpr32, 12, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.3, addrspace 5) + ; CHECK-NEXT: SCRATCH_STORE_DWORD_SADDR $vgpr5, $sgpr32, 16, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.4, addrspace 5) + ; CHECK-NEXT: $exec_lo = S_MOV_B32 -1 + ; CHECK-NEXT: SCRATCH_STORE_DWORD_SADDR $vgpr40, $sgpr32, 20, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.5, addrspace 5) + ; CHECK-NEXT: SCRATCH_STORE_DWORD_SADDR $vgpr41, $sgpr32, 24, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.6, addrspace 5) + ; CHECK-NEXT: SCRATCH_STORE_DWORD_SADDR $vgpr42, $sgpr32, 28, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.7, addrspace 5) + ; CHECK-NEXT: $vgpr0 = V_MOV_B32_e32 14, implicit $exec + ; CHECK-NEXT: S_NOP 0, implicit-def $vgpr2_vgpr3_vgpr4_vgpr5, implicit-def $vgpr40_vgpr41_vgpr42 + ; CHECK-NEXT: $vgpr40 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 20, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.5, addrspace 5) + ; CHECK-NEXT: $vgpr41 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 24, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.6, addrspace 5) + ; CHECK-NEXT: $vgpr42 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 28, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.7, addrspace 5) + ; CHECK-NEXT: $exec_lo = S_XOR_B32 $sgpr0, -1, implicit-def $scc + ; CHECK-NEXT: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 0, 0, implicit $exec, implicit $flat_scr, implicit $vgpr0(tied-def 0) :: (load (s32) from %stack.0, addrspace 5) + ; CHECK-NEXT: $vgpr2 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 4, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.1, addrspace 5) + ; CHECK-NEXT: $vgpr3 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 8, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.2, addrspace 5) + ; CHECK-NEXT: $vgpr4 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 12, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.3, addrspace 5) + ; CHECK-NEXT: $vgpr5 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 16, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.4, addrspace 5) + ; CHECK-NEXT: $exec_lo = S_MOV_B32 $sgpr0 + ; CHECK-NEXT: SI_WHOLE_WAVE_FUNC_RETURN killed renamable $sgpr0, implicit killed $vgpr0 + renamable $sgpr0 = SI_WHOLE_WAVE_FUNC_SETUP implicit-def dead $exec, implicit $exec + $vgpr0 = V_MOV_B32_e32 14, implicit $exec + S_NOP 0, implicit-def $vgpr2_vgpr3_vgpr4_vgpr5, implicit-def $vgpr40_vgpr41_vgpr42 + SI_WHOLE_WAVE_FUNC_RETURN killed renamable $sgpr0, implicit killed $vgpr0 + +... +--- +name: dont_restore_used_vgprs +alignment: 1 +tracksRegLiveness: true +noPhis: true +isSSA: false +noVRegs: true +hasFakeUses: false +tracksDebugUserValues: true +liveins: + - { reg: '$vgpr0' } + - { reg: '$vgpr20' } + - { reg: '$vgpr40' } +frameInfo: + maxAlignment: 1 + isCalleeSavedInfoValid: true +machineFunctionInfo: + maxKernArgAlign: 1 + frameOffsetReg: '$sgpr33' + stackPtrOffsetReg: '$sgpr32' + returnsVoid: false + occupancy: 16 + sgprForEXECCopy: '$sgpr105' + isWholeWaveFunction: true +body: | + bb.0: + liveins: $vgpr0, $vgpr20, $vgpr40 + + ; CHECK-LABEL: name: dont_restore_used_vgprs + ; CHECK: liveins: $vgpr0, $vgpr20, $vgpr40 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: $sgpr0 = S_XOR_SAVEEXEC_B32 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec + ; CHECK-NEXT: S_NOP 0, implicit $vgpr0, implicit $vgpr20, implicit $vgpr40 + ; CHECK-NEXT: $exec_lo = S_MOV_B32 $sgpr0 + ; CHECK-NEXT: SI_WHOLE_WAVE_FUNC_RETURN killed renamable $sgpr0, implicit killed $vgpr0 + renamable $sgpr0 = SI_WHOLE_WAVE_FUNC_SETUP implicit-def dead $exec, implicit $exec + S_NOP 0, implicit $vgpr0, implicit $vgpr20, implicit $vgpr40 + SI_WHOLE_WAVE_FUNC_RETURN killed renamable $sgpr0, implicit killed $vgpr0 + +... +--- +name: multiple_blocks +alignment: 1 +tracksRegLiveness: true +noPhis: true +isSSA: false +noVRegs: true +hasFakeUses: false +tracksDebugUserValues: true +liveins: + - { reg: '$vgpr0' } + - { reg: '$vgpr1' } +frameInfo: + maxAlignment: 1 + isCalleeSavedInfoValid: true +machineFunctionInfo: + maxKernArgAlign: 1 + frameOffsetReg: '$sgpr33' + stackPtrOffsetReg: '$sgpr32' + returnsVoid: false + occupancy: 16 + sgprForEXECCopy: '$sgpr105' + isWholeWaveFunction: true +body: | + ; CHECK-LABEL: name: multiple_blocks + ; CHECK: bb.0: + ; CHECK-NEXT: successors: %bb.1(0x40000000), %bb.2(0x40000000) + ; CHECK-NEXT: liveins: $vgpr0, $vgpr1 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: $vcc_lo = S_XOR_SAVEEXEC_B32 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec + ; CHECK-NEXT: SCRATCH_STORE_DWORD_SADDR $vgpr0, $sgpr32, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.0, addrspace 5) + ; CHECK-NEXT: SCRATCH_STORE_DWORD_SADDR $vgpr1, $sgpr32, 4, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.1, addrspace 5) + ; CHECK-NEXT: $exec_lo = S_MOV_B32 -1 + ; CHECK-NEXT: $sgpr1 = S_MOV_B32 $exec_lo + ; CHECK-NEXT: V_CMPX_EQ_U32_nosdst_e64 $vgpr0, $vgpr1, implicit-def $exec, implicit $exec + ; CHECK-NEXT: S_CBRANCH_EXECZ %bb.2, implicit $exec + ; CHECK-NEXT: S_BRANCH %bb.1 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.1: + ; CHECK-NEXT: successors: %bb.2(0x80000000) + ; CHECK-NEXT: liveins: $vcc_lo, $sgpr1, $vgpr0, $vgpr1 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: renamable $vgpr1 = V_ADD_U32_e64 $vgpr0, $vgpr1, 0, implicit $exec + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.2: + ; CHECK-NEXT: liveins: $vcc_lo, $sgpr1, $vgpr0, $vgpr1 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: $exec_lo = S_OR_B32 $exec_lo, killed renamable $sgpr1, implicit-def $scc + ; CHECK-NEXT: renamable $vgpr0 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr0, $vcc_lo, implicit $exec + ; CHECK-NEXT: $exec_lo = S_XOR_B32 $vcc_lo, -1, implicit-def $scc + ; CHECK-NEXT: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 0, 0, implicit $exec, implicit $flat_scr, implicit $vgpr0(tied-def 0) :: (load (s32) from %stack.0, addrspace 5) + ; CHECK-NEXT: $vgpr1 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 4, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.1, addrspace 5) + ; CHECK-NEXT: $exec_lo = S_MOV_B32 $vcc_lo + ; CHECK-NEXT: SI_WHOLE_WAVE_FUNC_RETURN killed renamable $vcc_lo, implicit $vgpr0 + bb.0: + successors: %bb.1, %bb.2 + liveins: $vgpr0, $vgpr1 + + renamable $vcc_lo = SI_WHOLE_WAVE_FUNC_SETUP implicit-def dead $exec, implicit $exec + $sgpr1 = S_MOV_B32 $exec_lo + V_CMPX_EQ_U32_nosdst_e64 $vgpr0, $vgpr1, implicit-def $exec, implicit $exec + S_CBRANCH_EXECZ %bb.2, implicit $exec + S_BRANCH %bb.1 + + bb.1: + liveins: $vcc_lo, $sgpr1, $vgpr0, $vgpr1 + + renamable $vgpr1 = V_ADD_U32_e64 $vgpr0, $vgpr1, 0, implicit $exec + + bb.2: + liveins: $vcc_lo, $sgpr1, $vgpr0, $vgpr1 + + $exec_lo = S_OR_B32 $exec_lo, killed renamable $sgpr1, implicit-def $scc + renamable $vgpr0 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr0, $vcc_lo, implicit $exec + SI_WHOLE_WAVE_FUNC_RETURN killed renamable $vcc_lo, implicit $vgpr0 + +... diff --git a/llvm/test/CodeGen/AMDGPU/whole-wave-functions.ll b/llvm/test/CodeGen/AMDGPU/whole-wave-functions.ll new file mode 100644 index 0000000..53d0292 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/whole-wave-functions.ll @@ -0,0 +1,2414 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc -global-isel=0 -mtriple=amdgcn--amdpal -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck --check-prefix=DAGISEL %s +; RUN: llc -global-isel=1 -mtriple=amdgcn--amdpal -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck --check-prefix=GISEL %s +; RUN: llc -global-isel=0 -mtriple=amdgcn--amdpal -mcpu=gfx1200 -mattr=+wavefrontsize64 -verify-machineinstrs < %s | FileCheck --check-prefix=DAGISEL64 %s +; RUN: llc -global-isel=1 -mtriple=amdgcn--amdpal -mcpu=gfx1200 -mattr=+wavefrontsize64 -verify-machineinstrs < %s | FileCheck --check-prefix=GISEL64 %s + +; Make sure the i1 %active is passed through EXEC. +; The EXEC mask should be set to -1 for the duration of the function +; and restored to its original value in the epilogue. +; We will also need to restore the inactive lanes for any allocated VGPRs. +define amdgpu_gfx_whole_wave i32 @basic_test(i1 %active, i32 %a, i32 %b) { +; DAGISEL-LABEL: basic_test: +; DAGISEL: ; %bb.0: +; DAGISEL-NEXT: s_wait_loadcnt_dscnt 0x0 +; DAGISEL-NEXT: s_wait_expcnt 0x0 +; DAGISEL-NEXT: s_wait_samplecnt 0x0 +; DAGISEL-NEXT: s_wait_bvhcnt 0x0 +; DAGISEL-NEXT: s_wait_kmcnt 0x0 +; DAGISEL-NEXT: s_xor_saveexec_b32 vcc_lo, -1 +; DAGISEL-NEXT: s_clause 0x1 +; DAGISEL-NEXT: scratch_store_b32 off, v0, s32 +; DAGISEL-NEXT: scratch_store_b32 off, v1, s32 offset:4 +; DAGISEL-NEXT: s_mov_b32 exec_lo, -1 +; DAGISEL-NEXT: s_wait_alu 0xfffe +; DAGISEL-NEXT: v_dual_cndmask_b32 v0, 5, v0 :: v_dual_cndmask_b32 v1, 3, v1 +; DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; DAGISEL-NEXT: v_mov_b32_dpp v0, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 +; DAGISEL-NEXT: s_xor_b32 exec_lo, vcc_lo, -1 +; DAGISEL-NEXT: s_clause 0x1 +; DAGISEL-NEXT: scratch_load_b32 v0, off, s32 +; DAGISEL-NEXT: scratch_load_b32 v1, off, s32 offset:4 +; DAGISEL-NEXT: s_mov_b32 exec_lo, vcc_lo +; DAGISEL-NEXT: s_wait_loadcnt 0x0 +; DAGISEL-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-LABEL: basic_test: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 +; GISEL-NEXT: s_wait_expcnt 0x0 +; GISEL-NEXT: s_wait_samplecnt 0x0 +; GISEL-NEXT: s_wait_bvhcnt 0x0 +; GISEL-NEXT: s_wait_kmcnt 0x0 +; GISEL-NEXT: s_xor_saveexec_b32 vcc_lo, -1 +; GISEL-NEXT: s_clause 0x1 +; GISEL-NEXT: scratch_store_b32 off, v0, s32 +; GISEL-NEXT: scratch_store_b32 off, v1, s32 offset:4 +; GISEL-NEXT: s_mov_b32 exec_lo, -1 +; GISEL-NEXT: s_wait_alu 0xfffe +; GISEL-NEXT: v_dual_cndmask_b32 v0, 5, v0 :: v_dual_cndmask_b32 v1, 3, v1 +; GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GISEL-NEXT: v_mov_b32_dpp v0, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 +; GISEL-NEXT: s_xor_b32 exec_lo, vcc_lo, -1 +; GISEL-NEXT: s_clause 0x1 +; GISEL-NEXT: scratch_load_b32 v0, off, s32 +; GISEL-NEXT: scratch_load_b32 v1, off, s32 offset:4 +; GISEL-NEXT: s_mov_b32 exec_lo, vcc_lo +; GISEL-NEXT: s_wait_loadcnt 0x0 +; GISEL-NEXT: s_setpc_b64 s[30:31] +; +; DAGISEL64-LABEL: basic_test: +; DAGISEL64: ; %bb.0: +; DAGISEL64-NEXT: s_wait_loadcnt_dscnt 0x0 +; DAGISEL64-NEXT: s_wait_expcnt 0x0 +; DAGISEL64-NEXT: s_wait_samplecnt 0x0 +; DAGISEL64-NEXT: s_wait_bvhcnt 0x0 +; DAGISEL64-NEXT: s_wait_kmcnt 0x0 +; DAGISEL64-NEXT: s_xor_saveexec_b64 vcc, -1 +; DAGISEL64-NEXT: s_clause 0x1 +; DAGISEL64-NEXT: scratch_store_b32 off, v0, s32 +; DAGISEL64-NEXT: scratch_store_b32 off, v1, s32 offset:4 +; DAGISEL64-NEXT: s_mov_b64 exec, -1 +; DAGISEL64-NEXT: s_wait_alu 0xfffe +; DAGISEL64-NEXT: v_cndmask_b32_e32 v0, 5, v0, vcc +; DAGISEL64-NEXT: v_cndmask_b32_e32 v1, 3, v1, vcc +; DAGISEL64-NEXT: s_delay_alu instid0(VALU_DEP_1) +; DAGISEL64-NEXT: v_mov_b32_dpp v0, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 +; DAGISEL64-NEXT: s_xor_b64 exec, vcc, -1 +; DAGISEL64-NEXT: s_clause 0x1 +; DAGISEL64-NEXT: scratch_load_b32 v0, off, s32 +; DAGISEL64-NEXT: scratch_load_b32 v1, off, s32 offset:4 +; DAGISEL64-NEXT: s_mov_b64 exec, vcc +; DAGISEL64-NEXT: s_wait_loadcnt 0x0 +; DAGISEL64-NEXT: s_setpc_b64 s[30:31] +; +; GISEL64-LABEL: basic_test: +; GISEL64: ; %bb.0: +; GISEL64-NEXT: s_wait_loadcnt_dscnt 0x0 +; GISEL64-NEXT: s_wait_expcnt 0x0 +; GISEL64-NEXT: s_wait_samplecnt 0x0 +; GISEL64-NEXT: s_wait_bvhcnt 0x0 +; GISEL64-NEXT: s_wait_kmcnt 0x0 +; GISEL64-NEXT: s_xor_saveexec_b64 vcc, -1 +; GISEL64-NEXT: s_clause 0x1 +; GISEL64-NEXT: scratch_store_b32 off, v0, s32 +; GISEL64-NEXT: scratch_store_b32 off, v1, s32 offset:4 +; GISEL64-NEXT: s_mov_b64 exec, -1 +; GISEL64-NEXT: s_wait_alu 0xfffe +; GISEL64-NEXT: v_cndmask_b32_e32 v0, 5, v0, vcc +; GISEL64-NEXT: v_cndmask_b32_e32 v1, 3, v1, vcc +; GISEL64-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GISEL64-NEXT: v_mov_b32_dpp v0, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 +; GISEL64-NEXT: s_xor_b64 exec, vcc, -1 +; GISEL64-NEXT: s_clause 0x1 +; GISEL64-NEXT: scratch_load_b32 v0, off, s32 +; GISEL64-NEXT: scratch_load_b32 v1, off, s32 offset:4 +; GISEL64-NEXT: s_mov_b64 exec, vcc +; GISEL64-NEXT: s_wait_loadcnt 0x0 +; GISEL64-NEXT: s_setpc_b64 s[30:31] + %x = select i1 %active, i32 %a, i32 5 + %y = select i1 %active, i32 %b, i32 3 + %ret = call i32 @llvm.amdgcn.update.dpp.i32(i32 %x, i32 %y, i32 1, i32 1, i32 1, i1 false) + ret i32 %ret +} + +; Make sure we don't crash if there's only one use for %active. +define amdgpu_gfx_whole_wave i32 @single_use_of_active(i1 %active, i32 %a, i32 %b) { +; DAGISEL-LABEL: single_use_of_active: +; DAGISEL: ; %bb.0: +; DAGISEL-NEXT: s_wait_loadcnt_dscnt 0x0 +; DAGISEL-NEXT: s_wait_expcnt 0x0 +; DAGISEL-NEXT: s_wait_samplecnt 0x0 +; DAGISEL-NEXT: s_wait_bvhcnt 0x0 +; DAGISEL-NEXT: s_wait_kmcnt 0x0 +; DAGISEL-NEXT: s_xor_saveexec_b32 vcc_lo, -1 +; DAGISEL-NEXT: s_clause 0x1 +; DAGISEL-NEXT: scratch_store_b32 off, v0, s32 +; DAGISEL-NEXT: scratch_store_b32 off, v1, s32 offset:4 +; DAGISEL-NEXT: s_mov_b32 exec_lo, -1 +; DAGISEL-NEXT: s_wait_alu 0xfffe +; DAGISEL-NEXT: v_cndmask_b32_e32 v1, 17, v1, vcc_lo +; DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; DAGISEL-NEXT: v_mov_b32_dpp v0, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 +; DAGISEL-NEXT: s_xor_b32 exec_lo, vcc_lo, -1 +; DAGISEL-NEXT: s_clause 0x1 +; DAGISEL-NEXT: scratch_load_b32 v0, off, s32 +; DAGISEL-NEXT: scratch_load_b32 v1, off, s32 offset:4 +; DAGISEL-NEXT: s_mov_b32 exec_lo, vcc_lo +; DAGISEL-NEXT: s_wait_loadcnt 0x0 +; DAGISEL-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-LABEL: single_use_of_active: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 +; GISEL-NEXT: s_wait_expcnt 0x0 +; GISEL-NEXT: s_wait_samplecnt 0x0 +; GISEL-NEXT: s_wait_bvhcnt 0x0 +; GISEL-NEXT: s_wait_kmcnt 0x0 +; GISEL-NEXT: s_xor_saveexec_b32 vcc_lo, -1 +; GISEL-NEXT: s_clause 0x1 +; GISEL-NEXT: scratch_store_b32 off, v0, s32 +; GISEL-NEXT: scratch_store_b32 off, v1, s32 offset:4 +; GISEL-NEXT: s_mov_b32 exec_lo, -1 +; GISEL-NEXT: s_wait_alu 0xfffe +; GISEL-NEXT: v_cndmask_b32_e32 v1, 17, v1, vcc_lo +; GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GISEL-NEXT: v_mov_b32_dpp v0, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 +; GISEL-NEXT: s_xor_b32 exec_lo, vcc_lo, -1 +; GISEL-NEXT: s_clause 0x1 +; GISEL-NEXT: scratch_load_b32 v0, off, s32 +; GISEL-NEXT: scratch_load_b32 v1, off, s32 offset:4 +; GISEL-NEXT: s_mov_b32 exec_lo, vcc_lo +; GISEL-NEXT: s_wait_loadcnt 0x0 +; GISEL-NEXT: s_setpc_b64 s[30:31] +; +; DAGISEL64-LABEL: single_use_of_active: +; DAGISEL64: ; %bb.0: +; DAGISEL64-NEXT: s_wait_loadcnt_dscnt 0x0 +; DAGISEL64-NEXT: s_wait_expcnt 0x0 +; DAGISEL64-NEXT: s_wait_samplecnt 0x0 +; DAGISEL64-NEXT: s_wait_bvhcnt 0x0 +; DAGISEL64-NEXT: s_wait_kmcnt 0x0 +; DAGISEL64-NEXT: s_xor_saveexec_b64 vcc, -1 +; DAGISEL64-NEXT: s_clause 0x1 +; DAGISEL64-NEXT: scratch_store_b32 off, v0, s32 +; DAGISEL64-NEXT: scratch_store_b32 off, v1, s32 offset:4 +; DAGISEL64-NEXT: s_mov_b64 exec, -1 +; DAGISEL64-NEXT: s_wait_alu 0xfffe +; DAGISEL64-NEXT: v_cndmask_b32_e32 v1, 17, v1, vcc +; DAGISEL64-NEXT: s_delay_alu instid0(VALU_DEP_1) +; DAGISEL64-NEXT: v_mov_b32_dpp v0, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 +; DAGISEL64-NEXT: s_xor_b64 exec, vcc, -1 +; DAGISEL64-NEXT: s_clause 0x1 +; DAGISEL64-NEXT: scratch_load_b32 v0, off, s32 +; DAGISEL64-NEXT: scratch_load_b32 v1, off, s32 offset:4 +; DAGISEL64-NEXT: s_mov_b64 exec, vcc +; DAGISEL64-NEXT: s_wait_loadcnt 0x0 +; DAGISEL64-NEXT: s_setpc_b64 s[30:31] +; +; GISEL64-LABEL: single_use_of_active: +; GISEL64: ; %bb.0: +; GISEL64-NEXT: s_wait_loadcnt_dscnt 0x0 +; GISEL64-NEXT: s_wait_expcnt 0x0 +; GISEL64-NEXT: s_wait_samplecnt 0x0 +; GISEL64-NEXT: s_wait_bvhcnt 0x0 +; GISEL64-NEXT: s_wait_kmcnt 0x0 +; GISEL64-NEXT: s_xor_saveexec_b64 vcc, -1 +; GISEL64-NEXT: s_clause 0x1 +; GISEL64-NEXT: scratch_store_b32 off, v0, s32 +; GISEL64-NEXT: scratch_store_b32 off, v1, s32 offset:4 +; GISEL64-NEXT: s_mov_b64 exec, -1 +; GISEL64-NEXT: s_wait_alu 0xfffe +; GISEL64-NEXT: v_cndmask_b32_e32 v1, 17, v1, vcc +; GISEL64-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GISEL64-NEXT: v_mov_b32_dpp v0, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 +; GISEL64-NEXT: s_xor_b64 exec, vcc, -1 +; GISEL64-NEXT: s_clause 0x1 +; GISEL64-NEXT: scratch_load_b32 v0, off, s32 +; GISEL64-NEXT: scratch_load_b32 v1, off, s32 offset:4 +; GISEL64-NEXT: s_mov_b64 exec, vcc +; GISEL64-NEXT: s_wait_loadcnt 0x0 +; GISEL64-NEXT: s_setpc_b64 s[30:31] + %y = select i1 %active, i32 %b, i32 17 + %ret = call i32 @llvm.amdgcn.update.dpp.i32(i32 %a, i32 %y, i32 1, i32 1, i32 1, i1 false) + ret i32 %ret +} + +; Make sure we don't crash if %active is not used at all. +define amdgpu_gfx_whole_wave i32 @unused_active(i1 %active, i32 %a, i32 %b) { +; DAGISEL-LABEL: unused_active: +; DAGISEL: ; %bb.0: +; DAGISEL-NEXT: s_wait_loadcnt_dscnt 0x0 +; DAGISEL-NEXT: s_wait_expcnt 0x0 +; DAGISEL-NEXT: s_wait_samplecnt 0x0 +; DAGISEL-NEXT: s_wait_bvhcnt 0x0 +; DAGISEL-NEXT: s_wait_kmcnt 0x0 +; DAGISEL-NEXT: s_xor_saveexec_b32 s0, -1 +; DAGISEL-NEXT: scratch_store_b32 off, v0, s32 ; 4-byte Folded Spill +; DAGISEL-NEXT: s_mov_b32 exec_lo, -1 +; DAGISEL-NEXT: v_mov_b32_e32 v0, 14 +; DAGISEL-NEXT: s_wait_alu 0xfffe +; DAGISEL-NEXT: s_xor_b32 exec_lo, s0, -1 +; DAGISEL-NEXT: scratch_load_b32 v0, off, s32 ; 4-byte Folded Reload +; DAGISEL-NEXT: s_mov_b32 exec_lo, s0 +; DAGISEL-NEXT: s_wait_loadcnt 0x0 +; DAGISEL-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-LABEL: unused_active: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 +; GISEL-NEXT: s_wait_expcnt 0x0 +; GISEL-NEXT: s_wait_samplecnt 0x0 +; GISEL-NEXT: s_wait_bvhcnt 0x0 +; GISEL-NEXT: s_wait_kmcnt 0x0 +; GISEL-NEXT: s_xor_saveexec_b32 s0, -1 +; GISEL-NEXT: scratch_store_b32 off, v0, s32 ; 4-byte Folded Spill +; GISEL-NEXT: s_mov_b32 exec_lo, -1 +; GISEL-NEXT: v_mov_b32_e32 v0, 14 +; GISEL-NEXT: s_wait_alu 0xfffe +; GISEL-NEXT: s_xor_b32 exec_lo, s0, -1 +; GISEL-NEXT: scratch_load_b32 v0, off, s32 ; 4-byte Folded Reload +; GISEL-NEXT: s_mov_b32 exec_lo, s0 +; GISEL-NEXT: s_wait_loadcnt 0x0 +; GISEL-NEXT: s_setpc_b64 s[30:31] +; +; DAGISEL64-LABEL: unused_active: +; DAGISEL64: ; %bb.0: +; DAGISEL64-NEXT: s_wait_loadcnt_dscnt 0x0 +; DAGISEL64-NEXT: s_wait_expcnt 0x0 +; DAGISEL64-NEXT: s_wait_samplecnt 0x0 +; DAGISEL64-NEXT: s_wait_bvhcnt 0x0 +; DAGISEL64-NEXT: s_wait_kmcnt 0x0 +; DAGISEL64-NEXT: s_xor_saveexec_b64 s[0:1], -1 +; DAGISEL64-NEXT: scratch_store_b32 off, v0, s32 ; 4-byte Folded Spill +; DAGISEL64-NEXT: s_mov_b64 exec, -1 +; DAGISEL64-NEXT: v_mov_b32_e32 v0, 14 +; DAGISEL64-NEXT: s_wait_alu 0xfffe +; DAGISEL64-NEXT: s_xor_b64 exec, s[0:1], -1 +; DAGISEL64-NEXT: scratch_load_b32 v0, off, s32 ; 4-byte Folded Reload +; DAGISEL64-NEXT: s_mov_b64 exec, s[0:1] +; DAGISEL64-NEXT: s_wait_loadcnt 0x0 +; DAGISEL64-NEXT: s_setpc_b64 s[30:31] +; +; GISEL64-LABEL: unused_active: +; GISEL64: ; %bb.0: +; GISEL64-NEXT: s_wait_loadcnt_dscnt 0x0 +; GISEL64-NEXT: s_wait_expcnt 0x0 +; GISEL64-NEXT: s_wait_samplecnt 0x0 +; GISEL64-NEXT: s_wait_bvhcnt 0x0 +; GISEL64-NEXT: s_wait_kmcnt 0x0 +; GISEL64-NEXT: s_xor_saveexec_b64 s[0:1], -1 +; GISEL64-NEXT: scratch_store_b32 off, v0, s32 ; 4-byte Folded Spill +; GISEL64-NEXT: s_mov_b64 exec, -1 +; GISEL64-NEXT: v_mov_b32_e32 v0, 14 +; GISEL64-NEXT: s_wait_alu 0xfffe +; GISEL64-NEXT: s_xor_b64 exec, s[0:1], -1 +; GISEL64-NEXT: scratch_load_b32 v0, off, s32 ; 4-byte Folded Reload +; GISEL64-NEXT: s_mov_b64 exec, s[0:1] +; GISEL64-NEXT: s_wait_loadcnt 0x0 +; GISEL64-NEXT: s_setpc_b64 s[30:31] + ret i32 14 +} + +; For any used VGPRs (including those used for SGPR spills), we need to restore the inactive lanes. +; For CSR VGPRs, we need to restore all lanes. +define amdgpu_gfx_whole_wave i32 @csr(i1 %active, i32 %a, i32 %b) { +; DAGISEL-LABEL: csr: +; DAGISEL: ; %bb.0: +; DAGISEL-NEXT: s_wait_loadcnt_dscnt 0x0 +; DAGISEL-NEXT: s_wait_expcnt 0x0 +; DAGISEL-NEXT: s_wait_samplecnt 0x0 +; DAGISEL-NEXT: s_wait_bvhcnt 0x0 +; DAGISEL-NEXT: s_wait_kmcnt 0x0 +; DAGISEL-NEXT: s_xor_saveexec_b32 vcc_lo, -1 +; DAGISEL-NEXT: s_clause 0x3 +; DAGISEL-NEXT: scratch_store_b32 off, v2, s32 +; DAGISEL-NEXT: scratch_store_b32 off, v0, s32 offset:4 +; DAGISEL-NEXT: scratch_store_b32 off, v1, s32 offset:8 +; DAGISEL-NEXT: scratch_store_b32 off, v49, s32 offset:16 +; DAGISEL-NEXT: s_mov_b32 exec_lo, -1 +; DAGISEL-NEXT: scratch_store_b32 off, v40, s32 offset:12 ; 4-byte Folded Spill +; DAGISEL-NEXT: ;;#ASMSTART +; DAGISEL-NEXT: ; clobber CSR +; DAGISEL-NEXT: ;;#ASMEND +; DAGISEL-NEXT: v_writelane_b32 v2, s20, 0 +; DAGISEL-NEXT: ;;#ASMSTART +; DAGISEL-NEXT: ; clobber non-CSR +; DAGISEL-NEXT: ;;#ASMEND +; DAGISEL-NEXT: scratch_load_b32 v40, off, s32 offset:12 ; 4-byte Folded Reload +; DAGISEL-NEXT: s_wait_alu 0xfffe +; DAGISEL-NEXT: v_dual_cndmask_b32 v0, 5, v0 :: v_dual_cndmask_b32 v1, 3, v1 +; DAGISEL-NEXT: v_readlane_b32 s20, v2, 0 +; DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) +; DAGISEL-NEXT: v_mov_b32_dpp v0, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 +; DAGISEL-NEXT: s_xor_b32 exec_lo, vcc_lo, -1 +; DAGISEL-NEXT: s_clause 0x3 +; DAGISEL-NEXT: scratch_load_b32 v2, off, s32 +; DAGISEL-NEXT: scratch_load_b32 v0, off, s32 offset:4 +; DAGISEL-NEXT: scratch_load_b32 v1, off, s32 offset:8 +; DAGISEL-NEXT: scratch_load_b32 v49, off, s32 offset:16 +; DAGISEL-NEXT: s_mov_b32 exec_lo, vcc_lo +; DAGISEL-NEXT: s_wait_loadcnt 0x0 +; DAGISEL-NEXT: s_wait_alu 0xf1ff +; DAGISEL-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-LABEL: csr: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 +; GISEL-NEXT: s_wait_expcnt 0x0 +; GISEL-NEXT: s_wait_samplecnt 0x0 +; GISEL-NEXT: s_wait_bvhcnt 0x0 +; GISEL-NEXT: s_wait_kmcnt 0x0 +; GISEL-NEXT: s_xor_saveexec_b32 vcc_lo, -1 +; GISEL-NEXT: s_clause 0x3 +; GISEL-NEXT: scratch_store_b32 off, v2, s32 +; GISEL-NEXT: scratch_store_b32 off, v0, s32 offset:4 +; GISEL-NEXT: scratch_store_b32 off, v1, s32 offset:8 +; GISEL-NEXT: scratch_store_b32 off, v49, s32 offset:16 +; GISEL-NEXT: s_mov_b32 exec_lo, -1 +; GISEL-NEXT: scratch_store_b32 off, v40, s32 offset:12 ; 4-byte Folded Spill +; GISEL-NEXT: ;;#ASMSTART +; GISEL-NEXT: ; clobber CSR +; GISEL-NEXT: ;;#ASMEND +; GISEL-NEXT: v_writelane_b32 v2, s20, 0 +; GISEL-NEXT: ;;#ASMSTART +; GISEL-NEXT: ; clobber non-CSR +; GISEL-NEXT: ;;#ASMEND +; GISEL-NEXT: scratch_load_b32 v40, off, s32 offset:12 ; 4-byte Folded Reload +; GISEL-NEXT: s_wait_alu 0xfffe +; GISEL-NEXT: v_dual_cndmask_b32 v0, 5, v0 :: v_dual_cndmask_b32 v1, 3, v1 +; GISEL-NEXT: v_readlane_b32 s20, v2, 0 +; GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GISEL-NEXT: v_mov_b32_dpp v0, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 +; GISEL-NEXT: s_xor_b32 exec_lo, vcc_lo, -1 +; GISEL-NEXT: s_clause 0x3 +; GISEL-NEXT: scratch_load_b32 v2, off, s32 +; GISEL-NEXT: scratch_load_b32 v0, off, s32 offset:4 +; GISEL-NEXT: scratch_load_b32 v1, off, s32 offset:8 +; GISEL-NEXT: scratch_load_b32 v49, off, s32 offset:16 +; GISEL-NEXT: s_mov_b32 exec_lo, vcc_lo +; GISEL-NEXT: s_wait_loadcnt 0x0 +; GISEL-NEXT: s_wait_alu 0xf1ff +; GISEL-NEXT: s_setpc_b64 s[30:31] +; +; DAGISEL64-LABEL: csr: +; DAGISEL64: ; %bb.0: +; DAGISEL64-NEXT: s_wait_loadcnt_dscnt 0x0 +; DAGISEL64-NEXT: s_wait_expcnt 0x0 +; DAGISEL64-NEXT: s_wait_samplecnt 0x0 +; DAGISEL64-NEXT: s_wait_bvhcnt 0x0 +; DAGISEL64-NEXT: s_wait_kmcnt 0x0 +; DAGISEL64-NEXT: s_xor_saveexec_b64 vcc, -1 +; DAGISEL64-NEXT: s_clause 0x3 +; DAGISEL64-NEXT: scratch_store_b32 off, v2, s32 +; DAGISEL64-NEXT: scratch_store_b32 off, v0, s32 offset:4 +; DAGISEL64-NEXT: scratch_store_b32 off, v1, s32 offset:8 +; DAGISEL64-NEXT: scratch_store_b32 off, v49, s32 offset:16 +; DAGISEL64-NEXT: s_mov_b64 exec, -1 +; DAGISEL64-NEXT: scratch_store_b32 off, v40, s32 offset:12 ; 4-byte Folded Spill +; DAGISEL64-NEXT: ;;#ASMSTART +; DAGISEL64-NEXT: ; clobber CSR +; DAGISEL64-NEXT: ;;#ASMEND +; DAGISEL64-NEXT: v_writelane_b32 v2, s20, 0 +; DAGISEL64-NEXT: ;;#ASMSTART +; DAGISEL64-NEXT: ; clobber non-CSR +; DAGISEL64-NEXT: ;;#ASMEND +; DAGISEL64-NEXT: scratch_load_b32 v40, off, s32 offset:12 ; 4-byte Folded Reload +; DAGISEL64-NEXT: s_wait_alu 0xfffe +; DAGISEL64-NEXT: v_cndmask_b32_e32 v0, 5, v0, vcc +; DAGISEL64-NEXT: v_cndmask_b32_e32 v1, 3, v1, vcc +; DAGISEL64-NEXT: v_readlane_b32 s20, v2, 0 +; DAGISEL64-NEXT: s_delay_alu instid0(VALU_DEP_2) +; DAGISEL64-NEXT: v_mov_b32_dpp v0, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 +; DAGISEL64-NEXT: s_xor_b64 exec, vcc, -1 +; DAGISEL64-NEXT: s_clause 0x3 +; DAGISEL64-NEXT: scratch_load_b32 v2, off, s32 +; DAGISEL64-NEXT: scratch_load_b32 v0, off, s32 offset:4 +; DAGISEL64-NEXT: scratch_load_b32 v1, off, s32 offset:8 +; DAGISEL64-NEXT: scratch_load_b32 v49, off, s32 offset:16 +; DAGISEL64-NEXT: s_mov_b64 exec, vcc +; DAGISEL64-NEXT: s_wait_loadcnt 0x0 +; DAGISEL64-NEXT: s_wait_alu 0xf1ff +; DAGISEL64-NEXT: s_setpc_b64 s[30:31] +; +; GISEL64-LABEL: csr: +; GISEL64: ; %bb.0: +; GISEL64-NEXT: s_wait_loadcnt_dscnt 0x0 +; GISEL64-NEXT: s_wait_expcnt 0x0 +; GISEL64-NEXT: s_wait_samplecnt 0x0 +; GISEL64-NEXT: s_wait_bvhcnt 0x0 +; GISEL64-NEXT: s_wait_kmcnt 0x0 +; GISEL64-NEXT: s_xor_saveexec_b64 vcc, -1 +; GISEL64-NEXT: s_clause 0x3 +; GISEL64-NEXT: scratch_store_b32 off, v2, s32 +; GISEL64-NEXT: scratch_store_b32 off, v0, s32 offset:4 +; GISEL64-NEXT: scratch_store_b32 off, v1, s32 offset:8 +; GISEL64-NEXT: scratch_store_b32 off, v49, s32 offset:16 +; GISEL64-NEXT: s_mov_b64 exec, -1 +; GISEL64-NEXT: scratch_store_b32 off, v40, s32 offset:12 ; 4-byte Folded Spill +; GISEL64-NEXT: ;;#ASMSTART +; GISEL64-NEXT: ; clobber CSR +; GISEL64-NEXT: ;;#ASMEND +; GISEL64-NEXT: v_writelane_b32 v2, s20, 0 +; GISEL64-NEXT: ;;#ASMSTART +; GISEL64-NEXT: ; clobber non-CSR +; GISEL64-NEXT: ;;#ASMEND +; GISEL64-NEXT: scratch_load_b32 v40, off, s32 offset:12 ; 4-byte Folded Reload +; GISEL64-NEXT: s_wait_alu 0xfffe +; GISEL64-NEXT: v_cndmask_b32_e32 v0, 5, v0, vcc +; GISEL64-NEXT: v_cndmask_b32_e32 v1, 3, v1, vcc +; GISEL64-NEXT: v_readlane_b32 s20, v2, 0 +; GISEL64-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GISEL64-NEXT: v_mov_b32_dpp v0, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 +; GISEL64-NEXT: s_xor_b64 exec, vcc, -1 +; GISEL64-NEXT: s_clause 0x3 +; GISEL64-NEXT: scratch_load_b32 v2, off, s32 +; GISEL64-NEXT: scratch_load_b32 v0, off, s32 offset:4 +; GISEL64-NEXT: scratch_load_b32 v1, off, s32 offset:8 +; GISEL64-NEXT: scratch_load_b32 v49, off, s32 offset:16 +; GISEL64-NEXT: s_mov_b64 exec, vcc +; GISEL64-NEXT: s_wait_loadcnt 0x0 +; GISEL64-NEXT: s_wait_alu 0xf1ff +; GISEL64-NEXT: s_setpc_b64 s[30:31] + %x = select i1 %active, i32 %a, i32 5 + %y = select i1 %active, i32 %b, i32 3 + call void asm sideeffect "; clobber CSR", "~{v40},~{s48}"() + call void asm sideeffect "; clobber non-CSR", "~{v49},~{s20}"() + %ret = call i32 @llvm.amdgcn.update.dpp.i32(i32 %x, i32 %y, i32 1, i32 1, i32 1, i1 false) + ret i32 %ret +} + +; Save and restore all lanes of v40. +define amdgpu_gfx_whole_wave void @csr_vgpr_only(i1 %active, i32 %a, i32 %b) { +; DAGISEL-LABEL: csr_vgpr_only: +; DAGISEL: ; %bb.0: +; DAGISEL-NEXT: s_wait_loadcnt_dscnt 0x0 +; DAGISEL-NEXT: s_wait_expcnt 0x0 +; DAGISEL-NEXT: s_wait_samplecnt 0x0 +; DAGISEL-NEXT: s_wait_bvhcnt 0x0 +; DAGISEL-NEXT: s_wait_kmcnt 0x0 +; DAGISEL-NEXT: s_or_saveexec_b32 s0, -1 +; DAGISEL-NEXT: scratch_store_b32 off, v40, s32 ; 4-byte Folded Spill +; DAGISEL-NEXT: ;;#ASMSTART +; DAGISEL-NEXT: ; clobber CSR VGPR +; DAGISEL-NEXT: ;;#ASMEND +; DAGISEL-NEXT: scratch_load_b32 v40, off, s32 ; 4-byte Folded Reload +; DAGISEL-NEXT: s_wait_alu 0xfffe +; DAGISEL-NEXT: s_mov_b32 exec_lo, s0 +; DAGISEL-NEXT: s_wait_loadcnt 0x0 +; DAGISEL-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-LABEL: csr_vgpr_only: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 +; GISEL-NEXT: s_wait_expcnt 0x0 +; GISEL-NEXT: s_wait_samplecnt 0x0 +; GISEL-NEXT: s_wait_bvhcnt 0x0 +; GISEL-NEXT: s_wait_kmcnt 0x0 +; GISEL-NEXT: s_or_saveexec_b32 s0, -1 +; GISEL-NEXT: scratch_store_b32 off, v40, s32 ; 4-byte Folded Spill +; GISEL-NEXT: ;;#ASMSTART +; GISEL-NEXT: ; clobber CSR VGPR +; GISEL-NEXT: ;;#ASMEND +; GISEL-NEXT: scratch_load_b32 v40, off, s32 ; 4-byte Folded Reload +; GISEL-NEXT: s_wait_alu 0xfffe +; GISEL-NEXT: s_mov_b32 exec_lo, s0 +; GISEL-NEXT: s_wait_loadcnt 0x0 +; GISEL-NEXT: s_setpc_b64 s[30:31] +; +; DAGISEL64-LABEL: csr_vgpr_only: +; DAGISEL64: ; %bb.0: +; DAGISEL64-NEXT: s_wait_loadcnt_dscnt 0x0 +; DAGISEL64-NEXT: s_wait_expcnt 0x0 +; DAGISEL64-NEXT: s_wait_samplecnt 0x0 +; DAGISEL64-NEXT: s_wait_bvhcnt 0x0 +; DAGISEL64-NEXT: s_wait_kmcnt 0x0 +; DAGISEL64-NEXT: s_or_saveexec_b64 s[0:1], -1 +; DAGISEL64-NEXT: scratch_store_b32 off, v40, s32 ; 4-byte Folded Spill +; DAGISEL64-NEXT: ;;#ASMSTART +; DAGISEL64-NEXT: ; clobber CSR VGPR +; DAGISEL64-NEXT: ;;#ASMEND +; DAGISEL64-NEXT: scratch_load_b32 v40, off, s32 ; 4-byte Folded Reload +; DAGISEL64-NEXT: s_wait_alu 0xfffe +; DAGISEL64-NEXT: s_mov_b64 exec, s[0:1] +; DAGISEL64-NEXT: s_wait_loadcnt 0x0 +; DAGISEL64-NEXT: s_setpc_b64 s[30:31] +; +; GISEL64-LABEL: csr_vgpr_only: +; GISEL64: ; %bb.0: +; GISEL64-NEXT: s_wait_loadcnt_dscnt 0x0 +; GISEL64-NEXT: s_wait_expcnt 0x0 +; GISEL64-NEXT: s_wait_samplecnt 0x0 +; GISEL64-NEXT: s_wait_bvhcnt 0x0 +; GISEL64-NEXT: s_wait_kmcnt 0x0 +; GISEL64-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GISEL64-NEXT: scratch_store_b32 off, v40, s32 ; 4-byte Folded Spill +; GISEL64-NEXT: ;;#ASMSTART +; GISEL64-NEXT: ; clobber CSR VGPR +; GISEL64-NEXT: ;;#ASMEND +; GISEL64-NEXT: scratch_load_b32 v40, off, s32 ; 4-byte Folded Reload +; GISEL64-NEXT: s_wait_alu 0xfffe +; GISEL64-NEXT: s_mov_b64 exec, s[0:1] +; GISEL64-NEXT: s_wait_loadcnt 0x0 +; GISEL64-NEXT: s_setpc_b64 s[30:31] + call void asm sideeffect "; clobber CSR VGPR", "~{v40}"() + ret void +} + +define amdgpu_gfx_whole_wave void @sgpr_spill_only(i1 %active, i32 %a, i32 %b) { +; DAGISEL-LABEL: sgpr_spill_only: +; DAGISEL: ; %bb.0: +; DAGISEL-NEXT: s_wait_loadcnt_dscnt 0x0 +; DAGISEL-NEXT: s_wait_expcnt 0x0 +; DAGISEL-NEXT: s_wait_samplecnt 0x0 +; DAGISEL-NEXT: s_wait_bvhcnt 0x0 +; DAGISEL-NEXT: s_wait_kmcnt 0x0 +; DAGISEL-NEXT: s_xor_saveexec_b32 s0, -1 +; DAGISEL-NEXT: scratch_store_b32 off, v0, s32 ; 4-byte Folded Spill +; DAGISEL-NEXT: s_mov_b32 exec_lo, -1 +; DAGISEL-NEXT: v_writelane_b32 v0, s68, 0 +; DAGISEL-NEXT: ;;#ASMSTART +; DAGISEL-NEXT: ; clobber CSR SGPR +; DAGISEL-NEXT: ;;#ASMEND +; DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; DAGISEL-NEXT: v_readlane_b32 s68, v0, 0 +; DAGISEL-NEXT: s_wait_alu 0xfffe +; DAGISEL-NEXT: s_xor_b32 exec_lo, s0, -1 +; DAGISEL-NEXT: scratch_load_b32 v0, off, s32 ; 4-byte Folded Reload +; DAGISEL-NEXT: s_mov_b32 exec_lo, s0 +; DAGISEL-NEXT: s_wait_loadcnt 0x0 +; DAGISEL-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-LABEL: sgpr_spill_only: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 +; GISEL-NEXT: s_wait_expcnt 0x0 +; GISEL-NEXT: s_wait_samplecnt 0x0 +; GISEL-NEXT: s_wait_bvhcnt 0x0 +; GISEL-NEXT: s_wait_kmcnt 0x0 +; GISEL-NEXT: s_xor_saveexec_b32 s0, -1 +; GISEL-NEXT: scratch_store_b32 off, v0, s32 ; 4-byte Folded Spill +; GISEL-NEXT: s_mov_b32 exec_lo, -1 +; GISEL-NEXT: v_writelane_b32 v0, s68, 0 +; GISEL-NEXT: ;;#ASMSTART +; GISEL-NEXT: ; clobber CSR SGPR +; GISEL-NEXT: ;;#ASMEND +; GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GISEL-NEXT: v_readlane_b32 s68, v0, 0 +; GISEL-NEXT: s_wait_alu 0xfffe +; GISEL-NEXT: s_xor_b32 exec_lo, s0, -1 +; GISEL-NEXT: scratch_load_b32 v0, off, s32 ; 4-byte Folded Reload +; GISEL-NEXT: s_mov_b32 exec_lo, s0 +; GISEL-NEXT: s_wait_loadcnt 0x0 +; GISEL-NEXT: s_setpc_b64 s[30:31] +; +; DAGISEL64-LABEL: sgpr_spill_only: +; DAGISEL64: ; %bb.0: +; DAGISEL64-NEXT: s_wait_loadcnt_dscnt 0x0 +; DAGISEL64-NEXT: s_wait_expcnt 0x0 +; DAGISEL64-NEXT: s_wait_samplecnt 0x0 +; DAGISEL64-NEXT: s_wait_bvhcnt 0x0 +; DAGISEL64-NEXT: s_wait_kmcnt 0x0 +; DAGISEL64-NEXT: s_xor_saveexec_b64 s[0:1], -1 +; DAGISEL64-NEXT: scratch_store_b32 off, v0, s32 ; 4-byte Folded Spill +; DAGISEL64-NEXT: s_mov_b64 exec, -1 +; DAGISEL64-NEXT: v_writelane_b32 v0, s68, 0 +; DAGISEL64-NEXT: ;;#ASMSTART +; DAGISEL64-NEXT: ; clobber CSR SGPR +; DAGISEL64-NEXT: ;;#ASMEND +; DAGISEL64-NEXT: s_delay_alu instid0(VALU_DEP_1) +; DAGISEL64-NEXT: v_readlane_b32 s68, v0, 0 +; DAGISEL64-NEXT: s_wait_alu 0xfffe +; DAGISEL64-NEXT: s_xor_b64 exec, s[0:1], -1 +; DAGISEL64-NEXT: scratch_load_b32 v0, off, s32 ; 4-byte Folded Reload +; DAGISEL64-NEXT: s_mov_b64 exec, s[0:1] +; DAGISEL64-NEXT: s_wait_loadcnt 0x0 +; DAGISEL64-NEXT: s_setpc_b64 s[30:31] +; +; GISEL64-LABEL: sgpr_spill_only: +; GISEL64: ; %bb.0: +; GISEL64-NEXT: s_wait_loadcnt_dscnt 0x0 +; GISEL64-NEXT: s_wait_expcnt 0x0 +; GISEL64-NEXT: s_wait_samplecnt 0x0 +; GISEL64-NEXT: s_wait_bvhcnt 0x0 +; GISEL64-NEXT: s_wait_kmcnt 0x0 +; GISEL64-NEXT: s_xor_saveexec_b64 s[0:1], -1 +; GISEL64-NEXT: scratch_store_b32 off, v0, s32 ; 4-byte Folded Spill +; GISEL64-NEXT: s_mov_b64 exec, -1 +; GISEL64-NEXT: v_writelane_b32 v0, s68, 0 +; GISEL64-NEXT: ;;#ASMSTART +; GISEL64-NEXT: ; clobber CSR SGPR +; GISEL64-NEXT: ;;#ASMEND +; GISEL64-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GISEL64-NEXT: v_readlane_b32 s68, v0, 0 +; GISEL64-NEXT: s_wait_alu 0xfffe +; GISEL64-NEXT: s_xor_b64 exec, s[0:1], -1 +; GISEL64-NEXT: scratch_load_b32 v0, off, s32 ; 4-byte Folded Reload +; GISEL64-NEXT: s_mov_b64 exec, s[0:1] +; GISEL64-NEXT: s_wait_loadcnt 0x0 +; GISEL64-NEXT: s_setpc_b64 s[30:31] + call void asm sideeffect "; clobber CSR SGPR", "~{s68}"() + ret void +} + +define amdgpu_gfx_whole_wave i32 @multiple_blocks(i1 %active, i32 %a, i32 %b) { +; DAGISEL-LABEL: multiple_blocks: +; DAGISEL: ; %bb.0: +; DAGISEL-NEXT: s_wait_loadcnt_dscnt 0x0 +; DAGISEL-NEXT: s_wait_expcnt 0x0 +; DAGISEL-NEXT: s_wait_samplecnt 0x0 +; DAGISEL-NEXT: s_wait_bvhcnt 0x0 +; DAGISEL-NEXT: s_wait_kmcnt 0x0 +; DAGISEL-NEXT: s_xor_saveexec_b32 vcc_lo, -1 +; DAGISEL-NEXT: s_clause 0x1 +; DAGISEL-NEXT: scratch_store_b32 off, v0, s32 +; DAGISEL-NEXT: scratch_store_b32 off, v1, s32 offset:4 +; DAGISEL-NEXT: s_mov_b32 exec_lo, -1 +; DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; DAGISEL-NEXT: s_mov_b32 s1, exec_lo +; DAGISEL-NEXT: v_cmpx_eq_u32_e64 v0, v1 +; DAGISEL-NEXT: ; %bb.1: ; %if.then +; DAGISEL-NEXT: v_add_nc_u32_e32 v1, v0, v1 +; DAGISEL-NEXT: ; %bb.2: ; %if.end +; DAGISEL-NEXT: s_wait_alu 0xfffe +; DAGISEL-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; DAGISEL-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo +; DAGISEL-NEXT: s_xor_b32 exec_lo, vcc_lo, -1 +; DAGISEL-NEXT: s_clause 0x1 +; DAGISEL-NEXT: scratch_load_b32 v0, off, s32 +; DAGISEL-NEXT: scratch_load_b32 v1, off, s32 offset:4 +; DAGISEL-NEXT: s_mov_b32 exec_lo, vcc_lo +; DAGISEL-NEXT: s_wait_loadcnt 0x0 +; DAGISEL-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-LABEL: multiple_blocks: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 +; GISEL-NEXT: s_wait_expcnt 0x0 +; GISEL-NEXT: s_wait_samplecnt 0x0 +; GISEL-NEXT: s_wait_bvhcnt 0x0 +; GISEL-NEXT: s_wait_kmcnt 0x0 +; GISEL-NEXT: s_xor_saveexec_b32 vcc_lo, -1 +; GISEL-NEXT: s_clause 0x1 +; GISEL-NEXT: scratch_store_b32 off, v0, s32 +; GISEL-NEXT: scratch_store_b32 off, v1, s32 offset:4 +; GISEL-NEXT: s_mov_b32 exec_lo, -1 +; GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GISEL-NEXT: s_mov_b32 s1, exec_lo +; GISEL-NEXT: v_cmpx_eq_u32_e64 v0, v1 +; GISEL-NEXT: ; %bb.1: ; %if.then +; GISEL-NEXT: v_add_nc_u32_e32 v1, v0, v1 +; GISEL-NEXT: ; %bb.2: ; %if.end +; GISEL-NEXT: s_wait_alu 0xfffe +; GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GISEL-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo +; GISEL-NEXT: s_xor_b32 exec_lo, vcc_lo, -1 +; GISEL-NEXT: s_clause 0x1 +; GISEL-NEXT: scratch_load_b32 v0, off, s32 +; GISEL-NEXT: scratch_load_b32 v1, off, s32 offset:4 +; GISEL-NEXT: s_mov_b32 exec_lo, vcc_lo +; GISEL-NEXT: s_wait_loadcnt 0x0 +; GISEL-NEXT: s_setpc_b64 s[30:31] +; +; DAGISEL64-LABEL: multiple_blocks: +; DAGISEL64: ; %bb.0: +; DAGISEL64-NEXT: s_wait_loadcnt_dscnt 0x0 +; DAGISEL64-NEXT: s_wait_expcnt 0x0 +; DAGISEL64-NEXT: s_wait_samplecnt 0x0 +; DAGISEL64-NEXT: s_wait_bvhcnt 0x0 +; DAGISEL64-NEXT: s_wait_kmcnt 0x0 +; DAGISEL64-NEXT: s_xor_saveexec_b64 vcc, -1 +; DAGISEL64-NEXT: s_clause 0x1 +; DAGISEL64-NEXT: scratch_store_b32 off, v0, s32 +; DAGISEL64-NEXT: scratch_store_b32 off, v1, s32 offset:4 +; DAGISEL64-NEXT: s_mov_b64 exec, -1 +; DAGISEL64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; DAGISEL64-NEXT: s_mov_b64 s[2:3], exec +; DAGISEL64-NEXT: v_cmpx_eq_u32_e64 v0, v1 +; DAGISEL64-NEXT: ; %bb.1: ; %if.then +; DAGISEL64-NEXT: v_add_nc_u32_e32 v1, v0, v1 +; DAGISEL64-NEXT: ; %bb.2: ; %if.end +; DAGISEL64-NEXT: s_wait_alu 0xfffe +; DAGISEL64-NEXT: s_or_b64 exec, exec, s[2:3] +; DAGISEL64-NEXT: s_delay_alu instid0(VALU_DEP_1) +; DAGISEL64-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc +; DAGISEL64-NEXT: s_xor_b64 exec, vcc, -1 +; DAGISEL64-NEXT: s_clause 0x1 +; DAGISEL64-NEXT: scratch_load_b32 v0, off, s32 +; DAGISEL64-NEXT: scratch_load_b32 v1, off, s32 offset:4 +; DAGISEL64-NEXT: s_mov_b64 exec, vcc +; DAGISEL64-NEXT: s_wait_loadcnt 0x0 +; DAGISEL64-NEXT: s_setpc_b64 s[30:31] +; +; GISEL64-LABEL: multiple_blocks: +; GISEL64: ; %bb.0: +; GISEL64-NEXT: s_wait_loadcnt_dscnt 0x0 +; GISEL64-NEXT: s_wait_expcnt 0x0 +; GISEL64-NEXT: s_wait_samplecnt 0x0 +; GISEL64-NEXT: s_wait_bvhcnt 0x0 +; GISEL64-NEXT: s_wait_kmcnt 0x0 +; GISEL64-NEXT: s_xor_saveexec_b64 vcc, -1 +; GISEL64-NEXT: s_clause 0x1 +; GISEL64-NEXT: scratch_store_b32 off, v0, s32 +; GISEL64-NEXT: scratch_store_b32 off, v1, s32 offset:4 +; GISEL64-NEXT: s_mov_b64 exec, -1 +; GISEL64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GISEL64-NEXT: s_mov_b64 s[2:3], exec +; GISEL64-NEXT: v_cmpx_eq_u32_e64 v0, v1 +; GISEL64-NEXT: ; %bb.1: ; %if.then +; GISEL64-NEXT: v_add_nc_u32_e32 v1, v0, v1 +; GISEL64-NEXT: ; %bb.2: ; %if.end +; GISEL64-NEXT: s_wait_alu 0xfffe +; GISEL64-NEXT: s_or_b64 exec, exec, s[2:3] +; GISEL64-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GISEL64-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc +; GISEL64-NEXT: s_xor_b64 exec, vcc, -1 +; GISEL64-NEXT: s_clause 0x1 +; GISEL64-NEXT: scratch_load_b32 v0, off, s32 +; GISEL64-NEXT: scratch_load_b32 v1, off, s32 offset:4 +; GISEL64-NEXT: s_mov_b64 exec, vcc +; GISEL64-NEXT: s_wait_loadcnt 0x0 +; GISEL64-NEXT: s_setpc_b64 s[30:31] + %c = icmp eq i32 %a, %b + br i1 %c, label %if.then, label %if.end + +if.then: ; preds = %0 + %d = add i32 %a, %b + br label %if.end + +if.end: + %f = phi i32 [ %d, %if.then ], [ %b, %0 ] + %e = select i1 %active, i32 %a, i32 %f + ret i32 %e +} + +define amdgpu_gfx_whole_wave i64 @ret_64(i1 %active, i64 %a, i64 %b) { +; DAGISEL-LABEL: ret_64: +; DAGISEL: ; %bb.0: +; DAGISEL-NEXT: s_wait_loadcnt_dscnt 0x0 +; DAGISEL-NEXT: s_wait_expcnt 0x0 +; DAGISEL-NEXT: s_wait_samplecnt 0x0 +; DAGISEL-NEXT: s_wait_bvhcnt 0x0 +; DAGISEL-NEXT: s_wait_kmcnt 0x0 +; DAGISEL-NEXT: s_xor_saveexec_b32 vcc_lo, -1 +; DAGISEL-NEXT: s_clause 0x3 +; DAGISEL-NEXT: scratch_store_b32 off, v0, s32 +; DAGISEL-NEXT: scratch_store_b32 off, v1, s32 offset:4 +; DAGISEL-NEXT: scratch_store_b32 off, v2, s32 offset:8 +; DAGISEL-NEXT: scratch_store_b32 off, v3, s32 offset:12 +; DAGISEL-NEXT: s_mov_b32 exec_lo, -1 +; DAGISEL-NEXT: s_wait_alu 0xfffe +; DAGISEL-NEXT: v_dual_cndmask_b32 v1, 0, v1 :: v_dual_cndmask_b32 v0, 5, v0 +; DAGISEL-NEXT: v_dual_cndmask_b32 v2, 3, v2 :: v_dual_cndmask_b32 v3, 0, v3 +; DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; DAGISEL-NEXT: v_mov_b32_dpp v0, v2 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 +; DAGISEL-NEXT: v_mov_b32_dpp v1, v3 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 +; DAGISEL-NEXT: s_xor_b32 exec_lo, vcc_lo, -1 +; DAGISEL-NEXT: s_clause 0x3 +; DAGISEL-NEXT: scratch_load_b32 v0, off, s32 +; DAGISEL-NEXT: scratch_load_b32 v1, off, s32 offset:4 +; DAGISEL-NEXT: scratch_load_b32 v2, off, s32 offset:8 +; DAGISEL-NEXT: scratch_load_b32 v3, off, s32 offset:12 +; DAGISEL-NEXT: s_mov_b32 exec_lo, vcc_lo +; DAGISEL-NEXT: s_wait_loadcnt 0x0 +; DAGISEL-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-LABEL: ret_64: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 +; GISEL-NEXT: s_wait_expcnt 0x0 +; GISEL-NEXT: s_wait_samplecnt 0x0 +; GISEL-NEXT: s_wait_bvhcnt 0x0 +; GISEL-NEXT: s_wait_kmcnt 0x0 +; GISEL-NEXT: s_xor_saveexec_b32 vcc_lo, -1 +; GISEL-NEXT: s_clause 0x3 +; GISEL-NEXT: scratch_store_b32 off, v0, s32 +; GISEL-NEXT: scratch_store_b32 off, v1, s32 offset:4 +; GISEL-NEXT: scratch_store_b32 off, v2, s32 offset:8 +; GISEL-NEXT: scratch_store_b32 off, v3, s32 offset:12 +; GISEL-NEXT: s_mov_b32 exec_lo, -1 +; GISEL-NEXT: s_wait_alu 0xfffe +; GISEL-NEXT: v_dual_cndmask_b32 v0, 5, v0 :: v_dual_cndmask_b32 v1, 0, v1 +; GISEL-NEXT: v_dual_cndmask_b32 v2, 3, v2 :: v_dual_cndmask_b32 v3, 0, v3 +; GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GISEL-NEXT: v_mov_b32_dpp v0, v2 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 +; GISEL-NEXT: v_mov_b32_dpp v1, v3 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 +; GISEL-NEXT: s_xor_b32 exec_lo, vcc_lo, -1 +; GISEL-NEXT: s_clause 0x3 +; GISEL-NEXT: scratch_load_b32 v0, off, s32 +; GISEL-NEXT: scratch_load_b32 v1, off, s32 offset:4 +; GISEL-NEXT: scratch_load_b32 v2, off, s32 offset:8 +; GISEL-NEXT: scratch_load_b32 v3, off, s32 offset:12 +; GISEL-NEXT: s_mov_b32 exec_lo, vcc_lo +; GISEL-NEXT: s_wait_loadcnt 0x0 +; GISEL-NEXT: s_setpc_b64 s[30:31] +; +; DAGISEL64-LABEL: ret_64: +; DAGISEL64: ; %bb.0: +; DAGISEL64-NEXT: s_wait_loadcnt_dscnt 0x0 +; DAGISEL64-NEXT: s_wait_expcnt 0x0 +; DAGISEL64-NEXT: s_wait_samplecnt 0x0 +; DAGISEL64-NEXT: s_wait_bvhcnt 0x0 +; DAGISEL64-NEXT: s_wait_kmcnt 0x0 +; DAGISEL64-NEXT: s_xor_saveexec_b64 vcc, -1 +; DAGISEL64-NEXT: s_clause 0x3 +; DAGISEL64-NEXT: scratch_store_b32 off, v0, s32 +; DAGISEL64-NEXT: scratch_store_b32 off, v1, s32 offset:4 +; DAGISEL64-NEXT: scratch_store_b32 off, v2, s32 offset:8 +; DAGISEL64-NEXT: scratch_store_b32 off, v3, s32 offset:12 +; DAGISEL64-NEXT: s_mov_b64 exec, -1 +; DAGISEL64-NEXT: s_wait_alu 0xfffe +; DAGISEL64-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; DAGISEL64-NEXT: v_cndmask_b32_e32 v0, 5, v0, vcc +; DAGISEL64-NEXT: v_cndmask_b32_e32 v2, 3, v2, vcc +; DAGISEL64-NEXT: v_cndmask_b32_e32 v3, 0, v3, vcc +; DAGISEL64-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; DAGISEL64-NEXT: v_mov_b32_dpp v0, v2 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 +; DAGISEL64-NEXT: v_mov_b32_dpp v1, v3 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 +; DAGISEL64-NEXT: s_xor_b64 exec, vcc, -1 +; DAGISEL64-NEXT: s_clause 0x3 +; DAGISEL64-NEXT: scratch_load_b32 v0, off, s32 +; DAGISEL64-NEXT: scratch_load_b32 v1, off, s32 offset:4 +; DAGISEL64-NEXT: scratch_load_b32 v2, off, s32 offset:8 +; DAGISEL64-NEXT: scratch_load_b32 v3, off, s32 offset:12 +; DAGISEL64-NEXT: s_mov_b64 exec, vcc +; DAGISEL64-NEXT: s_wait_loadcnt 0x0 +; DAGISEL64-NEXT: s_setpc_b64 s[30:31] +; +; GISEL64-LABEL: ret_64: +; GISEL64: ; %bb.0: +; GISEL64-NEXT: s_wait_loadcnt_dscnt 0x0 +; GISEL64-NEXT: s_wait_expcnt 0x0 +; GISEL64-NEXT: s_wait_samplecnt 0x0 +; GISEL64-NEXT: s_wait_bvhcnt 0x0 +; GISEL64-NEXT: s_wait_kmcnt 0x0 +; GISEL64-NEXT: s_xor_saveexec_b64 vcc, -1 +; GISEL64-NEXT: s_clause 0x3 +; GISEL64-NEXT: scratch_store_b32 off, v0, s32 +; GISEL64-NEXT: scratch_store_b32 off, v1, s32 offset:4 +; GISEL64-NEXT: scratch_store_b32 off, v2, s32 offset:8 +; GISEL64-NEXT: scratch_store_b32 off, v3, s32 offset:12 +; GISEL64-NEXT: s_mov_b64 exec, -1 +; GISEL64-NEXT: s_wait_alu 0xfffe +; GISEL64-NEXT: v_cndmask_b32_e32 v0, 5, v0, vcc +; GISEL64-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; GISEL64-NEXT: v_cndmask_b32_e32 v2, 3, v2, vcc +; GISEL64-NEXT: v_cndmask_b32_e32 v3, 0, v3, vcc +; GISEL64-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GISEL64-NEXT: v_mov_b32_dpp v0, v2 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 +; GISEL64-NEXT: v_mov_b32_dpp v1, v3 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 +; GISEL64-NEXT: s_xor_b64 exec, vcc, -1 +; GISEL64-NEXT: s_clause 0x3 +; GISEL64-NEXT: scratch_load_b32 v0, off, s32 +; GISEL64-NEXT: scratch_load_b32 v1, off, s32 offset:4 +; GISEL64-NEXT: scratch_load_b32 v2, off, s32 offset:8 +; GISEL64-NEXT: scratch_load_b32 v3, off, s32 offset:12 +; GISEL64-NEXT: s_mov_b64 exec, vcc +; GISEL64-NEXT: s_wait_loadcnt 0x0 +; GISEL64-NEXT: s_setpc_b64 s[30:31] + %x = select i1 %active, i64 %a, i64 5 + %y = select i1 %active, i64 %b, i64 3 + %ret = call i64 @llvm.amdgcn.update.dpp.i64(i64 %x, i64 %y, i32 1, i32 1, i32 1, i1 false) + ret i64 %ret +} + +define amdgpu_gfx_whole_wave void @inreg_args(i1 %active, i32 inreg %i32, <4 x i32> inreg %v4i32, float inreg %float, ptr addrspace(5) inreg %ptr, ptr addrspace(5) inreg %ptr2) { +; DAGISEL-LABEL: inreg_args: +; DAGISEL: ; %bb.0: +; DAGISEL-NEXT: s_wait_loadcnt_dscnt 0x0 +; DAGISEL-NEXT: s_wait_expcnt 0x0 +; DAGISEL-NEXT: s_wait_samplecnt 0x0 +; DAGISEL-NEXT: s_wait_bvhcnt 0x0 +; DAGISEL-NEXT: s_wait_kmcnt 0x0 +; DAGISEL-NEXT: s_xor_saveexec_b32 s0, -1 +; DAGISEL-NEXT: s_clause 0x5 +; DAGISEL-NEXT: scratch_store_b32 off, v0, s32 +; DAGISEL-NEXT: scratch_store_b32 off, v1, s32 offset:4 +; DAGISEL-NEXT: scratch_store_b32 off, v2, s32 offset:8 +; DAGISEL-NEXT: scratch_store_b32 off, v3, s32 offset:12 +; DAGISEL-NEXT: scratch_store_b32 off, v4, s32 offset:16 +; DAGISEL-NEXT: scratch_store_b32 off, v5, s32 offset:20 +; DAGISEL-NEXT: s_mov_b32 exec_lo, -1 +; DAGISEL-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s9 +; DAGISEL-NEXT: v_dual_mov_b32 v0, s5 :: v_dual_mov_b32 v1, s6 +; DAGISEL-NEXT: v_dual_mov_b32 v2, s7 :: v_dual_mov_b32 v3, s8 +; DAGISEL-NEXT: scratch_store_b32 off, v4, s10 +; DAGISEL-NEXT: s_clause 0x1 +; DAGISEL-NEXT: scratch_store_b128 off, v[0:3], s11 +; DAGISEL-NEXT: scratch_store_b32 off, v5, s11 +; DAGISEL-NEXT: s_wait_alu 0xfffe +; DAGISEL-NEXT: s_xor_b32 exec_lo, s0, -1 +; DAGISEL-NEXT: s_clause 0x5 +; DAGISEL-NEXT: scratch_load_b32 v0, off, s32 +; DAGISEL-NEXT: scratch_load_b32 v1, off, s32 offset:4 +; DAGISEL-NEXT: scratch_load_b32 v2, off, s32 offset:8 +; DAGISEL-NEXT: scratch_load_b32 v3, off, s32 offset:12 +; DAGISEL-NEXT: scratch_load_b32 v4, off, s32 offset:16 +; DAGISEL-NEXT: scratch_load_b32 v5, off, s32 offset:20 +; DAGISEL-NEXT: s_mov_b32 exec_lo, s0 +; DAGISEL-NEXT: s_wait_loadcnt 0x0 +; DAGISEL-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-LABEL: inreg_args: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 +; GISEL-NEXT: s_wait_expcnt 0x0 +; GISEL-NEXT: s_wait_samplecnt 0x0 +; GISEL-NEXT: s_wait_bvhcnt 0x0 +; GISEL-NEXT: s_wait_kmcnt 0x0 +; GISEL-NEXT: s_xor_saveexec_b32 s34, -1 +; GISEL-NEXT: s_clause 0x5 +; GISEL-NEXT: scratch_store_b32 off, v0, s32 +; GISEL-NEXT: scratch_store_b32 off, v1, s32 offset:4 +; GISEL-NEXT: scratch_store_b32 off, v2, s32 offset:8 +; GISEL-NEXT: scratch_store_b32 off, v3, s32 offset:12 +; GISEL-NEXT: scratch_store_b32 off, v4, s32 offset:16 +; GISEL-NEXT: scratch_store_b32 off, v5, s32 offset:20 +; GISEL-NEXT: s_mov_b32 exec_lo, -1 +; GISEL-NEXT: s_mov_b32 s0, s5 +; GISEL-NEXT: s_mov_b32 s1, s6 +; GISEL-NEXT: s_mov_b32 s2, s7 +; GISEL-NEXT: s_mov_b32 s3, s8 +; GISEL-NEXT: v_mov_b32_e32 v4, s4 +; GISEL-NEXT: s_wait_alu 0xfffe +; GISEL-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s3 +; GISEL-NEXT: v_dual_mov_b32 v1, s1 :: v_dual_mov_b32 v2, s2 +; GISEL-NEXT: v_mov_b32_e32 v5, s9 +; GISEL-NEXT: scratch_store_b32 off, v4, s10 +; GISEL-NEXT: s_clause 0x1 +; GISEL-NEXT: scratch_store_b128 off, v[0:3], s11 +; GISEL-NEXT: scratch_store_b32 off, v5, s11 +; GISEL-NEXT: s_xor_b32 exec_lo, s34, -1 +; GISEL-NEXT: s_clause 0x5 +; GISEL-NEXT: scratch_load_b32 v0, off, s32 +; GISEL-NEXT: scratch_load_b32 v1, off, s32 offset:4 +; GISEL-NEXT: scratch_load_b32 v2, off, s32 offset:8 +; GISEL-NEXT: scratch_load_b32 v3, off, s32 offset:12 +; GISEL-NEXT: scratch_load_b32 v4, off, s32 offset:16 +; GISEL-NEXT: scratch_load_b32 v5, off, s32 offset:20 +; GISEL-NEXT: s_mov_b32 exec_lo, s34 +; GISEL-NEXT: s_wait_loadcnt 0x0 +; GISEL-NEXT: s_setpc_b64 s[30:31] +; +; DAGISEL64-LABEL: inreg_args: +; DAGISEL64: ; %bb.0: +; DAGISEL64-NEXT: s_wait_loadcnt_dscnt 0x0 +; DAGISEL64-NEXT: s_wait_expcnt 0x0 +; DAGISEL64-NEXT: s_wait_samplecnt 0x0 +; DAGISEL64-NEXT: s_wait_bvhcnt 0x0 +; DAGISEL64-NEXT: s_wait_kmcnt 0x0 +; DAGISEL64-NEXT: s_xor_saveexec_b64 s[0:1], -1 +; DAGISEL64-NEXT: s_clause 0x5 +; DAGISEL64-NEXT: scratch_store_b32 off, v0, s32 +; DAGISEL64-NEXT: scratch_store_b32 off, v1, s32 offset:4 +; DAGISEL64-NEXT: scratch_store_b32 off, v2, s32 offset:8 +; DAGISEL64-NEXT: scratch_store_b32 off, v3, s32 offset:12 +; DAGISEL64-NEXT: scratch_store_b32 off, v4, s32 offset:16 +; DAGISEL64-NEXT: scratch_store_b32 off, v5, s32 offset:20 +; DAGISEL64-NEXT: s_mov_b64 exec, -1 +; DAGISEL64-NEXT: v_mov_b32_e32 v4, s4 +; DAGISEL64-NEXT: v_mov_b32_e32 v0, s5 +; DAGISEL64-NEXT: v_mov_b32_e32 v1, s6 +; DAGISEL64-NEXT: v_mov_b32_e32 v2, s7 +; DAGISEL64-NEXT: v_mov_b32_e32 v3, s8 +; DAGISEL64-NEXT: v_mov_b32_e32 v5, s9 +; DAGISEL64-NEXT: scratch_store_b32 off, v4, s10 +; DAGISEL64-NEXT: s_clause 0x1 +; DAGISEL64-NEXT: scratch_store_b128 off, v[0:3], s11 +; DAGISEL64-NEXT: scratch_store_b32 off, v5, s11 +; DAGISEL64-NEXT: s_wait_alu 0xfffe +; DAGISEL64-NEXT: s_xor_b64 exec, s[0:1], -1 +; DAGISEL64-NEXT: s_clause 0x5 +; DAGISEL64-NEXT: scratch_load_b32 v0, off, s32 +; DAGISEL64-NEXT: scratch_load_b32 v1, off, s32 offset:4 +; DAGISEL64-NEXT: scratch_load_b32 v2, off, s32 offset:8 +; DAGISEL64-NEXT: scratch_load_b32 v3, off, s32 offset:12 +; DAGISEL64-NEXT: scratch_load_b32 v4, off, s32 offset:16 +; DAGISEL64-NEXT: scratch_load_b32 v5, off, s32 offset:20 +; DAGISEL64-NEXT: s_mov_b64 exec, s[0:1] +; DAGISEL64-NEXT: s_wait_loadcnt 0x0 +; DAGISEL64-NEXT: s_setpc_b64 s[30:31] +; +; GISEL64-LABEL: inreg_args: +; GISEL64: ; %bb.0: +; GISEL64-NEXT: s_wait_loadcnt_dscnt 0x0 +; GISEL64-NEXT: s_wait_expcnt 0x0 +; GISEL64-NEXT: s_wait_samplecnt 0x0 +; GISEL64-NEXT: s_wait_bvhcnt 0x0 +; GISEL64-NEXT: s_wait_kmcnt 0x0 +; GISEL64-NEXT: s_xor_saveexec_b64 s[34:35], -1 +; GISEL64-NEXT: s_clause 0x5 +; GISEL64-NEXT: scratch_store_b32 off, v0, s32 +; GISEL64-NEXT: scratch_store_b32 off, v1, s32 offset:4 +; GISEL64-NEXT: scratch_store_b32 off, v2, s32 offset:8 +; GISEL64-NEXT: scratch_store_b32 off, v3, s32 offset:12 +; GISEL64-NEXT: scratch_store_b32 off, v4, s32 offset:16 +; GISEL64-NEXT: scratch_store_b32 off, v5, s32 offset:20 +; GISEL64-NEXT: s_mov_b64 exec, -1 +; GISEL64-NEXT: s_mov_b32 s0, s5 +; GISEL64-NEXT: s_mov_b32 s1, s6 +; GISEL64-NEXT: s_mov_b32 s2, s7 +; GISEL64-NEXT: s_mov_b32 s3, s8 +; GISEL64-NEXT: v_mov_b32_e32 v4, s4 +; GISEL64-NEXT: s_wait_alu 0xfffe +; GISEL64-NEXT: v_mov_b32_e32 v0, s0 +; GISEL64-NEXT: v_mov_b32_e32 v1, s1 +; GISEL64-NEXT: v_mov_b32_e32 v2, s2 +; GISEL64-NEXT: v_mov_b32_e32 v3, s3 +; GISEL64-NEXT: v_mov_b32_e32 v5, s9 +; GISEL64-NEXT: scratch_store_b32 off, v4, s10 +; GISEL64-NEXT: s_clause 0x1 +; GISEL64-NEXT: scratch_store_b128 off, v[0:3], s11 +; GISEL64-NEXT: scratch_store_b32 off, v5, s11 +; GISEL64-NEXT: s_xor_b64 exec, s[34:35], -1 +; GISEL64-NEXT: s_clause 0x5 +; GISEL64-NEXT: scratch_load_b32 v0, off, s32 +; GISEL64-NEXT: scratch_load_b32 v1, off, s32 offset:4 +; GISEL64-NEXT: scratch_load_b32 v2, off, s32 offset:8 +; GISEL64-NEXT: scratch_load_b32 v3, off, s32 offset:12 +; GISEL64-NEXT: scratch_load_b32 v4, off, s32 offset:16 +; GISEL64-NEXT: scratch_load_b32 v5, off, s32 offset:20 +; GISEL64-NEXT: s_mov_b64 exec, s[34:35] +; GISEL64-NEXT: s_wait_loadcnt 0x0 +; GISEL64-NEXT: s_setpc_b64 s[30:31] + store i32 %i32, ptr addrspace(5) %ptr + store <4 x i32> %v4i32, ptr addrspace(5) %ptr2 + store float %float, ptr addrspace(5) %ptr2 + ret void +} + +declare amdgpu_gfx <2 x half> @gfx_callee(<2 x half> %x, <2 x half> %y) + +define amdgpu_gfx_whole_wave <2 x half> @call_gfx_from_whole_wave(i1 %active, <2 x half> %x, <2 x half> %y) { +; DAGISEL-LABEL: call_gfx_from_whole_wave: +; DAGISEL: ; %bb.0: +; DAGISEL-NEXT: s_wait_loadcnt_dscnt 0x0 +; DAGISEL-NEXT: s_wait_expcnt 0x0 +; DAGISEL-NEXT: s_wait_samplecnt 0x0 +; DAGISEL-NEXT: s_wait_bvhcnt 0x0 +; DAGISEL-NEXT: s_wait_kmcnt 0x0 +; DAGISEL-NEXT: s_mov_b32 s0, s33 +; DAGISEL-NEXT: s_mov_b32 s33, s32 +; DAGISEL-NEXT: s_xor_saveexec_b32 s4, -1 +; DAGISEL-NEXT: s_clause 0x1f +; DAGISEL-NEXT: scratch_store_b32 off, v0, s33 offset:4 +; DAGISEL-NEXT: scratch_store_b32 off, v1, s33 offset:8 +; DAGISEL-NEXT: scratch_store_b32 off, v2, s33 offset:12 +; DAGISEL-NEXT: scratch_store_b32 off, v3, s33 offset:16 +; DAGISEL-NEXT: scratch_store_b32 off, v4, s33 offset:20 +; DAGISEL-NEXT: scratch_store_b32 off, v5, s33 offset:24 +; DAGISEL-NEXT: scratch_store_b32 off, v6, s33 offset:28 +; DAGISEL-NEXT: scratch_store_b32 off, v7, s33 offset:32 +; DAGISEL-NEXT: scratch_store_b32 off, v8, s33 offset:36 +; DAGISEL-NEXT: scratch_store_b32 off, v9, s33 offset:40 +; DAGISEL-NEXT: scratch_store_b32 off, v10, s33 offset:44 +; DAGISEL-NEXT: scratch_store_b32 off, v11, s33 offset:48 +; DAGISEL-NEXT: scratch_store_b32 off, v12, s33 offset:52 +; DAGISEL-NEXT: scratch_store_b32 off, v13, s33 offset:56 +; DAGISEL-NEXT: scratch_store_b32 off, v14, s33 offset:60 +; DAGISEL-NEXT: scratch_store_b32 off, v15, s33 offset:64 +; DAGISEL-NEXT: scratch_store_b32 off, v16, s33 offset:68 +; DAGISEL-NEXT: scratch_store_b32 off, v17, s33 offset:72 +; DAGISEL-NEXT: scratch_store_b32 off, v18, s33 offset:76 +; DAGISEL-NEXT: scratch_store_b32 off, v19, s33 offset:80 +; DAGISEL-NEXT: scratch_store_b32 off, v20, s33 offset:84 +; DAGISEL-NEXT: scratch_store_b32 off, v21, s33 offset:88 +; DAGISEL-NEXT: scratch_store_b32 off, v22, s33 offset:92 +; DAGISEL-NEXT: scratch_store_b32 off, v23, s33 offset:96 +; DAGISEL-NEXT: scratch_store_b32 off, v24, s33 offset:100 +; DAGISEL-NEXT: scratch_store_b32 off, v25, s33 offset:104 +; DAGISEL-NEXT: scratch_store_b32 off, v26, s33 offset:108 +; DAGISEL-NEXT: scratch_store_b32 off, v27, s33 offset:112 +; DAGISEL-NEXT: scratch_store_b32 off, v28, s33 offset:116 +; DAGISEL-NEXT: scratch_store_b32 off, v29, s33 offset:120 +; DAGISEL-NEXT: scratch_store_b32 off, v30, s33 offset:124 +; DAGISEL-NEXT: scratch_store_b32 off, v31, s33 offset:128 +; DAGISEL-NEXT: s_clause 0x1f +; DAGISEL-NEXT: scratch_store_b32 off, v32, s33 offset:132 +; DAGISEL-NEXT: scratch_store_b32 off, v33, s33 offset:136 +; DAGISEL-NEXT: scratch_store_b32 off, v34, s33 offset:140 +; DAGISEL-NEXT: scratch_store_b32 off, v35, s33 offset:144 +; DAGISEL-NEXT: scratch_store_b32 off, v36, s33 offset:148 +; DAGISEL-NEXT: scratch_store_b32 off, v37, s33 offset:152 +; DAGISEL-NEXT: scratch_store_b32 off, v38, s33 offset:156 +; DAGISEL-NEXT: scratch_store_b32 off, v39, s33 offset:160 +; DAGISEL-NEXT: scratch_store_b32 off, v48, s33 offset:164 +; DAGISEL-NEXT: scratch_store_b32 off, v49, s33 offset:168 +; DAGISEL-NEXT: scratch_store_b32 off, v50, s33 offset:172 +; DAGISEL-NEXT: scratch_store_b32 off, v51, s33 offset:176 +; DAGISEL-NEXT: scratch_store_b32 off, v52, s33 offset:180 +; DAGISEL-NEXT: scratch_store_b32 off, v53, s33 offset:184 +; DAGISEL-NEXT: scratch_store_b32 off, v54, s33 offset:188 +; DAGISEL-NEXT: scratch_store_b32 off, v55, s33 offset:192 +; DAGISEL-NEXT: scratch_store_b32 off, v64, s33 offset:196 +; DAGISEL-NEXT: scratch_store_b32 off, v65, s33 offset:200 +; DAGISEL-NEXT: scratch_store_b32 off, v66, s33 offset:204 +; DAGISEL-NEXT: scratch_store_b32 off, v67, s33 offset:208 +; DAGISEL-NEXT: scratch_store_b32 off, v68, s33 offset:212 +; DAGISEL-NEXT: scratch_store_b32 off, v69, s33 offset:216 +; DAGISEL-NEXT: scratch_store_b32 off, v70, s33 offset:220 +; DAGISEL-NEXT: scratch_store_b32 off, v71, s33 offset:224 +; DAGISEL-NEXT: scratch_store_b32 off, v80, s33 offset:228 +; DAGISEL-NEXT: scratch_store_b32 off, v81, s33 offset:232 +; DAGISEL-NEXT: scratch_store_b32 off, v82, s33 offset:236 +; DAGISEL-NEXT: scratch_store_b32 off, v83, s33 offset:240 +; DAGISEL-NEXT: scratch_store_b32 off, v84, s33 offset:244 +; DAGISEL-NEXT: scratch_store_b32 off, v85, s33 offset:248 +; DAGISEL-NEXT: scratch_store_b32 off, v86, s33 offset:252 +; DAGISEL-NEXT: scratch_store_b32 off, v87, s33 offset:256 +; DAGISEL-NEXT: s_clause 0x1f +; DAGISEL-NEXT: scratch_store_b32 off, v96, s33 offset:260 +; DAGISEL-NEXT: scratch_store_b32 off, v97, s33 offset:264 +; DAGISEL-NEXT: scratch_store_b32 off, v98, s33 offset:268 +; DAGISEL-NEXT: scratch_store_b32 off, v99, s33 offset:272 +; DAGISEL-NEXT: scratch_store_b32 off, v100, s33 offset:276 +; DAGISEL-NEXT: scratch_store_b32 off, v101, s33 offset:280 +; DAGISEL-NEXT: scratch_store_b32 off, v102, s33 offset:284 +; DAGISEL-NEXT: scratch_store_b32 off, v103, s33 offset:288 +; DAGISEL-NEXT: scratch_store_b32 off, v112, s33 offset:292 +; DAGISEL-NEXT: scratch_store_b32 off, v113, s33 offset:296 +; DAGISEL-NEXT: scratch_store_b32 off, v114, s33 offset:300 +; DAGISEL-NEXT: scratch_store_b32 off, v115, s33 offset:304 +; DAGISEL-NEXT: scratch_store_b32 off, v116, s33 offset:308 +; DAGISEL-NEXT: scratch_store_b32 off, v117, s33 offset:312 +; DAGISEL-NEXT: scratch_store_b32 off, v118, s33 offset:316 +; DAGISEL-NEXT: scratch_store_b32 off, v119, s33 offset:320 +; DAGISEL-NEXT: scratch_store_b32 off, v128, s33 offset:324 +; DAGISEL-NEXT: scratch_store_b32 off, v129, s33 offset:328 +; DAGISEL-NEXT: scratch_store_b32 off, v130, s33 offset:332 +; DAGISEL-NEXT: scratch_store_b32 off, v131, s33 offset:336 +; DAGISEL-NEXT: scratch_store_b32 off, v132, s33 offset:340 +; DAGISEL-NEXT: scratch_store_b32 off, v133, s33 offset:344 +; DAGISEL-NEXT: scratch_store_b32 off, v134, s33 offset:348 +; DAGISEL-NEXT: scratch_store_b32 off, v135, s33 offset:352 +; DAGISEL-NEXT: scratch_store_b32 off, v144, s33 offset:356 +; DAGISEL-NEXT: scratch_store_b32 off, v145, s33 offset:360 +; DAGISEL-NEXT: scratch_store_b32 off, v146, s33 offset:364 +; DAGISEL-NEXT: scratch_store_b32 off, v147, s33 offset:368 +; DAGISEL-NEXT: scratch_store_b32 off, v148, s33 offset:372 +; DAGISEL-NEXT: scratch_store_b32 off, v149, s33 offset:376 +; DAGISEL-NEXT: scratch_store_b32 off, v150, s33 offset:380 +; DAGISEL-NEXT: scratch_store_b32 off, v151, s33 offset:384 +; DAGISEL-NEXT: s_clause 0x1f +; DAGISEL-NEXT: scratch_store_b32 off, v160, s33 offset:388 +; DAGISEL-NEXT: scratch_store_b32 off, v161, s33 offset:392 +; DAGISEL-NEXT: scratch_store_b32 off, v162, s33 offset:396 +; DAGISEL-NEXT: scratch_store_b32 off, v163, s33 offset:400 +; DAGISEL-NEXT: scratch_store_b32 off, v164, s33 offset:404 +; DAGISEL-NEXT: scratch_store_b32 off, v165, s33 offset:408 +; DAGISEL-NEXT: scratch_store_b32 off, v166, s33 offset:412 +; DAGISEL-NEXT: scratch_store_b32 off, v167, s33 offset:416 +; DAGISEL-NEXT: scratch_store_b32 off, v176, s33 offset:420 +; DAGISEL-NEXT: scratch_store_b32 off, v177, s33 offset:424 +; DAGISEL-NEXT: scratch_store_b32 off, v178, s33 offset:428 +; DAGISEL-NEXT: scratch_store_b32 off, v179, s33 offset:432 +; DAGISEL-NEXT: scratch_store_b32 off, v180, s33 offset:436 +; DAGISEL-NEXT: scratch_store_b32 off, v181, s33 offset:440 +; DAGISEL-NEXT: scratch_store_b32 off, v182, s33 offset:444 +; DAGISEL-NEXT: scratch_store_b32 off, v183, s33 offset:448 +; DAGISEL-NEXT: scratch_store_b32 off, v192, s33 offset:452 +; DAGISEL-NEXT: scratch_store_b32 off, v193, s33 offset:456 +; DAGISEL-NEXT: scratch_store_b32 off, v194, s33 offset:460 +; DAGISEL-NEXT: scratch_store_b32 off, v195, s33 offset:464 +; DAGISEL-NEXT: scratch_store_b32 off, v196, s33 offset:468 +; DAGISEL-NEXT: scratch_store_b32 off, v197, s33 offset:472 +; DAGISEL-NEXT: scratch_store_b32 off, v198, s33 offset:476 +; DAGISEL-NEXT: scratch_store_b32 off, v199, s33 offset:480 +; DAGISEL-NEXT: scratch_store_b32 off, v208, s33 offset:484 +; DAGISEL-NEXT: scratch_store_b32 off, v209, s33 offset:488 +; DAGISEL-NEXT: scratch_store_b32 off, v210, s33 offset:492 +; DAGISEL-NEXT: scratch_store_b32 off, v211, s33 offset:496 +; DAGISEL-NEXT: scratch_store_b32 off, v212, s33 offset:500 +; DAGISEL-NEXT: scratch_store_b32 off, v213, s33 offset:504 +; DAGISEL-NEXT: scratch_store_b32 off, v214, s33 offset:508 +; DAGISEL-NEXT: scratch_store_b32 off, v215, s33 offset:512 +; DAGISEL-NEXT: s_clause 0xf +; DAGISEL-NEXT: scratch_store_b32 off, v224, s33 offset:516 +; DAGISEL-NEXT: scratch_store_b32 off, v225, s33 offset:520 +; DAGISEL-NEXT: scratch_store_b32 off, v226, s33 offset:524 +; DAGISEL-NEXT: scratch_store_b32 off, v227, s33 offset:528 +; DAGISEL-NEXT: scratch_store_b32 off, v228, s33 offset:532 +; DAGISEL-NEXT: scratch_store_b32 off, v229, s33 offset:536 +; DAGISEL-NEXT: scratch_store_b32 off, v230, s33 offset:540 +; DAGISEL-NEXT: scratch_store_b32 off, v231, s33 offset:544 +; DAGISEL-NEXT: scratch_store_b32 off, v240, s33 offset:548 +; DAGISEL-NEXT: scratch_store_b32 off, v241, s33 offset:552 +; DAGISEL-NEXT: scratch_store_b32 off, v242, s33 offset:556 +; DAGISEL-NEXT: scratch_store_b32 off, v243, s33 offset:560 +; DAGISEL-NEXT: scratch_store_b32 off, v244, s33 offset:564 +; DAGISEL-NEXT: scratch_store_b32 off, v245, s33 offset:568 +; DAGISEL-NEXT: scratch_store_b32 off, v246, s33 offset:572 +; DAGISEL-NEXT: scratch_store_b32 off, v247, s33 offset:576 +; DAGISEL-NEXT: s_mov_b32 exec_lo, -1 +; DAGISEL-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill +; DAGISEL-NEXT: s_wait_alu 0xfffe +; DAGISEL-NEXT: v_writelane_b32 v40, s0, 3 +; DAGISEL-NEXT: v_mov_b32_e32 v2, v0 +; DAGISEL-NEXT: v_swap_b32 v0, v1 +; DAGISEL-NEXT: s_mov_b32 s1, gfx_callee@abs32@hi +; DAGISEL-NEXT: v_writelane_b32 v40, s4, 0 +; DAGISEL-NEXT: s_mov_b32 s0, gfx_callee@abs32@lo +; DAGISEL-NEXT: s_addk_co_i32 s32, 0x250 +; DAGISEL-NEXT: v_writelane_b32 v40, s30, 1 +; DAGISEL-NEXT: v_writelane_b32 v40, s31, 2 +; DAGISEL-NEXT: s_wait_alu 0xfffe +; DAGISEL-NEXT: s_swappc_b64 s[30:31], s[0:1] +; DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; DAGISEL-NEXT: v_readlane_b32 s31, v40, 2 +; DAGISEL-NEXT: v_readlane_b32 s30, v40, 1 +; DAGISEL-NEXT: v_readlane_b32 s4, v40, 0 +; DAGISEL-NEXT: v_readlane_b32 s0, v40, 3 +; DAGISEL-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload +; DAGISEL-NEXT: s_mov_b32 s32, s33 +; DAGISEL-NEXT: s_xor_b32 exec_lo, s4, -1 +; DAGISEL-NEXT: s_clause 0x1f +; DAGISEL-NEXT: scratch_load_b32 v0, off, s33 offset:4 +; DAGISEL-NEXT: scratch_load_b32 v1, off, s33 offset:8 +; DAGISEL-NEXT: scratch_load_b32 v2, off, s33 offset:12 +; DAGISEL-NEXT: scratch_load_b32 v3, off, s33 offset:16 +; DAGISEL-NEXT: scratch_load_b32 v4, off, s33 offset:20 +; DAGISEL-NEXT: scratch_load_b32 v5, off, s33 offset:24 +; DAGISEL-NEXT: scratch_load_b32 v6, off, s33 offset:28 +; DAGISEL-NEXT: scratch_load_b32 v7, off, s33 offset:32 +; DAGISEL-NEXT: scratch_load_b32 v8, off, s33 offset:36 +; DAGISEL-NEXT: scratch_load_b32 v9, off, s33 offset:40 +; DAGISEL-NEXT: scratch_load_b32 v10, off, s33 offset:44 +; DAGISEL-NEXT: scratch_load_b32 v11, off, s33 offset:48 +; DAGISEL-NEXT: scratch_load_b32 v12, off, s33 offset:52 +; DAGISEL-NEXT: scratch_load_b32 v13, off, s33 offset:56 +; DAGISEL-NEXT: scratch_load_b32 v14, off, s33 offset:60 +; DAGISEL-NEXT: scratch_load_b32 v15, off, s33 offset:64 +; DAGISEL-NEXT: scratch_load_b32 v16, off, s33 offset:68 +; DAGISEL-NEXT: scratch_load_b32 v17, off, s33 offset:72 +; DAGISEL-NEXT: scratch_load_b32 v18, off, s33 offset:76 +; DAGISEL-NEXT: scratch_load_b32 v19, off, s33 offset:80 +; DAGISEL-NEXT: scratch_load_b32 v20, off, s33 offset:84 +; DAGISEL-NEXT: scratch_load_b32 v21, off, s33 offset:88 +; DAGISEL-NEXT: scratch_load_b32 v22, off, s33 offset:92 +; DAGISEL-NEXT: scratch_load_b32 v23, off, s33 offset:96 +; DAGISEL-NEXT: scratch_load_b32 v24, off, s33 offset:100 +; DAGISEL-NEXT: scratch_load_b32 v25, off, s33 offset:104 +; DAGISEL-NEXT: scratch_load_b32 v26, off, s33 offset:108 +; DAGISEL-NEXT: scratch_load_b32 v27, off, s33 offset:112 +; DAGISEL-NEXT: scratch_load_b32 v28, off, s33 offset:116 +; DAGISEL-NEXT: scratch_load_b32 v29, off, s33 offset:120 +; DAGISEL-NEXT: scratch_load_b32 v30, off, s33 offset:124 +; DAGISEL-NEXT: scratch_load_b32 v31, off, s33 offset:128 +; DAGISEL-NEXT: s_clause 0x1f +; DAGISEL-NEXT: scratch_load_b32 v32, off, s33 offset:132 +; DAGISEL-NEXT: scratch_load_b32 v33, off, s33 offset:136 +; DAGISEL-NEXT: scratch_load_b32 v34, off, s33 offset:140 +; DAGISEL-NEXT: scratch_load_b32 v35, off, s33 offset:144 +; DAGISEL-NEXT: scratch_load_b32 v36, off, s33 offset:148 +; DAGISEL-NEXT: scratch_load_b32 v37, off, s33 offset:152 +; DAGISEL-NEXT: scratch_load_b32 v38, off, s33 offset:156 +; DAGISEL-NEXT: scratch_load_b32 v39, off, s33 offset:160 +; DAGISEL-NEXT: scratch_load_b32 v48, off, s33 offset:164 +; DAGISEL-NEXT: scratch_load_b32 v49, off, s33 offset:168 +; DAGISEL-NEXT: scratch_load_b32 v50, off, s33 offset:172 +; DAGISEL-NEXT: scratch_load_b32 v51, off, s33 offset:176 +; DAGISEL-NEXT: scratch_load_b32 v52, off, s33 offset:180 +; DAGISEL-NEXT: scratch_load_b32 v53, off, s33 offset:184 +; DAGISEL-NEXT: scratch_load_b32 v54, off, s33 offset:188 +; DAGISEL-NEXT: scratch_load_b32 v55, off, s33 offset:192 +; DAGISEL-NEXT: scratch_load_b32 v64, off, s33 offset:196 +; DAGISEL-NEXT: scratch_load_b32 v65, off, s33 offset:200 +; DAGISEL-NEXT: scratch_load_b32 v66, off, s33 offset:204 +; DAGISEL-NEXT: scratch_load_b32 v67, off, s33 offset:208 +; DAGISEL-NEXT: scratch_load_b32 v68, off, s33 offset:212 +; DAGISEL-NEXT: scratch_load_b32 v69, off, s33 offset:216 +; DAGISEL-NEXT: scratch_load_b32 v70, off, s33 offset:220 +; DAGISEL-NEXT: scratch_load_b32 v71, off, s33 offset:224 +; DAGISEL-NEXT: scratch_load_b32 v80, off, s33 offset:228 +; DAGISEL-NEXT: scratch_load_b32 v81, off, s33 offset:232 +; DAGISEL-NEXT: scratch_load_b32 v82, off, s33 offset:236 +; DAGISEL-NEXT: scratch_load_b32 v83, off, s33 offset:240 +; DAGISEL-NEXT: scratch_load_b32 v84, off, s33 offset:244 +; DAGISEL-NEXT: scratch_load_b32 v85, off, s33 offset:248 +; DAGISEL-NEXT: scratch_load_b32 v86, off, s33 offset:252 +; DAGISEL-NEXT: scratch_load_b32 v87, off, s33 offset:256 +; DAGISEL-NEXT: s_clause 0x1f +; DAGISEL-NEXT: scratch_load_b32 v96, off, s33 offset:260 +; DAGISEL-NEXT: scratch_load_b32 v97, off, s33 offset:264 +; DAGISEL-NEXT: scratch_load_b32 v98, off, s33 offset:268 +; DAGISEL-NEXT: scratch_load_b32 v99, off, s33 offset:272 +; DAGISEL-NEXT: scratch_load_b32 v100, off, s33 offset:276 +; DAGISEL-NEXT: scratch_load_b32 v101, off, s33 offset:280 +; DAGISEL-NEXT: scratch_load_b32 v102, off, s33 offset:284 +; DAGISEL-NEXT: scratch_load_b32 v103, off, s33 offset:288 +; DAGISEL-NEXT: scratch_load_b32 v112, off, s33 offset:292 +; DAGISEL-NEXT: scratch_load_b32 v113, off, s33 offset:296 +; DAGISEL-NEXT: scratch_load_b32 v114, off, s33 offset:300 +; DAGISEL-NEXT: scratch_load_b32 v115, off, s33 offset:304 +; DAGISEL-NEXT: scratch_load_b32 v116, off, s33 offset:308 +; DAGISEL-NEXT: scratch_load_b32 v117, off, s33 offset:312 +; DAGISEL-NEXT: scratch_load_b32 v118, off, s33 offset:316 +; DAGISEL-NEXT: scratch_load_b32 v119, off, s33 offset:320 +; DAGISEL-NEXT: scratch_load_b32 v128, off, s33 offset:324 +; DAGISEL-NEXT: scratch_load_b32 v129, off, s33 offset:328 +; DAGISEL-NEXT: scratch_load_b32 v130, off, s33 offset:332 +; DAGISEL-NEXT: scratch_load_b32 v131, off, s33 offset:336 +; DAGISEL-NEXT: scratch_load_b32 v132, off, s33 offset:340 +; DAGISEL-NEXT: scratch_load_b32 v133, off, s33 offset:344 +; DAGISEL-NEXT: scratch_load_b32 v134, off, s33 offset:348 +; DAGISEL-NEXT: scratch_load_b32 v135, off, s33 offset:352 +; DAGISEL-NEXT: scratch_load_b32 v144, off, s33 offset:356 +; DAGISEL-NEXT: scratch_load_b32 v145, off, s33 offset:360 +; DAGISEL-NEXT: scratch_load_b32 v146, off, s33 offset:364 +; DAGISEL-NEXT: scratch_load_b32 v147, off, s33 offset:368 +; DAGISEL-NEXT: scratch_load_b32 v148, off, s33 offset:372 +; DAGISEL-NEXT: scratch_load_b32 v149, off, s33 offset:376 +; DAGISEL-NEXT: scratch_load_b32 v150, off, s33 offset:380 +; DAGISEL-NEXT: scratch_load_b32 v151, off, s33 offset:384 +; DAGISEL-NEXT: s_clause 0x1f +; DAGISEL-NEXT: scratch_load_b32 v160, off, s33 offset:388 +; DAGISEL-NEXT: scratch_load_b32 v161, off, s33 offset:392 +; DAGISEL-NEXT: scratch_load_b32 v162, off, s33 offset:396 +; DAGISEL-NEXT: scratch_load_b32 v163, off, s33 offset:400 +; DAGISEL-NEXT: scratch_load_b32 v164, off, s33 offset:404 +; DAGISEL-NEXT: scratch_load_b32 v165, off, s33 offset:408 +; DAGISEL-NEXT: scratch_load_b32 v166, off, s33 offset:412 +; DAGISEL-NEXT: scratch_load_b32 v167, off, s33 offset:416 +; DAGISEL-NEXT: scratch_load_b32 v176, off, s33 offset:420 +; DAGISEL-NEXT: scratch_load_b32 v177, off, s33 offset:424 +; DAGISEL-NEXT: scratch_load_b32 v178, off, s33 offset:428 +; DAGISEL-NEXT: scratch_load_b32 v179, off, s33 offset:432 +; DAGISEL-NEXT: scratch_load_b32 v180, off, s33 offset:436 +; DAGISEL-NEXT: scratch_load_b32 v181, off, s33 offset:440 +; DAGISEL-NEXT: scratch_load_b32 v182, off, s33 offset:444 +; DAGISEL-NEXT: scratch_load_b32 v183, off, s33 offset:448 +; DAGISEL-NEXT: scratch_load_b32 v192, off, s33 offset:452 +; DAGISEL-NEXT: scratch_load_b32 v193, off, s33 offset:456 +; DAGISEL-NEXT: scratch_load_b32 v194, off, s33 offset:460 +; DAGISEL-NEXT: scratch_load_b32 v195, off, s33 offset:464 +; DAGISEL-NEXT: scratch_load_b32 v196, off, s33 offset:468 +; DAGISEL-NEXT: scratch_load_b32 v197, off, s33 offset:472 +; DAGISEL-NEXT: scratch_load_b32 v198, off, s33 offset:476 +; DAGISEL-NEXT: scratch_load_b32 v199, off, s33 offset:480 +; DAGISEL-NEXT: scratch_load_b32 v208, off, s33 offset:484 +; DAGISEL-NEXT: scratch_load_b32 v209, off, s33 offset:488 +; DAGISEL-NEXT: scratch_load_b32 v210, off, s33 offset:492 +; DAGISEL-NEXT: scratch_load_b32 v211, off, s33 offset:496 +; DAGISEL-NEXT: scratch_load_b32 v212, off, s33 offset:500 +; DAGISEL-NEXT: scratch_load_b32 v213, off, s33 offset:504 +; DAGISEL-NEXT: scratch_load_b32 v214, off, s33 offset:508 +; DAGISEL-NEXT: scratch_load_b32 v215, off, s33 offset:512 +; DAGISEL-NEXT: s_clause 0xf +; DAGISEL-NEXT: scratch_load_b32 v224, off, s33 offset:516 +; DAGISEL-NEXT: scratch_load_b32 v225, off, s33 offset:520 +; DAGISEL-NEXT: scratch_load_b32 v226, off, s33 offset:524 +; DAGISEL-NEXT: scratch_load_b32 v227, off, s33 offset:528 +; DAGISEL-NEXT: scratch_load_b32 v228, off, s33 offset:532 +; DAGISEL-NEXT: scratch_load_b32 v229, off, s33 offset:536 +; DAGISEL-NEXT: scratch_load_b32 v230, off, s33 offset:540 +; DAGISEL-NEXT: scratch_load_b32 v231, off, s33 offset:544 +; DAGISEL-NEXT: scratch_load_b32 v240, off, s33 offset:548 +; DAGISEL-NEXT: scratch_load_b32 v241, off, s33 offset:552 +; DAGISEL-NEXT: scratch_load_b32 v242, off, s33 offset:556 +; DAGISEL-NEXT: scratch_load_b32 v243, off, s33 offset:560 +; DAGISEL-NEXT: scratch_load_b32 v244, off, s33 offset:564 +; DAGISEL-NEXT: scratch_load_b32 v245, off, s33 offset:568 +; DAGISEL-NEXT: scratch_load_b32 v246, off, s33 offset:572 +; DAGISEL-NEXT: scratch_load_b32 v247, off, s33 offset:576 +; DAGISEL-NEXT: s_mov_b32 exec_lo, s4 +; DAGISEL-NEXT: s_mov_b32 s33, s0 +; DAGISEL-NEXT: s_wait_loadcnt 0x0 +; DAGISEL-NEXT: s_wait_alu 0xfffe +; DAGISEL-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-LABEL: call_gfx_from_whole_wave: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 +; GISEL-NEXT: s_wait_expcnt 0x0 +; GISEL-NEXT: s_wait_samplecnt 0x0 +; GISEL-NEXT: s_wait_bvhcnt 0x0 +; GISEL-NEXT: s_wait_kmcnt 0x0 +; GISEL-NEXT: s_mov_b32 s0, s33 +; GISEL-NEXT: s_mov_b32 s33, s32 +; GISEL-NEXT: s_xor_saveexec_b32 s4, -1 +; GISEL-NEXT: s_clause 0x1f +; GISEL-NEXT: scratch_store_b32 off, v0, s33 offset:4 +; GISEL-NEXT: scratch_store_b32 off, v1, s33 offset:8 +; GISEL-NEXT: scratch_store_b32 off, v2, s33 offset:12 +; GISEL-NEXT: scratch_store_b32 off, v3, s33 offset:16 +; GISEL-NEXT: scratch_store_b32 off, v4, s33 offset:20 +; GISEL-NEXT: scratch_store_b32 off, v5, s33 offset:24 +; GISEL-NEXT: scratch_store_b32 off, v6, s33 offset:28 +; GISEL-NEXT: scratch_store_b32 off, v7, s33 offset:32 +; GISEL-NEXT: scratch_store_b32 off, v8, s33 offset:36 +; GISEL-NEXT: scratch_store_b32 off, v9, s33 offset:40 +; GISEL-NEXT: scratch_store_b32 off, v10, s33 offset:44 +; GISEL-NEXT: scratch_store_b32 off, v11, s33 offset:48 +; GISEL-NEXT: scratch_store_b32 off, v12, s33 offset:52 +; GISEL-NEXT: scratch_store_b32 off, v13, s33 offset:56 +; GISEL-NEXT: scratch_store_b32 off, v14, s33 offset:60 +; GISEL-NEXT: scratch_store_b32 off, v15, s33 offset:64 +; GISEL-NEXT: scratch_store_b32 off, v16, s33 offset:68 +; GISEL-NEXT: scratch_store_b32 off, v17, s33 offset:72 +; GISEL-NEXT: scratch_store_b32 off, v18, s33 offset:76 +; GISEL-NEXT: scratch_store_b32 off, v19, s33 offset:80 +; GISEL-NEXT: scratch_store_b32 off, v20, s33 offset:84 +; GISEL-NEXT: scratch_store_b32 off, v21, s33 offset:88 +; GISEL-NEXT: scratch_store_b32 off, v22, s33 offset:92 +; GISEL-NEXT: scratch_store_b32 off, v23, s33 offset:96 +; GISEL-NEXT: scratch_store_b32 off, v24, s33 offset:100 +; GISEL-NEXT: scratch_store_b32 off, v25, s33 offset:104 +; GISEL-NEXT: scratch_store_b32 off, v26, s33 offset:108 +; GISEL-NEXT: scratch_store_b32 off, v27, s33 offset:112 +; GISEL-NEXT: scratch_store_b32 off, v28, s33 offset:116 +; GISEL-NEXT: scratch_store_b32 off, v29, s33 offset:120 +; GISEL-NEXT: scratch_store_b32 off, v30, s33 offset:124 +; GISEL-NEXT: scratch_store_b32 off, v31, s33 offset:128 +; GISEL-NEXT: s_clause 0x1f +; GISEL-NEXT: scratch_store_b32 off, v32, s33 offset:132 +; GISEL-NEXT: scratch_store_b32 off, v33, s33 offset:136 +; GISEL-NEXT: scratch_store_b32 off, v34, s33 offset:140 +; GISEL-NEXT: scratch_store_b32 off, v35, s33 offset:144 +; GISEL-NEXT: scratch_store_b32 off, v36, s33 offset:148 +; GISEL-NEXT: scratch_store_b32 off, v37, s33 offset:152 +; GISEL-NEXT: scratch_store_b32 off, v38, s33 offset:156 +; GISEL-NEXT: scratch_store_b32 off, v39, s33 offset:160 +; GISEL-NEXT: scratch_store_b32 off, v48, s33 offset:164 +; GISEL-NEXT: scratch_store_b32 off, v49, s33 offset:168 +; GISEL-NEXT: scratch_store_b32 off, v50, s33 offset:172 +; GISEL-NEXT: scratch_store_b32 off, v51, s33 offset:176 +; GISEL-NEXT: scratch_store_b32 off, v52, s33 offset:180 +; GISEL-NEXT: scratch_store_b32 off, v53, s33 offset:184 +; GISEL-NEXT: scratch_store_b32 off, v54, s33 offset:188 +; GISEL-NEXT: scratch_store_b32 off, v55, s33 offset:192 +; GISEL-NEXT: scratch_store_b32 off, v64, s33 offset:196 +; GISEL-NEXT: scratch_store_b32 off, v65, s33 offset:200 +; GISEL-NEXT: scratch_store_b32 off, v66, s33 offset:204 +; GISEL-NEXT: scratch_store_b32 off, v67, s33 offset:208 +; GISEL-NEXT: scratch_store_b32 off, v68, s33 offset:212 +; GISEL-NEXT: scratch_store_b32 off, v69, s33 offset:216 +; GISEL-NEXT: scratch_store_b32 off, v70, s33 offset:220 +; GISEL-NEXT: scratch_store_b32 off, v71, s33 offset:224 +; GISEL-NEXT: scratch_store_b32 off, v80, s33 offset:228 +; GISEL-NEXT: scratch_store_b32 off, v81, s33 offset:232 +; GISEL-NEXT: scratch_store_b32 off, v82, s33 offset:236 +; GISEL-NEXT: scratch_store_b32 off, v83, s33 offset:240 +; GISEL-NEXT: scratch_store_b32 off, v84, s33 offset:244 +; GISEL-NEXT: scratch_store_b32 off, v85, s33 offset:248 +; GISEL-NEXT: scratch_store_b32 off, v86, s33 offset:252 +; GISEL-NEXT: scratch_store_b32 off, v87, s33 offset:256 +; GISEL-NEXT: s_clause 0x1f +; GISEL-NEXT: scratch_store_b32 off, v96, s33 offset:260 +; GISEL-NEXT: scratch_store_b32 off, v97, s33 offset:264 +; GISEL-NEXT: scratch_store_b32 off, v98, s33 offset:268 +; GISEL-NEXT: scratch_store_b32 off, v99, s33 offset:272 +; GISEL-NEXT: scratch_store_b32 off, v100, s33 offset:276 +; GISEL-NEXT: scratch_store_b32 off, v101, s33 offset:280 +; GISEL-NEXT: scratch_store_b32 off, v102, s33 offset:284 +; GISEL-NEXT: scratch_store_b32 off, v103, s33 offset:288 +; GISEL-NEXT: scratch_store_b32 off, v112, s33 offset:292 +; GISEL-NEXT: scratch_store_b32 off, v113, s33 offset:296 +; GISEL-NEXT: scratch_store_b32 off, v114, s33 offset:300 +; GISEL-NEXT: scratch_store_b32 off, v115, s33 offset:304 +; GISEL-NEXT: scratch_store_b32 off, v116, s33 offset:308 +; GISEL-NEXT: scratch_store_b32 off, v117, s33 offset:312 +; GISEL-NEXT: scratch_store_b32 off, v118, s33 offset:316 +; GISEL-NEXT: scratch_store_b32 off, v119, s33 offset:320 +; GISEL-NEXT: scratch_store_b32 off, v128, s33 offset:324 +; GISEL-NEXT: scratch_store_b32 off, v129, s33 offset:328 +; GISEL-NEXT: scratch_store_b32 off, v130, s33 offset:332 +; GISEL-NEXT: scratch_store_b32 off, v131, s33 offset:336 +; GISEL-NEXT: scratch_store_b32 off, v132, s33 offset:340 +; GISEL-NEXT: scratch_store_b32 off, v133, s33 offset:344 +; GISEL-NEXT: scratch_store_b32 off, v134, s33 offset:348 +; GISEL-NEXT: scratch_store_b32 off, v135, s33 offset:352 +; GISEL-NEXT: scratch_store_b32 off, v144, s33 offset:356 +; GISEL-NEXT: scratch_store_b32 off, v145, s33 offset:360 +; GISEL-NEXT: scratch_store_b32 off, v146, s33 offset:364 +; GISEL-NEXT: scratch_store_b32 off, v147, s33 offset:368 +; GISEL-NEXT: scratch_store_b32 off, v148, s33 offset:372 +; GISEL-NEXT: scratch_store_b32 off, v149, s33 offset:376 +; GISEL-NEXT: scratch_store_b32 off, v150, s33 offset:380 +; GISEL-NEXT: scratch_store_b32 off, v151, s33 offset:384 +; GISEL-NEXT: s_clause 0x1f +; GISEL-NEXT: scratch_store_b32 off, v160, s33 offset:388 +; GISEL-NEXT: scratch_store_b32 off, v161, s33 offset:392 +; GISEL-NEXT: scratch_store_b32 off, v162, s33 offset:396 +; GISEL-NEXT: scratch_store_b32 off, v163, s33 offset:400 +; GISEL-NEXT: scratch_store_b32 off, v164, s33 offset:404 +; GISEL-NEXT: scratch_store_b32 off, v165, s33 offset:408 +; GISEL-NEXT: scratch_store_b32 off, v166, s33 offset:412 +; GISEL-NEXT: scratch_store_b32 off, v167, s33 offset:416 +; GISEL-NEXT: scratch_store_b32 off, v176, s33 offset:420 +; GISEL-NEXT: scratch_store_b32 off, v177, s33 offset:424 +; GISEL-NEXT: scratch_store_b32 off, v178, s33 offset:428 +; GISEL-NEXT: scratch_store_b32 off, v179, s33 offset:432 +; GISEL-NEXT: scratch_store_b32 off, v180, s33 offset:436 +; GISEL-NEXT: scratch_store_b32 off, v181, s33 offset:440 +; GISEL-NEXT: scratch_store_b32 off, v182, s33 offset:444 +; GISEL-NEXT: scratch_store_b32 off, v183, s33 offset:448 +; GISEL-NEXT: scratch_store_b32 off, v192, s33 offset:452 +; GISEL-NEXT: scratch_store_b32 off, v193, s33 offset:456 +; GISEL-NEXT: scratch_store_b32 off, v194, s33 offset:460 +; GISEL-NEXT: scratch_store_b32 off, v195, s33 offset:464 +; GISEL-NEXT: scratch_store_b32 off, v196, s33 offset:468 +; GISEL-NEXT: scratch_store_b32 off, v197, s33 offset:472 +; GISEL-NEXT: scratch_store_b32 off, v198, s33 offset:476 +; GISEL-NEXT: scratch_store_b32 off, v199, s33 offset:480 +; GISEL-NEXT: scratch_store_b32 off, v208, s33 offset:484 +; GISEL-NEXT: scratch_store_b32 off, v209, s33 offset:488 +; GISEL-NEXT: scratch_store_b32 off, v210, s33 offset:492 +; GISEL-NEXT: scratch_store_b32 off, v211, s33 offset:496 +; GISEL-NEXT: scratch_store_b32 off, v212, s33 offset:500 +; GISEL-NEXT: scratch_store_b32 off, v213, s33 offset:504 +; GISEL-NEXT: scratch_store_b32 off, v214, s33 offset:508 +; GISEL-NEXT: scratch_store_b32 off, v215, s33 offset:512 +; GISEL-NEXT: s_clause 0xf +; GISEL-NEXT: scratch_store_b32 off, v224, s33 offset:516 +; GISEL-NEXT: scratch_store_b32 off, v225, s33 offset:520 +; GISEL-NEXT: scratch_store_b32 off, v226, s33 offset:524 +; GISEL-NEXT: scratch_store_b32 off, v227, s33 offset:528 +; GISEL-NEXT: scratch_store_b32 off, v228, s33 offset:532 +; GISEL-NEXT: scratch_store_b32 off, v229, s33 offset:536 +; GISEL-NEXT: scratch_store_b32 off, v230, s33 offset:540 +; GISEL-NEXT: scratch_store_b32 off, v231, s33 offset:544 +; GISEL-NEXT: scratch_store_b32 off, v240, s33 offset:548 +; GISEL-NEXT: scratch_store_b32 off, v241, s33 offset:552 +; GISEL-NEXT: scratch_store_b32 off, v242, s33 offset:556 +; GISEL-NEXT: scratch_store_b32 off, v243, s33 offset:560 +; GISEL-NEXT: scratch_store_b32 off, v244, s33 offset:564 +; GISEL-NEXT: scratch_store_b32 off, v245, s33 offset:568 +; GISEL-NEXT: scratch_store_b32 off, v246, s33 offset:572 +; GISEL-NEXT: scratch_store_b32 off, v247, s33 offset:576 +; GISEL-NEXT: s_mov_b32 exec_lo, -1 +; GISEL-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill +; GISEL-NEXT: s_wait_alu 0xfffe +; GISEL-NEXT: v_writelane_b32 v40, s0, 3 +; GISEL-NEXT: v_mov_b32_e32 v2, v0 +; GISEL-NEXT: v_swap_b32 v0, v1 +; GISEL-NEXT: s_mov_b32 s0, gfx_callee@abs32@lo +; GISEL-NEXT: v_writelane_b32 v40, s4, 0 +; GISEL-NEXT: s_mov_b32 s1, gfx_callee@abs32@hi +; GISEL-NEXT: s_addk_co_i32 s32, 0x250 +; GISEL-NEXT: v_writelane_b32 v40, s30, 1 +; GISEL-NEXT: v_writelane_b32 v40, s31, 2 +; GISEL-NEXT: s_wait_alu 0xfffe +; GISEL-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GISEL-NEXT: v_readlane_b32 s31, v40, 2 +; GISEL-NEXT: v_readlane_b32 s30, v40, 1 +; GISEL-NEXT: v_readlane_b32 s4, v40, 0 +; GISEL-NEXT: v_readlane_b32 s0, v40, 3 +; GISEL-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload +; GISEL-NEXT: s_mov_b32 s32, s33 +; GISEL-NEXT: s_xor_b32 exec_lo, s4, -1 +; GISEL-NEXT: s_clause 0x1f +; GISEL-NEXT: scratch_load_b32 v0, off, s33 offset:4 +; GISEL-NEXT: scratch_load_b32 v1, off, s33 offset:8 +; GISEL-NEXT: scratch_load_b32 v2, off, s33 offset:12 +; GISEL-NEXT: scratch_load_b32 v3, off, s33 offset:16 +; GISEL-NEXT: scratch_load_b32 v4, off, s33 offset:20 +; GISEL-NEXT: scratch_load_b32 v5, off, s33 offset:24 +; GISEL-NEXT: scratch_load_b32 v6, off, s33 offset:28 +; GISEL-NEXT: scratch_load_b32 v7, off, s33 offset:32 +; GISEL-NEXT: scratch_load_b32 v8, off, s33 offset:36 +; GISEL-NEXT: scratch_load_b32 v9, off, s33 offset:40 +; GISEL-NEXT: scratch_load_b32 v10, off, s33 offset:44 +; GISEL-NEXT: scratch_load_b32 v11, off, s33 offset:48 +; GISEL-NEXT: scratch_load_b32 v12, off, s33 offset:52 +; GISEL-NEXT: scratch_load_b32 v13, off, s33 offset:56 +; GISEL-NEXT: scratch_load_b32 v14, off, s33 offset:60 +; GISEL-NEXT: scratch_load_b32 v15, off, s33 offset:64 +; GISEL-NEXT: scratch_load_b32 v16, off, s33 offset:68 +; GISEL-NEXT: scratch_load_b32 v17, off, s33 offset:72 +; GISEL-NEXT: scratch_load_b32 v18, off, s33 offset:76 +; GISEL-NEXT: scratch_load_b32 v19, off, s33 offset:80 +; GISEL-NEXT: scratch_load_b32 v20, off, s33 offset:84 +; GISEL-NEXT: scratch_load_b32 v21, off, s33 offset:88 +; GISEL-NEXT: scratch_load_b32 v22, off, s33 offset:92 +; GISEL-NEXT: scratch_load_b32 v23, off, s33 offset:96 +; GISEL-NEXT: scratch_load_b32 v24, off, s33 offset:100 +; GISEL-NEXT: scratch_load_b32 v25, off, s33 offset:104 +; GISEL-NEXT: scratch_load_b32 v26, off, s33 offset:108 +; GISEL-NEXT: scratch_load_b32 v27, off, s33 offset:112 +; GISEL-NEXT: scratch_load_b32 v28, off, s33 offset:116 +; GISEL-NEXT: scratch_load_b32 v29, off, s33 offset:120 +; GISEL-NEXT: scratch_load_b32 v30, off, s33 offset:124 +; GISEL-NEXT: scratch_load_b32 v31, off, s33 offset:128 +; GISEL-NEXT: s_clause 0x1f +; GISEL-NEXT: scratch_load_b32 v32, off, s33 offset:132 +; GISEL-NEXT: scratch_load_b32 v33, off, s33 offset:136 +; GISEL-NEXT: scratch_load_b32 v34, off, s33 offset:140 +; GISEL-NEXT: scratch_load_b32 v35, off, s33 offset:144 +; GISEL-NEXT: scratch_load_b32 v36, off, s33 offset:148 +; GISEL-NEXT: scratch_load_b32 v37, off, s33 offset:152 +; GISEL-NEXT: scratch_load_b32 v38, off, s33 offset:156 +; GISEL-NEXT: scratch_load_b32 v39, off, s33 offset:160 +; GISEL-NEXT: scratch_load_b32 v48, off, s33 offset:164 +; GISEL-NEXT: scratch_load_b32 v49, off, s33 offset:168 +; GISEL-NEXT: scratch_load_b32 v50, off, s33 offset:172 +; GISEL-NEXT: scratch_load_b32 v51, off, s33 offset:176 +; GISEL-NEXT: scratch_load_b32 v52, off, s33 offset:180 +; GISEL-NEXT: scratch_load_b32 v53, off, s33 offset:184 +; GISEL-NEXT: scratch_load_b32 v54, off, s33 offset:188 +; GISEL-NEXT: scratch_load_b32 v55, off, s33 offset:192 +; GISEL-NEXT: scratch_load_b32 v64, off, s33 offset:196 +; GISEL-NEXT: scratch_load_b32 v65, off, s33 offset:200 +; GISEL-NEXT: scratch_load_b32 v66, off, s33 offset:204 +; GISEL-NEXT: scratch_load_b32 v67, off, s33 offset:208 +; GISEL-NEXT: scratch_load_b32 v68, off, s33 offset:212 +; GISEL-NEXT: scratch_load_b32 v69, off, s33 offset:216 +; GISEL-NEXT: scratch_load_b32 v70, off, s33 offset:220 +; GISEL-NEXT: scratch_load_b32 v71, off, s33 offset:224 +; GISEL-NEXT: scratch_load_b32 v80, off, s33 offset:228 +; GISEL-NEXT: scratch_load_b32 v81, off, s33 offset:232 +; GISEL-NEXT: scratch_load_b32 v82, off, s33 offset:236 +; GISEL-NEXT: scratch_load_b32 v83, off, s33 offset:240 +; GISEL-NEXT: scratch_load_b32 v84, off, s33 offset:244 +; GISEL-NEXT: scratch_load_b32 v85, off, s33 offset:248 +; GISEL-NEXT: scratch_load_b32 v86, off, s33 offset:252 +; GISEL-NEXT: scratch_load_b32 v87, off, s33 offset:256 +; GISEL-NEXT: s_clause 0x1f +; GISEL-NEXT: scratch_load_b32 v96, off, s33 offset:260 +; GISEL-NEXT: scratch_load_b32 v97, off, s33 offset:264 +; GISEL-NEXT: scratch_load_b32 v98, off, s33 offset:268 +; GISEL-NEXT: scratch_load_b32 v99, off, s33 offset:272 +; GISEL-NEXT: scratch_load_b32 v100, off, s33 offset:276 +; GISEL-NEXT: scratch_load_b32 v101, off, s33 offset:280 +; GISEL-NEXT: scratch_load_b32 v102, off, s33 offset:284 +; GISEL-NEXT: scratch_load_b32 v103, off, s33 offset:288 +; GISEL-NEXT: scratch_load_b32 v112, off, s33 offset:292 +; GISEL-NEXT: scratch_load_b32 v113, off, s33 offset:296 +; GISEL-NEXT: scratch_load_b32 v114, off, s33 offset:300 +; GISEL-NEXT: scratch_load_b32 v115, off, s33 offset:304 +; GISEL-NEXT: scratch_load_b32 v116, off, s33 offset:308 +; GISEL-NEXT: scratch_load_b32 v117, off, s33 offset:312 +; GISEL-NEXT: scratch_load_b32 v118, off, s33 offset:316 +; GISEL-NEXT: scratch_load_b32 v119, off, s33 offset:320 +; GISEL-NEXT: scratch_load_b32 v128, off, s33 offset:324 +; GISEL-NEXT: scratch_load_b32 v129, off, s33 offset:328 +; GISEL-NEXT: scratch_load_b32 v130, off, s33 offset:332 +; GISEL-NEXT: scratch_load_b32 v131, off, s33 offset:336 +; GISEL-NEXT: scratch_load_b32 v132, off, s33 offset:340 +; GISEL-NEXT: scratch_load_b32 v133, off, s33 offset:344 +; GISEL-NEXT: scratch_load_b32 v134, off, s33 offset:348 +; GISEL-NEXT: scratch_load_b32 v135, off, s33 offset:352 +; GISEL-NEXT: scratch_load_b32 v144, off, s33 offset:356 +; GISEL-NEXT: scratch_load_b32 v145, off, s33 offset:360 +; GISEL-NEXT: scratch_load_b32 v146, off, s33 offset:364 +; GISEL-NEXT: scratch_load_b32 v147, off, s33 offset:368 +; GISEL-NEXT: scratch_load_b32 v148, off, s33 offset:372 +; GISEL-NEXT: scratch_load_b32 v149, off, s33 offset:376 +; GISEL-NEXT: scratch_load_b32 v150, off, s33 offset:380 +; GISEL-NEXT: scratch_load_b32 v151, off, s33 offset:384 +; GISEL-NEXT: s_clause 0x1f +; GISEL-NEXT: scratch_load_b32 v160, off, s33 offset:388 +; GISEL-NEXT: scratch_load_b32 v161, off, s33 offset:392 +; GISEL-NEXT: scratch_load_b32 v162, off, s33 offset:396 +; GISEL-NEXT: scratch_load_b32 v163, off, s33 offset:400 +; GISEL-NEXT: scratch_load_b32 v164, off, s33 offset:404 +; GISEL-NEXT: scratch_load_b32 v165, off, s33 offset:408 +; GISEL-NEXT: scratch_load_b32 v166, off, s33 offset:412 +; GISEL-NEXT: scratch_load_b32 v167, off, s33 offset:416 +; GISEL-NEXT: scratch_load_b32 v176, off, s33 offset:420 +; GISEL-NEXT: scratch_load_b32 v177, off, s33 offset:424 +; GISEL-NEXT: scratch_load_b32 v178, off, s33 offset:428 +; GISEL-NEXT: scratch_load_b32 v179, off, s33 offset:432 +; GISEL-NEXT: scratch_load_b32 v180, off, s33 offset:436 +; GISEL-NEXT: scratch_load_b32 v181, off, s33 offset:440 +; GISEL-NEXT: scratch_load_b32 v182, off, s33 offset:444 +; GISEL-NEXT: scratch_load_b32 v183, off, s33 offset:448 +; GISEL-NEXT: scratch_load_b32 v192, off, s33 offset:452 +; GISEL-NEXT: scratch_load_b32 v193, off, s33 offset:456 +; GISEL-NEXT: scratch_load_b32 v194, off, s33 offset:460 +; GISEL-NEXT: scratch_load_b32 v195, off, s33 offset:464 +; GISEL-NEXT: scratch_load_b32 v196, off, s33 offset:468 +; GISEL-NEXT: scratch_load_b32 v197, off, s33 offset:472 +; GISEL-NEXT: scratch_load_b32 v198, off, s33 offset:476 +; GISEL-NEXT: scratch_load_b32 v199, off, s33 offset:480 +; GISEL-NEXT: scratch_load_b32 v208, off, s33 offset:484 +; GISEL-NEXT: scratch_load_b32 v209, off, s33 offset:488 +; GISEL-NEXT: scratch_load_b32 v210, off, s33 offset:492 +; GISEL-NEXT: scratch_load_b32 v211, off, s33 offset:496 +; GISEL-NEXT: scratch_load_b32 v212, off, s33 offset:500 +; GISEL-NEXT: scratch_load_b32 v213, off, s33 offset:504 +; GISEL-NEXT: scratch_load_b32 v214, off, s33 offset:508 +; GISEL-NEXT: scratch_load_b32 v215, off, s33 offset:512 +; GISEL-NEXT: s_clause 0xf +; GISEL-NEXT: scratch_load_b32 v224, off, s33 offset:516 +; GISEL-NEXT: scratch_load_b32 v225, off, s33 offset:520 +; GISEL-NEXT: scratch_load_b32 v226, off, s33 offset:524 +; GISEL-NEXT: scratch_load_b32 v227, off, s33 offset:528 +; GISEL-NEXT: scratch_load_b32 v228, off, s33 offset:532 +; GISEL-NEXT: scratch_load_b32 v229, off, s33 offset:536 +; GISEL-NEXT: scratch_load_b32 v230, off, s33 offset:540 +; GISEL-NEXT: scratch_load_b32 v231, off, s33 offset:544 +; GISEL-NEXT: scratch_load_b32 v240, off, s33 offset:548 +; GISEL-NEXT: scratch_load_b32 v241, off, s33 offset:552 +; GISEL-NEXT: scratch_load_b32 v242, off, s33 offset:556 +; GISEL-NEXT: scratch_load_b32 v243, off, s33 offset:560 +; GISEL-NEXT: scratch_load_b32 v244, off, s33 offset:564 +; GISEL-NEXT: scratch_load_b32 v245, off, s33 offset:568 +; GISEL-NEXT: scratch_load_b32 v246, off, s33 offset:572 +; GISEL-NEXT: scratch_load_b32 v247, off, s33 offset:576 +; GISEL-NEXT: s_mov_b32 exec_lo, s4 +; GISEL-NEXT: s_mov_b32 s33, s0 +; GISEL-NEXT: s_wait_loadcnt 0x0 +; GISEL-NEXT: s_wait_alu 0xfffe +; GISEL-NEXT: s_setpc_b64 s[30:31] +; +; DAGISEL64-LABEL: call_gfx_from_whole_wave: +; DAGISEL64: ; %bb.0: +; DAGISEL64-NEXT: s_wait_loadcnt_dscnt 0x0 +; DAGISEL64-NEXT: s_wait_expcnt 0x0 +; DAGISEL64-NEXT: s_wait_samplecnt 0x0 +; DAGISEL64-NEXT: s_wait_bvhcnt 0x0 +; DAGISEL64-NEXT: s_wait_kmcnt 0x0 +; DAGISEL64-NEXT: s_mov_b32 s0, s33 +; DAGISEL64-NEXT: s_mov_b32 s33, s32 +; DAGISEL64-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; DAGISEL64-NEXT: s_clause 0x1f +; DAGISEL64-NEXT: scratch_store_b32 off, v0, s33 offset:4 +; DAGISEL64-NEXT: scratch_store_b32 off, v1, s33 offset:8 +; DAGISEL64-NEXT: scratch_store_b32 off, v2, s33 offset:12 +; DAGISEL64-NEXT: scratch_store_b32 off, v3, s33 offset:16 +; DAGISEL64-NEXT: scratch_store_b32 off, v4, s33 offset:20 +; DAGISEL64-NEXT: scratch_store_b32 off, v5, s33 offset:24 +; DAGISEL64-NEXT: scratch_store_b32 off, v6, s33 offset:28 +; DAGISEL64-NEXT: scratch_store_b32 off, v7, s33 offset:32 +; DAGISEL64-NEXT: scratch_store_b32 off, v8, s33 offset:36 +; DAGISEL64-NEXT: scratch_store_b32 off, v9, s33 offset:40 +; DAGISEL64-NEXT: scratch_store_b32 off, v10, s33 offset:44 +; DAGISEL64-NEXT: scratch_store_b32 off, v11, s33 offset:48 +; DAGISEL64-NEXT: scratch_store_b32 off, v12, s33 offset:52 +; DAGISEL64-NEXT: scratch_store_b32 off, v13, s33 offset:56 +; DAGISEL64-NEXT: scratch_store_b32 off, v14, s33 offset:60 +; DAGISEL64-NEXT: scratch_store_b32 off, v15, s33 offset:64 +; DAGISEL64-NEXT: scratch_store_b32 off, v16, s33 offset:68 +; DAGISEL64-NEXT: scratch_store_b32 off, v17, s33 offset:72 +; DAGISEL64-NEXT: scratch_store_b32 off, v18, s33 offset:76 +; DAGISEL64-NEXT: scratch_store_b32 off, v19, s33 offset:80 +; DAGISEL64-NEXT: scratch_store_b32 off, v20, s33 offset:84 +; DAGISEL64-NEXT: scratch_store_b32 off, v21, s33 offset:88 +; DAGISEL64-NEXT: scratch_store_b32 off, v22, s33 offset:92 +; DAGISEL64-NEXT: scratch_store_b32 off, v23, s33 offset:96 +; DAGISEL64-NEXT: scratch_store_b32 off, v24, s33 offset:100 +; DAGISEL64-NEXT: scratch_store_b32 off, v25, s33 offset:104 +; DAGISEL64-NEXT: scratch_store_b32 off, v26, s33 offset:108 +; DAGISEL64-NEXT: scratch_store_b32 off, v27, s33 offset:112 +; DAGISEL64-NEXT: scratch_store_b32 off, v28, s33 offset:116 +; DAGISEL64-NEXT: scratch_store_b32 off, v29, s33 offset:120 +; DAGISEL64-NEXT: scratch_store_b32 off, v30, s33 offset:124 +; DAGISEL64-NEXT: scratch_store_b32 off, v31, s33 offset:128 +; DAGISEL64-NEXT: s_clause 0x1f +; DAGISEL64-NEXT: scratch_store_b32 off, v32, s33 offset:132 +; DAGISEL64-NEXT: scratch_store_b32 off, v33, s33 offset:136 +; DAGISEL64-NEXT: scratch_store_b32 off, v34, s33 offset:140 +; DAGISEL64-NEXT: scratch_store_b32 off, v35, s33 offset:144 +; DAGISEL64-NEXT: scratch_store_b32 off, v36, s33 offset:148 +; DAGISEL64-NEXT: scratch_store_b32 off, v37, s33 offset:152 +; DAGISEL64-NEXT: scratch_store_b32 off, v38, s33 offset:156 +; DAGISEL64-NEXT: scratch_store_b32 off, v39, s33 offset:160 +; DAGISEL64-NEXT: scratch_store_b32 off, v48, s33 offset:164 +; DAGISEL64-NEXT: scratch_store_b32 off, v49, s33 offset:168 +; DAGISEL64-NEXT: scratch_store_b32 off, v50, s33 offset:172 +; DAGISEL64-NEXT: scratch_store_b32 off, v51, s33 offset:176 +; DAGISEL64-NEXT: scratch_store_b32 off, v52, s33 offset:180 +; DAGISEL64-NEXT: scratch_store_b32 off, v53, s33 offset:184 +; DAGISEL64-NEXT: scratch_store_b32 off, v54, s33 offset:188 +; DAGISEL64-NEXT: scratch_store_b32 off, v55, s33 offset:192 +; DAGISEL64-NEXT: scratch_store_b32 off, v64, s33 offset:196 +; DAGISEL64-NEXT: scratch_store_b32 off, v65, s33 offset:200 +; DAGISEL64-NEXT: scratch_store_b32 off, v66, s33 offset:204 +; DAGISEL64-NEXT: scratch_store_b32 off, v67, s33 offset:208 +; DAGISEL64-NEXT: scratch_store_b32 off, v68, s33 offset:212 +; DAGISEL64-NEXT: scratch_store_b32 off, v69, s33 offset:216 +; DAGISEL64-NEXT: scratch_store_b32 off, v70, s33 offset:220 +; DAGISEL64-NEXT: scratch_store_b32 off, v71, s33 offset:224 +; DAGISEL64-NEXT: scratch_store_b32 off, v80, s33 offset:228 +; DAGISEL64-NEXT: scratch_store_b32 off, v81, s33 offset:232 +; DAGISEL64-NEXT: scratch_store_b32 off, v82, s33 offset:236 +; DAGISEL64-NEXT: scratch_store_b32 off, v83, s33 offset:240 +; DAGISEL64-NEXT: scratch_store_b32 off, v84, s33 offset:244 +; DAGISEL64-NEXT: scratch_store_b32 off, v85, s33 offset:248 +; DAGISEL64-NEXT: scratch_store_b32 off, v86, s33 offset:252 +; DAGISEL64-NEXT: scratch_store_b32 off, v87, s33 offset:256 +; DAGISEL64-NEXT: s_clause 0x1f +; DAGISEL64-NEXT: scratch_store_b32 off, v96, s33 offset:260 +; DAGISEL64-NEXT: scratch_store_b32 off, v97, s33 offset:264 +; DAGISEL64-NEXT: scratch_store_b32 off, v98, s33 offset:268 +; DAGISEL64-NEXT: scratch_store_b32 off, v99, s33 offset:272 +; DAGISEL64-NEXT: scratch_store_b32 off, v100, s33 offset:276 +; DAGISEL64-NEXT: scratch_store_b32 off, v101, s33 offset:280 +; DAGISEL64-NEXT: scratch_store_b32 off, v102, s33 offset:284 +; DAGISEL64-NEXT: scratch_store_b32 off, v103, s33 offset:288 +; DAGISEL64-NEXT: scratch_store_b32 off, v112, s33 offset:292 +; DAGISEL64-NEXT: scratch_store_b32 off, v113, s33 offset:296 +; DAGISEL64-NEXT: scratch_store_b32 off, v114, s33 offset:300 +; DAGISEL64-NEXT: scratch_store_b32 off, v115, s33 offset:304 +; DAGISEL64-NEXT: scratch_store_b32 off, v116, s33 offset:308 +; DAGISEL64-NEXT: scratch_store_b32 off, v117, s33 offset:312 +; DAGISEL64-NEXT: scratch_store_b32 off, v118, s33 offset:316 +; DAGISEL64-NEXT: scratch_store_b32 off, v119, s33 offset:320 +; DAGISEL64-NEXT: scratch_store_b32 off, v128, s33 offset:324 +; DAGISEL64-NEXT: scratch_store_b32 off, v129, s33 offset:328 +; DAGISEL64-NEXT: scratch_store_b32 off, v130, s33 offset:332 +; DAGISEL64-NEXT: scratch_store_b32 off, v131, s33 offset:336 +; DAGISEL64-NEXT: scratch_store_b32 off, v132, s33 offset:340 +; DAGISEL64-NEXT: scratch_store_b32 off, v133, s33 offset:344 +; DAGISEL64-NEXT: scratch_store_b32 off, v134, s33 offset:348 +; DAGISEL64-NEXT: scratch_store_b32 off, v135, s33 offset:352 +; DAGISEL64-NEXT: scratch_store_b32 off, v144, s33 offset:356 +; DAGISEL64-NEXT: scratch_store_b32 off, v145, s33 offset:360 +; DAGISEL64-NEXT: scratch_store_b32 off, v146, s33 offset:364 +; DAGISEL64-NEXT: scratch_store_b32 off, v147, s33 offset:368 +; DAGISEL64-NEXT: scratch_store_b32 off, v148, s33 offset:372 +; DAGISEL64-NEXT: scratch_store_b32 off, v149, s33 offset:376 +; DAGISEL64-NEXT: scratch_store_b32 off, v150, s33 offset:380 +; DAGISEL64-NEXT: scratch_store_b32 off, v151, s33 offset:384 +; DAGISEL64-NEXT: s_clause 0x1f +; DAGISEL64-NEXT: scratch_store_b32 off, v160, s33 offset:388 +; DAGISEL64-NEXT: scratch_store_b32 off, v161, s33 offset:392 +; DAGISEL64-NEXT: scratch_store_b32 off, v162, s33 offset:396 +; DAGISEL64-NEXT: scratch_store_b32 off, v163, s33 offset:400 +; DAGISEL64-NEXT: scratch_store_b32 off, v164, s33 offset:404 +; DAGISEL64-NEXT: scratch_store_b32 off, v165, s33 offset:408 +; DAGISEL64-NEXT: scratch_store_b32 off, v166, s33 offset:412 +; DAGISEL64-NEXT: scratch_store_b32 off, v167, s33 offset:416 +; DAGISEL64-NEXT: scratch_store_b32 off, v176, s33 offset:420 +; DAGISEL64-NEXT: scratch_store_b32 off, v177, s33 offset:424 +; DAGISEL64-NEXT: scratch_store_b32 off, v178, s33 offset:428 +; DAGISEL64-NEXT: scratch_store_b32 off, v179, s33 offset:432 +; DAGISEL64-NEXT: scratch_store_b32 off, v180, s33 offset:436 +; DAGISEL64-NEXT: scratch_store_b32 off, v181, s33 offset:440 +; DAGISEL64-NEXT: scratch_store_b32 off, v182, s33 offset:444 +; DAGISEL64-NEXT: scratch_store_b32 off, v183, s33 offset:448 +; DAGISEL64-NEXT: scratch_store_b32 off, v192, s33 offset:452 +; DAGISEL64-NEXT: scratch_store_b32 off, v193, s33 offset:456 +; DAGISEL64-NEXT: scratch_store_b32 off, v194, s33 offset:460 +; DAGISEL64-NEXT: scratch_store_b32 off, v195, s33 offset:464 +; DAGISEL64-NEXT: scratch_store_b32 off, v196, s33 offset:468 +; DAGISEL64-NEXT: scratch_store_b32 off, v197, s33 offset:472 +; DAGISEL64-NEXT: scratch_store_b32 off, v198, s33 offset:476 +; DAGISEL64-NEXT: scratch_store_b32 off, v199, s33 offset:480 +; DAGISEL64-NEXT: scratch_store_b32 off, v208, s33 offset:484 +; DAGISEL64-NEXT: scratch_store_b32 off, v209, s33 offset:488 +; DAGISEL64-NEXT: scratch_store_b32 off, v210, s33 offset:492 +; DAGISEL64-NEXT: scratch_store_b32 off, v211, s33 offset:496 +; DAGISEL64-NEXT: scratch_store_b32 off, v212, s33 offset:500 +; DAGISEL64-NEXT: scratch_store_b32 off, v213, s33 offset:504 +; DAGISEL64-NEXT: scratch_store_b32 off, v214, s33 offset:508 +; DAGISEL64-NEXT: scratch_store_b32 off, v215, s33 offset:512 +; DAGISEL64-NEXT: s_clause 0xf +; DAGISEL64-NEXT: scratch_store_b32 off, v224, s33 offset:516 +; DAGISEL64-NEXT: scratch_store_b32 off, v225, s33 offset:520 +; DAGISEL64-NEXT: scratch_store_b32 off, v226, s33 offset:524 +; DAGISEL64-NEXT: scratch_store_b32 off, v227, s33 offset:528 +; DAGISEL64-NEXT: scratch_store_b32 off, v228, s33 offset:532 +; DAGISEL64-NEXT: scratch_store_b32 off, v229, s33 offset:536 +; DAGISEL64-NEXT: scratch_store_b32 off, v230, s33 offset:540 +; DAGISEL64-NEXT: scratch_store_b32 off, v231, s33 offset:544 +; DAGISEL64-NEXT: scratch_store_b32 off, v240, s33 offset:548 +; DAGISEL64-NEXT: scratch_store_b32 off, v241, s33 offset:552 +; DAGISEL64-NEXT: scratch_store_b32 off, v242, s33 offset:556 +; DAGISEL64-NEXT: scratch_store_b32 off, v243, s33 offset:560 +; DAGISEL64-NEXT: scratch_store_b32 off, v244, s33 offset:564 +; DAGISEL64-NEXT: scratch_store_b32 off, v245, s33 offset:568 +; DAGISEL64-NEXT: scratch_store_b32 off, v246, s33 offset:572 +; DAGISEL64-NEXT: scratch_store_b32 off, v247, s33 offset:576 +; DAGISEL64-NEXT: s_mov_b64 exec, -1 +; DAGISEL64-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill +; DAGISEL64-NEXT: s_wait_alu 0xfffe +; DAGISEL64-NEXT: v_writelane_b32 v40, s0, 4 +; DAGISEL64-NEXT: v_mov_b32_e32 v2, v0 +; DAGISEL64-NEXT: v_swap_b32 v0, v1 +; DAGISEL64-NEXT: s_mov_b32 s1, gfx_callee@abs32@hi +; DAGISEL64-NEXT: v_writelane_b32 v40, s4, 0 +; DAGISEL64-NEXT: s_mov_b32 s0, gfx_callee@abs32@lo +; DAGISEL64-NEXT: s_addk_co_i32 s32, 0x250 +; DAGISEL64-NEXT: v_writelane_b32 v40, s5, 1 +; DAGISEL64-NEXT: v_writelane_b32 v40, s30, 2 +; DAGISEL64-NEXT: v_writelane_b32 v40, s31, 3 +; DAGISEL64-NEXT: s_wait_alu 0xfffe +; DAGISEL64-NEXT: s_swappc_b64 s[30:31], s[0:1] +; DAGISEL64-NEXT: s_delay_alu instid0(VALU_DEP_1) +; DAGISEL64-NEXT: v_readlane_b32 s31, v40, 3 +; DAGISEL64-NEXT: v_readlane_b32 s30, v40, 2 +; DAGISEL64-NEXT: v_readlane_b32 s5, v40, 1 +; DAGISEL64-NEXT: v_readlane_b32 s4, v40, 0 +; DAGISEL64-NEXT: v_readlane_b32 s0, v40, 4 +; DAGISEL64-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload +; DAGISEL64-NEXT: s_mov_b32 s32, s33 +; DAGISEL64-NEXT: s_xor_b64 exec, s[4:5], -1 +; DAGISEL64-NEXT: s_clause 0x1f +; DAGISEL64-NEXT: scratch_load_b32 v0, off, s33 offset:4 +; DAGISEL64-NEXT: scratch_load_b32 v1, off, s33 offset:8 +; DAGISEL64-NEXT: scratch_load_b32 v2, off, s33 offset:12 +; DAGISEL64-NEXT: scratch_load_b32 v3, off, s33 offset:16 +; DAGISEL64-NEXT: scratch_load_b32 v4, off, s33 offset:20 +; DAGISEL64-NEXT: scratch_load_b32 v5, off, s33 offset:24 +; DAGISEL64-NEXT: scratch_load_b32 v6, off, s33 offset:28 +; DAGISEL64-NEXT: scratch_load_b32 v7, off, s33 offset:32 +; DAGISEL64-NEXT: scratch_load_b32 v8, off, s33 offset:36 +; DAGISEL64-NEXT: scratch_load_b32 v9, off, s33 offset:40 +; DAGISEL64-NEXT: scratch_load_b32 v10, off, s33 offset:44 +; DAGISEL64-NEXT: scratch_load_b32 v11, off, s33 offset:48 +; DAGISEL64-NEXT: scratch_load_b32 v12, off, s33 offset:52 +; DAGISEL64-NEXT: scratch_load_b32 v13, off, s33 offset:56 +; DAGISEL64-NEXT: scratch_load_b32 v14, off, s33 offset:60 +; DAGISEL64-NEXT: scratch_load_b32 v15, off, s33 offset:64 +; DAGISEL64-NEXT: scratch_load_b32 v16, off, s33 offset:68 +; DAGISEL64-NEXT: scratch_load_b32 v17, off, s33 offset:72 +; DAGISEL64-NEXT: scratch_load_b32 v18, off, s33 offset:76 +; DAGISEL64-NEXT: scratch_load_b32 v19, off, s33 offset:80 +; DAGISEL64-NEXT: scratch_load_b32 v20, off, s33 offset:84 +; DAGISEL64-NEXT: scratch_load_b32 v21, off, s33 offset:88 +; DAGISEL64-NEXT: scratch_load_b32 v22, off, s33 offset:92 +; DAGISEL64-NEXT: scratch_load_b32 v23, off, s33 offset:96 +; DAGISEL64-NEXT: scratch_load_b32 v24, off, s33 offset:100 +; DAGISEL64-NEXT: scratch_load_b32 v25, off, s33 offset:104 +; DAGISEL64-NEXT: scratch_load_b32 v26, off, s33 offset:108 +; DAGISEL64-NEXT: scratch_load_b32 v27, off, s33 offset:112 +; DAGISEL64-NEXT: scratch_load_b32 v28, off, s33 offset:116 +; DAGISEL64-NEXT: scratch_load_b32 v29, off, s33 offset:120 +; DAGISEL64-NEXT: scratch_load_b32 v30, off, s33 offset:124 +; DAGISEL64-NEXT: scratch_load_b32 v31, off, s33 offset:128 +; DAGISEL64-NEXT: s_clause 0x1f +; DAGISEL64-NEXT: scratch_load_b32 v32, off, s33 offset:132 +; DAGISEL64-NEXT: scratch_load_b32 v33, off, s33 offset:136 +; DAGISEL64-NEXT: scratch_load_b32 v34, off, s33 offset:140 +; DAGISEL64-NEXT: scratch_load_b32 v35, off, s33 offset:144 +; DAGISEL64-NEXT: scratch_load_b32 v36, off, s33 offset:148 +; DAGISEL64-NEXT: scratch_load_b32 v37, off, s33 offset:152 +; DAGISEL64-NEXT: scratch_load_b32 v38, off, s33 offset:156 +; DAGISEL64-NEXT: scratch_load_b32 v39, off, s33 offset:160 +; DAGISEL64-NEXT: scratch_load_b32 v48, off, s33 offset:164 +; DAGISEL64-NEXT: scratch_load_b32 v49, off, s33 offset:168 +; DAGISEL64-NEXT: scratch_load_b32 v50, off, s33 offset:172 +; DAGISEL64-NEXT: scratch_load_b32 v51, off, s33 offset:176 +; DAGISEL64-NEXT: scratch_load_b32 v52, off, s33 offset:180 +; DAGISEL64-NEXT: scratch_load_b32 v53, off, s33 offset:184 +; DAGISEL64-NEXT: scratch_load_b32 v54, off, s33 offset:188 +; DAGISEL64-NEXT: scratch_load_b32 v55, off, s33 offset:192 +; DAGISEL64-NEXT: scratch_load_b32 v64, off, s33 offset:196 +; DAGISEL64-NEXT: scratch_load_b32 v65, off, s33 offset:200 +; DAGISEL64-NEXT: scratch_load_b32 v66, off, s33 offset:204 +; DAGISEL64-NEXT: scratch_load_b32 v67, off, s33 offset:208 +; DAGISEL64-NEXT: scratch_load_b32 v68, off, s33 offset:212 +; DAGISEL64-NEXT: scratch_load_b32 v69, off, s33 offset:216 +; DAGISEL64-NEXT: scratch_load_b32 v70, off, s33 offset:220 +; DAGISEL64-NEXT: scratch_load_b32 v71, off, s33 offset:224 +; DAGISEL64-NEXT: scratch_load_b32 v80, off, s33 offset:228 +; DAGISEL64-NEXT: scratch_load_b32 v81, off, s33 offset:232 +; DAGISEL64-NEXT: scratch_load_b32 v82, off, s33 offset:236 +; DAGISEL64-NEXT: scratch_load_b32 v83, off, s33 offset:240 +; DAGISEL64-NEXT: scratch_load_b32 v84, off, s33 offset:244 +; DAGISEL64-NEXT: scratch_load_b32 v85, off, s33 offset:248 +; DAGISEL64-NEXT: scratch_load_b32 v86, off, s33 offset:252 +; DAGISEL64-NEXT: scratch_load_b32 v87, off, s33 offset:256 +; DAGISEL64-NEXT: s_clause 0x1f +; DAGISEL64-NEXT: scratch_load_b32 v96, off, s33 offset:260 +; DAGISEL64-NEXT: scratch_load_b32 v97, off, s33 offset:264 +; DAGISEL64-NEXT: scratch_load_b32 v98, off, s33 offset:268 +; DAGISEL64-NEXT: scratch_load_b32 v99, off, s33 offset:272 +; DAGISEL64-NEXT: scratch_load_b32 v100, off, s33 offset:276 +; DAGISEL64-NEXT: scratch_load_b32 v101, off, s33 offset:280 +; DAGISEL64-NEXT: scratch_load_b32 v102, off, s33 offset:284 +; DAGISEL64-NEXT: scratch_load_b32 v103, off, s33 offset:288 +; DAGISEL64-NEXT: scratch_load_b32 v112, off, s33 offset:292 +; DAGISEL64-NEXT: scratch_load_b32 v113, off, s33 offset:296 +; DAGISEL64-NEXT: scratch_load_b32 v114, off, s33 offset:300 +; DAGISEL64-NEXT: scratch_load_b32 v115, off, s33 offset:304 +; DAGISEL64-NEXT: scratch_load_b32 v116, off, s33 offset:308 +; DAGISEL64-NEXT: scratch_load_b32 v117, off, s33 offset:312 +; DAGISEL64-NEXT: scratch_load_b32 v118, off, s33 offset:316 +; DAGISEL64-NEXT: scratch_load_b32 v119, off, s33 offset:320 +; DAGISEL64-NEXT: scratch_load_b32 v128, off, s33 offset:324 +; DAGISEL64-NEXT: scratch_load_b32 v129, off, s33 offset:328 +; DAGISEL64-NEXT: scratch_load_b32 v130, off, s33 offset:332 +; DAGISEL64-NEXT: scratch_load_b32 v131, off, s33 offset:336 +; DAGISEL64-NEXT: scratch_load_b32 v132, off, s33 offset:340 +; DAGISEL64-NEXT: scratch_load_b32 v133, off, s33 offset:344 +; DAGISEL64-NEXT: scratch_load_b32 v134, off, s33 offset:348 +; DAGISEL64-NEXT: scratch_load_b32 v135, off, s33 offset:352 +; DAGISEL64-NEXT: scratch_load_b32 v144, off, s33 offset:356 +; DAGISEL64-NEXT: scratch_load_b32 v145, off, s33 offset:360 +; DAGISEL64-NEXT: scratch_load_b32 v146, off, s33 offset:364 +; DAGISEL64-NEXT: scratch_load_b32 v147, off, s33 offset:368 +; DAGISEL64-NEXT: scratch_load_b32 v148, off, s33 offset:372 +; DAGISEL64-NEXT: scratch_load_b32 v149, off, s33 offset:376 +; DAGISEL64-NEXT: scratch_load_b32 v150, off, s33 offset:380 +; DAGISEL64-NEXT: scratch_load_b32 v151, off, s33 offset:384 +; DAGISEL64-NEXT: s_clause 0x1f +; DAGISEL64-NEXT: scratch_load_b32 v160, off, s33 offset:388 +; DAGISEL64-NEXT: scratch_load_b32 v161, off, s33 offset:392 +; DAGISEL64-NEXT: scratch_load_b32 v162, off, s33 offset:396 +; DAGISEL64-NEXT: scratch_load_b32 v163, off, s33 offset:400 +; DAGISEL64-NEXT: scratch_load_b32 v164, off, s33 offset:404 +; DAGISEL64-NEXT: scratch_load_b32 v165, off, s33 offset:408 +; DAGISEL64-NEXT: scratch_load_b32 v166, off, s33 offset:412 +; DAGISEL64-NEXT: scratch_load_b32 v167, off, s33 offset:416 +; DAGISEL64-NEXT: scratch_load_b32 v176, off, s33 offset:420 +; DAGISEL64-NEXT: scratch_load_b32 v177, off, s33 offset:424 +; DAGISEL64-NEXT: scratch_load_b32 v178, off, s33 offset:428 +; DAGISEL64-NEXT: scratch_load_b32 v179, off, s33 offset:432 +; DAGISEL64-NEXT: scratch_load_b32 v180, off, s33 offset:436 +; DAGISEL64-NEXT: scratch_load_b32 v181, off, s33 offset:440 +; DAGISEL64-NEXT: scratch_load_b32 v182, off, s33 offset:444 +; DAGISEL64-NEXT: scratch_load_b32 v183, off, s33 offset:448 +; DAGISEL64-NEXT: scratch_load_b32 v192, off, s33 offset:452 +; DAGISEL64-NEXT: scratch_load_b32 v193, off, s33 offset:456 +; DAGISEL64-NEXT: scratch_load_b32 v194, off, s33 offset:460 +; DAGISEL64-NEXT: scratch_load_b32 v195, off, s33 offset:464 +; DAGISEL64-NEXT: scratch_load_b32 v196, off, s33 offset:468 +; DAGISEL64-NEXT: scratch_load_b32 v197, off, s33 offset:472 +; DAGISEL64-NEXT: scratch_load_b32 v198, off, s33 offset:476 +; DAGISEL64-NEXT: scratch_load_b32 v199, off, s33 offset:480 +; DAGISEL64-NEXT: scratch_load_b32 v208, off, s33 offset:484 +; DAGISEL64-NEXT: scratch_load_b32 v209, off, s33 offset:488 +; DAGISEL64-NEXT: scratch_load_b32 v210, off, s33 offset:492 +; DAGISEL64-NEXT: scratch_load_b32 v211, off, s33 offset:496 +; DAGISEL64-NEXT: scratch_load_b32 v212, off, s33 offset:500 +; DAGISEL64-NEXT: scratch_load_b32 v213, off, s33 offset:504 +; DAGISEL64-NEXT: scratch_load_b32 v214, off, s33 offset:508 +; DAGISEL64-NEXT: scratch_load_b32 v215, off, s33 offset:512 +; DAGISEL64-NEXT: s_clause 0xf +; DAGISEL64-NEXT: scratch_load_b32 v224, off, s33 offset:516 +; DAGISEL64-NEXT: scratch_load_b32 v225, off, s33 offset:520 +; DAGISEL64-NEXT: scratch_load_b32 v226, off, s33 offset:524 +; DAGISEL64-NEXT: scratch_load_b32 v227, off, s33 offset:528 +; DAGISEL64-NEXT: scratch_load_b32 v228, off, s33 offset:532 +; DAGISEL64-NEXT: scratch_load_b32 v229, off, s33 offset:536 +; DAGISEL64-NEXT: scratch_load_b32 v230, off, s33 offset:540 +; DAGISEL64-NEXT: scratch_load_b32 v231, off, s33 offset:544 +; DAGISEL64-NEXT: scratch_load_b32 v240, off, s33 offset:548 +; DAGISEL64-NEXT: scratch_load_b32 v241, off, s33 offset:552 +; DAGISEL64-NEXT: scratch_load_b32 v242, off, s33 offset:556 +; DAGISEL64-NEXT: scratch_load_b32 v243, off, s33 offset:560 +; DAGISEL64-NEXT: scratch_load_b32 v244, off, s33 offset:564 +; DAGISEL64-NEXT: scratch_load_b32 v245, off, s33 offset:568 +; DAGISEL64-NEXT: scratch_load_b32 v246, off, s33 offset:572 +; DAGISEL64-NEXT: scratch_load_b32 v247, off, s33 offset:576 +; DAGISEL64-NEXT: s_mov_b64 exec, s[4:5] +; DAGISEL64-NEXT: s_mov_b32 s33, s0 +; DAGISEL64-NEXT: s_wait_loadcnt 0x0 +; DAGISEL64-NEXT: s_wait_alu 0xfffe +; DAGISEL64-NEXT: s_setpc_b64 s[30:31] +; +; GISEL64-LABEL: call_gfx_from_whole_wave: +; GISEL64: ; %bb.0: +; GISEL64-NEXT: s_wait_loadcnt_dscnt 0x0 +; GISEL64-NEXT: s_wait_expcnt 0x0 +; GISEL64-NEXT: s_wait_samplecnt 0x0 +; GISEL64-NEXT: s_wait_bvhcnt 0x0 +; GISEL64-NEXT: s_wait_kmcnt 0x0 +; GISEL64-NEXT: s_mov_b32 s0, s33 +; GISEL64-NEXT: s_mov_b32 s33, s32 +; GISEL64-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GISEL64-NEXT: s_clause 0x1f +; GISEL64-NEXT: scratch_store_b32 off, v0, s33 offset:4 +; GISEL64-NEXT: scratch_store_b32 off, v1, s33 offset:8 +; GISEL64-NEXT: scratch_store_b32 off, v2, s33 offset:12 +; GISEL64-NEXT: scratch_store_b32 off, v3, s33 offset:16 +; GISEL64-NEXT: scratch_store_b32 off, v4, s33 offset:20 +; GISEL64-NEXT: scratch_store_b32 off, v5, s33 offset:24 +; GISEL64-NEXT: scratch_store_b32 off, v6, s33 offset:28 +; GISEL64-NEXT: scratch_store_b32 off, v7, s33 offset:32 +; GISEL64-NEXT: scratch_store_b32 off, v8, s33 offset:36 +; GISEL64-NEXT: scratch_store_b32 off, v9, s33 offset:40 +; GISEL64-NEXT: scratch_store_b32 off, v10, s33 offset:44 +; GISEL64-NEXT: scratch_store_b32 off, v11, s33 offset:48 +; GISEL64-NEXT: scratch_store_b32 off, v12, s33 offset:52 +; GISEL64-NEXT: scratch_store_b32 off, v13, s33 offset:56 +; GISEL64-NEXT: scratch_store_b32 off, v14, s33 offset:60 +; GISEL64-NEXT: scratch_store_b32 off, v15, s33 offset:64 +; GISEL64-NEXT: scratch_store_b32 off, v16, s33 offset:68 +; GISEL64-NEXT: scratch_store_b32 off, v17, s33 offset:72 +; GISEL64-NEXT: scratch_store_b32 off, v18, s33 offset:76 +; GISEL64-NEXT: scratch_store_b32 off, v19, s33 offset:80 +; GISEL64-NEXT: scratch_store_b32 off, v20, s33 offset:84 +; GISEL64-NEXT: scratch_store_b32 off, v21, s33 offset:88 +; GISEL64-NEXT: scratch_store_b32 off, v22, s33 offset:92 +; GISEL64-NEXT: scratch_store_b32 off, v23, s33 offset:96 +; GISEL64-NEXT: scratch_store_b32 off, v24, s33 offset:100 +; GISEL64-NEXT: scratch_store_b32 off, v25, s33 offset:104 +; GISEL64-NEXT: scratch_store_b32 off, v26, s33 offset:108 +; GISEL64-NEXT: scratch_store_b32 off, v27, s33 offset:112 +; GISEL64-NEXT: scratch_store_b32 off, v28, s33 offset:116 +; GISEL64-NEXT: scratch_store_b32 off, v29, s33 offset:120 +; GISEL64-NEXT: scratch_store_b32 off, v30, s33 offset:124 +; GISEL64-NEXT: scratch_store_b32 off, v31, s33 offset:128 +; GISEL64-NEXT: s_clause 0x1f +; GISEL64-NEXT: scratch_store_b32 off, v32, s33 offset:132 +; GISEL64-NEXT: scratch_store_b32 off, v33, s33 offset:136 +; GISEL64-NEXT: scratch_store_b32 off, v34, s33 offset:140 +; GISEL64-NEXT: scratch_store_b32 off, v35, s33 offset:144 +; GISEL64-NEXT: scratch_store_b32 off, v36, s33 offset:148 +; GISEL64-NEXT: scratch_store_b32 off, v37, s33 offset:152 +; GISEL64-NEXT: scratch_store_b32 off, v38, s33 offset:156 +; GISEL64-NEXT: scratch_store_b32 off, v39, s33 offset:160 +; GISEL64-NEXT: scratch_store_b32 off, v48, s33 offset:164 +; GISEL64-NEXT: scratch_store_b32 off, v49, s33 offset:168 +; GISEL64-NEXT: scratch_store_b32 off, v50, s33 offset:172 +; GISEL64-NEXT: scratch_store_b32 off, v51, s33 offset:176 +; GISEL64-NEXT: scratch_store_b32 off, v52, s33 offset:180 +; GISEL64-NEXT: scratch_store_b32 off, v53, s33 offset:184 +; GISEL64-NEXT: scratch_store_b32 off, v54, s33 offset:188 +; GISEL64-NEXT: scratch_store_b32 off, v55, s33 offset:192 +; GISEL64-NEXT: scratch_store_b32 off, v64, s33 offset:196 +; GISEL64-NEXT: scratch_store_b32 off, v65, s33 offset:200 +; GISEL64-NEXT: scratch_store_b32 off, v66, s33 offset:204 +; GISEL64-NEXT: scratch_store_b32 off, v67, s33 offset:208 +; GISEL64-NEXT: scratch_store_b32 off, v68, s33 offset:212 +; GISEL64-NEXT: scratch_store_b32 off, v69, s33 offset:216 +; GISEL64-NEXT: scratch_store_b32 off, v70, s33 offset:220 +; GISEL64-NEXT: scratch_store_b32 off, v71, s33 offset:224 +; GISEL64-NEXT: scratch_store_b32 off, v80, s33 offset:228 +; GISEL64-NEXT: scratch_store_b32 off, v81, s33 offset:232 +; GISEL64-NEXT: scratch_store_b32 off, v82, s33 offset:236 +; GISEL64-NEXT: scratch_store_b32 off, v83, s33 offset:240 +; GISEL64-NEXT: scratch_store_b32 off, v84, s33 offset:244 +; GISEL64-NEXT: scratch_store_b32 off, v85, s33 offset:248 +; GISEL64-NEXT: scratch_store_b32 off, v86, s33 offset:252 +; GISEL64-NEXT: scratch_store_b32 off, v87, s33 offset:256 +; GISEL64-NEXT: s_clause 0x1f +; GISEL64-NEXT: scratch_store_b32 off, v96, s33 offset:260 +; GISEL64-NEXT: scratch_store_b32 off, v97, s33 offset:264 +; GISEL64-NEXT: scratch_store_b32 off, v98, s33 offset:268 +; GISEL64-NEXT: scratch_store_b32 off, v99, s33 offset:272 +; GISEL64-NEXT: scratch_store_b32 off, v100, s33 offset:276 +; GISEL64-NEXT: scratch_store_b32 off, v101, s33 offset:280 +; GISEL64-NEXT: scratch_store_b32 off, v102, s33 offset:284 +; GISEL64-NEXT: scratch_store_b32 off, v103, s33 offset:288 +; GISEL64-NEXT: scratch_store_b32 off, v112, s33 offset:292 +; GISEL64-NEXT: scratch_store_b32 off, v113, s33 offset:296 +; GISEL64-NEXT: scratch_store_b32 off, v114, s33 offset:300 +; GISEL64-NEXT: scratch_store_b32 off, v115, s33 offset:304 +; GISEL64-NEXT: scratch_store_b32 off, v116, s33 offset:308 +; GISEL64-NEXT: scratch_store_b32 off, v117, s33 offset:312 +; GISEL64-NEXT: scratch_store_b32 off, v118, s33 offset:316 +; GISEL64-NEXT: scratch_store_b32 off, v119, s33 offset:320 +; GISEL64-NEXT: scratch_store_b32 off, v128, s33 offset:324 +; GISEL64-NEXT: scratch_store_b32 off, v129, s33 offset:328 +; GISEL64-NEXT: scratch_store_b32 off, v130, s33 offset:332 +; GISEL64-NEXT: scratch_store_b32 off, v131, s33 offset:336 +; GISEL64-NEXT: scratch_store_b32 off, v132, s33 offset:340 +; GISEL64-NEXT: scratch_store_b32 off, v133, s33 offset:344 +; GISEL64-NEXT: scratch_store_b32 off, v134, s33 offset:348 +; GISEL64-NEXT: scratch_store_b32 off, v135, s33 offset:352 +; GISEL64-NEXT: scratch_store_b32 off, v144, s33 offset:356 +; GISEL64-NEXT: scratch_store_b32 off, v145, s33 offset:360 +; GISEL64-NEXT: scratch_store_b32 off, v146, s33 offset:364 +; GISEL64-NEXT: scratch_store_b32 off, v147, s33 offset:368 +; GISEL64-NEXT: scratch_store_b32 off, v148, s33 offset:372 +; GISEL64-NEXT: scratch_store_b32 off, v149, s33 offset:376 +; GISEL64-NEXT: scratch_store_b32 off, v150, s33 offset:380 +; GISEL64-NEXT: scratch_store_b32 off, v151, s33 offset:384 +; GISEL64-NEXT: s_clause 0x1f +; GISEL64-NEXT: scratch_store_b32 off, v160, s33 offset:388 +; GISEL64-NEXT: scratch_store_b32 off, v161, s33 offset:392 +; GISEL64-NEXT: scratch_store_b32 off, v162, s33 offset:396 +; GISEL64-NEXT: scratch_store_b32 off, v163, s33 offset:400 +; GISEL64-NEXT: scratch_store_b32 off, v164, s33 offset:404 +; GISEL64-NEXT: scratch_store_b32 off, v165, s33 offset:408 +; GISEL64-NEXT: scratch_store_b32 off, v166, s33 offset:412 +; GISEL64-NEXT: scratch_store_b32 off, v167, s33 offset:416 +; GISEL64-NEXT: scratch_store_b32 off, v176, s33 offset:420 +; GISEL64-NEXT: scratch_store_b32 off, v177, s33 offset:424 +; GISEL64-NEXT: scratch_store_b32 off, v178, s33 offset:428 +; GISEL64-NEXT: scratch_store_b32 off, v179, s33 offset:432 +; GISEL64-NEXT: scratch_store_b32 off, v180, s33 offset:436 +; GISEL64-NEXT: scratch_store_b32 off, v181, s33 offset:440 +; GISEL64-NEXT: scratch_store_b32 off, v182, s33 offset:444 +; GISEL64-NEXT: scratch_store_b32 off, v183, s33 offset:448 +; GISEL64-NEXT: scratch_store_b32 off, v192, s33 offset:452 +; GISEL64-NEXT: scratch_store_b32 off, v193, s33 offset:456 +; GISEL64-NEXT: scratch_store_b32 off, v194, s33 offset:460 +; GISEL64-NEXT: scratch_store_b32 off, v195, s33 offset:464 +; GISEL64-NEXT: scratch_store_b32 off, v196, s33 offset:468 +; GISEL64-NEXT: scratch_store_b32 off, v197, s33 offset:472 +; GISEL64-NEXT: scratch_store_b32 off, v198, s33 offset:476 +; GISEL64-NEXT: scratch_store_b32 off, v199, s33 offset:480 +; GISEL64-NEXT: scratch_store_b32 off, v208, s33 offset:484 +; GISEL64-NEXT: scratch_store_b32 off, v209, s33 offset:488 +; GISEL64-NEXT: scratch_store_b32 off, v210, s33 offset:492 +; GISEL64-NEXT: scratch_store_b32 off, v211, s33 offset:496 +; GISEL64-NEXT: scratch_store_b32 off, v212, s33 offset:500 +; GISEL64-NEXT: scratch_store_b32 off, v213, s33 offset:504 +; GISEL64-NEXT: scratch_store_b32 off, v214, s33 offset:508 +; GISEL64-NEXT: scratch_store_b32 off, v215, s33 offset:512 +; GISEL64-NEXT: s_clause 0xf +; GISEL64-NEXT: scratch_store_b32 off, v224, s33 offset:516 +; GISEL64-NEXT: scratch_store_b32 off, v225, s33 offset:520 +; GISEL64-NEXT: scratch_store_b32 off, v226, s33 offset:524 +; GISEL64-NEXT: scratch_store_b32 off, v227, s33 offset:528 +; GISEL64-NEXT: scratch_store_b32 off, v228, s33 offset:532 +; GISEL64-NEXT: scratch_store_b32 off, v229, s33 offset:536 +; GISEL64-NEXT: scratch_store_b32 off, v230, s33 offset:540 +; GISEL64-NEXT: scratch_store_b32 off, v231, s33 offset:544 +; GISEL64-NEXT: scratch_store_b32 off, v240, s33 offset:548 +; GISEL64-NEXT: scratch_store_b32 off, v241, s33 offset:552 +; GISEL64-NEXT: scratch_store_b32 off, v242, s33 offset:556 +; GISEL64-NEXT: scratch_store_b32 off, v243, s33 offset:560 +; GISEL64-NEXT: scratch_store_b32 off, v244, s33 offset:564 +; GISEL64-NEXT: scratch_store_b32 off, v245, s33 offset:568 +; GISEL64-NEXT: scratch_store_b32 off, v246, s33 offset:572 +; GISEL64-NEXT: scratch_store_b32 off, v247, s33 offset:576 +; GISEL64-NEXT: s_mov_b64 exec, -1 +; GISEL64-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill +; GISEL64-NEXT: s_wait_alu 0xfffe +; GISEL64-NEXT: v_writelane_b32 v40, s0, 4 +; GISEL64-NEXT: v_mov_b32_e32 v2, v0 +; GISEL64-NEXT: v_swap_b32 v0, v1 +; GISEL64-NEXT: s_mov_b32 s0, gfx_callee@abs32@lo +; GISEL64-NEXT: v_writelane_b32 v40, s4, 0 +; GISEL64-NEXT: s_mov_b32 s1, gfx_callee@abs32@hi +; GISEL64-NEXT: s_addk_co_i32 s32, 0x250 +; GISEL64-NEXT: v_writelane_b32 v40, s5, 1 +; GISEL64-NEXT: v_writelane_b32 v40, s30, 2 +; GISEL64-NEXT: v_writelane_b32 v40, s31, 3 +; GISEL64-NEXT: s_wait_alu 0xfffe +; GISEL64-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GISEL64-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GISEL64-NEXT: v_readlane_b32 s31, v40, 3 +; GISEL64-NEXT: v_readlane_b32 s30, v40, 2 +; GISEL64-NEXT: v_readlane_b32 s5, v40, 1 +; GISEL64-NEXT: v_readlane_b32 s4, v40, 0 +; GISEL64-NEXT: v_readlane_b32 s0, v40, 4 +; GISEL64-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload +; GISEL64-NEXT: s_mov_b32 s32, s33 +; GISEL64-NEXT: s_xor_b64 exec, s[4:5], -1 +; GISEL64-NEXT: s_clause 0x1f +; GISEL64-NEXT: scratch_load_b32 v0, off, s33 offset:4 +; GISEL64-NEXT: scratch_load_b32 v1, off, s33 offset:8 +; GISEL64-NEXT: scratch_load_b32 v2, off, s33 offset:12 +; GISEL64-NEXT: scratch_load_b32 v3, off, s33 offset:16 +; GISEL64-NEXT: scratch_load_b32 v4, off, s33 offset:20 +; GISEL64-NEXT: scratch_load_b32 v5, off, s33 offset:24 +; GISEL64-NEXT: scratch_load_b32 v6, off, s33 offset:28 +; GISEL64-NEXT: scratch_load_b32 v7, off, s33 offset:32 +; GISEL64-NEXT: scratch_load_b32 v8, off, s33 offset:36 +; GISEL64-NEXT: scratch_load_b32 v9, off, s33 offset:40 +; GISEL64-NEXT: scratch_load_b32 v10, off, s33 offset:44 +; GISEL64-NEXT: scratch_load_b32 v11, off, s33 offset:48 +; GISEL64-NEXT: scratch_load_b32 v12, off, s33 offset:52 +; GISEL64-NEXT: scratch_load_b32 v13, off, s33 offset:56 +; GISEL64-NEXT: scratch_load_b32 v14, off, s33 offset:60 +; GISEL64-NEXT: scratch_load_b32 v15, off, s33 offset:64 +; GISEL64-NEXT: scratch_load_b32 v16, off, s33 offset:68 +; GISEL64-NEXT: scratch_load_b32 v17, off, s33 offset:72 +; GISEL64-NEXT: scratch_load_b32 v18, off, s33 offset:76 +; GISEL64-NEXT: scratch_load_b32 v19, off, s33 offset:80 +; GISEL64-NEXT: scratch_load_b32 v20, off, s33 offset:84 +; GISEL64-NEXT: scratch_load_b32 v21, off, s33 offset:88 +; GISEL64-NEXT: scratch_load_b32 v22, off, s33 offset:92 +; GISEL64-NEXT: scratch_load_b32 v23, off, s33 offset:96 +; GISEL64-NEXT: scratch_load_b32 v24, off, s33 offset:100 +; GISEL64-NEXT: scratch_load_b32 v25, off, s33 offset:104 +; GISEL64-NEXT: scratch_load_b32 v26, off, s33 offset:108 +; GISEL64-NEXT: scratch_load_b32 v27, off, s33 offset:112 +; GISEL64-NEXT: scratch_load_b32 v28, off, s33 offset:116 +; GISEL64-NEXT: scratch_load_b32 v29, off, s33 offset:120 +; GISEL64-NEXT: scratch_load_b32 v30, off, s33 offset:124 +; GISEL64-NEXT: scratch_load_b32 v31, off, s33 offset:128 +; GISEL64-NEXT: s_clause 0x1f +; GISEL64-NEXT: scratch_load_b32 v32, off, s33 offset:132 +; GISEL64-NEXT: scratch_load_b32 v33, off, s33 offset:136 +; GISEL64-NEXT: scratch_load_b32 v34, off, s33 offset:140 +; GISEL64-NEXT: scratch_load_b32 v35, off, s33 offset:144 +; GISEL64-NEXT: scratch_load_b32 v36, off, s33 offset:148 +; GISEL64-NEXT: scratch_load_b32 v37, off, s33 offset:152 +; GISEL64-NEXT: scratch_load_b32 v38, off, s33 offset:156 +; GISEL64-NEXT: scratch_load_b32 v39, off, s33 offset:160 +; GISEL64-NEXT: scratch_load_b32 v48, off, s33 offset:164 +; GISEL64-NEXT: scratch_load_b32 v49, off, s33 offset:168 +; GISEL64-NEXT: scratch_load_b32 v50, off, s33 offset:172 +; GISEL64-NEXT: scratch_load_b32 v51, off, s33 offset:176 +; GISEL64-NEXT: scratch_load_b32 v52, off, s33 offset:180 +; GISEL64-NEXT: scratch_load_b32 v53, off, s33 offset:184 +; GISEL64-NEXT: scratch_load_b32 v54, off, s33 offset:188 +; GISEL64-NEXT: scratch_load_b32 v55, off, s33 offset:192 +; GISEL64-NEXT: scratch_load_b32 v64, off, s33 offset:196 +; GISEL64-NEXT: scratch_load_b32 v65, off, s33 offset:200 +; GISEL64-NEXT: scratch_load_b32 v66, off, s33 offset:204 +; GISEL64-NEXT: scratch_load_b32 v67, off, s33 offset:208 +; GISEL64-NEXT: scratch_load_b32 v68, off, s33 offset:212 +; GISEL64-NEXT: scratch_load_b32 v69, off, s33 offset:216 +; GISEL64-NEXT: scratch_load_b32 v70, off, s33 offset:220 +; GISEL64-NEXT: scratch_load_b32 v71, off, s33 offset:224 +; GISEL64-NEXT: scratch_load_b32 v80, off, s33 offset:228 +; GISEL64-NEXT: scratch_load_b32 v81, off, s33 offset:232 +; GISEL64-NEXT: scratch_load_b32 v82, off, s33 offset:236 +; GISEL64-NEXT: scratch_load_b32 v83, off, s33 offset:240 +; GISEL64-NEXT: scratch_load_b32 v84, off, s33 offset:244 +; GISEL64-NEXT: scratch_load_b32 v85, off, s33 offset:248 +; GISEL64-NEXT: scratch_load_b32 v86, off, s33 offset:252 +; GISEL64-NEXT: scratch_load_b32 v87, off, s33 offset:256 +; GISEL64-NEXT: s_clause 0x1f +; GISEL64-NEXT: scratch_load_b32 v96, off, s33 offset:260 +; GISEL64-NEXT: scratch_load_b32 v97, off, s33 offset:264 +; GISEL64-NEXT: scratch_load_b32 v98, off, s33 offset:268 +; GISEL64-NEXT: scratch_load_b32 v99, off, s33 offset:272 +; GISEL64-NEXT: scratch_load_b32 v100, off, s33 offset:276 +; GISEL64-NEXT: scratch_load_b32 v101, off, s33 offset:280 +; GISEL64-NEXT: scratch_load_b32 v102, off, s33 offset:284 +; GISEL64-NEXT: scratch_load_b32 v103, off, s33 offset:288 +; GISEL64-NEXT: scratch_load_b32 v112, off, s33 offset:292 +; GISEL64-NEXT: scratch_load_b32 v113, off, s33 offset:296 +; GISEL64-NEXT: scratch_load_b32 v114, off, s33 offset:300 +; GISEL64-NEXT: scratch_load_b32 v115, off, s33 offset:304 +; GISEL64-NEXT: scratch_load_b32 v116, off, s33 offset:308 +; GISEL64-NEXT: scratch_load_b32 v117, off, s33 offset:312 +; GISEL64-NEXT: scratch_load_b32 v118, off, s33 offset:316 +; GISEL64-NEXT: scratch_load_b32 v119, off, s33 offset:320 +; GISEL64-NEXT: scratch_load_b32 v128, off, s33 offset:324 +; GISEL64-NEXT: scratch_load_b32 v129, off, s33 offset:328 +; GISEL64-NEXT: scratch_load_b32 v130, off, s33 offset:332 +; GISEL64-NEXT: scratch_load_b32 v131, off, s33 offset:336 +; GISEL64-NEXT: scratch_load_b32 v132, off, s33 offset:340 +; GISEL64-NEXT: scratch_load_b32 v133, off, s33 offset:344 +; GISEL64-NEXT: scratch_load_b32 v134, off, s33 offset:348 +; GISEL64-NEXT: scratch_load_b32 v135, off, s33 offset:352 +; GISEL64-NEXT: scratch_load_b32 v144, off, s33 offset:356 +; GISEL64-NEXT: scratch_load_b32 v145, off, s33 offset:360 +; GISEL64-NEXT: scratch_load_b32 v146, off, s33 offset:364 +; GISEL64-NEXT: scratch_load_b32 v147, off, s33 offset:368 +; GISEL64-NEXT: scratch_load_b32 v148, off, s33 offset:372 +; GISEL64-NEXT: scratch_load_b32 v149, off, s33 offset:376 +; GISEL64-NEXT: scratch_load_b32 v150, off, s33 offset:380 +; GISEL64-NEXT: scratch_load_b32 v151, off, s33 offset:384 +; GISEL64-NEXT: s_clause 0x1f +; GISEL64-NEXT: scratch_load_b32 v160, off, s33 offset:388 +; GISEL64-NEXT: scratch_load_b32 v161, off, s33 offset:392 +; GISEL64-NEXT: scratch_load_b32 v162, off, s33 offset:396 +; GISEL64-NEXT: scratch_load_b32 v163, off, s33 offset:400 +; GISEL64-NEXT: scratch_load_b32 v164, off, s33 offset:404 +; GISEL64-NEXT: scratch_load_b32 v165, off, s33 offset:408 +; GISEL64-NEXT: scratch_load_b32 v166, off, s33 offset:412 +; GISEL64-NEXT: scratch_load_b32 v167, off, s33 offset:416 +; GISEL64-NEXT: scratch_load_b32 v176, off, s33 offset:420 +; GISEL64-NEXT: scratch_load_b32 v177, off, s33 offset:424 +; GISEL64-NEXT: scratch_load_b32 v178, off, s33 offset:428 +; GISEL64-NEXT: scratch_load_b32 v179, off, s33 offset:432 +; GISEL64-NEXT: scratch_load_b32 v180, off, s33 offset:436 +; GISEL64-NEXT: scratch_load_b32 v181, off, s33 offset:440 +; GISEL64-NEXT: scratch_load_b32 v182, off, s33 offset:444 +; GISEL64-NEXT: scratch_load_b32 v183, off, s33 offset:448 +; GISEL64-NEXT: scratch_load_b32 v192, off, s33 offset:452 +; GISEL64-NEXT: scratch_load_b32 v193, off, s33 offset:456 +; GISEL64-NEXT: scratch_load_b32 v194, off, s33 offset:460 +; GISEL64-NEXT: scratch_load_b32 v195, off, s33 offset:464 +; GISEL64-NEXT: scratch_load_b32 v196, off, s33 offset:468 +; GISEL64-NEXT: scratch_load_b32 v197, off, s33 offset:472 +; GISEL64-NEXT: scratch_load_b32 v198, off, s33 offset:476 +; GISEL64-NEXT: scratch_load_b32 v199, off, s33 offset:480 +; GISEL64-NEXT: scratch_load_b32 v208, off, s33 offset:484 +; GISEL64-NEXT: scratch_load_b32 v209, off, s33 offset:488 +; GISEL64-NEXT: scratch_load_b32 v210, off, s33 offset:492 +; GISEL64-NEXT: scratch_load_b32 v211, off, s33 offset:496 +; GISEL64-NEXT: scratch_load_b32 v212, off, s33 offset:500 +; GISEL64-NEXT: scratch_load_b32 v213, off, s33 offset:504 +; GISEL64-NEXT: scratch_load_b32 v214, off, s33 offset:508 +; GISEL64-NEXT: scratch_load_b32 v215, off, s33 offset:512 +; GISEL64-NEXT: s_clause 0xf +; GISEL64-NEXT: scratch_load_b32 v224, off, s33 offset:516 +; GISEL64-NEXT: scratch_load_b32 v225, off, s33 offset:520 +; GISEL64-NEXT: scratch_load_b32 v226, off, s33 offset:524 +; GISEL64-NEXT: scratch_load_b32 v227, off, s33 offset:528 +; GISEL64-NEXT: scratch_load_b32 v228, off, s33 offset:532 +; GISEL64-NEXT: scratch_load_b32 v229, off, s33 offset:536 +; GISEL64-NEXT: scratch_load_b32 v230, off, s33 offset:540 +; GISEL64-NEXT: scratch_load_b32 v231, off, s33 offset:544 +; GISEL64-NEXT: scratch_load_b32 v240, off, s33 offset:548 +; GISEL64-NEXT: scratch_load_b32 v241, off, s33 offset:552 +; GISEL64-NEXT: scratch_load_b32 v242, off, s33 offset:556 +; GISEL64-NEXT: scratch_load_b32 v243, off, s33 offset:560 +; GISEL64-NEXT: scratch_load_b32 v244, off, s33 offset:564 +; GISEL64-NEXT: scratch_load_b32 v245, off, s33 offset:568 +; GISEL64-NEXT: scratch_load_b32 v246, off, s33 offset:572 +; GISEL64-NEXT: scratch_load_b32 v247, off, s33 offset:576 +; GISEL64-NEXT: s_mov_b64 exec, s[4:5] +; GISEL64-NEXT: s_mov_b32 s33, s0 +; GISEL64-NEXT: s_wait_loadcnt 0x0 +; GISEL64-NEXT: s_wait_alu 0xfffe +; GISEL64-NEXT: s_setpc_b64 s[30:31] + %ret = call amdgpu_gfx <2 x half>(<2 x half>, <2 x half>) @gfx_callee(<2 x half> %y, <2 x half> %x) convergent + ret <2 x half> %ret +} diff --git a/llvm/test/CodeGen/AVR/branch-relaxation-long-backward.ll b/llvm/test/CodeGen/AVR/branch-relaxation-long-backward.ll index 3562b93..9e1aa10 100644 --- a/llvm/test/CodeGen/AVR/branch-relaxation-long-backward.ll +++ b/llvm/test/CodeGen/AVR/branch-relaxation-long-backward.ll @@ -1,28 +1,21 @@ ; RUN: llc < %s -mtriple=avr -mcpu=attiny85 -filetype=obj -o - | llvm-objdump --mcpu=attiny85 -dr --no-show-raw-insn --no-leading-addr - | FileCheck --check-prefix=ATTINY85 %s -; RUN: llc < %s -mtriple=avr -mcpu=avr25 -filetype=obj -o - | llvm-objdump --mcpu=avr25 -dr --no-show-raw-insn --no-leading-addr - | FileCheck --check-prefix=AVR25 %s ; RUN: llc < %s -mtriple=avr -mcpu=avr3 -filetype=obj -o - | llvm-objdump --mcpu=avr3 -dr --no-show-raw-insn --no-leading-addr - | FileCheck --check-prefix=AVR3 %s ; ATTINY85: <main>: ; ATTINY85-NEXT: andi r24, 0x1 ; ATTINY85: cpi r24, 0x0 -; ATTINY85-NEXT: breq .+2 -; ATTINY85-NEXT: rjmp .+4086 +; ATTINY85-NEXT: breq .-2 +; ATTINY85-NEXT: R_AVR_7_PCREL .text+0x100c +; ATTINY85-NEXT: rjmp .-2 +; ATTINY85-NEXT: R_AVR_13_PCREL .text+0x2 ; ATTINY85: ldi r24, 0x3 ; ATTINY85-NEXT: ret -; AVR25: <main>: -; AVR25-NEXT: andi r24, 0x1 -; AVR25: cpi r24, 0x0 -; AVR25-NEXT: breq .+2 -; AVR25-NEXT: rjmp .-2 -; AVR25-NEXT: R_AVR_13_PCREL .text+0x2 -; AVR25: ldi r24, 0x3 -; AVR25-NEXT: ret - ; AVR3: <main>: ; AVR3-NEXT: andi r24, 0x1 ; AVR3: cpi r24, 0x0 -; AVR3-NEXT: breq .+4 +; AVR3-NEXT: breq .-2 +; AVR3-NEXT: R_AVR_7_PCREL .text+0x100e ; AVR3-NEXT: jmp 0x0 ; AVR3-NEXT: R_AVR_CALL .text+0x2 ; AVR3: ldi r24, 0x3 diff --git a/llvm/test/CodeGen/AVR/branch-relaxation-long-forward.ll b/llvm/test/CodeGen/AVR/branch-relaxation-long-forward.ll index a51cf42..1fc84a7 100644 --- a/llvm/test/CodeGen/AVR/branch-relaxation-long-forward.ll +++ b/llvm/test/CodeGen/AVR/branch-relaxation-long-forward.ll @@ -1,28 +1,21 @@ ; RUN: llc < %s -mtriple=avr -mcpu=attiny85 -filetype=obj -o - | llvm-objdump --mcpu=attiny85 -dr --no-show-raw-insn --no-leading-addr - | FileCheck --check-prefix=ATTINY85 %s -; RUN: llc < %s -mtriple=avr -mcpu=avr25 -filetype=obj -o - | llvm-objdump --mcpu=avr25 -dr --no-show-raw-insn --no-leading-addr - | FileCheck --check-prefix=AVR25 %s ; RUN: llc < %s -mtriple=avr -mcpu=avr3 -filetype=obj -o - | llvm-objdump --mcpu=avr3 -dr --no-show-raw-insn --no-leading-addr - | FileCheck --check-prefix=AVR3 %s ; ATTINY85: <main>: ; ATTINY85-NEXT: andi r24, 0x1 ; ATTINY85-NEXT: cpi r24, 0x0 -; ATTINY85-NEXT: brne .+2 -; ATTINY85-NEXT: rjmp .-4092 +; ATTINY85-NEXT: brne .-2 +; ATTINY85-NEXT: R_AVR_7_PCREL .text+0x8 +; ATTINY85-NEXT: rjmp .-2 +; ATTINY85-NEXT: R_AVR_13_PCREL .text+0x100c ; ATTINY85: ldi r24, 0x3 ; ATTINY85-NEXT: ret -; AVR25: <main>: -; AVR25-NEXT: andi r24, 0x1 -; AVR25-NEXT: cpi r24, 0x0 -; AVR25-NEXT: brne .+2 -; AVR25-NEXT: rjmp .-2 -; AVR25-NEXT: R_AVR_13_PCREL .text+0x100c -; AVR25: ldi r24, 0x3 -; AVR25-NEXT: ret - ; AVR3: <main>: ; AVR3-NEXT: andi r24, 0x1 ; AVR3-NEXT: cpi r24, 0x0 -; AVR3-NEXT: brne .+4 +; AVR3-NEXT: brne .-2 +; AVR3-NEXT: R_AVR_7_PCREL .text+0xa ; AVR3-NEXT: jmp 0x0 ; AVR3-NEXT: R_AVR_CALL .text+0x100e ; AVR3: ldi r24, 0x3 diff --git a/llvm/test/CodeGen/AVR/jmp.ll b/llvm/test/CodeGen/AVR/jmp.ll index 95dfff4..1cbc637 100644 --- a/llvm/test/CodeGen/AVR/jmp.ll +++ b/llvm/test/CodeGen/AVR/jmp.ll @@ -18,7 +18,8 @@ declare i8 @bar(i8); ; CHECK: rcall .-2 ; CHECK-NEXT: 00000000: R_AVR_13_PCREL bar ; CHECK-NEXT: cpi r24, 0x7b -; CHECK-NEXT: brne .+4 +; CHECK-NEXT: brne .-2 +; CHECK-NEXT: R_AVR_7_PCREL .text+0xa ; CHECK-NEXT: ldi r24, 0x64 ; CHECK-NEXT: ret ; CHECK-NEXT: ldi r24, 0xc8 diff --git a/llvm/test/CodeGen/LoongArch/lsx/vmskcond.ll b/llvm/test/CodeGen/LoongArch/lsx/vmskcond.ll index 0ee3012..ad57bbf 100644 --- a/llvm/test/CodeGen/LoongArch/lsx/vmskcond.ll +++ b/llvm/test/CodeGen/LoongArch/lsx/vmskcond.ll @@ -588,3 +588,18 @@ define i2 @vmsk_trunc_i64(<2 x i64> %a) { %res = bitcast <2 x i1> %y to i2 ret i2 %res } + +define i4 @vmsk_eq_allzeros_v4i8(<4 x i8> %a) { +; CHECK-LABEL: vmsk_eq_allzeros_v4i8: +; CHECK: # %bb.0: +; CHECK-NEXT: vseqi.b $vr0, $vr0, 0 +; CHECK-NEXT: vilvl.b $vr0, $vr0, $vr0 +; CHECK-NEXT: vilvl.h $vr0, $vr0, $vr0 +; CHECK-NEXT: vslli.w $vr0, $vr0, 24 +; CHECK-NEXT: vmskltz.w $vr0, $vr0 +; CHECK-NEXT: vpickve2gr.hu $a0, $vr0, 0 +; CHECK-NEXT: ret + %1 = icmp eq <4 x i8> %a, zeroinitializer + %2 = bitcast <4 x i1> %1 to i4 + ret i4 %2 +} diff --git a/llvm/test/CodeGen/MIR/AMDGPU/long-branch-reg-all-sgpr-used.ll b/llvm/test/CodeGen/MIR/AMDGPU/long-branch-reg-all-sgpr-used.ll index b514c493..278cf01 100644 --- a/llvm/test/CodeGen/MIR/AMDGPU/long-branch-reg-all-sgpr-used.ll +++ b/llvm/test/CodeGen/MIR/AMDGPU/long-branch-reg-all-sgpr-used.ll @@ -46,6 +46,7 @@ ; CHECK-NEXT: hasInitWholeWave: false ; CHECK-NEXT: dynamicVGPRBlockSize: 0 ; CHECK-NEXT: scratchReservedForDynamicVGPRs: 0 +; CHECK-NEXT: isWholeWaveFunction: false ; CHECK-NEXT: body: define amdgpu_kernel void @long_branch_used_all_sgprs(ptr addrspace(1) %arg, i32 %cnd) #0 { entry: @@ -315,6 +316,7 @@ ; CHECK-NEXT: hasInitWholeWave: false ; CHECK-NEXT: dynamicVGPRBlockSize: 0 ; CHECK-NEXT: scratchReservedForDynamicVGPRs: 0 +; CHECK-NEXT: isWholeWaveFunction: false ; CHECK-NEXT: body: define amdgpu_kernel void @long_branch_high_num_sgprs_used(ptr addrspace(1) %arg, i32 %cnd) #0 { entry: diff --git a/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-after-pei.ll b/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-after-pei.ll index fc730f9..890ea44 100644 --- a/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-after-pei.ll +++ b/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-after-pei.ll @@ -46,6 +46,7 @@ ; AFTER-PEI-NEXT: hasInitWholeWave: false ; AFTER-PEI-NEXT: dynamicVGPRBlockSize: 0 ; AFTER-PEI-NEXT: scratchReservedForDynamicVGPRs: 0 +; AFTER-PEI-NEXT: isWholeWaveFunction: false ; AFTER-PEI-NEXT: body: define amdgpu_kernel void @scavenge_fi(ptr addrspace(1) %out, i32 %in) #0 { %wide.sgpr0 = call <32 x i32> asm sideeffect "; def $0", "=s" () #0 diff --git a/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-long-branch-reg-debug.ll b/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-long-branch-reg-debug.ll index 5adef14..f84ef8a 100644 --- a/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-long-branch-reg-debug.ll +++ b/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-long-branch-reg-debug.ll @@ -46,6 +46,7 @@ ; CHECK-NEXT: hasInitWholeWave: false ; CHECK-NEXT: dynamicVGPRBlockSize: 0 ; CHECK-NEXT: scratchReservedForDynamicVGPRs: 0 +; CHECK-NEXT: isWholeWaveFunction: false ; CHECK-NEXT: body: define amdgpu_kernel void @uniform_long_forward_branch_debug(ptr addrspace(1) %arg, i32 %arg1) #0 !dbg !5 { bb0: diff --git a/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-long-branch-reg.ll b/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-long-branch-reg.ll index fa40164..cc834d0 100644 --- a/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-long-branch-reg.ll +++ b/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-long-branch-reg.ll @@ -46,6 +46,7 @@ ; CHECK-NEXT: hasInitWholeWave: false ; CHECK-NEXT: dynamicVGPRBlockSize: 0 ; CHECK-NEXT: scratchReservedForDynamicVGPRs: 0 +; CHECK-NEXT: isWholeWaveFunction: false ; CHECK-NEXT: body: define amdgpu_kernel void @uniform_long_forward_branch(ptr addrspace(1) %arg, i32 %arg1) #0 { bb0: diff --git a/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-no-ir.mir b/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-no-ir.mir index 24565e4..06c580e 100644 --- a/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-no-ir.mir +++ b/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-no-ir.mir @@ -55,6 +55,7 @@ # FULL-NEXT: hasInitWholeWave: false # FULL-NEXT: dynamicVGPRBlockSize: 0 # FULL-NEXT: scratchReservedForDynamicVGPRs: 0 +# FULL-NEXT: isWholeWaveFunction: false # FULL-NEXT: body: # SIMPLE: machineFunctionInfo: @@ -162,6 +163,7 @@ body: | # FULL-NEXT: hasInitWholeWave: false # FULL-NEXT: dynamicVGPRBlockSize: 0 # FULL-NEXT: scratchReservedForDynamicVGPRs: 0 +# FULL-NEXT: isWholeWaveFunction: false # FULL-NEXT: body: # SIMPLE: machineFunctionInfo: @@ -240,6 +242,7 @@ body: | # FULL-NEXT: hasInitWholeWave: false # FULL-NEXT: dynamicVGPRBlockSize: 0 # FULL-NEXT: scratchReservedForDynamicVGPRs: 0 +# FULL-NEXT: isWholeWaveFunction: false # FULL-NEXT: body: # SIMPLE: machineFunctionInfo: @@ -319,6 +322,7 @@ body: | # FULL-NEXT: hasInitWholeWave: false # FULL-NEXT: dynamicVGPRBlockSize: 0 # FULL-NEXT: scratchReservedForDynamicVGPRs: 0 +# FULL-NEXT: isWholeWaveFunction: false # FULL-NEXT: body: # SIMPLE: machineFunctionInfo: diff --git a/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info.ll b/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info.ll index a152713..4271546 100644 --- a/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info.ll +++ b/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info.ll @@ -56,6 +56,7 @@ ; CHECK-NEXT: hasInitWholeWave: false ; CHECK-NEXT: dynamicVGPRBlockSize: 0 ; CHECK-NEXT: scratchReservedForDynamicVGPRs: 0 +; CHECK-NEXT: isWholeWaveFunction: false ; CHECK-NEXT: body: define amdgpu_kernel void @kernel(i32 %arg0, i64 %arg1, <16 x i32> %arg2) { %gep = getelementptr inbounds [512 x float], ptr addrspace(3) @lds, i32 0, i32 %arg0 @@ -105,6 +106,7 @@ define amdgpu_kernel void @kernel(i32 %arg0, i64 %arg1, <16 x i32> %arg2) { ; CHECK-NEXT: hasInitWholeWave: false ; CHECK-NEXT: dynamicVGPRBlockSize: 0 ; CHECK-NEXT: scratchReservedForDynamicVGPRs: 0 +; CHECK-NEXT: isWholeWaveFunction: false ; CHECK-NEXT: body: define amdgpu_ps void @ps_shader(i32 %arg0, i32 inreg %arg1) { %gep = getelementptr inbounds [128 x i32], ptr addrspace(2) @gds, i32 0, i32 %arg0 @@ -178,6 +180,7 @@ define amdgpu_ps void @gds_size_shader(i32 %arg0, i32 inreg %arg1) #5 { ; CHECK-NEXT: hasInitWholeWave: false ; CHECK-NEXT: dynamicVGPRBlockSize: 0 ; CHECK-NEXT: scratchReservedForDynamicVGPRs: 0 +; CHECK-NEXT: isWholeWaveFunction: false ; CHECK-NEXT: body: define void @function() { ret void @@ -233,6 +236,7 @@ define void @function() { ; CHECK-NEXT: hasInitWholeWave: false ; CHECK-NEXT: dynamicVGPRBlockSize: 0 ; CHECK-NEXT: scratchReservedForDynamicVGPRs: 0 +; CHECK-NEXT: isWholeWaveFunction: false ; CHECK-NEXT: body: define void @function_nsz() #0 { ret void diff --git a/llvm/test/CodeGen/NVPTX/wmma-ptx78-sm90.py b/llvm/test/CodeGen/NVPTX/wmma-ptx78-sm90.py new file mode 100644 index 0000000..8f50206 --- /dev/null +++ b/llvm/test/CodeGen/NVPTX/wmma-ptx78-sm90.py @@ -0,0 +1,14 @@ +# Check all variants of instructions supported by PTX78 on SM90 +# RUN: %python %s --ptx=78 --gpu-arch=90 --aa > %t-ptx78-sm_90.ll +# RUN: FileCheck %t-ptx78-sm_90.ll < %t-ptx78-sm_90.ll \ +# RUN: --check-prefixes=PTX78STMATRIX-DAG +# RUN: llc < %t-ptx78-sm_90.ll -mtriple=nvptx64 -mcpu=sm_90 -mattr=+ptx78 \ +# RUN: | FileCheck %t-ptx78-sm_90.ll +# RUN: %if ptxas-12.7 %{ \ +# RUN: llc < %t-ptx78-sm_90.ll -mtriple=nvptx64 -mcpu=sm_90 -mattr=+ptx78 \ +# RUN: | %ptxas-verify -arch=sm_90 \ +# RUN: %} + +import wmma + +wmma.main() diff --git a/llvm/test/CodeGen/NVPTX/wmma-ptx86-sm100a.py b/llvm/test/CodeGen/NVPTX/wmma-ptx86-sm100a.py index 6ad0a2a..5c14a54 100644 --- a/llvm/test/CodeGen/NVPTX/wmma-ptx86-sm100a.py +++ b/llvm/test/CodeGen/NVPTX/wmma-ptx86-sm100a.py @@ -1,9 +1,7 @@ # Check all variants of instructions supported by PTX86 on SM100a # RUN: %python %s --ptx=86 --gpu-arch=100 --aa > %t-ptx86-sm_100a.ll # RUN: FileCheck %t-ptx86-sm_100a.ll < %t-ptx86-sm_100a.ll \ -# RUN: --check-prefixes=PTX86LDMATRIX-DAG -# RUN: FileCheck %t-ptx86-sm_100a.ll < %t-ptx86-sm_100a.ll \ -# RUN: --check-prefixes=PTX86LDMATRIX-DAG +# RUN: --check-prefixes=PTX86LDMATRIX-DAG,PTX86STMATRIX-DAG # RUN: llc < %t-ptx86-sm_100a.ll -mtriple=nvptx64 -mcpu=sm_100a -mattr=+ptx86 \ # RUN: | FileCheck %t-ptx86-sm_100a.ll # RUN: %if ptxas-12.7 %{ \ diff --git a/llvm/test/CodeGen/NVPTX/wmma-ptx86-sm101a.py b/llvm/test/CodeGen/NVPTX/wmma-ptx86-sm101a.py index 7d99534..a77f9ad 100644 --- a/llvm/test/CodeGen/NVPTX/wmma-ptx86-sm101a.py +++ b/llvm/test/CodeGen/NVPTX/wmma-ptx86-sm101a.py @@ -1,9 +1,7 @@ # Check all variants of instructions supported by PTX86 on SM101a # RUN: %python %s --ptx=86 --gpu-arch=101 --aa > %t-ptx86-sm_101a.ll # RUN: FileCheck %t-ptx86-sm_101a.ll < %t-ptx86-sm_101a.ll \ -# RUN: --check-prefixes=PTX86LDMATRIX-DAG -# RUN: FileCheck %t-ptx86-sm_101a.ll < %t-ptx86-sm_101a.ll \ -# RUN: --check-prefixes=PTX86LDMATRIX-DAG +# RUN: --check-prefixes=PTX86LDMATRIX-DAG,PTX86STMATRIX-DAG # RUN: llc < %t-ptx86-sm_101a.ll -mtriple=nvptx64 -mcpu=sm_101a -mattr=+ptx86 \ # RUN: | FileCheck %t-ptx86-sm_101a.ll # RUN: %if ptxas-12.7 %{ \ diff --git a/llvm/test/CodeGen/NVPTX/wmma-ptx86-sm120a.py b/llvm/test/CodeGen/NVPTX/wmma-ptx86-sm120a.py index 7bddf0b..8126e64 100644 --- a/llvm/test/CodeGen/NVPTX/wmma-ptx86-sm120a.py +++ b/llvm/test/CodeGen/NVPTX/wmma-ptx86-sm120a.py @@ -1,9 +1,7 @@ # Check all variants of instructions supported by PTX86 on SM120a # RUN: %python %s --ptx=86 --gpu-arch=120 --aa > %t-ptx86-sm_120a.ll # RUN: FileCheck %t-ptx86-sm_120a.ll < %t-ptx86-sm_120a.ll \ -# RUN: --check-prefixes=PTX86LDMATRIX-DAG -# RUN: FileCheck %t-ptx86-sm_120a.ll < %t-ptx86-sm_120a.ll \ -# RUN: --check-prefixes=PTX86LDMATRIX-DAG +# RUN: --check-prefixes=PTX86LDMATRIX-DAG,PTX86STMATRIX-DAG # RUN: llc < %t-ptx86-sm_120a.ll -mtriple=nvptx64 -mcpu=sm_120a -mattr=+ptx86 \ # RUN: | FileCheck %t-ptx86-sm_120a.ll # RUN: %if ptxas-12.7 %{ \ diff --git a/llvm/test/CodeGen/NVPTX/wmma.py b/llvm/test/CodeGen/NVPTX/wmma.py index 2ee4896..2eb3c3d 100644 --- a/llvm/test/CodeGen/NVPTX/wmma.py +++ b/llvm/test/CodeGen/NVPTX/wmma.py @@ -10,6 +10,7 @@ import argparse from itertools import product from string import Template + class MMAType: def __init__(self, ptx_type): self.ptx_type = ptx_type @@ -176,6 +177,13 @@ class MMAFrag: "m8n16:x1:b8x16.b4x16_p64": 1, "m8n16:x2:b8x16.b4x16_p64": 2, "m8n16:x4:b8x16.b4x16_p64": 4, + # stmatrix + "m8n8:x1:b16": 1, + "m8n8:x2:b16": 2, + "m8n8:x4:b16": 4, + "m16n8:x1:b8": 1, + "m16n8:x2:b8": 2, + "m16n8:x4:b8": 4, }.get( "%s:%s:%s" % (geom, frag, ptx_elt_type), { @@ -241,6 +249,13 @@ def make_ldmatrix_ops(geoms, frags, types): ] +def make_stmatrix_ops(geoms, frags, types): + return [ + MMAFrag(geom, frag, ptx_type) + for (geom, frag, ptx_type) in product(geoms, frags, types) + ] + + def get_wmma_ops(): return ( make_mma_ops(["m16n16k8"], ["tf32"], [], ["f32"], []) @@ -315,6 +330,12 @@ def get_ldmatrix_ops(): ) +def get_stmatrix_ops(): + return make_stmatrix_ops(["m8n8"], ["x1", "x2", "x4"], ["b16"]) + make_stmatrix_ops( + ["m16n8"], ["x1", "x2", "x4"], ["b8"] + ) + + def is_wmma_geom_supported(geom): # geometries for FP and ints. if geom in ["m8n32k16", "m32n8k16"]: @@ -360,6 +381,14 @@ def is_ldmatrix_geom_supported(geom): assert False # Unexpected geometry. +def is_stmatrix_geom_supported(geom): + if geom in ["m8n8"]: + return ptx_version >= 78 and gpu_arch >= 90 + elif geom in ["m16n8"]: + return ptx_version >= 86 and gpu_arch >= 100 and aa + assert False # Unexpected geometry. + + def is_ldmatrix_trans_supported(geom, trans): if geom in ["m8n8"]: return True @@ -369,6 +398,15 @@ def is_ldmatrix_trans_supported(geom, trans): return trans == "" assert False # Unexpected geometry. + +def is_stmatrix_trans_supported(geom, trans): + if geom in ["m8n8"]: + return True + elif geom in ["m16n8"]: + return trans == ".trans" + assert False # Unexpected geometry. + + def is_type_supported(ptx_type): if ptx_type in ["s8", "u8", "s32"]: return ptx_version >= 63 and gpu_arch >= 72 @@ -463,6 +501,16 @@ def is_ldmatrix_variant_supported(frag, trans): return frag.frag in ["x1", "x2", "x4"] +def is_stmatrix_variant_supported(frag, trans): + if not ( + is_type_supported(frag.mma_type.ptx_type) + and is_stmatrix_geom_supported(frag.geom) + and is_stmatrix_trans_supported(frag.geom, trans) + ): + return False + return frag.frag in ["x1", "x2", "x4"] + + def make_wmma_slice_ty(frag): return [frag.mma_type.llvm_type] * frag.nregs @@ -717,6 +765,65 @@ define ${ret_ty} @test_${function}_o(i8 ${as}* %src) { return generated_items +def gen_stmatrix_tests(): + stmatrix_template = """ +declare void @${intrinsic}(i8 ${as}* %dst, ${args}); + +; CHECK-LABEL: .func {{.*}}test_${function}( +define void @test_${function}(i8 ${as}* %dst, ${args}) { +; CHECK: ${instruction} {{.*}}[%rd{{[0-9+]}}] +; CHECK: {${check_args}} + call void @${intrinsic}(i8${as}* %dst, ${args}); + ret void +} + +; CHECK-LABEL: .func{{.*}}test_${function}_o( +define void @test_${function}_o(i8 ${as}* %dst, ${args}) { +; CHECK: ${instruction} {{.*}}[%rd{{[0-9+]}}+128], +; CHECK: {${check_args}} + %dst1 = getelementptr i8, i8 ${as}* %dst, i32 128; + call void @${intrinsic}(i8 ${as}* %dst1, ${args}); + ret void +} +""" + intrinsic_template = ( + "llvm.nvvm.stmatrix.sync.aligned.${geom}.${frag}${trans}.${itype}.${pspace}" + ) + instruction_template = ( + "stmatrix.sync.aligned.${geom}.${frag}${trans}${space}.${itype}" + ) + generated_items = [] + + for frag, space, trans in product( + get_stmatrix_ops(), + ["", ".shared"], + ["", ".trans"], + ): + if not is_stmatrix_variant_supported(frag, trans): + continue + + params = { + "frag": frag.frag, + "space": space, + "trans": trans, + "itype": frag.mma_type.ptx_type, + "pspace": get_pspace(space), + "as": "addrspace(%d)" % get_aspace(space), + "geom": frag.geom, + } + + test_params = params + test_params["intrinsic"] = Template(intrinsic_template).substitute(params) + test_params["function"] = test_params["intrinsic"].replace(".", "_") + test_params["instruction"] = Template(instruction_template).substitute(params) + test_params["args"] = make_wmma_slice_args(frag) + test_params["check_args"] = check_pattern(frag) + + print(Template(stmatrix_template).substitute(test_params)) + generated_items.append((test_params["intrinsic"], test_params["instruction"])) + + return generated_items + def mma_signature(op): if op.a.mma_type.ptx_type == "f16": # FP16 ops identified by accumulator & result type. @@ -893,6 +1000,7 @@ def gen_check_unsupported_ops(items): ; NOALTFLOAT-NOT: .{{bf16|tf32}} ; NODOUBLE-NOT: .f64 ; NOLDMATRIX-NOT: ldmatrix.sync.aligned +; NOSTMATRIX-NOT: stmatrix.sync.aligned ; M16N16-DAG: m16n16k16.load.{{[ab].*}}.f16.p ; M16N16-DAG: m16n16k16.{{load|store}}.{{[cd].*\.(f16|f32)}}.p @@ -994,6 +1102,26 @@ def gen_check_unsupported_ops(items): ; PTX86LDMATRIX-DAG: ldmatrix.sync.aligned.m8n16.x4.b8x16.b6x16_p32 ; PTX86LDMATRIX-DAG: ldmatrix.sync.aligned.m8n16.x4.b8x16.b4x16_p64 +; PTX78STMATRIX-DAG: stmatrix.sync.aligned.m8n8.x1.b16 +; PTX78STMATRIX-DAG: stmatrix.sync.aligned.m8n8.x2.b16 +; PTX78STMATRIX-DAG: stmatrix.sync.aligned.m8n8.x4.b16 +; PTX78STMATRIX-DAG: stmatrix.sync.aligned.m8n8.x1.trans.b16 +; PTX78STMATRIX-DAG: stmatrix.sync.aligned.m8n8.x2.trans.b16 +; PTX78STMATRIX-DAG: stmatrix.sync.aligned.m8n8.x4.trans.b16 +; PTX78STMATRIX-DAG: stmatrix.sync.aligned.m8n8.x1.shared.b16 +; PTX78STMATRIX-DAG: stmatrix.sync.aligned.m8n8.x2.shared.b16 +; PTX78STMATRIX-DAG: stmatrix.sync.aligned.m8n8.x4.shared.b16 +; PTX78STMATRIX-DAG: stmatrix.sync.aligned.m8n8.x1.trans.shared.b16 +; PTX78STMATRIX-DAG: stmatrix.sync.aligned.m8n8.x2.trans.shared.b16 +; PTX78STMATRIX-DAG: stmatrix.sync.aligned.m8n8.x4.trans.shared.b16 + +; PTX86STMATRIX-DAG: stmatrix.sync.aligned.m16n8.x1.trans.b8 +; PTX86STMATRIX-DAG: stmatrix.sync.aligned.m16n8.x2.trans.b8 +; PTX86STMATRIX-DAG: stmatrix.sync.aligned.m16n8.x4.trans.b8 +; PTX86STMATRIX-DAG: stmatrix.sync.aligned.m16n8.x1.trans.shared.b8 +; PTX86STMATRIX-DAG: stmatrix.sync.aligned.m16n8.x2.trans.shared.b8 +; PTX86STMATRIX-DAG: stmatrix.sync.aligned.m16n8.x4.trans.shared.b8 + ; PTX71MMA-DAG: mma.m8n8k4.row.col.f64 ; PTX71MMA-DAG: mma.m16n8k4.row.col.tf32 ; PTX71MMA-DAG: mma.m16n8k8.row.col.tf32 @@ -1039,6 +1167,7 @@ def gen_tests(): items = gen_wmma_load_tests() items += gen_wmma_store_tests() items += gen_ldmatrix_tests() + items += gen_stmatrix_tests() items += gen_wmma_mma_tests() items += gen_mma_tests() gen_check_unsupported_ops(items) diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/div-by-constant.ll b/llvm/test/CodeGen/RISCV/GlobalISel/div-by-constant.ll index 4b999b8..6864afe 100644 --- a/llvm/test/CodeGen/RISCV/GlobalISel/div-by-constant.ll +++ b/llvm/test/CodeGen/RISCV/GlobalISel/div-by-constant.ll @@ -66,7 +66,7 @@ define i32 @udiv_constant_add(i32 %a) nounwind { ; RV64IM-NEXT: srli a2, a2, 32 ; RV64IM-NEXT: mul a1, a2, a1 ; RV64IM-NEXT: srli a1, a1, 32 -; RV64IM-NEXT: subw a0, a0, a1 +; RV64IM-NEXT: sub a0, a0, a1 ; RV64IM-NEXT: srliw a0, a0, 1 ; RV64IM-NEXT: add a0, a0, a1 ; RV64IM-NEXT: srliw a0, a0, 2 @@ -79,7 +79,7 @@ define i32 @udiv_constant_add(i32 %a) nounwind { ; RV64IMZB-NEXT: zext.w a2, a0 ; RV64IMZB-NEXT: mul a1, a2, a1 ; RV64IMZB-NEXT: srli a1, a1, 32 -; RV64IMZB-NEXT: subw a0, a0, a1 +; RV64IMZB-NEXT: sub a0, a0, a1 ; RV64IMZB-NEXT: srliw a0, a0, 1 ; RV64IMZB-NEXT: add a0, a0, a1 ; RV64IMZB-NEXT: srliw a0, a0, 2 @@ -250,7 +250,7 @@ define i8 @udiv8_constant_add(i8 %a) nounwind { ; RV64-NEXT: zext.b a2, a0 ; RV64-NEXT: mul a1, a2, a1 ; RV64-NEXT: srli a1, a1, 8 -; RV64-NEXT: subw a0, a0, a1 +; RV64-NEXT: sub a0, a0, a1 ; RV64-NEXT: zext.b a0, a0 ; RV64-NEXT: srli a0, a0, 1 ; RV64-NEXT: add a0, a0, a1 @@ -414,8 +414,7 @@ define i32 @sdiv_constant_srai(i32 %a) nounwind { ; RV64-NEXT: addi a1, a1, 1639 ; RV64-NEXT: sext.w a0, a0 ; RV64-NEXT: mul a0, a0, a1 -; RV64-NEXT: srai a0, a0, 32 -; RV64-NEXT: sraiw a0, a0, 1 +; RV64-NEXT: srai a0, a0, 33 ; RV64-NEXT: srliw a1, a0, 31 ; RV64-NEXT: addw a0, a0, a1 ; RV64-NEXT: ret @@ -656,8 +655,6 @@ define i8 @sdiv8_constant_srai(i8 %a) nounwind { ; RV32IM-NEXT: srai a0, a0, 24 ; RV32IM-NEXT: mul a0, a0, a1 ; RV32IM-NEXT: slli a0, a0, 16 -; RV32IM-NEXT: srai a0, a0, 24 -; RV32IM-NEXT: slli a0, a0, 24 ; RV32IM-NEXT: srai a0, a0, 25 ; RV32IM-NEXT: zext.b a1, a0 ; RV32IM-NEXT: srli a1, a1, 7 @@ -670,9 +667,7 @@ define i8 @sdiv8_constant_srai(i8 %a) nounwind { ; RV32IMZB-NEXT: sext.b a0, a0 ; RV32IMZB-NEXT: mul a0, a0, a1 ; RV32IMZB-NEXT: sext.h a0, a0 -; RV32IMZB-NEXT: srai a0, a0, 8 -; RV32IMZB-NEXT: sext.b a0, a0 -; RV32IMZB-NEXT: srai a0, a0, 1 +; RV32IMZB-NEXT: srai a0, a0, 9 ; RV32IMZB-NEXT: zext.b a1, a0 ; RV32IMZB-NEXT: srli a1, a1, 7 ; RV32IMZB-NEXT: add a0, a0, a1 @@ -685,8 +680,6 @@ define i8 @sdiv8_constant_srai(i8 %a) nounwind { ; RV64IM-NEXT: srai a0, a0, 56 ; RV64IM-NEXT: mul a0, a0, a1 ; RV64IM-NEXT: slli a0, a0, 48 -; RV64IM-NEXT: srai a0, a0, 56 -; RV64IM-NEXT: slli a0, a0, 56 ; RV64IM-NEXT: srai a0, a0, 57 ; RV64IM-NEXT: zext.b a1, a0 ; RV64IM-NEXT: srli a1, a1, 7 @@ -699,9 +692,7 @@ define i8 @sdiv8_constant_srai(i8 %a) nounwind { ; RV64IMZB-NEXT: sext.b a0, a0 ; RV64IMZB-NEXT: mul a0, a0, a1 ; RV64IMZB-NEXT: sext.h a0, a0 -; RV64IMZB-NEXT: srai a0, a0, 8 -; RV64IMZB-NEXT: sext.b a0, a0 -; RV64IMZB-NEXT: srai a0, a0, 1 +; RV64IMZB-NEXT: srai a0, a0, 9 ; RV64IMZB-NEXT: zext.b a1, a0 ; RV64IMZB-NEXT: srli a1, a1, 7 ; RV64IMZB-NEXT: add a0, a0, a1 @@ -816,7 +807,7 @@ define i8 @sdiv8_constant_sub_srai(i8 %a) nounwind { ; RV64IM-NEXT: mul a1, a2, a1 ; RV64IM-NEXT: slli a1, a1, 48 ; RV64IM-NEXT: srai a1, a1, 56 -; RV64IM-NEXT: subw a1, a1, a0 +; RV64IM-NEXT: sub a1, a1, a0 ; RV64IM-NEXT: slli a1, a1, 56 ; RV64IM-NEXT: srai a0, a1, 58 ; RV64IM-NEXT: zext.b a1, a0 @@ -906,8 +897,6 @@ define i16 @sdiv16_constant_srai(i16 %a) nounwind { ; RV32IM-NEXT: addi a1, a1, 1639 ; RV32IM-NEXT: srai a0, a0, 16 ; RV32IM-NEXT: mul a0, a0, a1 -; RV32IM-NEXT: srai a0, a0, 16 -; RV32IM-NEXT: slli a0, a0, 16 ; RV32IM-NEXT: srai a0, a0, 17 ; RV32IM-NEXT: slli a1, a0, 16 ; RV32IM-NEXT: srli a1, a1, 16 @@ -921,9 +910,7 @@ define i16 @sdiv16_constant_srai(i16 %a) nounwind { ; RV32IMZB-NEXT: addi a1, a1, 1639 ; RV32IMZB-NEXT: sext.h a0, a0 ; RV32IMZB-NEXT: mul a0, a0, a1 -; RV32IMZB-NEXT: srai a0, a0, 16 -; RV32IMZB-NEXT: sext.h a0, a0 -; RV32IMZB-NEXT: srai a0, a0, 1 +; RV32IMZB-NEXT: srai a0, a0, 17 ; RV32IMZB-NEXT: zext.h a1, a0 ; RV32IMZB-NEXT: srli a1, a1, 15 ; RV32IMZB-NEXT: add a0, a0, a1 @@ -936,9 +923,7 @@ define i16 @sdiv16_constant_srai(i16 %a) nounwind { ; RV64IM-NEXT: addi a1, a1, 1639 ; RV64IM-NEXT: srai a0, a0, 48 ; RV64IM-NEXT: mul a0, a0, a1 -; RV64IM-NEXT: sraiw a0, a0, 16 -; RV64IM-NEXT: slli a0, a0, 48 -; RV64IM-NEXT: srai a0, a0, 49 +; RV64IM-NEXT: sraiw a0, a0, 17 ; RV64IM-NEXT: slli a1, a0, 48 ; RV64IM-NEXT: srli a1, a1, 48 ; RV64IM-NEXT: srli a1, a1, 15 @@ -951,9 +936,7 @@ define i16 @sdiv16_constant_srai(i16 %a) nounwind { ; RV64IMZB-NEXT: addi a1, a1, 1639 ; RV64IMZB-NEXT: sext.h a0, a0 ; RV64IMZB-NEXT: mul a0, a0, a1 -; RV64IMZB-NEXT: sraiw a0, a0, 16 -; RV64IMZB-NEXT: sext.h a0, a0 -; RV64IMZB-NEXT: srai a0, a0, 1 +; RV64IMZB-NEXT: sraiw a0, a0, 17 ; RV64IMZB-NEXT: zext.h a1, a0 ; RV64IMZB-NEXT: srli a1, a1, 15 ; RV64IMZB-NEXT: add a0, a0, a1 @@ -1071,7 +1054,7 @@ define i16 @sdiv16_constant_sub_srai(i16 %a) nounwind { ; RV64IM-NEXT: srai a2, a2, 48 ; RV64IM-NEXT: mul a1, a2, a1 ; RV64IM-NEXT: sraiw a1, a1, 16 -; RV64IM-NEXT: subw a1, a1, a0 +; RV64IM-NEXT: sub a1, a1, a0 ; RV64IM-NEXT: slli a1, a1, 48 ; RV64IM-NEXT: srai a0, a1, 51 ; RV64IM-NEXT: slli a1, a0, 48 diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/double-convert.ll b/llvm/test/CodeGen/RISCV/GlobalISel/double-convert.ll index a49e94f..620c5ec 100644 --- a/llvm/test/CodeGen/RISCV/GlobalISel/double-convert.ll +++ b/llvm/test/CodeGen/RISCV/GlobalISel/double-convert.ll @@ -246,17 +246,11 @@ define double @fcvt_d_wu(i32 %a) nounwind { } define double @fcvt_d_wu_load(ptr %p) nounwind { -; RV32IFD-LABEL: fcvt_d_wu_load: -; RV32IFD: # %bb.0: -; RV32IFD-NEXT: lw a0, 0(a0) -; RV32IFD-NEXT: fcvt.d.wu fa0, a0 -; RV32IFD-NEXT: ret -; -; RV64IFD-LABEL: fcvt_d_wu_load: -; RV64IFD: # %bb.0: -; RV64IFD-NEXT: lwu a0, 0(a0) -; RV64IFD-NEXT: fcvt.d.wu fa0, a0 -; RV64IFD-NEXT: ret +; CHECKIFD-LABEL: fcvt_d_wu_load: +; CHECKIFD: # %bb.0: +; CHECKIFD-NEXT: lw a0, 0(a0) +; CHECKIFD-NEXT: fcvt.d.wu fa0, a0 +; CHECKIFD-NEXT: ret ; ; RV32I-LABEL: fcvt_d_wu_load: ; RV32I: # %bb.0: diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/float-convert.ll b/llvm/test/CodeGen/RISCV/GlobalISel/float-convert.ll index fa09362..bbea792 100644 --- a/llvm/test/CodeGen/RISCV/GlobalISel/float-convert.ll +++ b/llvm/test/CodeGen/RISCV/GlobalISel/float-convert.ll @@ -232,17 +232,11 @@ define float @fcvt_s_wu(i32 %a) nounwind { } define float @fcvt_s_wu_load(ptr %p) nounwind { -; RV32IF-LABEL: fcvt_s_wu_load: -; RV32IF: # %bb.0: -; RV32IF-NEXT: lw a0, 0(a0) -; RV32IF-NEXT: fcvt.s.wu fa0, a0 -; RV32IF-NEXT: ret -; -; RV64IF-LABEL: fcvt_s_wu_load: -; RV64IF: # %bb.0: -; RV64IF-NEXT: lwu a0, 0(a0) -; RV64IF-NEXT: fcvt.s.wu fa0, a0 -; RV64IF-NEXT: ret +; CHECKIF-LABEL: fcvt_s_wu_load: +; CHECKIF: # %bb.0: +; CHECKIF-NEXT: lw a0, 0(a0) +; CHECKIF-NEXT: fcvt.s.wu fa0, a0 +; CHECKIF-NEXT: ret ; ; RV32I-LABEL: fcvt_s_wu_load: ; RV32I: # %bb.0: diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-abs-rv64.mir b/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-abs-rv64.mir index 78a2227b..a7c1c63 100644 --- a/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-abs-rv64.mir +++ b/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-abs-rv64.mir @@ -88,8 +88,7 @@ body: | ; RV64I-NEXT: [[ADD:%[0-9]+]]:_(s64) = G_ADD [[ASSERT_SEXT]], [[ASHR]] ; RV64I-NEXT: [[SEXT_INREG:%[0-9]+]]:_(s64) = G_SEXT_INREG [[ADD]], 32 ; RV64I-NEXT: [[XOR:%[0-9]+]]:_(s64) = G_XOR [[SEXT_INREG]], [[ASHR]] - ; RV64I-NEXT: [[SEXT_INREG1:%[0-9]+]]:_(s64) = G_SEXT_INREG [[XOR]], 32 - ; RV64I-NEXT: $x10 = COPY [[SEXT_INREG1]](s64) + ; RV64I-NEXT: $x10 = COPY [[XOR]](s64) ; RV64I-NEXT: PseudoRET implicit $x10 ; ; RV64ZBB-LABEL: name: abs_i32 diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/rotl-rotr.ll b/llvm/test/CodeGen/RISCV/GlobalISel/rotl-rotr.ll index 8a786fc..46d1661 100644 --- a/llvm/test/CodeGen/RISCV/GlobalISel/rotl-rotr.ll +++ b/llvm/test/CodeGen/RISCV/GlobalISel/rotl-rotr.ll @@ -29,7 +29,7 @@ define i32 @rotl_32(i32 %x, i32 %y) nounwind { ; ; RV64I-LABEL: rotl_32: ; RV64I: # %bb.0: -; RV64I-NEXT: negw a2, a1 +; RV64I-NEXT: neg a2, a1 ; RV64I-NEXT: sllw a1, a0, a1 ; RV64I-NEXT: srlw a0, a0, a2 ; RV64I-NEXT: or a0, a1, a0 @@ -55,7 +55,7 @@ define i32 @rotl_32(i32 %x, i32 %y) nounwind { ; ; RV64XTHEADBB-LABEL: rotl_32: ; RV64XTHEADBB: # %bb.0: -; RV64XTHEADBB-NEXT: negw a2, a1 +; RV64XTHEADBB-NEXT: neg a2, a1 ; RV64XTHEADBB-NEXT: sllw a1, a0, a1 ; RV64XTHEADBB-NEXT: srlw a0, a0, a2 ; RV64XTHEADBB-NEXT: or a0, a1, a0 @@ -78,7 +78,7 @@ define i32 @rotr_32(i32 %x, i32 %y) nounwind { ; ; RV64I-LABEL: rotr_32: ; RV64I: # %bb.0: -; RV64I-NEXT: negw a2, a1 +; RV64I-NEXT: neg a2, a1 ; RV64I-NEXT: srlw a1, a0, a1 ; RV64I-NEXT: sllw a0, a0, a2 ; RV64I-NEXT: or a0, a1, a0 @@ -104,7 +104,7 @@ define i32 @rotr_32(i32 %x, i32 %y) nounwind { ; ; RV64XTHEADBB-LABEL: rotr_32: ; RV64XTHEADBB: # %bb.0: -; RV64XTHEADBB-NEXT: negw a2, a1 +; RV64XTHEADBB-NEXT: neg a2, a1 ; RV64XTHEADBB-NEXT: srlw a1, a0, a1 ; RV64XTHEADBB-NEXT: sllw a0, a0, a2 ; RV64XTHEADBB-NEXT: or a0, a1, a0 @@ -167,7 +167,7 @@ define i64 @rotl_64(i64 %x, i64 %y) nounwind { ; ; RV64I-LABEL: rotl_64: ; RV64I: # %bb.0: -; RV64I-NEXT: negw a2, a1 +; RV64I-NEXT: neg a2, a1 ; RV64I-NEXT: sll a1, a0, a1 ; RV64I-NEXT: srl a0, a0, a2 ; RV64I-NEXT: or a0, a1, a0 @@ -276,7 +276,7 @@ define i64 @rotl_64(i64 %x, i64 %y) nounwind { ; ; RV64XTHEADBB-LABEL: rotl_64: ; RV64XTHEADBB: # %bb.0: -; RV64XTHEADBB-NEXT: negw a2, a1 +; RV64XTHEADBB-NEXT: neg a2, a1 ; RV64XTHEADBB-NEXT: sll a1, a0, a1 ; RV64XTHEADBB-NEXT: srl a0, a0, a2 ; RV64XTHEADBB-NEXT: or a0, a1, a0 @@ -340,7 +340,7 @@ define i64 @rotr_64(i64 %x, i64 %y) nounwind { ; ; RV64I-LABEL: rotr_64: ; RV64I: # %bb.0: -; RV64I-NEXT: negw a2, a1 +; RV64I-NEXT: neg a2, a1 ; RV64I-NEXT: srl a1, a0, a1 ; RV64I-NEXT: sll a0, a0, a2 ; RV64I-NEXT: or a0, a1, a0 @@ -451,7 +451,7 @@ define i64 @rotr_64(i64 %x, i64 %y) nounwind { ; ; RV64XTHEADBB-LABEL: rotr_64: ; RV64XTHEADBB: # %bb.0: -; RV64XTHEADBB-NEXT: negw a2, a1 +; RV64XTHEADBB-NEXT: neg a2, a1 ; RV64XTHEADBB-NEXT: srl a1, a0, a1 ; RV64XTHEADBB-NEXT: sll a0, a0, a2 ; RV64XTHEADBB-NEXT: or a0, a1, a0 @@ -474,7 +474,7 @@ define i32 @rotl_32_mask(i32 %x, i32 %y) nounwind { ; ; RV64I-LABEL: rotl_32_mask: ; RV64I: # %bb.0: -; RV64I-NEXT: negw a2, a1 +; RV64I-NEXT: neg a2, a1 ; RV64I-NEXT: sllw a1, a0, a1 ; RV64I-NEXT: srlw a0, a0, a2 ; RV64I-NEXT: or a0, a1, a0 @@ -490,7 +490,7 @@ define i32 @rotl_32_mask(i32 %x, i32 %y) nounwind { ; ; RV64ZBB-LABEL: rotl_32_mask: ; RV64ZBB: # %bb.0: -; RV64ZBB-NEXT: negw a2, a1 +; RV64ZBB-NEXT: neg a2, a1 ; RV64ZBB-NEXT: sllw a1, a0, a1 ; RV64ZBB-NEXT: srlw a0, a0, a2 ; RV64ZBB-NEXT: or a0, a1, a0 @@ -506,7 +506,7 @@ define i32 @rotl_32_mask(i32 %x, i32 %y) nounwind { ; ; RV64XTHEADBB-LABEL: rotl_32_mask: ; RV64XTHEADBB: # %bb.0: -; RV64XTHEADBB-NEXT: negw a2, a1 +; RV64XTHEADBB-NEXT: neg a2, a1 ; RV64XTHEADBB-NEXT: sllw a1, a0, a1 ; RV64XTHEADBB-NEXT: srlw a0, a0, a2 ; RV64XTHEADBB-NEXT: or a0, a1, a0 @@ -531,7 +531,7 @@ define i32 @rotl_32_mask_and_63_and_31(i32 %x, i32 %y) nounwind { ; RV64I-LABEL: rotl_32_mask_and_63_and_31: ; RV64I: # %bb.0: ; RV64I-NEXT: sllw a2, a0, a1 -; RV64I-NEXT: negw a1, a1 +; RV64I-NEXT: neg a1, a1 ; RV64I-NEXT: srlw a0, a0, a1 ; RV64I-NEXT: or a0, a2, a0 ; RV64I-NEXT: ret @@ -547,7 +547,7 @@ define i32 @rotl_32_mask_and_63_and_31(i32 %x, i32 %y) nounwind { ; RV64ZBB-LABEL: rotl_32_mask_and_63_and_31: ; RV64ZBB: # %bb.0: ; RV64ZBB-NEXT: sllw a2, a0, a1 -; RV64ZBB-NEXT: negw a1, a1 +; RV64ZBB-NEXT: neg a1, a1 ; RV64ZBB-NEXT: srlw a0, a0, a1 ; RV64ZBB-NEXT: or a0, a2, a0 ; RV64ZBB-NEXT: ret @@ -563,7 +563,7 @@ define i32 @rotl_32_mask_and_63_and_31(i32 %x, i32 %y) nounwind { ; RV64XTHEADBB-LABEL: rotl_32_mask_and_63_and_31: ; RV64XTHEADBB: # %bb.0: ; RV64XTHEADBB-NEXT: sllw a2, a0, a1 -; RV64XTHEADBB-NEXT: negw a1, a1 +; RV64XTHEADBB-NEXT: neg a1, a1 ; RV64XTHEADBB-NEXT: srlw a0, a0, a1 ; RV64XTHEADBB-NEXT: or a0, a2, a0 ; RV64XTHEADBB-NEXT: ret @@ -632,7 +632,7 @@ define i32 @rotr_32_mask(i32 %x, i32 %y) nounwind { ; ; RV64I-LABEL: rotr_32_mask: ; RV64I: # %bb.0: -; RV64I-NEXT: negw a2, a1 +; RV64I-NEXT: neg a2, a1 ; RV64I-NEXT: srlw a1, a0, a1 ; RV64I-NEXT: sllw a0, a0, a2 ; RV64I-NEXT: or a0, a1, a0 @@ -648,7 +648,7 @@ define i32 @rotr_32_mask(i32 %x, i32 %y) nounwind { ; ; RV64ZBB-LABEL: rotr_32_mask: ; RV64ZBB: # %bb.0: -; RV64ZBB-NEXT: negw a2, a1 +; RV64ZBB-NEXT: neg a2, a1 ; RV64ZBB-NEXT: srlw a1, a0, a1 ; RV64ZBB-NEXT: sllw a0, a0, a2 ; RV64ZBB-NEXT: or a0, a1, a0 @@ -664,7 +664,7 @@ define i32 @rotr_32_mask(i32 %x, i32 %y) nounwind { ; ; RV64XTHEADBB-LABEL: rotr_32_mask: ; RV64XTHEADBB: # %bb.0: -; RV64XTHEADBB-NEXT: negw a2, a1 +; RV64XTHEADBB-NEXT: neg a2, a1 ; RV64XTHEADBB-NEXT: srlw a1, a0, a1 ; RV64XTHEADBB-NEXT: sllw a0, a0, a2 ; RV64XTHEADBB-NEXT: or a0, a1, a0 @@ -689,7 +689,7 @@ define i32 @rotr_32_mask_and_63_and_31(i32 %x, i32 %y) nounwind { ; RV64I-LABEL: rotr_32_mask_and_63_and_31: ; RV64I: # %bb.0: ; RV64I-NEXT: srlw a2, a0, a1 -; RV64I-NEXT: negw a1, a1 +; RV64I-NEXT: neg a1, a1 ; RV64I-NEXT: sllw a0, a0, a1 ; RV64I-NEXT: or a0, a2, a0 ; RV64I-NEXT: ret @@ -705,7 +705,7 @@ define i32 @rotr_32_mask_and_63_and_31(i32 %x, i32 %y) nounwind { ; RV64ZBB-LABEL: rotr_32_mask_and_63_and_31: ; RV64ZBB: # %bb.0: ; RV64ZBB-NEXT: srlw a2, a0, a1 -; RV64ZBB-NEXT: negw a1, a1 +; RV64ZBB-NEXT: neg a1, a1 ; RV64ZBB-NEXT: sllw a0, a0, a1 ; RV64ZBB-NEXT: or a0, a2, a0 ; RV64ZBB-NEXT: ret @@ -721,7 +721,7 @@ define i32 @rotr_32_mask_and_63_and_31(i32 %x, i32 %y) nounwind { ; RV64XTHEADBB-LABEL: rotr_32_mask_and_63_and_31: ; RV64XTHEADBB: # %bb.0: ; RV64XTHEADBB-NEXT: srlw a2, a0, a1 -; RV64XTHEADBB-NEXT: negw a1, a1 +; RV64XTHEADBB-NEXT: neg a1, a1 ; RV64XTHEADBB-NEXT: sllw a0, a0, a1 ; RV64XTHEADBB-NEXT: or a0, a2, a0 ; RV64XTHEADBB-NEXT: ret @@ -829,7 +829,7 @@ define i64 @rotl_64_mask(i64 %x, i64 %y) nounwind { ; ; RV64I-LABEL: rotl_64_mask: ; RV64I: # %bb.0: -; RV64I-NEXT: negw a2, a1 +; RV64I-NEXT: neg a2, a1 ; RV64I-NEXT: sll a1, a0, a1 ; RV64I-NEXT: srl a0, a0, a2 ; RV64I-NEXT: or a0, a1, a0 @@ -884,7 +884,7 @@ define i64 @rotl_64_mask(i64 %x, i64 %y) nounwind { ; ; RV64ZBB-LABEL: rotl_64_mask: ; RV64ZBB: # %bb.0: -; RV64ZBB-NEXT: negw a2, a1 +; RV64ZBB-NEXT: neg a2, a1 ; RV64ZBB-NEXT: sll a1, a0, a1 ; RV64ZBB-NEXT: srl a0, a0, a2 ; RV64ZBB-NEXT: or a0, a1, a0 @@ -939,7 +939,7 @@ define i64 @rotl_64_mask(i64 %x, i64 %y) nounwind { ; ; RV64XTHEADBB-LABEL: rotl_64_mask: ; RV64XTHEADBB: # %bb.0: -; RV64XTHEADBB-NEXT: negw a2, a1 +; RV64XTHEADBB-NEXT: neg a2, a1 ; RV64XTHEADBB-NEXT: sll a1, a0, a1 ; RV64XTHEADBB-NEXT: srl a0, a0, a2 ; RV64XTHEADBB-NEXT: or a0, a1, a0 @@ -1005,7 +1005,7 @@ define i64 @rotl_64_mask_and_127_and_63(i64 %x, i64 %y) nounwind { ; RV64I-LABEL: rotl_64_mask_and_127_and_63: ; RV64I: # %bb.0: ; RV64I-NEXT: sll a2, a0, a1 -; RV64I-NEXT: negw a1, a1 +; RV64I-NEXT: neg a1, a1 ; RV64I-NEXT: srl a0, a0, a1 ; RV64I-NEXT: or a0, a2, a0 ; RV64I-NEXT: ret @@ -1062,7 +1062,7 @@ define i64 @rotl_64_mask_and_127_and_63(i64 %x, i64 %y) nounwind { ; RV64ZBB-LABEL: rotl_64_mask_and_127_and_63: ; RV64ZBB: # %bb.0: ; RV64ZBB-NEXT: sll a2, a0, a1 -; RV64ZBB-NEXT: negw a1, a1 +; RV64ZBB-NEXT: neg a1, a1 ; RV64ZBB-NEXT: srl a0, a0, a1 ; RV64ZBB-NEXT: or a0, a2, a0 ; RV64ZBB-NEXT: ret @@ -1119,7 +1119,7 @@ define i64 @rotl_64_mask_and_127_and_63(i64 %x, i64 %y) nounwind { ; RV64XTHEADBB-LABEL: rotl_64_mask_and_127_and_63: ; RV64XTHEADBB: # %bb.0: ; RV64XTHEADBB-NEXT: sll a2, a0, a1 -; RV64XTHEADBB-NEXT: negw a1, a1 +; RV64XTHEADBB-NEXT: neg a1, a1 ; RV64XTHEADBB-NEXT: srl a0, a0, a1 ; RV64XTHEADBB-NEXT: or a0, a2, a0 ; RV64XTHEADBB-NEXT: ret @@ -1277,7 +1277,7 @@ define i64 @rotr_64_mask(i64 %x, i64 %y) nounwind { ; ; RV64I-LABEL: rotr_64_mask: ; RV64I: # %bb.0: -; RV64I-NEXT: negw a2, a1 +; RV64I-NEXT: neg a2, a1 ; RV64I-NEXT: srl a1, a0, a1 ; RV64I-NEXT: sll a0, a0, a2 ; RV64I-NEXT: or a0, a1, a0 @@ -1331,7 +1331,7 @@ define i64 @rotr_64_mask(i64 %x, i64 %y) nounwind { ; ; RV64ZBB-LABEL: rotr_64_mask: ; RV64ZBB: # %bb.0: -; RV64ZBB-NEXT: negw a2, a1 +; RV64ZBB-NEXT: neg a2, a1 ; RV64ZBB-NEXT: srl a1, a0, a1 ; RV64ZBB-NEXT: sll a0, a0, a2 ; RV64ZBB-NEXT: or a0, a1, a0 @@ -1385,7 +1385,7 @@ define i64 @rotr_64_mask(i64 %x, i64 %y) nounwind { ; ; RV64XTHEADBB-LABEL: rotr_64_mask: ; RV64XTHEADBB: # %bb.0: -; RV64XTHEADBB-NEXT: negw a2, a1 +; RV64XTHEADBB-NEXT: neg a2, a1 ; RV64XTHEADBB-NEXT: srl a1, a0, a1 ; RV64XTHEADBB-NEXT: sll a0, a0, a2 ; RV64XTHEADBB-NEXT: or a0, a1, a0 @@ -1451,7 +1451,7 @@ define i64 @rotr_64_mask_and_127_and_63(i64 %x, i64 %y) nounwind { ; RV64I-LABEL: rotr_64_mask_and_127_and_63: ; RV64I: # %bb.0: ; RV64I-NEXT: srl a2, a0, a1 -; RV64I-NEXT: negw a1, a1 +; RV64I-NEXT: neg a1, a1 ; RV64I-NEXT: sll a0, a0, a1 ; RV64I-NEXT: or a0, a2, a0 ; RV64I-NEXT: ret @@ -1508,7 +1508,7 @@ define i64 @rotr_64_mask_and_127_and_63(i64 %x, i64 %y) nounwind { ; RV64ZBB-LABEL: rotr_64_mask_and_127_and_63: ; RV64ZBB: # %bb.0: ; RV64ZBB-NEXT: srl a2, a0, a1 -; RV64ZBB-NEXT: negw a1, a1 +; RV64ZBB-NEXT: neg a1, a1 ; RV64ZBB-NEXT: sll a0, a0, a1 ; RV64ZBB-NEXT: or a0, a2, a0 ; RV64ZBB-NEXT: ret @@ -1565,7 +1565,7 @@ define i64 @rotr_64_mask_and_127_and_63(i64 %x, i64 %y) nounwind { ; RV64XTHEADBB-LABEL: rotr_64_mask_and_127_and_63: ; RV64XTHEADBB: # %bb.0: ; RV64XTHEADBB-NEXT: srl a2, a0, a1 -; RV64XTHEADBB-NEXT: negw a1, a1 +; RV64XTHEADBB-NEXT: neg a1, a1 ; RV64XTHEADBB-NEXT: sll a0, a0, a1 ; RV64XTHEADBB-NEXT: or a0, a2, a0 ; RV64XTHEADBB-NEXT: ret @@ -1701,7 +1701,7 @@ define signext i32 @rotl_32_mask_shared(i32 signext %a, i32 signext %b, i32 sign ; RV64I: # %bb.0: ; RV64I-NEXT: andi a3, a2, 31 ; RV64I-NEXT: sllw a4, a0, a2 -; RV64I-NEXT: negw a3, a3 +; RV64I-NEXT: neg a3, a3 ; RV64I-NEXT: srlw a0, a0, a3 ; RV64I-NEXT: or a0, a4, a0 ; RV64I-NEXT: sllw a1, a1, a2 @@ -1737,7 +1737,7 @@ define signext i32 @rotl_32_mask_shared(i32 signext %a, i32 signext %b, i32 sign ; RV64XTHEADBB: # %bb.0: ; RV64XTHEADBB-NEXT: andi a3, a2, 31 ; RV64XTHEADBB-NEXT: sllw a4, a0, a2 -; RV64XTHEADBB-NEXT: negw a3, a3 +; RV64XTHEADBB-NEXT: neg a3, a3 ; RV64XTHEADBB-NEXT: srlw a0, a0, a3 ; RV64XTHEADBB-NEXT: or a0, a4, a0 ; RV64XTHEADBB-NEXT: sllw a1, a1, a2 @@ -1822,7 +1822,7 @@ define signext i64 @rotl_64_mask_shared(i64 signext %a, i64 signext %b, i64 sign ; RV64I: # %bb.0: ; RV64I-NEXT: andi a3, a2, 63 ; RV64I-NEXT: sll a4, a0, a2 -; RV64I-NEXT: negw a3, a3 +; RV64I-NEXT: neg a3, a3 ; RV64I-NEXT: srl a0, a0, a3 ; RV64I-NEXT: or a0, a4, a0 ; RV64I-NEXT: sll a1, a1, a2 @@ -1972,7 +1972,7 @@ define signext i64 @rotl_64_mask_shared(i64 signext %a, i64 signext %b, i64 sign ; RV64XTHEADBB: # %bb.0: ; RV64XTHEADBB-NEXT: andi a3, a2, 63 ; RV64XTHEADBB-NEXT: sll a4, a0, a2 -; RV64XTHEADBB-NEXT: negw a3, a3 +; RV64XTHEADBB-NEXT: neg a3, a3 ; RV64XTHEADBB-NEXT: srl a0, a0, a3 ; RV64XTHEADBB-NEXT: or a0, a4, a0 ; RV64XTHEADBB-NEXT: sll a1, a1, a2 @@ -2002,7 +2002,7 @@ define signext i32 @rotr_32_mask_shared(i32 signext %a, i32 signext %b, i32 sign ; RV64I: # %bb.0: ; RV64I-NEXT: andi a3, a2, 31 ; RV64I-NEXT: srlw a4, a0, a2 -; RV64I-NEXT: negw a3, a3 +; RV64I-NEXT: neg a3, a3 ; RV64I-NEXT: sllw a0, a0, a3 ; RV64I-NEXT: or a0, a4, a0 ; RV64I-NEXT: sllw a1, a1, a2 @@ -2038,7 +2038,7 @@ define signext i32 @rotr_32_mask_shared(i32 signext %a, i32 signext %b, i32 sign ; RV64XTHEADBB: # %bb.0: ; RV64XTHEADBB-NEXT: andi a3, a2, 31 ; RV64XTHEADBB-NEXT: srlw a4, a0, a2 -; RV64XTHEADBB-NEXT: negw a3, a3 +; RV64XTHEADBB-NEXT: neg a3, a3 ; RV64XTHEADBB-NEXT: sllw a0, a0, a3 ; RV64XTHEADBB-NEXT: or a0, a4, a0 ; RV64XTHEADBB-NEXT: sllw a1, a1, a2 @@ -2125,7 +2125,7 @@ define signext i64 @rotr_64_mask_shared(i64 signext %a, i64 signext %b, i64 sign ; RV64I: # %bb.0: ; RV64I-NEXT: andi a3, a2, 63 ; RV64I-NEXT: srl a4, a0, a2 -; RV64I-NEXT: negw a3, a3 +; RV64I-NEXT: neg a3, a3 ; RV64I-NEXT: sll a0, a0, a3 ; RV64I-NEXT: or a0, a4, a0 ; RV64I-NEXT: sll a1, a1, a2 @@ -2279,7 +2279,7 @@ define signext i64 @rotr_64_mask_shared(i64 signext %a, i64 signext %b, i64 sign ; RV64XTHEADBB: # %bb.0: ; RV64XTHEADBB-NEXT: andi a3, a2, 63 ; RV64XTHEADBB-NEXT: srl a4, a0, a2 -; RV64XTHEADBB-NEXT: negw a3, a3 +; RV64XTHEADBB-NEXT: neg a3, a3 ; RV64XTHEADBB-NEXT: sll a0, a0, a3 ; RV64XTHEADBB-NEXT: or a0, a4, a0 ; RV64XTHEADBB-NEXT: sll a1, a1, a2 @@ -2312,8 +2312,8 @@ define signext i32 @rotl_32_mask_multiple(i32 signext %a, i32 signext %b, i32 si ; RV64I-NEXT: andi a3, a2, 31 ; RV64I-NEXT: sllw a4, a0, a2 ; RV64I-NEXT: sllw a2, a1, a2 -; RV64I-NEXT: negw a5, a3 -; RV64I-NEXT: negw a3, a3 +; RV64I-NEXT: neg a5, a3 +; RV64I-NEXT: neg a3, a3 ; RV64I-NEXT: srlw a0, a0, a5 ; RV64I-NEXT: srlw a1, a1, a3 ; RV64I-NEXT: or a0, a4, a0 @@ -2353,8 +2353,8 @@ define signext i32 @rotl_32_mask_multiple(i32 signext %a, i32 signext %b, i32 si ; RV64XTHEADBB-NEXT: andi a3, a2, 31 ; RV64XTHEADBB-NEXT: sllw a4, a0, a2 ; RV64XTHEADBB-NEXT: sllw a2, a1, a2 -; RV64XTHEADBB-NEXT: negw a5, a3 -; RV64XTHEADBB-NEXT: negw a3, a3 +; RV64XTHEADBB-NEXT: neg a5, a3 +; RV64XTHEADBB-NEXT: neg a3, a3 ; RV64XTHEADBB-NEXT: srlw a0, a0, a5 ; RV64XTHEADBB-NEXT: srlw a1, a1, a3 ; RV64XTHEADBB-NEXT: or a0, a4, a0 @@ -2464,7 +2464,7 @@ define i64 @rotl_64_mask_multiple(i64 %a, i64 %b, i64 %amt) nounwind { ; RV64I-NEXT: andi a3, a2, 63 ; RV64I-NEXT: sll a4, a0, a2 ; RV64I-NEXT: sll a2, a1, a2 -; RV64I-NEXT: negw a3, a3 +; RV64I-NEXT: neg a3, a3 ; RV64I-NEXT: srl a0, a0, a3 ; RV64I-NEXT: srl a1, a1, a3 ; RV64I-NEXT: or a0, a4, a0 @@ -2664,7 +2664,7 @@ define i64 @rotl_64_mask_multiple(i64 %a, i64 %b, i64 %amt) nounwind { ; RV64XTHEADBB-NEXT: andi a3, a2, 63 ; RV64XTHEADBB-NEXT: sll a4, a0, a2 ; RV64XTHEADBB-NEXT: sll a2, a1, a2 -; RV64XTHEADBB-NEXT: negw a3, a3 +; RV64XTHEADBB-NEXT: neg a3, a3 ; RV64XTHEADBB-NEXT: srl a0, a0, a3 ; RV64XTHEADBB-NEXT: srl a1, a1, a3 ; RV64XTHEADBB-NEXT: or a0, a4, a0 @@ -2697,8 +2697,8 @@ define signext i32 @rotr_32_mask_multiple(i32 signext %a, i32 signext %b, i32 si ; RV64I-NEXT: andi a3, a2, 31 ; RV64I-NEXT: srlw a4, a0, a2 ; RV64I-NEXT: srlw a2, a1, a2 -; RV64I-NEXT: negw a5, a3 -; RV64I-NEXT: negw a3, a3 +; RV64I-NEXT: neg a5, a3 +; RV64I-NEXT: neg a3, a3 ; RV64I-NEXT: sllw a0, a0, a5 ; RV64I-NEXT: sllw a1, a1, a3 ; RV64I-NEXT: or a0, a4, a0 @@ -2738,8 +2738,8 @@ define signext i32 @rotr_32_mask_multiple(i32 signext %a, i32 signext %b, i32 si ; RV64XTHEADBB-NEXT: andi a3, a2, 31 ; RV64XTHEADBB-NEXT: srlw a4, a0, a2 ; RV64XTHEADBB-NEXT: srlw a2, a1, a2 -; RV64XTHEADBB-NEXT: negw a5, a3 -; RV64XTHEADBB-NEXT: negw a3, a3 +; RV64XTHEADBB-NEXT: neg a5, a3 +; RV64XTHEADBB-NEXT: neg a3, a3 ; RV64XTHEADBB-NEXT: sllw a0, a0, a5 ; RV64XTHEADBB-NEXT: sllw a1, a1, a3 ; RV64XTHEADBB-NEXT: or a0, a4, a0 @@ -2850,7 +2850,7 @@ define i64 @rotr_64_mask_multiple(i64 %a, i64 %b, i64 %amt) nounwind { ; RV64I-NEXT: andi a3, a2, 63 ; RV64I-NEXT: srl a4, a0, a2 ; RV64I-NEXT: srl a2, a1, a2 -; RV64I-NEXT: negw a3, a3 +; RV64I-NEXT: neg a3, a3 ; RV64I-NEXT: sll a0, a0, a3 ; RV64I-NEXT: sll a1, a1, a3 ; RV64I-NEXT: or a0, a4, a0 @@ -3052,7 +3052,7 @@ define i64 @rotr_64_mask_multiple(i64 %a, i64 %b, i64 %amt) nounwind { ; RV64XTHEADBB-NEXT: andi a3, a2, 63 ; RV64XTHEADBB-NEXT: srl a4, a0, a2 ; RV64XTHEADBB-NEXT: srl a2, a1, a2 -; RV64XTHEADBB-NEXT: negw a3, a3 +; RV64XTHEADBB-NEXT: neg a3, a3 ; RV64XTHEADBB-NEXT: sll a0, a0, a3 ; RV64XTHEADBB-NEXT: sll a1, a1, a3 ; RV64XTHEADBB-NEXT: or a0, a4, a0 @@ -3116,7 +3116,7 @@ define i64 @rotl_64_zext(i64 %x, i32 %y) nounwind { ; RV64I-LABEL: rotl_64_zext: ; RV64I: # %bb.0: ; RV64I-NEXT: li a2, 64 -; RV64I-NEXT: subw a2, a2, a1 +; RV64I-NEXT: sub a2, a2, a1 ; RV64I-NEXT: sll a1, a0, a1 ; RV64I-NEXT: srl a0, a0, a2 ; RV64I-NEXT: or a0, a1, a0 @@ -3171,7 +3171,7 @@ define i64 @rotl_64_zext(i64 %x, i32 %y) nounwind { ; RV64ZBB-LABEL: rotl_64_zext: ; RV64ZBB: # %bb.0: ; RV64ZBB-NEXT: li a2, 64 -; RV64ZBB-NEXT: subw a2, a2, a1 +; RV64ZBB-NEXT: sub a2, a2, a1 ; RV64ZBB-NEXT: sll a1, a0, a1 ; RV64ZBB-NEXT: srl a0, a0, a2 ; RV64ZBB-NEXT: or a0, a1, a0 @@ -3226,7 +3226,7 @@ define i64 @rotl_64_zext(i64 %x, i32 %y) nounwind { ; RV64XTHEADBB-LABEL: rotl_64_zext: ; RV64XTHEADBB: # %bb.0: ; RV64XTHEADBB-NEXT: li a2, 64 -; RV64XTHEADBB-NEXT: subw a2, a2, a1 +; RV64XTHEADBB-NEXT: sub a2, a2, a1 ; RV64XTHEADBB-NEXT: sll a1, a0, a1 ; RV64XTHEADBB-NEXT: srl a0, a0, a2 ; RV64XTHEADBB-NEXT: or a0, a1, a0 @@ -3289,7 +3289,7 @@ define i64 @rotr_64_zext(i64 %x, i32 %y) nounwind { ; RV64I-LABEL: rotr_64_zext: ; RV64I: # %bb.0: ; RV64I-NEXT: li a2, 64 -; RV64I-NEXT: subw a2, a2, a1 +; RV64I-NEXT: sub a2, a2, a1 ; RV64I-NEXT: srl a1, a0, a1 ; RV64I-NEXT: sll a0, a0, a2 ; RV64I-NEXT: or a0, a1, a0 @@ -3343,7 +3343,7 @@ define i64 @rotr_64_zext(i64 %x, i32 %y) nounwind { ; RV64ZBB-LABEL: rotr_64_zext: ; RV64ZBB: # %bb.0: ; RV64ZBB-NEXT: li a2, 64 -; RV64ZBB-NEXT: subw a2, a2, a1 +; RV64ZBB-NEXT: sub a2, a2, a1 ; RV64ZBB-NEXT: srl a1, a0, a1 ; RV64ZBB-NEXT: sll a0, a0, a2 ; RV64ZBB-NEXT: or a0, a1, a0 @@ -3397,7 +3397,7 @@ define i64 @rotr_64_zext(i64 %x, i32 %y) nounwind { ; RV64XTHEADBB-LABEL: rotr_64_zext: ; RV64XTHEADBB: # %bb.0: ; RV64XTHEADBB-NEXT: li a2, 64 -; RV64XTHEADBB-NEXT: subw a2, a2, a1 +; RV64XTHEADBB-NEXT: sub a2, a2, a1 ; RV64XTHEADBB-NEXT: srl a1, a0, a1 ; RV64XTHEADBB-NEXT: sll a0, a0, a2 ; RV64XTHEADBB-NEXT: or a0, a1, a0 diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/rv64zbb-zbkb.ll b/llvm/test/CodeGen/RISCV/GlobalISel/rv64zbb-zbkb.ll index 1eddb8f..b7f84ba 100644 --- a/llvm/test/CodeGen/RISCV/GlobalISel/rv64zbb-zbkb.ll +++ b/llvm/test/CodeGen/RISCV/GlobalISel/rv64zbb-zbkb.ll @@ -107,7 +107,7 @@ declare i32 @llvm.fshl.i32(i32, i32, i32) define signext i32 @rol_i32(i32 signext %a, i32 signext %b) nounwind { ; RV64I-LABEL: rol_i32: ; RV64I: # %bb.0: -; RV64I-NEXT: negw a2, a1 +; RV64I-NEXT: neg a2, a1 ; RV64I-NEXT: sllw a1, a0, a1 ; RV64I-NEXT: srlw a0, a0, a2 ; RV64I-NEXT: or a0, a1, a0 @@ -125,7 +125,7 @@ define signext i32 @rol_i32(i32 signext %a, i32 signext %b) nounwind { define void @rol_i32_nosext(i32 signext %a, i32 signext %b, ptr %x) nounwind { ; RV64I-LABEL: rol_i32_nosext: ; RV64I: # %bb.0: -; RV64I-NEXT: negw a3, a1 +; RV64I-NEXT: neg a3, a1 ; RV64I-NEXT: sllw a1, a0, a1 ; RV64I-NEXT: srlw a0, a0, a3 ; RV64I-NEXT: or a0, a1, a0 @@ -146,7 +146,7 @@ define signext i32 @rol_i32_neg_constant_rhs(i32 signext %a) nounwind { ; RV64I-LABEL: rol_i32_neg_constant_rhs: ; RV64I: # %bb.0: ; RV64I-NEXT: li a1, -2 -; RV64I-NEXT: negw a2, a0 +; RV64I-NEXT: neg a2, a0 ; RV64I-NEXT: sllw a0, a1, a0 ; RV64I-NEXT: srlw a1, a1, a2 ; RV64I-NEXT: or a0, a0, a1 @@ -166,7 +166,7 @@ declare i64 @llvm.fshl.i64(i64, i64, i64) define i64 @rol_i64(i64 %a, i64 %b) nounwind { ; RV64I-LABEL: rol_i64: ; RV64I: # %bb.0: -; RV64I-NEXT: negw a2, a1 +; RV64I-NEXT: neg a2, a1 ; RV64I-NEXT: sll a1, a0, a1 ; RV64I-NEXT: srl a0, a0, a2 ; RV64I-NEXT: or a0, a1, a0 @@ -185,7 +185,7 @@ declare i32 @llvm.fshr.i32(i32, i32, i32) define signext i32 @ror_i32(i32 signext %a, i32 signext %b) nounwind { ; RV64I-LABEL: ror_i32: ; RV64I: # %bb.0: -; RV64I-NEXT: negw a2, a1 +; RV64I-NEXT: neg a2, a1 ; RV64I-NEXT: srlw a1, a0, a1 ; RV64I-NEXT: sllw a0, a0, a2 ; RV64I-NEXT: or a0, a1, a0 @@ -203,7 +203,7 @@ define signext i32 @ror_i32(i32 signext %a, i32 signext %b) nounwind { define void @ror_i32_nosext(i32 signext %a, i32 signext %b, ptr %x) nounwind { ; RV64I-LABEL: ror_i32_nosext: ; RV64I: # %bb.0: -; RV64I-NEXT: negw a3, a1 +; RV64I-NEXT: neg a3, a1 ; RV64I-NEXT: srlw a1, a0, a1 ; RV64I-NEXT: sllw a0, a0, a3 ; RV64I-NEXT: or a0, a1, a0 @@ -224,7 +224,7 @@ define signext i32 @ror_i32_neg_constant_rhs(i32 signext %a) nounwind { ; RV64I-LABEL: ror_i32_neg_constant_rhs: ; RV64I: # %bb.0: ; RV64I-NEXT: li a1, -2 -; RV64I-NEXT: negw a2, a0 +; RV64I-NEXT: neg a2, a0 ; RV64I-NEXT: srlw a0, a1, a0 ; RV64I-NEXT: sllw a1, a1, a2 ; RV64I-NEXT: or a0, a0, a1 @@ -244,7 +244,7 @@ declare i64 @llvm.fshr.i64(i64, i64, i64) define i64 @ror_i64(i64 %a, i64 %b) nounwind { ; RV64I-LABEL: ror_i64: ; RV64I: # %bb.0: -; RV64I-NEXT: negw a2, a1 +; RV64I-NEXT: neg a2, a1 ; RV64I-NEXT: srl a1, a0, a1 ; RV64I-NEXT: sll a0, a0, a2 ; RV64I-NEXT: or a0, a1, a0 diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/rv64zbb.ll b/llvm/test/CodeGen/RISCV/GlobalISel/rv64zbb.ll index 9690302..2dd3bb3 100644 --- a/llvm/test/CodeGen/RISCV/GlobalISel/rv64zbb.ll +++ b/llvm/test/CodeGen/RISCV/GlobalISel/rv64zbb.ll @@ -31,7 +31,7 @@ define signext i32 @ctlz_i32(i32 signext %a) nounwind { ; RV64I-NEXT: and a1, a2, a1 ; RV64I-NEXT: lui a2, 209715 ; RV64I-NEXT: addi a2, a2, 819 -; RV64I-NEXT: subw a0, a0, a1 +; RV64I-NEXT: sub a0, a0, a1 ; RV64I-NEXT: srliw a1, a0, 2 ; RV64I-NEXT: and a0, a0, a2 ; RV64I-NEXT: and a1, a1, a2 @@ -88,7 +88,7 @@ define signext i32 @log2_i32(i32 signext %a) nounwind { ; RV64I-NEXT: and a1, a2, a1 ; RV64I-NEXT: lui a2, 209715 ; RV64I-NEXT: addi a2, a2, 819 -; RV64I-NEXT: subw a0, a0, a1 +; RV64I-NEXT: sub a0, a0, a1 ; RV64I-NEXT: srliw a1, a0, 2 ; RV64I-NEXT: and a0, a0, a2 ; RV64I-NEXT: and a1, a1, a2 @@ -103,7 +103,7 @@ define signext i32 @log2_i32(i32 signext %a) nounwind { ; RV64I-NEXT: call __muldi3 ; RV64I-NEXT: srliw a0, a0, 24 ; RV64I-NEXT: li a1, 32 -; RV64I-NEXT: subw a0, a1, a0 +; RV64I-NEXT: sub a0, a1, a0 ; RV64I-NEXT: j .LBB1_3 ; RV64I-NEXT: .LBB1_2: ; RV64I-NEXT: li a0, 32 @@ -153,7 +153,7 @@ define signext i32 @log2_ceil_i32(i32 signext %a) nounwind { ; RV64I-NEXT: and a1, a2, a1 ; RV64I-NEXT: lui a2, 209715 ; RV64I-NEXT: addi a2, a2, 819 -; RV64I-NEXT: subw a0, a0, a1 +; RV64I-NEXT: sub a0, a0, a1 ; RV64I-NEXT: srliw a1, a0, 2 ; RV64I-NEXT: and a0, a0, a2 ; RV64I-NEXT: and a1, a1, a2 @@ -168,7 +168,7 @@ define signext i32 @log2_ceil_i32(i32 signext %a) nounwind { ; RV64I-NEXT: call __muldi3 ; RV64I-NEXT: srliw a0, a0, 24 ; RV64I-NEXT: li a1, 32 -; RV64I-NEXT: subw a1, a1, a0 +; RV64I-NEXT: sub a1, a1, a0 ; RV64I-NEXT: .LBB2_2: # %cond.end ; RV64I-NEXT: subw a0, s0, a1 ; RV64I-NEXT: ld ra, 8(sp) # 8-byte Folded Reload @@ -212,7 +212,7 @@ define signext i32 @findLastSet_i32(i32 signext %a) nounwind { ; RV64I-NEXT: and a1, a2, a1 ; RV64I-NEXT: lui a2, 209715 ; RV64I-NEXT: addi a2, a2, 819 -; RV64I-NEXT: subw a0, a0, a1 +; RV64I-NEXT: sub a0, a0, a1 ; RV64I-NEXT: srliw a1, a0, 2 ; RV64I-NEXT: and a0, a0, a2 ; RV64I-NEXT: and a1, a1, a2 @@ -283,7 +283,7 @@ define i32 @ctlz_lshr_i32(i32 signext %a) { ; RV64I-NEXT: and a1, a2, a1 ; RV64I-NEXT: lui a2, 209715 ; RV64I-NEXT: addi a2, a2, 819 -; RV64I-NEXT: subw a0, a0, a1 +; RV64I-NEXT: sub a0, a0, a1 ; RV64I-NEXT: srliw a1, a0, 2 ; RV64I-NEXT: and a0, a0, a2 ; RV64I-NEXT: and a1, a1, a2 @@ -412,7 +412,7 @@ define signext i32 @cttz_i32(i32 signext %a) nounwind { ; RV64I-NEXT: and a1, a2, a1 ; RV64I-NEXT: lui a2, 209715 ; RV64I-NEXT: addi a2, a2, 819 -; RV64I-NEXT: subw a0, a0, a1 +; RV64I-NEXT: sub a0, a0, a1 ; RV64I-NEXT: srliw a1, a0, 2 ; RV64I-NEXT: and a0, a0, a2 ; RV64I-NEXT: and a1, a1, a2 @@ -455,7 +455,7 @@ define signext i32 @cttz_zero_undef_i32(i32 signext %a) nounwind { ; RV64I-NEXT: and a1, a2, a1 ; RV64I-NEXT: lui a2, 209715 ; RV64I-NEXT: addi a2, a2, 819 -; RV64I-NEXT: subw a0, a0, a1 +; RV64I-NEXT: sub a0, a0, a1 ; RV64I-NEXT: srliw a1, a0, 2 ; RV64I-NEXT: and a0, a0, a2 ; RV64I-NEXT: and a1, a1, a2 @@ -497,7 +497,7 @@ define signext i32 @findFirstSet_i32(i32 signext %a) nounwind { ; RV64I-NEXT: and a1, a2, a1 ; RV64I-NEXT: lui a2, 209715 ; RV64I-NEXT: addi a2, a2, 819 -; RV64I-NEXT: subw a0, a0, a1 +; RV64I-NEXT: sub a0, a0, a1 ; RV64I-NEXT: srliw a1, a0, 2 ; RV64I-NEXT: and a0, a0, a2 ; RV64I-NEXT: and a1, a1, a2 @@ -553,7 +553,7 @@ define signext i32 @ffs_i32(i32 signext %a) nounwind { ; RV64I-NEXT: and a1, a2, a1 ; RV64I-NEXT: lui a2, 209715 ; RV64I-NEXT: addi a2, a2, 819 -; RV64I-NEXT: subw a0, a0, a1 +; RV64I-NEXT: sub a0, a0, a1 ; RV64I-NEXT: srliw a1, a0, 2 ; RV64I-NEXT: and a0, a0, a2 ; RV64I-NEXT: and a1, a1, a2 @@ -672,7 +672,7 @@ define signext i32 @ctpop_i32(i32 signext %a) nounwind { ; RV64I-NEXT: and a1, a1, a2 ; RV64I-NEXT: lui a2, 209715 ; RV64I-NEXT: addi a2, a2, 819 -; RV64I-NEXT: subw a0, a0, a1 +; RV64I-NEXT: sub a0, a0, a1 ; RV64I-NEXT: srliw a1, a0, 2 ; RV64I-NEXT: and a0, a0, a2 ; RV64I-NEXT: and a1, a1, a2 @@ -728,7 +728,7 @@ define signext i32 @ctpop_i32_load(ptr %p) nounwind { ; RV64I-NEXT: and a1, a2, a1 ; RV64I-NEXT: lui a2, 209715 ; RV64I-NEXT: addi a2, a2, 819 -; RV64I-NEXT: subw a0, a0, a1 +; RV64I-NEXT: sub a0, a0, a1 ; RV64I-NEXT: srliw a1, a0, 2 ; RV64I-NEXT: and a0, a0, a2 ; RV64I-NEXT: and a1, a1, a2 @@ -748,7 +748,7 @@ define signext i32 @ctpop_i32_load(ptr %p) nounwind { ; ; RV64ZBB-LABEL: ctpop_i32_load: ; RV64ZBB: # %bb.0: -; RV64ZBB-NEXT: lwu a0, 0(a0) +; RV64ZBB-NEXT: lw a0, 0(a0) ; RV64ZBB-NEXT: cpopw a0, a0 ; RV64ZBB-NEXT: ret %a = load i32, ptr %p @@ -1053,9 +1053,8 @@ define signext i32 @abs_i32_sext(i32 signext %x) { ; RV64I-LABEL: abs_i32_sext: ; RV64I: # %bb.0: ; RV64I-NEXT: srai a1, a0, 31 -; RV64I-NEXT: add a0, a0, a1 +; RV64I-NEXT: addw a0, a0, a1 ; RV64I-NEXT: xor a0, a0, a1 -; RV64I-NEXT: sext.w a0, a0 ; RV64I-NEXT: ret ; ; RV64ZBB-LABEL: abs_i32_sext: diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/rv64zbkb.ll b/llvm/test/CodeGen/RISCV/GlobalISel/rv64zbkb.ll index cd59c9e..ba058ca 100644 --- a/llvm/test/CodeGen/RISCV/GlobalISel/rv64zbkb.ll +++ b/llvm/test/CodeGen/RISCV/GlobalISel/rv64zbkb.ll @@ -114,7 +114,7 @@ define i64 @pack_i64_2(i32 signext %a, i32 signext %b) nounwind { define i64 @pack_i64_3(ptr %0, ptr %1) { ; RV64I-LABEL: pack_i64_3: ; RV64I: # %bb.0: -; RV64I-NEXT: lwu a0, 0(a0) +; RV64I-NEXT: lw a0, 0(a0) ; RV64I-NEXT: lwu a1, 0(a1) ; RV64I-NEXT: slli a0, a0, 32 ; RV64I-NEXT: or a0, a0, a1 @@ -122,8 +122,8 @@ define i64 @pack_i64_3(ptr %0, ptr %1) { ; ; RV64ZBKB-LABEL: pack_i64_3: ; RV64ZBKB: # %bb.0: -; RV64ZBKB-NEXT: lwu a0, 0(a0) -; RV64ZBKB-NEXT: lwu a1, 0(a1) +; RV64ZBKB-NEXT: lw a0, 0(a0) +; RV64ZBKB-NEXT: lw a1, 0(a1) ; RV64ZBKB-NEXT: pack a0, a1, a0 ; RV64ZBKB-NEXT: ret %3 = load i32, ptr %0, align 4 diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/shifts.ll b/llvm/test/CodeGen/RISCV/GlobalISel/shifts.ll index 8b262db..d634cc9 100644 --- a/llvm/test/CodeGen/RISCV/GlobalISel/shifts.ll +++ b/llvm/test/CodeGen/RISCV/GlobalISel/shifts.ll @@ -330,13 +330,13 @@ define i128 @lshr128(i128 %a, i128 %b) nounwind { ; RV64I-NEXT: li a3, 64 ; RV64I-NEXT: bltu a2, a3, .LBB6_2 ; RV64I-NEXT: # %bb.1: -; RV64I-NEXT: subw a4, a2, a3 +; RV64I-NEXT: sub a4, a2, a3 ; RV64I-NEXT: srl a4, a1, a4 ; RV64I-NEXT: bnez a2, .LBB6_3 ; RV64I-NEXT: j .LBB6_4 ; RV64I-NEXT: .LBB6_2: ; RV64I-NEXT: srl a4, a0, a2 -; RV64I-NEXT: negw a5, a2 +; RV64I-NEXT: neg a5, a2 ; RV64I-NEXT: sll a5, a1, a5 ; RV64I-NEXT: or a4, a4, a5 ; RV64I-NEXT: beqz a2, .LBB6_4 @@ -476,13 +476,13 @@ define i128 @ashr128(i128 %a, i128 %b) nounwind { ; RV64I-NEXT: li a3, 64 ; RV64I-NEXT: bltu a2, a3, .LBB7_2 ; RV64I-NEXT: # %bb.1: -; RV64I-NEXT: subw a4, a2, a3 +; RV64I-NEXT: sub a4, a2, a3 ; RV64I-NEXT: sra a4, a1, a4 ; RV64I-NEXT: bnez a2, .LBB7_3 ; RV64I-NEXT: j .LBB7_4 ; RV64I-NEXT: .LBB7_2: ; RV64I-NEXT: srl a4, a0, a2 -; RV64I-NEXT: negw a5, a2 +; RV64I-NEXT: neg a5, a2 ; RV64I-NEXT: sll a5, a1, a5 ; RV64I-NEXT: or a4, a4, a5 ; RV64I-NEXT: beqz a2, .LBB7_4 @@ -615,13 +615,13 @@ define i128 @shl128(i128 %a, i128 %b) nounwind { ; RV64I-NEXT: bltu a2, a4, .LBB8_2 ; RV64I-NEXT: # %bb.1: ; RV64I-NEXT: li a0, 0 -; RV64I-NEXT: subw a4, a2, a4 +; RV64I-NEXT: sub a4, a2, a4 ; RV64I-NEXT: sll a3, a3, a4 ; RV64I-NEXT: bnez a2, .LBB8_3 ; RV64I-NEXT: j .LBB8_4 ; RV64I-NEXT: .LBB8_2: ; RV64I-NEXT: sll a0, a3, a2 -; RV64I-NEXT: negw a4, a2 +; RV64I-NEXT: neg a4, a2 ; RV64I-NEXT: srl a3, a3, a4 ; RV64I-NEXT: sll a4, a1, a2 ; RV64I-NEXT: or a3, a3, a4 @@ -685,7 +685,7 @@ define i64 @fshr64_minsize(i64 %a, i64 %b) minsize nounwind { ; ; RV64I-LABEL: fshr64_minsize: ; RV64I: # %bb.0: -; RV64I-NEXT: negw a2, a1 +; RV64I-NEXT: neg a2, a1 ; RV64I-NEXT: srl a1, a0, a1 ; RV64I-NEXT: sll a0, a0, a2 ; RV64I-NEXT: or a0, a1, a0 @@ -914,12 +914,12 @@ define i128 @fshr128_minsize(i128 %a, i128 %b) minsize nounwind { ; RV64I-NEXT: li a4, 64 ; RV64I-NEXT: bltu a5, a4, .LBB10_2 ; RV64I-NEXT: # %bb.1: -; RV64I-NEXT: subw a3, a5, a4 +; RV64I-NEXT: sub a3, a5, a4 ; RV64I-NEXT: srl a6, a1, a3 ; RV64I-NEXT: j .LBB10_3 ; RV64I-NEXT: .LBB10_2: ; RV64I-NEXT: srl a3, a0, a2 -; RV64I-NEXT: negw a6, a5 +; RV64I-NEXT: neg a6, a5 ; RV64I-NEXT: sll a6, a1, a6 ; RV64I-NEXT: or a6, a3, a6 ; RV64I-NEXT: .LBB10_3: @@ -928,7 +928,7 @@ define i128 @fshr128_minsize(i128 %a, i128 %b) minsize nounwind { ; RV64I-NEXT: # %bb.4: ; RV64I-NEXT: mv a3, a6 ; RV64I-NEXT: .LBB10_5: -; RV64I-NEXT: negw a7, a2 +; RV64I-NEXT: neg a7, a2 ; RV64I-NEXT: bltu a5, a4, .LBB10_7 ; RV64I-NEXT: # %bb.6: ; RV64I-NEXT: li a2, 0 @@ -940,13 +940,13 @@ define i128 @fshr128_minsize(i128 %a, i128 %b) minsize nounwind { ; RV64I-NEXT: bltu a6, a4, .LBB10_10 ; RV64I-NEXT: # %bb.9: ; RV64I-NEXT: li a5, 0 -; RV64I-NEXT: subw a4, a6, a4 +; RV64I-NEXT: sub a4, a6, a4 ; RV64I-NEXT: sll a0, a0, a4 ; RV64I-NEXT: bnez a6, .LBB10_11 ; RV64I-NEXT: j .LBB10_12 ; RV64I-NEXT: .LBB10_10: ; RV64I-NEXT: sll a5, a0, a7 -; RV64I-NEXT: negw a4, a6 +; RV64I-NEXT: neg a4, a6 ; RV64I-NEXT: srl a0, a0, a4 ; RV64I-NEXT: sll a4, a1, a7 ; RV64I-NEXT: or a0, a0, a4 diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/wide-scalar-shift-by-byte-multiple-legalization.ll b/llvm/test/CodeGen/RISCV/GlobalISel/wide-scalar-shift-by-byte-multiple-legalization.ll index 69519c0..014b1c1 100644 --- a/llvm/test/CodeGen/RISCV/GlobalISel/wide-scalar-shift-by-byte-multiple-legalization.ll +++ b/llvm/test/CodeGen/RISCV/GlobalISel/wide-scalar-shift-by-byte-multiple-legalization.ll @@ -758,13 +758,13 @@ define void @lshr_16bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; RV64I-NEXT: or a3, a6, a7 ; RV64I-NEXT: bltu a1, a4, .LBB6_2 ; RV64I-NEXT: # %bb.1: -; RV64I-NEXT: subw a5, a1, a4 +; RV64I-NEXT: sub a5, a1, a4 ; RV64I-NEXT: srl a5, a3, a5 ; RV64I-NEXT: bnez a1, .LBB6_3 ; RV64I-NEXT: j .LBB6_4 ; RV64I-NEXT: .LBB6_2: ; RV64I-NEXT: srl a5, a0, a1 -; RV64I-NEXT: negw a6, a1 +; RV64I-NEXT: neg a6, a1 ; RV64I-NEXT: sll a6, a3, a6 ; RV64I-NEXT: or a5, a5, a6 ; RV64I-NEXT: beqz a1, .LBB6_4 @@ -1091,13 +1091,13 @@ define void @lshr_16bytes_wordOff(ptr %src.ptr, ptr %wordOff.ptr, ptr %dst) noun ; RV64I-NEXT: or a3, a6, a7 ; RV64I-NEXT: bltu a1, a4, .LBB7_2 ; RV64I-NEXT: # %bb.1: -; RV64I-NEXT: subw a5, a1, a4 +; RV64I-NEXT: sub a5, a1, a4 ; RV64I-NEXT: srl a5, a3, a5 ; RV64I-NEXT: bnez a1, .LBB7_3 ; RV64I-NEXT: j .LBB7_4 ; RV64I-NEXT: .LBB7_2: ; RV64I-NEXT: srl a5, a0, a1 -; RV64I-NEXT: negw a6, a1 +; RV64I-NEXT: neg a6, a1 ; RV64I-NEXT: sll a6, a3, a6 ; RV64I-NEXT: or a5, a5, a6 ; RV64I-NEXT: beqz a1, .LBB7_4 @@ -1425,13 +1425,13 @@ define void @shl_16bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; RV64I-NEXT: bltu a3, a5, .LBB8_2 ; RV64I-NEXT: # %bb.1: ; RV64I-NEXT: li a1, 0 -; RV64I-NEXT: subw a5, a3, a5 +; RV64I-NEXT: sub a5, a3, a5 ; RV64I-NEXT: sll a4, a4, a5 ; RV64I-NEXT: bnez a3, .LBB8_3 ; RV64I-NEXT: j .LBB8_4 ; RV64I-NEXT: .LBB8_2: ; RV64I-NEXT: sll a1, a4, a3 -; RV64I-NEXT: negw a5, a3 +; RV64I-NEXT: neg a5, a3 ; RV64I-NEXT: srl a4, a4, a5 ; RV64I-NEXT: sll a5, a0, a3 ; RV64I-NEXT: or a4, a4, a5 @@ -1754,13 +1754,13 @@ define void @shl_16bytes_wordOff(ptr %src.ptr, ptr %wordOff.ptr, ptr %dst) nounw ; RV64I-NEXT: bltu a3, a5, .LBB9_2 ; RV64I-NEXT: # %bb.1: ; RV64I-NEXT: li a1, 0 -; RV64I-NEXT: subw a5, a3, a5 +; RV64I-NEXT: sub a5, a3, a5 ; RV64I-NEXT: sll a4, a4, a5 ; RV64I-NEXT: bnez a3, .LBB9_3 ; RV64I-NEXT: j .LBB9_4 ; RV64I-NEXT: .LBB9_2: ; RV64I-NEXT: sll a1, a4, a3 -; RV64I-NEXT: negw a5, a3 +; RV64I-NEXT: neg a5, a3 ; RV64I-NEXT: srl a4, a4, a5 ; RV64I-NEXT: sll a5, a0, a3 ; RV64I-NEXT: or a4, a4, a5 @@ -2083,13 +2083,13 @@ define void @ashr_16bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; RV64I-NEXT: or a3, a6, a7 ; RV64I-NEXT: bltu a1, a4, .LBB10_2 ; RV64I-NEXT: # %bb.1: -; RV64I-NEXT: subw a5, a1, a4 +; RV64I-NEXT: sub a5, a1, a4 ; RV64I-NEXT: sra a5, a3, a5 ; RV64I-NEXT: bnez a1, .LBB10_3 ; RV64I-NEXT: j .LBB10_4 ; RV64I-NEXT: .LBB10_2: ; RV64I-NEXT: srl a5, a0, a1 -; RV64I-NEXT: negw a6, a1 +; RV64I-NEXT: neg a6, a1 ; RV64I-NEXT: sll a6, a3, a6 ; RV64I-NEXT: or a5, a5, a6 ; RV64I-NEXT: beqz a1, .LBB10_4 @@ -2416,13 +2416,13 @@ define void @ashr_16bytes_wordOff(ptr %src.ptr, ptr %wordOff.ptr, ptr %dst) noun ; RV64I-NEXT: or a3, a6, a7 ; RV64I-NEXT: bltu a1, a4, .LBB11_2 ; RV64I-NEXT: # %bb.1: -; RV64I-NEXT: subw a5, a1, a4 +; RV64I-NEXT: sub a5, a1, a4 ; RV64I-NEXT: sra a5, a3, a5 ; RV64I-NEXT: bnez a1, .LBB11_3 ; RV64I-NEXT: j .LBB11_4 ; RV64I-NEXT: .LBB11_2: ; RV64I-NEXT: srl a5, a0, a1 -; RV64I-NEXT: negw a6, a1 +; RV64I-NEXT: neg a6, a1 ; RV64I-NEXT: sll a6, a3, a6 ; RV64I-NEXT: or a5, a5, a6 ; RV64I-NEXT: beqz a1, .LBB11_4 @@ -2796,8 +2796,8 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; RV64I-NEXT: or t0, t5, t3 ; RV64I-NEXT: or a5, s0, t6 ; RV64I-NEXT: slli a5, a5, 3 -; RV64I-NEXT: subw t1, a5, a7 -; RV64I-NEXT: negw t5, a5 +; RV64I-NEXT: sub t1, a5, a7 +; RV64I-NEXT: neg t5, a5 ; RV64I-NEXT: sll t3, t0, t5 ; RV64I-NEXT: bltu a5, a7, .LBB12_2 ; RV64I-NEXT: # %bb.1: @@ -2842,7 +2842,7 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; RV64I-NEXT: bgeu t6, a7, .LBB12_14 ; RV64I-NEXT: .LBB12_12: ; RV64I-NEXT: sll t5, a6, t5 -; RV64I-NEXT: negw s0, t6 +; RV64I-NEXT: neg s0, t6 ; RV64I-NEXT: srl s0, a6, s0 ; RV64I-NEXT: or s1, s0, t3 ; RV64I-NEXT: j .LBB12_15 @@ -2851,7 +2851,7 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; RV64I-NEXT: bltu t6, a7, .LBB12_12 ; RV64I-NEXT: .LBB12_14: ; RV64I-NEXT: li t5, 0 -; RV64I-NEXT: subw t3, t6, a7 +; RV64I-NEXT: sub t3, t6, a7 ; RV64I-NEXT: sll s1, a6, t3 ; RV64I-NEXT: .LBB12_15: ; RV64I-NEXT: sub s0, a5, t1 @@ -2862,13 +2862,13 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; RV64I-NEXT: .LBB12_17: ; RV64I-NEXT: bltu s0, a7, .LBB12_19 ; RV64I-NEXT: # %bb.18: -; RV64I-NEXT: subw t6, s0, a7 +; RV64I-NEXT: sub t6, s0, a7 ; RV64I-NEXT: srl t6, t0, t6 ; RV64I-NEXT: bnez s0, .LBB12_20 ; RV64I-NEXT: j .LBB12_21 ; RV64I-NEXT: .LBB12_19: ; RV64I-NEXT: srl t6, a6, s0 -; RV64I-NEXT: negw s1, s0 +; RV64I-NEXT: neg s1, s0 ; RV64I-NEXT: sll s1, t0, s1 ; RV64I-NEXT: or t6, t6, s1 ; RV64I-NEXT: beqz s0, .LBB12_21 @@ -3720,8 +3720,8 @@ define void @lshr_32bytes_wordOff(ptr %src.ptr, ptr %wordOff.ptr, ptr %dst) noun ; RV64I-NEXT: or t0, t5, t3 ; RV64I-NEXT: or a5, s0, t6 ; RV64I-NEXT: slli a5, a5, 5 -; RV64I-NEXT: subw t1, a5, a7 -; RV64I-NEXT: negw t5, a5 +; RV64I-NEXT: sub t1, a5, a7 +; RV64I-NEXT: neg t5, a5 ; RV64I-NEXT: sll t3, t0, t5 ; RV64I-NEXT: bltu a5, a7, .LBB13_2 ; RV64I-NEXT: # %bb.1: @@ -3766,7 +3766,7 @@ define void @lshr_32bytes_wordOff(ptr %src.ptr, ptr %wordOff.ptr, ptr %dst) noun ; RV64I-NEXT: bgeu t6, a7, .LBB13_14 ; RV64I-NEXT: .LBB13_12: ; RV64I-NEXT: sll t5, a6, t5 -; RV64I-NEXT: negw s0, t6 +; RV64I-NEXT: neg s0, t6 ; RV64I-NEXT: srl s0, a6, s0 ; RV64I-NEXT: or s1, s0, t3 ; RV64I-NEXT: j .LBB13_15 @@ -3775,7 +3775,7 @@ define void @lshr_32bytes_wordOff(ptr %src.ptr, ptr %wordOff.ptr, ptr %dst) noun ; RV64I-NEXT: bltu t6, a7, .LBB13_12 ; RV64I-NEXT: .LBB13_14: ; RV64I-NEXT: li t5, 0 -; RV64I-NEXT: subw t3, t6, a7 +; RV64I-NEXT: sub t3, t6, a7 ; RV64I-NEXT: sll s1, a6, t3 ; RV64I-NEXT: .LBB13_15: ; RV64I-NEXT: sub s0, a5, t1 @@ -3786,13 +3786,13 @@ define void @lshr_32bytes_wordOff(ptr %src.ptr, ptr %wordOff.ptr, ptr %dst) noun ; RV64I-NEXT: .LBB13_17: ; RV64I-NEXT: bltu s0, a7, .LBB13_19 ; RV64I-NEXT: # %bb.18: -; RV64I-NEXT: subw t6, s0, a7 +; RV64I-NEXT: sub t6, s0, a7 ; RV64I-NEXT: srl t6, t0, t6 ; RV64I-NEXT: bnez s0, .LBB13_20 ; RV64I-NEXT: j .LBB13_21 ; RV64I-NEXT: .LBB13_19: ; RV64I-NEXT: srl t6, a6, s0 -; RV64I-NEXT: negw s1, s0 +; RV64I-NEXT: neg s1, s0 ; RV64I-NEXT: sll s1, t0, s1 ; RV64I-NEXT: or t6, t6, s1 ; RV64I-NEXT: beqz s0, .LBB13_21 @@ -4644,8 +4644,8 @@ define void @lshr_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) no ; RV64I-NEXT: or t0, t5, t3 ; RV64I-NEXT: or a5, s0, t6 ; RV64I-NEXT: slli a5, a5, 6 -; RV64I-NEXT: subw t1, a5, a7 -; RV64I-NEXT: negw t5, a5 +; RV64I-NEXT: sub t1, a5, a7 +; RV64I-NEXT: neg t5, a5 ; RV64I-NEXT: sll t3, t0, t5 ; RV64I-NEXT: bltu a5, a7, .LBB14_2 ; RV64I-NEXT: # %bb.1: @@ -4690,7 +4690,7 @@ define void @lshr_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) no ; RV64I-NEXT: bgeu t6, a7, .LBB14_14 ; RV64I-NEXT: .LBB14_12: ; RV64I-NEXT: sll t5, a6, t5 -; RV64I-NEXT: negw s0, t6 +; RV64I-NEXT: neg s0, t6 ; RV64I-NEXT: srl s0, a6, s0 ; RV64I-NEXT: or s1, s0, t3 ; RV64I-NEXT: j .LBB14_15 @@ -4699,7 +4699,7 @@ define void @lshr_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) no ; RV64I-NEXT: bltu t6, a7, .LBB14_12 ; RV64I-NEXT: .LBB14_14: ; RV64I-NEXT: li t5, 0 -; RV64I-NEXT: subw t3, t6, a7 +; RV64I-NEXT: sub t3, t6, a7 ; RV64I-NEXT: sll s1, a6, t3 ; RV64I-NEXT: .LBB14_15: ; RV64I-NEXT: sub s0, a5, t1 @@ -4710,13 +4710,13 @@ define void @lshr_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) no ; RV64I-NEXT: .LBB14_17: ; RV64I-NEXT: bltu s0, a7, .LBB14_19 ; RV64I-NEXT: # %bb.18: -; RV64I-NEXT: subw t6, s0, a7 +; RV64I-NEXT: sub t6, s0, a7 ; RV64I-NEXT: srl t6, t0, t6 ; RV64I-NEXT: bnez s0, .LBB14_20 ; RV64I-NEXT: j .LBB14_21 ; RV64I-NEXT: .LBB14_19: ; RV64I-NEXT: srl t6, a6, s0 -; RV64I-NEXT: negw s1, s0 +; RV64I-NEXT: neg s1, s0 ; RV64I-NEXT: sll s1, t0, s1 ; RV64I-NEXT: or t6, t6, s1 ; RV64I-NEXT: beqz s0, .LBB14_21 @@ -5542,8 +5542,8 @@ define void @shl_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; RV64I-NEXT: or a5, s0, a6 ; RV64I-NEXT: or a6, a1, s5 ; RV64I-NEXT: slli a6, a6, 3 -; RV64I-NEXT: subw t2, a6, t0 -; RV64I-NEXT: negw t3, a6 +; RV64I-NEXT: sub t2, a6, t0 +; RV64I-NEXT: neg t3, a6 ; RV64I-NEXT: srl s0, t1, t3 ; RV64I-NEXT: bltu a6, t0, .LBB15_2 ; RV64I-NEXT: # %bb.1: @@ -5585,11 +5585,11 @@ define void @shl_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; RV64I-NEXT: slli s4, s9, 16 ; RV64I-NEXT: bltu a4, t0, .LBB15_7 ; RV64I-NEXT: # %bb.6: -; RV64I-NEXT: subw s0, a4, t0 +; RV64I-NEXT: sub s0, a4, t0 ; RV64I-NEXT: srl s0, a5, s0 ; RV64I-NEXT: j .LBB15_8 ; RV64I-NEXT: .LBB15_7: -; RV64I-NEXT: negw s6, a4 +; RV64I-NEXT: neg s6, a4 ; RV64I-NEXT: sll s6, a5, s6 ; RV64I-NEXT: or s0, s0, s6 ; RV64I-NEXT: .LBB15_8: @@ -5637,13 +5637,13 @@ define void @shl_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; RV64I-NEXT: bltu s0, t0, .LBB15_20 ; RV64I-NEXT: # %bb.19: ; RV64I-NEXT: li t2, 0 -; RV64I-NEXT: subw t0, s0, t0 +; RV64I-NEXT: sub t0, s0, t0 ; RV64I-NEXT: sll t0, t1, t0 ; RV64I-NEXT: bnez s0, .LBB15_21 ; RV64I-NEXT: j .LBB15_22 ; RV64I-NEXT: .LBB15_20: ; RV64I-NEXT: sll t2, t1, s0 -; RV64I-NEXT: negw t0, s0 +; RV64I-NEXT: neg t0, s0 ; RV64I-NEXT: srl t0, t1, t0 ; RV64I-NEXT: sll t1, a5, s0 ; RV64I-NEXT: or t0, t0, t1 @@ -6456,8 +6456,8 @@ define void @shl_32bytes_wordOff(ptr %src.ptr, ptr %wordOff.ptr, ptr %dst) nounw ; RV64I-NEXT: or a5, s0, a6 ; RV64I-NEXT: or a6, a1, s5 ; RV64I-NEXT: slli a6, a6, 5 -; RV64I-NEXT: subw t2, a6, t0 -; RV64I-NEXT: negw t3, a6 +; RV64I-NEXT: sub t2, a6, t0 +; RV64I-NEXT: neg t3, a6 ; RV64I-NEXT: srl s0, t1, t3 ; RV64I-NEXT: bltu a6, t0, .LBB16_2 ; RV64I-NEXT: # %bb.1: @@ -6499,11 +6499,11 @@ define void @shl_32bytes_wordOff(ptr %src.ptr, ptr %wordOff.ptr, ptr %dst) nounw ; RV64I-NEXT: slli s4, s9, 16 ; RV64I-NEXT: bltu a4, t0, .LBB16_7 ; RV64I-NEXT: # %bb.6: -; RV64I-NEXT: subw s0, a4, t0 +; RV64I-NEXT: sub s0, a4, t0 ; RV64I-NEXT: srl s0, a5, s0 ; RV64I-NEXT: j .LBB16_8 ; RV64I-NEXT: .LBB16_7: -; RV64I-NEXT: negw s6, a4 +; RV64I-NEXT: neg s6, a4 ; RV64I-NEXT: sll s6, a5, s6 ; RV64I-NEXT: or s0, s0, s6 ; RV64I-NEXT: .LBB16_8: @@ -6551,13 +6551,13 @@ define void @shl_32bytes_wordOff(ptr %src.ptr, ptr %wordOff.ptr, ptr %dst) nounw ; RV64I-NEXT: bltu s0, t0, .LBB16_20 ; RV64I-NEXT: # %bb.19: ; RV64I-NEXT: li t2, 0 -; RV64I-NEXT: subw t0, s0, t0 +; RV64I-NEXT: sub t0, s0, t0 ; RV64I-NEXT: sll t0, t1, t0 ; RV64I-NEXT: bnez s0, .LBB16_21 ; RV64I-NEXT: j .LBB16_22 ; RV64I-NEXT: .LBB16_20: ; RV64I-NEXT: sll t2, t1, s0 -; RV64I-NEXT: negw t0, s0 +; RV64I-NEXT: neg t0, s0 ; RV64I-NEXT: srl t0, t1, t0 ; RV64I-NEXT: sll t1, a5, s0 ; RV64I-NEXT: or t0, t0, t1 @@ -7370,8 +7370,8 @@ define void @shl_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) nou ; RV64I-NEXT: or a5, s0, a6 ; RV64I-NEXT: or a6, a1, s5 ; RV64I-NEXT: slli a6, a6, 6 -; RV64I-NEXT: subw t2, a6, t0 -; RV64I-NEXT: negw t3, a6 +; RV64I-NEXT: sub t2, a6, t0 +; RV64I-NEXT: neg t3, a6 ; RV64I-NEXT: srl s0, t1, t3 ; RV64I-NEXT: bltu a6, t0, .LBB17_2 ; RV64I-NEXT: # %bb.1: @@ -7413,11 +7413,11 @@ define void @shl_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) nou ; RV64I-NEXT: slli s4, s9, 16 ; RV64I-NEXT: bltu a4, t0, .LBB17_7 ; RV64I-NEXT: # %bb.6: -; RV64I-NEXT: subw s0, a4, t0 +; RV64I-NEXT: sub s0, a4, t0 ; RV64I-NEXT: srl s0, a5, s0 ; RV64I-NEXT: j .LBB17_8 ; RV64I-NEXT: .LBB17_7: -; RV64I-NEXT: negw s6, a4 +; RV64I-NEXT: neg s6, a4 ; RV64I-NEXT: sll s6, a5, s6 ; RV64I-NEXT: or s0, s0, s6 ; RV64I-NEXT: .LBB17_8: @@ -7465,13 +7465,13 @@ define void @shl_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) nou ; RV64I-NEXT: bltu s0, t0, .LBB17_20 ; RV64I-NEXT: # %bb.19: ; RV64I-NEXT: li t2, 0 -; RV64I-NEXT: subw t0, s0, t0 +; RV64I-NEXT: sub t0, s0, t0 ; RV64I-NEXT: sll t0, t1, t0 ; RV64I-NEXT: bnez s0, .LBB17_21 ; RV64I-NEXT: j .LBB17_22 ; RV64I-NEXT: .LBB17_20: ; RV64I-NEXT: sll t2, t1, s0 -; RV64I-NEXT: negw t0, s0 +; RV64I-NEXT: neg t0, s0 ; RV64I-NEXT: srl t0, t1, t0 ; RV64I-NEXT: sll t1, a5, s0 ; RV64I-NEXT: or t0, t0, t1 @@ -8310,8 +8310,8 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; RV64I-NEXT: or a5, t5, t4 ; RV64I-NEXT: or a6, s0, t6 ; RV64I-NEXT: slli a6, a6, 3 -; RV64I-NEXT: subw t1, a6, t0 -; RV64I-NEXT: negw t5, a6 +; RV64I-NEXT: sub t1, a6, t0 +; RV64I-NEXT: neg t5, a6 ; RV64I-NEXT: sll t4, a5, t5 ; RV64I-NEXT: bltu a6, t0, .LBB18_2 ; RV64I-NEXT: # %bb.1: @@ -8356,7 +8356,7 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; RV64I-NEXT: bgeu t6, t0, .LBB18_14 ; RV64I-NEXT: .LBB18_12: ; RV64I-NEXT: sll t5, a7, t5 -; RV64I-NEXT: negw s0, t6 +; RV64I-NEXT: neg s0, t6 ; RV64I-NEXT: srl s0, a7, s0 ; RV64I-NEXT: or s1, s0, t4 ; RV64I-NEXT: j .LBB18_15 @@ -8365,7 +8365,7 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; RV64I-NEXT: bltu t6, t0, .LBB18_12 ; RV64I-NEXT: .LBB18_14: ; RV64I-NEXT: li t5, 0 -; RV64I-NEXT: subw t4, t6, t0 +; RV64I-NEXT: sub t4, t6, t0 ; RV64I-NEXT: sll s1, a7, t4 ; RV64I-NEXT: .LBB18_15: ; RV64I-NEXT: sub s0, a6, t1 @@ -8376,13 +8376,13 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; RV64I-NEXT: .LBB18_17: ; RV64I-NEXT: bltu s0, t0, .LBB18_19 ; RV64I-NEXT: # %bb.18: -; RV64I-NEXT: subw t6, s0, t0 +; RV64I-NEXT: sub t6, s0, t0 ; RV64I-NEXT: sra t6, a5, t6 ; RV64I-NEXT: bnez s0, .LBB18_20 ; RV64I-NEXT: j .LBB18_21 ; RV64I-NEXT: .LBB18_19: ; RV64I-NEXT: srl t6, a7, s0 -; RV64I-NEXT: negw s1, s0 +; RV64I-NEXT: neg s1, s0 ; RV64I-NEXT: sll s1, a5, s1 ; RV64I-NEXT: or t6, t6, s1 ; RV64I-NEXT: beqz s0, .LBB18_21 @@ -9241,8 +9241,8 @@ define void @ashr_32bytes_wordOff(ptr %src.ptr, ptr %wordOff.ptr, ptr %dst) noun ; RV64I-NEXT: or a5, t5, t4 ; RV64I-NEXT: or a6, s0, t6 ; RV64I-NEXT: slli a6, a6, 5 -; RV64I-NEXT: subw t1, a6, t0 -; RV64I-NEXT: negw t5, a6 +; RV64I-NEXT: sub t1, a6, t0 +; RV64I-NEXT: neg t5, a6 ; RV64I-NEXT: sll t4, a5, t5 ; RV64I-NEXT: bltu a6, t0, .LBB19_2 ; RV64I-NEXT: # %bb.1: @@ -9287,7 +9287,7 @@ define void @ashr_32bytes_wordOff(ptr %src.ptr, ptr %wordOff.ptr, ptr %dst) noun ; RV64I-NEXT: bgeu t6, t0, .LBB19_14 ; RV64I-NEXT: .LBB19_12: ; RV64I-NEXT: sll t5, a7, t5 -; RV64I-NEXT: negw s0, t6 +; RV64I-NEXT: neg s0, t6 ; RV64I-NEXT: srl s0, a7, s0 ; RV64I-NEXT: or s1, s0, t4 ; RV64I-NEXT: j .LBB19_15 @@ -9296,7 +9296,7 @@ define void @ashr_32bytes_wordOff(ptr %src.ptr, ptr %wordOff.ptr, ptr %dst) noun ; RV64I-NEXT: bltu t6, t0, .LBB19_12 ; RV64I-NEXT: .LBB19_14: ; RV64I-NEXT: li t5, 0 -; RV64I-NEXT: subw t4, t6, t0 +; RV64I-NEXT: sub t4, t6, t0 ; RV64I-NEXT: sll s1, a7, t4 ; RV64I-NEXT: .LBB19_15: ; RV64I-NEXT: sub s0, a6, t1 @@ -9307,13 +9307,13 @@ define void @ashr_32bytes_wordOff(ptr %src.ptr, ptr %wordOff.ptr, ptr %dst) noun ; RV64I-NEXT: .LBB19_17: ; RV64I-NEXT: bltu s0, t0, .LBB19_19 ; RV64I-NEXT: # %bb.18: -; RV64I-NEXT: subw t6, s0, t0 +; RV64I-NEXT: sub t6, s0, t0 ; RV64I-NEXT: sra t6, a5, t6 ; RV64I-NEXT: bnez s0, .LBB19_20 ; RV64I-NEXT: j .LBB19_21 ; RV64I-NEXT: .LBB19_19: ; RV64I-NEXT: srl t6, a7, s0 -; RV64I-NEXT: negw s1, s0 +; RV64I-NEXT: neg s1, s0 ; RV64I-NEXT: sll s1, a5, s1 ; RV64I-NEXT: or t6, t6, s1 ; RV64I-NEXT: beqz s0, .LBB19_21 @@ -10172,8 +10172,8 @@ define void @ashr_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) no ; RV64I-NEXT: or a5, t5, t4 ; RV64I-NEXT: or a6, s0, t6 ; RV64I-NEXT: slli a6, a6, 6 -; RV64I-NEXT: subw t1, a6, t0 -; RV64I-NEXT: negw t5, a6 +; RV64I-NEXT: sub t1, a6, t0 +; RV64I-NEXT: neg t5, a6 ; RV64I-NEXT: sll t4, a5, t5 ; RV64I-NEXT: bltu a6, t0, .LBB20_2 ; RV64I-NEXT: # %bb.1: @@ -10218,7 +10218,7 @@ define void @ashr_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) no ; RV64I-NEXT: bgeu t6, t0, .LBB20_14 ; RV64I-NEXT: .LBB20_12: ; RV64I-NEXT: sll t5, a7, t5 -; RV64I-NEXT: negw s0, t6 +; RV64I-NEXT: neg s0, t6 ; RV64I-NEXT: srl s0, a7, s0 ; RV64I-NEXT: or s1, s0, t4 ; RV64I-NEXT: j .LBB20_15 @@ -10227,7 +10227,7 @@ define void @ashr_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) no ; RV64I-NEXT: bltu t6, t0, .LBB20_12 ; RV64I-NEXT: .LBB20_14: ; RV64I-NEXT: li t5, 0 -; RV64I-NEXT: subw t4, t6, t0 +; RV64I-NEXT: sub t4, t6, t0 ; RV64I-NEXT: sll s1, a7, t4 ; RV64I-NEXT: .LBB20_15: ; RV64I-NEXT: sub s0, a6, t1 @@ -10238,13 +10238,13 @@ define void @ashr_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) no ; RV64I-NEXT: .LBB20_17: ; RV64I-NEXT: bltu s0, t0, .LBB20_19 ; RV64I-NEXT: # %bb.18: -; RV64I-NEXT: subw t6, s0, t0 +; RV64I-NEXT: sub t6, s0, t0 ; RV64I-NEXT: sra t6, a5, t6 ; RV64I-NEXT: bnez s0, .LBB20_20 ; RV64I-NEXT: j .LBB20_21 ; RV64I-NEXT: .LBB20_19: ; RV64I-NEXT: srl t6, a7, s0 -; RV64I-NEXT: negw s1, s0 +; RV64I-NEXT: neg s1, s0 ; RV64I-NEXT: sll s1, a5, s1 ; RV64I-NEXT: or t6, t6, s1 ; RV64I-NEXT: beqz s0, .LBB20_21 diff --git a/llvm/test/CodeGen/RISCV/abds-neg.ll b/llvm/test/CodeGen/RISCV/abds-neg.ll index 3fb0f2c..41f73f5 100644 --- a/llvm/test/CodeGen/RISCV/abds-neg.ll +++ b/llvm/test/CodeGen/RISCV/abds-neg.ll @@ -2221,7 +2221,7 @@ define i32 @abd_subnsw_i32(i32 %a, i32 %b) nounwind { ; ; RV64I-LABEL: abd_subnsw_i32: ; RV64I: # %bb.0: -; RV64I-NEXT: subw a0, a0, a1 +; RV64I-NEXT: sub a0, a0, a1 ; RV64I-NEXT: sraiw a1, a0, 31 ; RV64I-NEXT: xor a0, a0, a1 ; RV64I-NEXT: subw a0, a1, a0 @@ -2236,7 +2236,7 @@ define i32 @abd_subnsw_i32(i32 %a, i32 %b) nounwind { ; ; RV64ZBB-LABEL: abd_subnsw_i32: ; RV64ZBB: # %bb.0: -; RV64ZBB-NEXT: subw a0, a0, a1 +; RV64ZBB-NEXT: sub a0, a0, a1 ; RV64ZBB-NEXT: sraiw a1, a0, 31 ; RV64ZBB-NEXT: xor a0, a0, a1 ; RV64ZBB-NEXT: subw a0, a1, a0 @@ -2258,7 +2258,7 @@ define i32 @abd_subnsw_i32_undef(i32 %a, i32 %b) nounwind { ; ; RV64I-LABEL: abd_subnsw_i32_undef: ; RV64I: # %bb.0: -; RV64I-NEXT: subw a0, a0, a1 +; RV64I-NEXT: sub a0, a0, a1 ; RV64I-NEXT: sraiw a1, a0, 31 ; RV64I-NEXT: xor a0, a0, a1 ; RV64I-NEXT: subw a0, a1, a0 @@ -2273,7 +2273,7 @@ define i32 @abd_subnsw_i32_undef(i32 %a, i32 %b) nounwind { ; ; RV64ZBB-LABEL: abd_subnsw_i32_undef: ; RV64ZBB: # %bb.0: -; RV64ZBB-NEXT: subw a0, a0, a1 +; RV64ZBB-NEXT: sub a0, a0, a1 ; RV64ZBB-NEXT: sraiw a1, a0, 31 ; RV64ZBB-NEXT: xor a0, a0, a1 ; RV64ZBB-NEXT: subw a0, a1, a0 diff --git a/llvm/test/CodeGen/RISCV/abds.ll b/llvm/test/CodeGen/RISCV/abds.ll index efb4e1a..28a95ef 100644 --- a/llvm/test/CodeGen/RISCV/abds.ll +++ b/llvm/test/CodeGen/RISCV/abds.ll @@ -1733,21 +1733,13 @@ define i8 @abd_subnsw_i8(i8 %a, i8 %b) nounwind { ; RV64I-NEXT: sub a0, a0, a1 ; RV64I-NEXT: ret ; -; RV32ZBB-LABEL: abd_subnsw_i8: -; RV32ZBB: # %bb.0: -; RV32ZBB-NEXT: sub a0, a0, a1 -; RV32ZBB-NEXT: sext.b a0, a0 -; RV32ZBB-NEXT: neg a1, a0 -; RV32ZBB-NEXT: max a0, a0, a1 -; RV32ZBB-NEXT: ret -; -; RV64ZBB-LABEL: abd_subnsw_i8: -; RV64ZBB: # %bb.0: -; RV64ZBB-NEXT: subw a0, a0, a1 -; RV64ZBB-NEXT: sext.b a0, a0 -; RV64ZBB-NEXT: neg a1, a0 -; RV64ZBB-NEXT: max a0, a0, a1 -; RV64ZBB-NEXT: ret +; ZBB-LABEL: abd_subnsw_i8: +; ZBB: # %bb.0: +; ZBB-NEXT: sub a0, a0, a1 +; ZBB-NEXT: sext.b a0, a0 +; ZBB-NEXT: neg a1, a0 +; ZBB-NEXT: max a0, a0, a1 +; ZBB-NEXT: ret %sub = sub nsw i8 %a, %b %abs = call i8 @llvm.abs.i8(i8 %sub, i1 false) ret i8 %abs @@ -1772,21 +1764,13 @@ define i8 @abd_subnsw_i8_undef(i8 %a, i8 %b) nounwind { ; RV64I-NEXT: sub a0, a0, a1 ; RV64I-NEXT: ret ; -; RV32ZBB-LABEL: abd_subnsw_i8_undef: -; RV32ZBB: # %bb.0: -; RV32ZBB-NEXT: sub a0, a0, a1 -; RV32ZBB-NEXT: sext.b a0, a0 -; RV32ZBB-NEXT: neg a1, a0 -; RV32ZBB-NEXT: max a0, a0, a1 -; RV32ZBB-NEXT: ret -; -; RV64ZBB-LABEL: abd_subnsw_i8_undef: -; RV64ZBB: # %bb.0: -; RV64ZBB-NEXT: subw a0, a0, a1 -; RV64ZBB-NEXT: sext.b a0, a0 -; RV64ZBB-NEXT: neg a1, a0 -; RV64ZBB-NEXT: max a0, a0, a1 -; RV64ZBB-NEXT: ret +; ZBB-LABEL: abd_subnsw_i8_undef: +; ZBB: # %bb.0: +; ZBB-NEXT: sub a0, a0, a1 +; ZBB-NEXT: sext.b a0, a0 +; ZBB-NEXT: neg a1, a0 +; ZBB-NEXT: max a0, a0, a1 +; ZBB-NEXT: ret %sub = sub nsw i8 %a, %b %abs = call i8 @llvm.abs.i8(i8 %sub, i1 true) ret i8 %abs @@ -1811,21 +1795,13 @@ define i16 @abd_subnsw_i16(i16 %a, i16 %b) nounwind { ; RV64I-NEXT: sub a0, a0, a1 ; RV64I-NEXT: ret ; -; RV32ZBB-LABEL: abd_subnsw_i16: -; RV32ZBB: # %bb.0: -; RV32ZBB-NEXT: sub a0, a0, a1 -; RV32ZBB-NEXT: sext.h a0, a0 -; RV32ZBB-NEXT: neg a1, a0 -; RV32ZBB-NEXT: max a0, a0, a1 -; RV32ZBB-NEXT: ret -; -; RV64ZBB-LABEL: abd_subnsw_i16: -; RV64ZBB: # %bb.0: -; RV64ZBB-NEXT: subw a0, a0, a1 -; RV64ZBB-NEXT: sext.h a0, a0 -; RV64ZBB-NEXT: neg a1, a0 -; RV64ZBB-NEXT: max a0, a0, a1 -; RV64ZBB-NEXT: ret +; ZBB-LABEL: abd_subnsw_i16: +; ZBB: # %bb.0: +; ZBB-NEXT: sub a0, a0, a1 +; ZBB-NEXT: sext.h a0, a0 +; ZBB-NEXT: neg a1, a0 +; ZBB-NEXT: max a0, a0, a1 +; ZBB-NEXT: ret %sub = sub nsw i16 %a, %b %abs = call i16 @llvm.abs.i16(i16 %sub, i1 false) ret i16 %abs @@ -1850,21 +1826,13 @@ define i16 @abd_subnsw_i16_undef(i16 %a, i16 %b) nounwind { ; RV64I-NEXT: sub a0, a0, a1 ; RV64I-NEXT: ret ; -; RV32ZBB-LABEL: abd_subnsw_i16_undef: -; RV32ZBB: # %bb.0: -; RV32ZBB-NEXT: sub a0, a0, a1 -; RV32ZBB-NEXT: sext.h a0, a0 -; RV32ZBB-NEXT: neg a1, a0 -; RV32ZBB-NEXT: max a0, a0, a1 -; RV32ZBB-NEXT: ret -; -; RV64ZBB-LABEL: abd_subnsw_i16_undef: -; RV64ZBB: # %bb.0: -; RV64ZBB-NEXT: subw a0, a0, a1 -; RV64ZBB-NEXT: sext.h a0, a0 -; RV64ZBB-NEXT: neg a1, a0 -; RV64ZBB-NEXT: max a0, a0, a1 -; RV64ZBB-NEXT: ret +; ZBB-LABEL: abd_subnsw_i16_undef: +; ZBB: # %bb.0: +; ZBB-NEXT: sub a0, a0, a1 +; ZBB-NEXT: sext.h a0, a0 +; ZBB-NEXT: neg a1, a0 +; ZBB-NEXT: max a0, a0, a1 +; ZBB-NEXT: ret %sub = sub nsw i16 %a, %b %abs = call i16 @llvm.abs.i16(i16 %sub, i1 true) ret i16 %abs @@ -1881,7 +1849,7 @@ define i32 @abd_subnsw_i32(i32 %a, i32 %b) nounwind { ; ; RV64I-LABEL: abd_subnsw_i32: ; RV64I: # %bb.0: -; RV64I-NEXT: subw a0, a0, a1 +; RV64I-NEXT: sub a0, a0, a1 ; RV64I-NEXT: sraiw a1, a0, 31 ; RV64I-NEXT: xor a0, a0, a1 ; RV64I-NEXT: subw a0, a0, a1 @@ -1916,7 +1884,7 @@ define i32 @abd_subnsw_i32_undef(i32 %a, i32 %b) nounwind { ; ; RV64I-LABEL: abd_subnsw_i32_undef: ; RV64I: # %bb.0: -; RV64I-NEXT: subw a0, a0, a1 +; RV64I-NEXT: sub a0, a0, a1 ; RV64I-NEXT: sraiw a1, a0, 31 ; RV64I-NEXT: xor a0, a0, a1 ; RV64I-NEXT: subw a0, a0, a1 @@ -2317,7 +2285,7 @@ define i32 @abd_sub_i32(i32 %a, i32 %b) nounwind { ; ; RV64I-LABEL: abd_sub_i32: ; RV64I: # %bb.0: -; RV64I-NEXT: subw a0, a0, a1 +; RV64I-NEXT: sub a0, a0, a1 ; RV64I-NEXT: sraiw a1, a0, 31 ; RV64I-NEXT: xor a0, a0, a1 ; RV64I-NEXT: subw a0, a0, a1 diff --git a/llvm/test/CodeGen/RISCV/addimm-mulimm.ll b/llvm/test/CodeGen/RISCV/addimm-mulimm.ll index aac355e..3b2cab2 100644 --- a/llvm/test/CodeGen/RISCV/addimm-mulimm.ll +++ b/llvm/test/CodeGen/RISCV/addimm-mulimm.ll @@ -20,7 +20,7 @@ define i32 @add_mul_combine_accept_a1(i32 %x) { ; RV64IMB: # %bb.0: ; RV64IMB-NEXT: sh1add a1, a0, a0 ; RV64IMB-NEXT: slli a0, a0, 5 -; RV64IMB-NEXT: subw a0, a0, a1 +; RV64IMB-NEXT: sub a0, a0, a1 ; RV64IMB-NEXT: addiw a0, a0, 1073 ; RV64IMB-NEXT: ret %tmp0 = add i32 %x, 37 @@ -41,7 +41,7 @@ define signext i32 @add_mul_combine_accept_a2(i32 signext %x) { ; RV64IMB: # %bb.0: ; RV64IMB-NEXT: sh1add a1, a0, a0 ; RV64IMB-NEXT: slli a0, a0, 5 -; RV64IMB-NEXT: subw a0, a0, a1 +; RV64IMB-NEXT: sub a0, a0, a1 ; RV64IMB-NEXT: addiw a0, a0, 1073 ; RV64IMB-NEXT: ret %tmp0 = add i32 %x, 37 @@ -93,7 +93,7 @@ define i32 @add_mul_combine_accept_b1(i32 %x) { ; RV64IMB: # %bb.0: ; RV64IMB-NEXT: sh3add a1, a0, a0 ; RV64IMB-NEXT: slli a0, a0, 5 -; RV64IMB-NEXT: subw a0, a0, a1 +; RV64IMB-NEXT: sub a0, a0, a1 ; RV64IMB-NEXT: lui a1, 50 ; RV64IMB-NEXT: addi a1, a1, 1119 ; RV64IMB-NEXT: addw a0, a0, a1 @@ -118,7 +118,7 @@ define signext i32 @add_mul_combine_accept_b2(i32 signext %x) { ; RV64IMB: # %bb.0: ; RV64IMB-NEXT: sh3add a1, a0, a0 ; RV64IMB-NEXT: slli a0, a0, 5 -; RV64IMB-NEXT: subw a0, a0, a1 +; RV64IMB-NEXT: sub a0, a0, a1 ; RV64IMB-NEXT: lui a1, 50 ; RV64IMB-NEXT: addi a1, a1, 1119 ; RV64IMB-NEXT: addw a0, a0, a1 @@ -456,7 +456,7 @@ define i32 @add_mul_combine_reject_f1(i32 %x) { ; RV64IMB-NEXT: addi a0, a0, 1972 ; RV64IMB-NEXT: sh1add a1, a0, a0 ; RV64IMB-NEXT: slli a0, a0, 5 -; RV64IMB-NEXT: subw a0, a0, a1 +; RV64IMB-NEXT: sub a0, a0, a1 ; RV64IMB-NEXT: addiw a0, a0, 11 ; RV64IMB-NEXT: ret %tmp0 = mul i32 %x, 29 @@ -479,7 +479,7 @@ define signext i32 @add_mul_combine_reject_f2(i32 signext %x) { ; RV64IMB-NEXT: addi a0, a0, 1972 ; RV64IMB-NEXT: sh1add a1, a0, a0 ; RV64IMB-NEXT: slli a0, a0, 5 -; RV64IMB-NEXT: subw a0, a0, a1 +; RV64IMB-NEXT: sub a0, a0, a1 ; RV64IMB-NEXT: addiw a0, a0, 11 ; RV64IMB-NEXT: ret %tmp0 = mul i32 %x, 29 diff --git a/llvm/test/CodeGen/RISCV/aext-to-sext.ll b/llvm/test/CodeGen/RISCV/aext-to-sext.ll index f3f71a9..34549a0 100644 --- a/llvm/test/CodeGen/RISCV/aext-to-sext.ll +++ b/llvm/test/CodeGen/RISCV/aext-to-sext.ll @@ -16,7 +16,7 @@ define void @quux(i32 signext %arg, i32 signext %arg1) nounwind { ; RV64I-NEXT: addi sp, sp, -16 ; RV64I-NEXT: sd ra, 8(sp) # 8-byte Folded Spill ; RV64I-NEXT: sd s0, 0(sp) # 8-byte Folded Spill -; RV64I-NEXT: subw s0, a1, a0 +; RV64I-NEXT: sub s0, a1, a0 ; RV64I-NEXT: .LBB0_2: # %bb2 ; RV64I-NEXT: # =>This Inner Loop Header: Depth=1 ; RV64I-NEXT: call hoge diff --git a/llvm/test/CodeGen/RISCV/atomic-signext.ll b/llvm/test/CodeGen/RISCV/atomic-signext.ll index bebc097..7d29ac9 100644 --- a/llvm/test/CodeGen/RISCV/atomic-signext.ll +++ b/llvm/test/CodeGen/RISCV/atomic-signext.ll @@ -4582,7 +4582,7 @@ define signext i32 @atomicrmw_and_i32_monotonic_crossbb(ptr %a, i1 %c) nounwind ; RV64I-NEXT: sext.w a0, a0 ; RV64I-NEXT: ret ; RV64I-NEXT: .LBB56_2: # %else -; RV64I-NEXT: lwu a1, 0(a0) +; RV64I-NEXT: lw a1, 0(a0) ; RV64I-NEXT: andi a2, a1, 1 ; RV64I-NEXT: sw a2, 0(a0) ; RV64I-NEXT: sext.w a0, a1 @@ -4700,7 +4700,7 @@ define signext i32 @atomicrmw_nand_i32_monotonic_crossbb(ptr %a, i1 %c) nounwind ; RV64I-NEXT: sext.w a0, a0 ; RV64I-NEXT: ret ; RV64I-NEXT: .LBB57_2: # %else -; RV64I-NEXT: lwu a1, 0(a0) +; RV64I-NEXT: lw a1, 0(a0) ; RV64I-NEXT: andi a2, a1, 1 ; RV64I-NEXT: sw a2, 0(a0) ; RV64I-NEXT: sext.w a0, a1 diff --git a/llvm/test/CodeGen/RISCV/atomicrmw-cond-sub-clamp.ll b/llvm/test/CodeGen/RISCV/atomicrmw-cond-sub-clamp.ll index 27704d1..ea9786d 100644 --- a/llvm/test/CodeGen/RISCV/atomicrmw-cond-sub-clamp.ll +++ b/llvm/test/CodeGen/RISCV/atomicrmw-cond-sub-clamp.ll @@ -161,7 +161,7 @@ define i8 @atomicrmw_usub_cond_i8(ptr %ptr, i8 %val) { ; RV64IA-NEXT: sltu t0, t0, a5 ; RV64IA-NEXT: addi t0, t0, -1 ; RV64IA-NEXT: and t0, t0, a1 -; RV64IA-NEXT: subw a6, a6, t0 +; RV64IA-NEXT: sub a6, a6, t0 ; RV64IA-NEXT: zext.b a6, a6 ; RV64IA-NEXT: sllw a6, a6, a0 ; RV64IA-NEXT: and a3, a3, a4 @@ -345,7 +345,7 @@ define i16 @atomicrmw_usub_cond_i16(ptr %ptr, i16 %val) { ; RV64IA-NEXT: sltu t1, t1, a6 ; RV64IA-NEXT: addi t1, t1, -1 ; RV64IA-NEXT: and t1, t1, a1 -; RV64IA-NEXT: subw a7, a7, t1 +; RV64IA-NEXT: sub a7, a7, t1 ; RV64IA-NEXT: and a7, a7, a3 ; RV64IA-NEXT: sllw a7, a7, a0 ; RV64IA-NEXT: and a4, a4, a5 diff --git a/llvm/test/CodeGen/RISCV/atomicrmw-uinc-udec-wrap.ll b/llvm/test/CodeGen/RISCV/atomicrmw-uinc-udec-wrap.ll index ada1933..4e04f38 100644 --- a/llvm/test/CodeGen/RISCV/atomicrmw-uinc-udec-wrap.ll +++ b/llvm/test/CodeGen/RISCV/atomicrmw-uinc-udec-wrap.ll @@ -150,7 +150,7 @@ define i8 @atomicrmw_uinc_wrap_i8(ptr %ptr, i8 %val) { ; RV64IA-NEXT: zext.b a7, a5 ; RV64IA-NEXT: addi a5, a5, 1 ; RV64IA-NEXT: sltu a7, a7, a1 -; RV64IA-NEXT: negw a7, a7 +; RV64IA-NEXT: neg a7, a7 ; RV64IA-NEXT: and a5, a7, a5 ; RV64IA-NEXT: zext.b a5, a5 ; RV64IA-NEXT: sllw a5, a5, a0 @@ -325,7 +325,7 @@ define i16 @atomicrmw_uinc_wrap_i16(ptr %ptr, i16 %val) { ; RV64IA-NEXT: addi a6, a6, 1 ; RV64IA-NEXT: sltu t0, t0, a1 ; RV64IA-NEXT: and a6, a6, a3 -; RV64IA-NEXT: negw t0, t0 +; RV64IA-NEXT: neg t0, t0 ; RV64IA-NEXT: and a6, t0, a6 ; RV64IA-NEXT: sllw a6, a6, a0 ; RV64IA-NEXT: and a4, a4, a5 diff --git a/llvm/test/CodeGen/RISCV/bfloat-convert.ll b/llvm/test/CodeGen/RISCV/bfloat-convert.ll index 3422ea6..6207a17 100644 --- a/llvm/test/CodeGen/RISCV/bfloat-convert.ll +++ b/llvm/test/CodeGen/RISCV/bfloat-convert.ll @@ -1074,7 +1074,7 @@ define bfloat @fcvt_bf16_wu_load(ptr %p) nounwind { ; ; CHECK64ZFBFMIN-LABEL: fcvt_bf16_wu_load: ; CHECK64ZFBFMIN: # %bb.0: -; CHECK64ZFBFMIN-NEXT: lwu a0, 0(a0) +; CHECK64ZFBFMIN-NEXT: lw a0, 0(a0) ; CHECK64ZFBFMIN-NEXT: fcvt.s.wu fa5, a0 ; CHECK64ZFBFMIN-NEXT: fcvt.bf16.s fa0, fa5 ; CHECK64ZFBFMIN-NEXT: ret @@ -1083,7 +1083,7 @@ define bfloat @fcvt_bf16_wu_load(ptr %p) nounwind { ; RV64ID: # %bb.0: ; RV64ID-NEXT: addi sp, sp, -16 ; RV64ID-NEXT: sd ra, 8(sp) # 8-byte Folded Spill -; RV64ID-NEXT: lwu a0, 0(a0) +; RV64ID-NEXT: lw a0, 0(a0) ; RV64ID-NEXT: fcvt.s.wu fa0, a0 ; RV64ID-NEXT: call __truncsfbf2 ; RV64ID-NEXT: fmv.x.w a0, fa0 diff --git a/llvm/test/CodeGen/RISCV/ctlz-cttz-ctpop.ll b/llvm/test/CodeGen/RISCV/ctlz-cttz-ctpop.ll index 72489185..530980c 100644 --- a/llvm/test/CodeGen/RISCV/ctlz-cttz-ctpop.ll +++ b/llvm/test/CodeGen/RISCV/ctlz-cttz-ctpop.ll @@ -63,7 +63,7 @@ define i8 @test_cttz_i8(i8 %a) nounwind { ; RV64NOZBB-NEXT: and a0, a0, a1 ; RV64NOZBB-NEXT: srli a1, a0, 1 ; RV64NOZBB-NEXT: andi a1, a1, 85 -; RV64NOZBB-NEXT: subw a0, a0, a1 +; RV64NOZBB-NEXT: sub a0, a0, a1 ; RV64NOZBB-NEXT: andi a1, a0, 51 ; RV64NOZBB-NEXT: srli a0, a0, 2 ; RV64NOZBB-NEXT: andi a0, a0, 51 @@ -262,7 +262,7 @@ define i32 @test_cttz_i32(i32 %a) nounwind { ; RV64I-NEXT: sext.w a1, a0 ; RV64I-NEXT: beqz a1, .LBB2_2 ; RV64I-NEXT: # %bb.1: # %cond.false -; RV64I-NEXT: negw a1, a0 +; RV64I-NEXT: neg a1, a0 ; RV64I-NEXT: and a0, a0, a1 ; RV64I-NEXT: slli a1, a0, 6 ; RV64I-NEXT: slli a2, a0, 8 @@ -270,16 +270,16 @@ define i32 @test_cttz_i32(i32 %a) nounwind { ; RV64I-NEXT: slli a4, a0, 12 ; RV64I-NEXT: add a1, a1, a2 ; RV64I-NEXT: slli a2, a0, 16 -; RV64I-NEXT: subw a3, a3, a4 +; RV64I-NEXT: sub a3, a3, a4 ; RV64I-NEXT: slli a4, a0, 18 -; RV64I-NEXT: subw a2, a2, a4 +; RV64I-NEXT: sub a2, a2, a4 ; RV64I-NEXT: slli a4, a0, 4 -; RV64I-NEXT: subw a4, a0, a4 +; RV64I-NEXT: sub a4, a0, a4 ; RV64I-NEXT: add a1, a4, a1 ; RV64I-NEXT: slli a4, a0, 14 -; RV64I-NEXT: subw a3, a3, a4 +; RV64I-NEXT: sub a3, a3, a4 ; RV64I-NEXT: slli a4, a0, 23 -; RV64I-NEXT: subw a2, a2, a4 +; RV64I-NEXT: sub a2, a2, a4 ; RV64I-NEXT: slli a0, a0, 27 ; RV64I-NEXT: add a1, a1, a3 ; RV64I-NEXT: add a0, a2, a0 @@ -318,7 +318,7 @@ define i32 @test_cttz_i32(i32 %a) nounwind { ; RV64M-NEXT: sext.w a1, a0 ; RV64M-NEXT: beqz a1, .LBB2_2 ; RV64M-NEXT: # %bb.1: # %cond.false -; RV64M-NEXT: negw a1, a0 +; RV64M-NEXT: neg a1, a0 ; RV64M-NEXT: and a0, a0, a1 ; RV64M-NEXT: lui a1, 30667 ; RV64M-NEXT: addi a1, a1, 1329 @@ -597,7 +597,7 @@ define i8 @test_cttz_i8_zero_undef(i8 %a) nounwind { ; RV64NOZBB-NEXT: and a0, a0, a1 ; RV64NOZBB-NEXT: srli a1, a0, 1 ; RV64NOZBB-NEXT: andi a1, a1, 85 -; RV64NOZBB-NEXT: subw a0, a0, a1 +; RV64NOZBB-NEXT: sub a0, a0, a1 ; RV64NOZBB-NEXT: andi a1, a0, 51 ; RV64NOZBB-NEXT: srli a0, a0, 2 ; RV64NOZBB-NEXT: andi a0, a0, 51 @@ -743,7 +743,7 @@ define i32 @test_cttz_i32_zero_undef(i32 %a) nounwind { ; ; RV64I-LABEL: test_cttz_i32_zero_undef: ; RV64I: # %bb.0: -; RV64I-NEXT: negw a1, a0 +; RV64I-NEXT: neg a1, a0 ; RV64I-NEXT: and a0, a0, a1 ; RV64I-NEXT: slli a1, a0, 6 ; RV64I-NEXT: slli a2, a0, 8 @@ -751,16 +751,16 @@ define i32 @test_cttz_i32_zero_undef(i32 %a) nounwind { ; RV64I-NEXT: slli a4, a0, 12 ; RV64I-NEXT: add a1, a1, a2 ; RV64I-NEXT: slli a2, a0, 16 -; RV64I-NEXT: subw a3, a3, a4 +; RV64I-NEXT: sub a3, a3, a4 ; RV64I-NEXT: slli a4, a0, 18 -; RV64I-NEXT: subw a2, a2, a4 +; RV64I-NEXT: sub a2, a2, a4 ; RV64I-NEXT: slli a4, a0, 4 -; RV64I-NEXT: subw a4, a0, a4 +; RV64I-NEXT: sub a4, a0, a4 ; RV64I-NEXT: add a1, a4, a1 ; RV64I-NEXT: slli a4, a0, 14 -; RV64I-NEXT: subw a3, a3, a4 +; RV64I-NEXT: sub a3, a3, a4 ; RV64I-NEXT: slli a4, a0, 23 -; RV64I-NEXT: subw a2, a2, a4 +; RV64I-NEXT: sub a2, a2, a4 ; RV64I-NEXT: slli a0, a0, 27 ; RV64I-NEXT: add a1, a1, a3 ; RV64I-NEXT: add a0, a2, a0 @@ -788,7 +788,7 @@ define i32 @test_cttz_i32_zero_undef(i32 %a) nounwind { ; ; RV64M-LABEL: test_cttz_i32_zero_undef: ; RV64M: # %bb.0: -; RV64M-NEXT: negw a1, a0 +; RV64M-NEXT: neg a1, a0 ; RV64M-NEXT: and a0, a0, a1 ; RV64M-NEXT: lui a1, 30667 ; RV64M-NEXT: addi a1, a1, 1329 @@ -1039,7 +1039,7 @@ define i8 @test_ctlz_i8(i8 %a) nounwind { ; RV64NOZBB-NEXT: not a0, a0 ; RV64NOZBB-NEXT: srli a1, a0, 1 ; RV64NOZBB-NEXT: andi a1, a1, 85 -; RV64NOZBB-NEXT: subw a0, a0, a1 +; RV64NOZBB-NEXT: sub a0, a0, a1 ; RV64NOZBB-NEXT: andi a1, a0, 51 ; RV64NOZBB-NEXT: srli a0, a0, 2 ; RV64NOZBB-NEXT: andi a0, a0, 51 @@ -1711,7 +1711,7 @@ define i8 @test_ctlz_i8_zero_undef(i8 %a) nounwind { ; RV64NOZBB-NEXT: not a0, a0 ; RV64NOZBB-NEXT: srli a1, a0, 1 ; RV64NOZBB-NEXT: andi a1, a1, 85 -; RV64NOZBB-NEXT: subw a0, a0, a1 +; RV64NOZBB-NEXT: sub a0, a0, a1 ; RV64NOZBB-NEXT: andi a1, a0, 51 ; RV64NOZBB-NEXT: srli a0, a0, 2 ; RV64NOZBB-NEXT: andi a0, a0, 51 @@ -2296,7 +2296,7 @@ define i8 @test_ctpop_i8(i8 %a) nounwind { ; RV64NOZBB: # %bb.0: ; RV64NOZBB-NEXT: srli a1, a0, 1 ; RV64NOZBB-NEXT: andi a1, a1, 85 -; RV64NOZBB-NEXT: subw a0, a0, a1 +; RV64NOZBB-NEXT: sub a0, a0, a1 ; RV64NOZBB-NEXT: andi a1, a0, 51 ; RV64NOZBB-NEXT: srli a0, a0, 2 ; RV64NOZBB-NEXT: andi a0, a0, 51 @@ -2336,7 +2336,7 @@ define i8 @test_ctpop_i8(i8 %a) nounwind { ; RV64XTHEADBB: # %bb.0: ; RV64XTHEADBB-NEXT: srli a1, a0, 1 ; RV64XTHEADBB-NEXT: andi a1, a1, 85 -; RV64XTHEADBB-NEXT: subw a0, a0, a1 +; RV64XTHEADBB-NEXT: sub a0, a0, a1 ; RV64XTHEADBB-NEXT: andi a1, a0, 51 ; RV64XTHEADBB-NEXT: srli a0, a0, 2 ; RV64XTHEADBB-NEXT: andi a0, a0, 51 diff --git a/llvm/test/CodeGen/RISCV/ctz_zero_return_test.ll b/llvm/test/CodeGen/RISCV/ctz_zero_return_test.ll index 637fb31..a1061fbb 100644 --- a/llvm/test/CodeGen/RISCV/ctz_zero_return_test.ll +++ b/llvm/test/CodeGen/RISCV/ctz_zero_return_test.ll @@ -163,7 +163,7 @@ define i64 @ctz_dereferencing_pointer_zext(ptr %b) nounwind { ; RV64I-LABEL: ctz_dereferencing_pointer_zext: ; RV64I: # %bb.0: # %entry ; RV64I-NEXT: lw a0, 0(a0) -; RV64I-NEXT: negw a1, a0 +; RV64I-NEXT: neg a1, a0 ; RV64I-NEXT: and a1, a0, a1 ; RV64I-NEXT: slli a2, a1, 6 ; RV64I-NEXT: slli a3, a1, 8 @@ -171,16 +171,16 @@ define i64 @ctz_dereferencing_pointer_zext(ptr %b) nounwind { ; RV64I-NEXT: slli a5, a1, 12 ; RV64I-NEXT: add a2, a2, a3 ; RV64I-NEXT: slli a3, a1, 16 -; RV64I-NEXT: subw a4, a4, a5 +; RV64I-NEXT: sub a4, a4, a5 ; RV64I-NEXT: slli a5, a1, 18 -; RV64I-NEXT: subw a3, a3, a5 +; RV64I-NEXT: sub a3, a3, a5 ; RV64I-NEXT: slli a5, a1, 4 -; RV64I-NEXT: subw a5, a1, a5 +; RV64I-NEXT: sub a5, a1, a5 ; RV64I-NEXT: add a2, a5, a2 ; RV64I-NEXT: slli a5, a1, 14 -; RV64I-NEXT: subw a4, a4, a5 +; RV64I-NEXT: sub a4, a4, a5 ; RV64I-NEXT: slli a5, a1, 23 -; RV64I-NEXT: subw a3, a3, a5 +; RV64I-NEXT: sub a3, a3, a5 ; RV64I-NEXT: slli a1, a1, 27 ; RV64I-NEXT: add a2, a2, a4 ; RV64I-NEXT: add a1, a3, a1 @@ -248,7 +248,7 @@ define signext i32 @ctz1(i32 signext %x) nounwind { ; ; RV64I-LABEL: ctz1: ; RV64I: # %bb.0: # %entry -; RV64I-NEXT: negw a1, a0 +; RV64I-NEXT: neg a1, a0 ; RV64I-NEXT: and a1, a0, a1 ; RV64I-NEXT: slli a2, a1, 6 ; RV64I-NEXT: slli a3, a1, 8 @@ -256,16 +256,16 @@ define signext i32 @ctz1(i32 signext %x) nounwind { ; RV64I-NEXT: slli a5, a1, 12 ; RV64I-NEXT: add a2, a2, a3 ; RV64I-NEXT: slli a3, a1, 16 -; RV64I-NEXT: subw a4, a4, a5 +; RV64I-NEXT: sub a4, a4, a5 ; RV64I-NEXT: slli a5, a1, 18 -; RV64I-NEXT: subw a3, a3, a5 +; RV64I-NEXT: sub a3, a3, a5 ; RV64I-NEXT: slli a5, a1, 4 -; RV64I-NEXT: subw a5, a1, a5 +; RV64I-NEXT: sub a5, a1, a5 ; RV64I-NEXT: add a2, a5, a2 ; RV64I-NEXT: slli a5, a1, 14 -; RV64I-NEXT: subw a4, a4, a5 +; RV64I-NEXT: sub a4, a4, a5 ; RV64I-NEXT: slli a5, a1, 23 -; RV64I-NEXT: subw a3, a3, a5 +; RV64I-NEXT: sub a3, a3, a5 ; RV64I-NEXT: slli a1, a1, 27 ; RV64I-NEXT: add a2, a2, a4 ; RV64I-NEXT: add a1, a3, a1 @@ -331,7 +331,7 @@ define signext i32 @ctz1_flipped(i32 signext %x) nounwind { ; ; RV64I-LABEL: ctz1_flipped: ; RV64I: # %bb.0: # %entry -; RV64I-NEXT: negw a1, a0 +; RV64I-NEXT: neg a1, a0 ; RV64I-NEXT: and a1, a0, a1 ; RV64I-NEXT: slli a2, a1, 6 ; RV64I-NEXT: slli a3, a1, 8 @@ -339,16 +339,16 @@ define signext i32 @ctz1_flipped(i32 signext %x) nounwind { ; RV64I-NEXT: slli a5, a1, 12 ; RV64I-NEXT: add a2, a2, a3 ; RV64I-NEXT: slli a3, a1, 16 -; RV64I-NEXT: subw a4, a4, a5 +; RV64I-NEXT: sub a4, a4, a5 ; RV64I-NEXT: slli a5, a1, 18 -; RV64I-NEXT: subw a3, a3, a5 +; RV64I-NEXT: sub a3, a3, a5 ; RV64I-NEXT: slli a5, a1, 4 -; RV64I-NEXT: subw a5, a1, a5 +; RV64I-NEXT: sub a5, a1, a5 ; RV64I-NEXT: add a2, a5, a2 ; RV64I-NEXT: slli a5, a1, 14 -; RV64I-NEXT: subw a4, a4, a5 +; RV64I-NEXT: sub a4, a4, a5 ; RV64I-NEXT: slli a5, a1, 23 -; RV64I-NEXT: subw a3, a3, a5 +; RV64I-NEXT: sub a3, a3, a5 ; RV64I-NEXT: slli a1, a1, 27 ; RV64I-NEXT: add a2, a2, a4 ; RV64I-NEXT: add a1, a3, a1 @@ -412,7 +412,7 @@ define signext i32 @ctz2(i32 signext %x) nounwind { ; RV64I: # %bb.0: # %entry ; RV64I-NEXT: beqz a0, .LBB4_2 ; RV64I-NEXT: # %bb.1: # %cond.false -; RV64I-NEXT: negw a1, a0 +; RV64I-NEXT: neg a1, a0 ; RV64I-NEXT: and a0, a0, a1 ; RV64I-NEXT: slli a1, a0, 6 ; RV64I-NEXT: slli a2, a0, 8 @@ -420,16 +420,16 @@ define signext i32 @ctz2(i32 signext %x) nounwind { ; RV64I-NEXT: slli a4, a0, 12 ; RV64I-NEXT: add a1, a1, a2 ; RV64I-NEXT: slli a2, a0, 16 -; RV64I-NEXT: subw a3, a3, a4 +; RV64I-NEXT: sub a3, a3, a4 ; RV64I-NEXT: slli a4, a0, 18 -; RV64I-NEXT: subw a2, a2, a4 +; RV64I-NEXT: sub a2, a2, a4 ; RV64I-NEXT: slli a4, a0, 4 -; RV64I-NEXT: subw a4, a0, a4 +; RV64I-NEXT: sub a4, a0, a4 ; RV64I-NEXT: add a1, a4, a1 ; RV64I-NEXT: slli a4, a0, 14 -; RV64I-NEXT: subw a3, a3, a4 +; RV64I-NEXT: sub a3, a3, a4 ; RV64I-NEXT: slli a4, a0, 23 -; RV64I-NEXT: subw a2, a2, a4 +; RV64I-NEXT: sub a2, a2, a4 ; RV64I-NEXT: slli a0, a0, 27 ; RV64I-NEXT: add a1, a1, a3 ; RV64I-NEXT: add a0, a2, a0 @@ -490,7 +490,7 @@ define signext i32 @ctz3(i32 signext %x) nounwind { ; RV64I: # %bb.0: # %entry ; RV64I-NEXT: beqz a0, .LBB5_2 ; RV64I-NEXT: # %bb.1: # %cond.false -; RV64I-NEXT: negw a1, a0 +; RV64I-NEXT: neg a1, a0 ; RV64I-NEXT: and a0, a0, a1 ; RV64I-NEXT: slli a1, a0, 6 ; RV64I-NEXT: slli a2, a0, 8 @@ -498,16 +498,16 @@ define signext i32 @ctz3(i32 signext %x) nounwind { ; RV64I-NEXT: slli a4, a0, 12 ; RV64I-NEXT: add a1, a1, a2 ; RV64I-NEXT: slli a2, a0, 16 -; RV64I-NEXT: subw a3, a3, a4 +; RV64I-NEXT: sub a3, a3, a4 ; RV64I-NEXT: slli a4, a0, 18 -; RV64I-NEXT: subw a2, a2, a4 +; RV64I-NEXT: sub a2, a2, a4 ; RV64I-NEXT: slli a4, a0, 4 -; RV64I-NEXT: subw a4, a0, a4 +; RV64I-NEXT: sub a4, a0, a4 ; RV64I-NEXT: add a1, a4, a1 ; RV64I-NEXT: slli a4, a0, 14 -; RV64I-NEXT: subw a3, a3, a4 +; RV64I-NEXT: sub a3, a3, a4 ; RV64I-NEXT: slli a4, a0, 23 -; RV64I-NEXT: subw a2, a2, a4 +; RV64I-NEXT: sub a2, a2, a4 ; RV64I-NEXT: slli a0, a0, 27 ; RV64I-NEXT: add a1, a1, a3 ; RV64I-NEXT: add a0, a2, a0 @@ -824,7 +824,7 @@ define signext i32 @ctz5(i32 signext %x) nounwind { ; ; RV64I-LABEL: ctz5: ; RV64I: # %bb.0: # %entry -; RV64I-NEXT: negw a1, a0 +; RV64I-NEXT: neg a1, a0 ; RV64I-NEXT: and a1, a0, a1 ; RV64I-NEXT: slli a2, a1, 6 ; RV64I-NEXT: slli a3, a1, 8 @@ -832,16 +832,16 @@ define signext i32 @ctz5(i32 signext %x) nounwind { ; RV64I-NEXT: slli a5, a1, 12 ; RV64I-NEXT: add a2, a2, a3 ; RV64I-NEXT: slli a3, a1, 16 -; RV64I-NEXT: subw a4, a4, a5 +; RV64I-NEXT: sub a4, a4, a5 ; RV64I-NEXT: slli a5, a1, 18 -; RV64I-NEXT: subw a3, a3, a5 +; RV64I-NEXT: sub a3, a3, a5 ; RV64I-NEXT: slli a5, a1, 4 -; RV64I-NEXT: subw a5, a1, a5 +; RV64I-NEXT: sub a5, a1, a5 ; RV64I-NEXT: add a2, a5, a2 ; RV64I-NEXT: slli a5, a1, 14 -; RV64I-NEXT: subw a4, a4, a5 +; RV64I-NEXT: sub a4, a4, a5 ; RV64I-NEXT: slli a5, a1, 23 -; RV64I-NEXT: subw a3, a3, a5 +; RV64I-NEXT: sub a3, a3, a5 ; RV64I-NEXT: slli a1, a1, 27 ; RV64I-NEXT: add a2, a2, a4 ; RV64I-NEXT: add a1, a3, a1 @@ -907,7 +907,7 @@ define signext i32 @ctz6(i32 signext %x) nounwind { ; ; RV64I-LABEL: ctz6: ; RV64I: # %bb.0: # %entry -; RV64I-NEXT: negw a1, a0 +; RV64I-NEXT: neg a1, a0 ; RV64I-NEXT: and a1, a0, a1 ; RV64I-NEXT: slli a2, a1, 6 ; RV64I-NEXT: slli a3, a1, 8 @@ -915,16 +915,16 @@ define signext i32 @ctz6(i32 signext %x) nounwind { ; RV64I-NEXT: slli a5, a1, 12 ; RV64I-NEXT: add a2, a2, a3 ; RV64I-NEXT: slli a3, a1, 16 -; RV64I-NEXT: subw a4, a4, a5 +; RV64I-NEXT: sub a4, a4, a5 ; RV64I-NEXT: slli a5, a1, 18 -; RV64I-NEXT: subw a3, a3, a5 +; RV64I-NEXT: sub a3, a3, a5 ; RV64I-NEXT: slli a5, a1, 4 -; RV64I-NEXT: subw a5, a1, a5 +; RV64I-NEXT: sub a5, a1, a5 ; RV64I-NEXT: add a2, a5, a2 ; RV64I-NEXT: slli a5, a1, 14 -; RV64I-NEXT: subw a4, a4, a5 +; RV64I-NEXT: sub a4, a4, a5 ; RV64I-NEXT: slli a5, a1, 23 -; RV64I-NEXT: subw a3, a3, a5 +; RV64I-NEXT: sub a3, a3, a5 ; RV64I-NEXT: slli a1, a1, 27 ; RV64I-NEXT: add a2, a2, a4 ; RV64I-NEXT: add a1, a3, a1 @@ -997,7 +997,7 @@ define signext i32 @globalVar() nounwind { ; RV64I: # %bb.0: # %entry ; RV64I-NEXT: lui a0, %hi(global_x) ; RV64I-NEXT: lw a0, %lo(global_x)(a0) -; RV64I-NEXT: negw a1, a0 +; RV64I-NEXT: neg a1, a0 ; RV64I-NEXT: and a1, a0, a1 ; RV64I-NEXT: slli a2, a1, 6 ; RV64I-NEXT: slli a3, a1, 8 @@ -1005,16 +1005,16 @@ define signext i32 @globalVar() nounwind { ; RV64I-NEXT: slli a5, a1, 12 ; RV64I-NEXT: add a2, a2, a3 ; RV64I-NEXT: slli a3, a1, 16 -; RV64I-NEXT: subw a4, a4, a5 +; RV64I-NEXT: sub a4, a4, a5 ; RV64I-NEXT: slli a5, a1, 18 -; RV64I-NEXT: subw a3, a3, a5 +; RV64I-NEXT: sub a3, a3, a5 ; RV64I-NEXT: slli a5, a1, 4 -; RV64I-NEXT: subw a5, a1, a5 +; RV64I-NEXT: sub a5, a1, a5 ; RV64I-NEXT: add a2, a5, a2 ; RV64I-NEXT: slli a5, a1, 14 -; RV64I-NEXT: subw a4, a4, a5 +; RV64I-NEXT: sub a4, a4, a5 ; RV64I-NEXT: slli a5, a1, 23 -; RV64I-NEXT: subw a3, a3, a5 +; RV64I-NEXT: sub a3, a3, a5 ; RV64I-NEXT: slli a1, a1, 27 ; RV64I-NEXT: add a2, a2, a4 ; RV64I-NEXT: add a1, a3, a1 diff --git a/llvm/test/CodeGen/RISCV/div-by-constant.ll b/llvm/test/CodeGen/RISCV/div-by-constant.ll index ea8b04d..53c3f58 100644 --- a/llvm/test/CodeGen/RISCV/div-by-constant.ll +++ b/llvm/test/CodeGen/RISCV/div-by-constant.ll @@ -54,7 +54,7 @@ define i32 @udiv_constant_add(i32 %a) nounwind { ; RV64IM-NEXT: slli a2, a2, 32 ; RV64IM-NEXT: mulhu a1, a1, a2 ; RV64IM-NEXT: srli a1, a1, 32 -; RV64IM-NEXT: subw a0, a0, a1 +; RV64IM-NEXT: sub a0, a0, a1 ; RV64IM-NEXT: srliw a0, a0, 1 ; RV64IM-NEXT: add a0, a0, a1 ; RV64IM-NEXT: srli a0, a0, 2 @@ -67,7 +67,7 @@ define i32 @udiv_constant_add(i32 %a) nounwind { ; RV64IMZB-NEXT: addi a2, a2, -1755 ; RV64IMZB-NEXT: mul a1, a1, a2 ; RV64IMZB-NEXT: srli a1, a1, 32 -; RV64IMZB-NEXT: subw a0, a0, a1 +; RV64IMZB-NEXT: sub a0, a0, a1 ; RV64IMZB-NEXT: srliw a0, a0, 1 ; RV64IMZB-NEXT: add a0, a0, a1 ; RV64IMZB-NEXT: srli a0, a0, 2 @@ -193,7 +193,7 @@ define i8 @udiv8_constant_add(i8 %a) nounwind { ; RV64IM-NEXT: li a2, 37 ; RV64IM-NEXT: mul a1, a1, a2 ; RV64IM-NEXT: srli a1, a1, 8 -; RV64IM-NEXT: subw a0, a0, a1 +; RV64IM-NEXT: sub a0, a0, a1 ; RV64IM-NEXT: slli a0, a0, 56 ; RV64IM-NEXT: srli a0, a0, 57 ; RV64IM-NEXT: add a0, a0, a1 @@ -206,7 +206,7 @@ define i8 @udiv8_constant_add(i8 %a) nounwind { ; RV64IMZB-NEXT: sh3add a2, a1, a1 ; RV64IMZB-NEXT: sh2add a1, a2, a1 ; RV64IMZB-NEXT: srli a1, a1, 8 -; RV64IMZB-NEXT: subw a0, a0, a1 +; RV64IMZB-NEXT: sub a0, a0, a1 ; RV64IMZB-NEXT: slli a0, a0, 56 ; RV64IMZB-NEXT: srli a0, a0, 57 ; RV64IMZB-NEXT: add a0, a0, a1 @@ -257,7 +257,7 @@ define i16 @udiv16_constant_add(i16 %a) nounwind { ; RV64-NEXT: lui a2, 149808 ; RV64-NEXT: mulhu a1, a1, a2 ; RV64-NEXT: srli a1, a1, 16 -; RV64-NEXT: subw a0, a0, a1 +; RV64-NEXT: sub a0, a0, a1 ; RV64-NEXT: slli a0, a0, 48 ; RV64-NEXT: srli a0, a0, 49 ; RV64-NEXT: add a0, a0, a1 @@ -367,7 +367,7 @@ define i32 @sdiv_constant_sub_srai(i32 %a) nounwind { ; RV64-NEXT: addi a2, a2, -1171 ; RV64-NEXT: mul a1, a1, a2 ; RV64-NEXT: srli a1, a1, 32 -; RV64-NEXT: subw a1, a1, a0 +; RV64-NEXT: sub a1, a1, a0 ; RV64-NEXT: srliw a0, a1, 31 ; RV64-NEXT: sraiw a1, a1, 2 ; RV64-NEXT: add a0, a1, a0 @@ -666,7 +666,7 @@ define i8 @sdiv8_constant_sub_srai(i8 %a) nounwind { ; RV64IM-NEXT: srai a1, a1, 56 ; RV64IM-NEXT: mul a1, a1, a2 ; RV64IM-NEXT: srli a1, a1, 8 -; RV64IM-NEXT: subw a1, a1, a0 +; RV64IM-NEXT: sub a1, a1, a0 ; RV64IM-NEXT: slli a1, a1, 56 ; RV64IM-NEXT: srli a0, a1, 63 ; RV64IM-NEXT: srai a1, a1, 58 @@ -679,7 +679,7 @@ define i8 @sdiv8_constant_sub_srai(i8 %a) nounwind { ; RV64IMZB-NEXT: li a2, 109 ; RV64IMZB-NEXT: mul a1, a1, a2 ; RV64IMZB-NEXT: srli a1, a1, 8 -; RV64IMZB-NEXT: subw a1, a1, a0 +; RV64IMZB-NEXT: sub a1, a1, a0 ; RV64IMZB-NEXT: slli a1, a1, 56 ; RV64IMZB-NEXT: srli a0, a1, 63 ; RV64IMZB-NEXT: srai a1, a1, 58 @@ -889,7 +889,7 @@ define i16 @sdiv16_constant_sub_srai(i16 %a) nounwind { ; RV64IM-NEXT: addi a2, a2, 1911 ; RV64IM-NEXT: mul a1, a1, a2 ; RV64IM-NEXT: srli a1, a1, 16 -; RV64IM-NEXT: subw a1, a1, a0 +; RV64IM-NEXT: sub a1, a1, a0 ; RV64IM-NEXT: slli a1, a1, 48 ; RV64IM-NEXT: srli a0, a1, 63 ; RV64IM-NEXT: srai a1, a1, 51 @@ -903,7 +903,7 @@ define i16 @sdiv16_constant_sub_srai(i16 %a) nounwind { ; RV64IMZB-NEXT: addi a2, a2, 1911 ; RV64IMZB-NEXT: mul a1, a1, a2 ; RV64IMZB-NEXT: srli a1, a1, 16 -; RV64IMZB-NEXT: subw a1, a1, a0 +; RV64IMZB-NEXT: sub a1, a1, a0 ; RV64IMZB-NEXT: slli a1, a1, 48 ; RV64IMZB-NEXT: srli a0, a1, 63 ; RV64IMZB-NEXT: srai a1, a1, 51 diff --git a/llvm/test/CodeGen/RISCV/double-convert-strict.ll b/llvm/test/CodeGen/RISCV/double-convert-strict.ll index 2b1ec10..9a5e357 100644 --- a/llvm/test/CodeGen/RISCV/double-convert-strict.ll +++ b/llvm/test/CodeGen/RISCV/double-convert-strict.ll @@ -347,17 +347,11 @@ define double @fcvt_d_wu(i32 %a) nounwind strictfp { declare double @llvm.experimental.constrained.uitofp.f64.i32(i32, metadata, metadata) define double @fcvt_d_wu_load(ptr %p) nounwind strictfp { -; RV32IFD-LABEL: fcvt_d_wu_load: -; RV32IFD: # %bb.0: -; RV32IFD-NEXT: lw a0, 0(a0) -; RV32IFD-NEXT: fcvt.d.wu fa0, a0 -; RV32IFD-NEXT: ret -; -; RV64IFD-LABEL: fcvt_d_wu_load: -; RV64IFD: # %bb.0: -; RV64IFD-NEXT: lwu a0, 0(a0) -; RV64IFD-NEXT: fcvt.d.wu fa0, a0 -; RV64IFD-NEXT: ret +; CHECKIFD-LABEL: fcvt_d_wu_load: +; CHECKIFD: # %bb.0: +; CHECKIFD-NEXT: lw a0, 0(a0) +; CHECKIFD-NEXT: fcvt.d.wu fa0, a0 +; CHECKIFD-NEXT: ret ; ; RV32IZFINXZDINX-LABEL: fcvt_d_wu_load: ; RV32IZFINXZDINX: # %bb.0: @@ -367,7 +361,7 @@ define double @fcvt_d_wu_load(ptr %p) nounwind strictfp { ; ; RV64IZFINXZDINX-LABEL: fcvt_d_wu_load: ; RV64IZFINXZDINX: # %bb.0: -; RV64IZFINXZDINX-NEXT: lwu a0, 0(a0) +; RV64IZFINXZDINX-NEXT: lw a0, 0(a0) ; RV64IZFINXZDINX-NEXT: fcvt.d.wu a0, a0 ; RV64IZFINXZDINX-NEXT: ret ; diff --git a/llvm/test/CodeGen/RISCV/double-convert.ll b/llvm/test/CodeGen/RISCV/double-convert.ll index fad9e21..a2e6186 100644 --- a/llvm/test/CodeGen/RISCV/double-convert.ll +++ b/llvm/test/CodeGen/RISCV/double-convert.ll @@ -582,17 +582,11 @@ define double @fcvt_d_wu(i32 %a) nounwind { } define double @fcvt_d_wu_load(ptr %p) nounwind { -; RV32IFD-LABEL: fcvt_d_wu_load: -; RV32IFD: # %bb.0: -; RV32IFD-NEXT: lw a0, 0(a0) -; RV32IFD-NEXT: fcvt.d.wu fa0, a0 -; RV32IFD-NEXT: ret -; -; RV64IFD-LABEL: fcvt_d_wu_load: -; RV64IFD: # %bb.0: -; RV64IFD-NEXT: lwu a0, 0(a0) -; RV64IFD-NEXT: fcvt.d.wu fa0, a0 -; RV64IFD-NEXT: ret +; CHECKIFD-LABEL: fcvt_d_wu_load: +; CHECKIFD: # %bb.0: +; CHECKIFD-NEXT: lw a0, 0(a0) +; CHECKIFD-NEXT: fcvt.d.wu fa0, a0 +; CHECKIFD-NEXT: ret ; ; RV32IZFINXZDINX-LABEL: fcvt_d_wu_load: ; RV32IZFINXZDINX: # %bb.0: @@ -602,7 +596,7 @@ define double @fcvt_d_wu_load(ptr %p) nounwind { ; ; RV64IZFINXZDINX-LABEL: fcvt_d_wu_load: ; RV64IZFINXZDINX: # %bb.0: -; RV64IZFINXZDINX-NEXT: lwu a0, 0(a0) +; RV64IZFINXZDINX-NEXT: lw a0, 0(a0) ; RV64IZFINXZDINX-NEXT: fcvt.d.wu a0, a0 ; RV64IZFINXZDINX-NEXT: ret ; diff --git a/llvm/test/CodeGen/RISCV/float-convert-strict.ll b/llvm/test/CodeGen/RISCV/float-convert-strict.ll index 0c265e1..1b25a2b 100644 --- a/llvm/test/CodeGen/RISCV/float-convert-strict.ll +++ b/llvm/test/CodeGen/RISCV/float-convert-strict.ll @@ -236,29 +236,17 @@ define float @fcvt_s_wu(i32 %a) nounwind strictfp { declare float @llvm.experimental.constrained.uitofp.f32.i32(i32 %a, metadata, metadata) define float @fcvt_s_wu_load(ptr %p) nounwind strictfp { -; RV32IF-LABEL: fcvt_s_wu_load: -; RV32IF: # %bb.0: -; RV32IF-NEXT: lw a0, 0(a0) -; RV32IF-NEXT: fcvt.s.wu fa0, a0 -; RV32IF-NEXT: ret -; -; RV64IF-LABEL: fcvt_s_wu_load: -; RV64IF: # %bb.0: -; RV64IF-NEXT: lwu a0, 0(a0) -; RV64IF-NEXT: fcvt.s.wu fa0, a0 -; RV64IF-NEXT: ret -; -; RV32IZFINX-LABEL: fcvt_s_wu_load: -; RV32IZFINX: # %bb.0: -; RV32IZFINX-NEXT: lw a0, 0(a0) -; RV32IZFINX-NEXT: fcvt.s.wu a0, a0 -; RV32IZFINX-NEXT: ret +; CHECKIF-LABEL: fcvt_s_wu_load: +; CHECKIF: # %bb.0: +; CHECKIF-NEXT: lw a0, 0(a0) +; CHECKIF-NEXT: fcvt.s.wu fa0, a0 +; CHECKIF-NEXT: ret ; -; RV64IZFINX-LABEL: fcvt_s_wu_load: -; RV64IZFINX: # %bb.0: -; RV64IZFINX-NEXT: lwu a0, 0(a0) -; RV64IZFINX-NEXT: fcvt.s.wu a0, a0 -; RV64IZFINX-NEXT: ret +; CHECKIZFINX-LABEL: fcvt_s_wu_load: +; CHECKIZFINX: # %bb.0: +; CHECKIZFINX-NEXT: lw a0, 0(a0) +; CHECKIZFINX-NEXT: fcvt.s.wu a0, a0 +; CHECKIZFINX-NEXT: ret ; ; RV32I-LABEL: fcvt_s_wu_load: ; RV32I: # %bb.0: diff --git a/llvm/test/CodeGen/RISCV/float-convert.ll b/llvm/test/CodeGen/RISCV/float-convert.ll index 1cb7b27..60349a0 100644 --- a/llvm/test/CodeGen/RISCV/float-convert.ll +++ b/llvm/test/CodeGen/RISCV/float-convert.ll @@ -482,29 +482,17 @@ define float @fcvt_s_wu(i32 %a) nounwind { } define float @fcvt_s_wu_load(ptr %p) nounwind { -; RV32IF-LABEL: fcvt_s_wu_load: -; RV32IF: # %bb.0: -; RV32IF-NEXT: lw a0, 0(a0) -; RV32IF-NEXT: fcvt.s.wu fa0, a0 -; RV32IF-NEXT: ret -; -; RV64IF-LABEL: fcvt_s_wu_load: -; RV64IF: # %bb.0: -; RV64IF-NEXT: lwu a0, 0(a0) -; RV64IF-NEXT: fcvt.s.wu fa0, a0 -; RV64IF-NEXT: ret -; -; RV32IZFINX-LABEL: fcvt_s_wu_load: -; RV32IZFINX: # %bb.0: -; RV32IZFINX-NEXT: lw a0, 0(a0) -; RV32IZFINX-NEXT: fcvt.s.wu a0, a0 -; RV32IZFINX-NEXT: ret +; CHECKIF-LABEL: fcvt_s_wu_load: +; CHECKIF: # %bb.0: +; CHECKIF-NEXT: lw a0, 0(a0) +; CHECKIF-NEXT: fcvt.s.wu fa0, a0 +; CHECKIF-NEXT: ret ; -; RV64IZFINX-LABEL: fcvt_s_wu_load: -; RV64IZFINX: # %bb.0: -; RV64IZFINX-NEXT: lwu a0, 0(a0) -; RV64IZFINX-NEXT: fcvt.s.wu a0, a0 -; RV64IZFINX-NEXT: ret +; CHECKIZFINX-LABEL: fcvt_s_wu_load: +; CHECKIZFINX: # %bb.0: +; CHECKIZFINX-NEXT: lw a0, 0(a0) +; CHECKIZFINX-NEXT: fcvt.s.wu a0, a0 +; CHECKIZFINX-NEXT: ret ; ; RV32I-LABEL: fcvt_s_wu_load: ; RV32I: # %bb.0: diff --git a/llvm/test/CodeGen/RISCV/half-convert-strict.ll b/llvm/test/CodeGen/RISCV/half-convert-strict.ll index 0a04d44..675e230 100644 --- a/llvm/test/CodeGen/RISCV/half-convert-strict.ll +++ b/llvm/test/CodeGen/RISCV/half-convert-strict.ll @@ -1461,29 +1461,17 @@ define half @fcvt_h_wu(i32 %a) nounwind strictfp { declare half @llvm.experimental.constrained.uitofp.f16.i32(i32, metadata, metadata) define half @fcvt_h_wu_load(ptr %p) nounwind strictfp { -; RV32IZFH-LABEL: fcvt_h_wu_load: -; RV32IZFH: # %bb.0: -; RV32IZFH-NEXT: lw a0, 0(a0) -; RV32IZFH-NEXT: fcvt.h.wu fa0, a0 -; RV32IZFH-NEXT: ret -; -; RV64IZFH-LABEL: fcvt_h_wu_load: -; RV64IZFH: # %bb.0: -; RV64IZFH-NEXT: lwu a0, 0(a0) -; RV64IZFH-NEXT: fcvt.h.wu fa0, a0 -; RV64IZFH-NEXT: ret -; -; RV32IZHINX-LABEL: fcvt_h_wu_load: -; RV32IZHINX: # %bb.0: -; RV32IZHINX-NEXT: lw a0, 0(a0) -; RV32IZHINX-NEXT: fcvt.h.wu a0, a0 -; RV32IZHINX-NEXT: ret +; CHECKIZFH-LABEL: fcvt_h_wu_load: +; CHECKIZFH: # %bb.0: +; CHECKIZFH-NEXT: lw a0, 0(a0) +; CHECKIZFH-NEXT: fcvt.h.wu fa0, a0 +; CHECKIZFH-NEXT: ret ; -; RV64IZHINX-LABEL: fcvt_h_wu_load: -; RV64IZHINX: # %bb.0: -; RV64IZHINX-NEXT: lwu a0, 0(a0) -; RV64IZHINX-NEXT: fcvt.h.wu a0, a0 -; RV64IZHINX-NEXT: ret +; CHECKIZHINX-LABEL: fcvt_h_wu_load: +; CHECKIZHINX: # %bb.0: +; CHECKIZHINX-NEXT: lw a0, 0(a0) +; CHECKIZHINX-NEXT: fcvt.h.wu a0, a0 +; CHECKIZHINX-NEXT: ret ; ; RV32IDZFH-LABEL: fcvt_h_wu_load: ; RV32IDZFH: # %bb.0: @@ -1493,7 +1481,7 @@ define half @fcvt_h_wu_load(ptr %p) nounwind strictfp { ; ; RV64IDZFH-LABEL: fcvt_h_wu_load: ; RV64IDZFH: # %bb.0: -; RV64IDZFH-NEXT: lwu a0, 0(a0) +; RV64IDZFH-NEXT: lw a0, 0(a0) ; RV64IDZFH-NEXT: fcvt.h.wu fa0, a0 ; RV64IDZFH-NEXT: ret ; @@ -1505,7 +1493,7 @@ define half @fcvt_h_wu_load(ptr %p) nounwind strictfp { ; ; RV64IZDINXZHINX-LABEL: fcvt_h_wu_load: ; RV64IZDINXZHINX: # %bb.0: -; RV64IZDINXZHINX-NEXT: lwu a0, 0(a0) +; RV64IZDINXZHINX-NEXT: lw a0, 0(a0) ; RV64IZDINXZHINX-NEXT: fcvt.h.wu a0, a0 ; RV64IZDINXZHINX-NEXT: ret ; @@ -1518,7 +1506,7 @@ define half @fcvt_h_wu_load(ptr %p) nounwind strictfp { ; ; CHECK64-IZFHMIN-LABEL: fcvt_h_wu_load: ; CHECK64-IZFHMIN: # %bb.0: -; CHECK64-IZFHMIN-NEXT: lwu a0, 0(a0) +; CHECK64-IZFHMIN-NEXT: lw a0, 0(a0) ; CHECK64-IZFHMIN-NEXT: fcvt.s.wu fa5, a0 ; CHECK64-IZFHMIN-NEXT: fcvt.h.s fa0, fa5 ; CHECK64-IZFHMIN-NEXT: ret @@ -1532,7 +1520,7 @@ define half @fcvt_h_wu_load(ptr %p) nounwind strictfp { ; ; CHECK64-IZHINXMIN-LABEL: fcvt_h_wu_load: ; CHECK64-IZHINXMIN: # %bb.0: -; CHECK64-IZHINXMIN-NEXT: lwu a0, 0(a0) +; CHECK64-IZHINXMIN-NEXT: lw a0, 0(a0) ; CHECK64-IZHINXMIN-NEXT: fcvt.s.wu a0, a0 ; CHECK64-IZHINXMIN-NEXT: fcvt.h.s a0, a0 ; CHECK64-IZHINXMIN-NEXT: ret @@ -1546,7 +1534,7 @@ define half @fcvt_h_wu_load(ptr %p) nounwind strictfp { ; ; CHECK64-IZDINXZHINXMIN-LABEL: fcvt_h_wu_load: ; CHECK64-IZDINXZHINXMIN: # %bb.0: -; CHECK64-IZDINXZHINXMIN-NEXT: lwu a0, 0(a0) +; CHECK64-IZDINXZHINXMIN-NEXT: lw a0, 0(a0) ; CHECK64-IZDINXZHINXMIN-NEXT: fcvt.s.wu a0, a0 ; CHECK64-IZDINXZHINXMIN-NEXT: fcvt.h.s a0, a0 ; CHECK64-IZDINXZHINXMIN-NEXT: ret diff --git a/llvm/test/CodeGen/RISCV/half-convert.ll b/llvm/test/CodeGen/RISCV/half-convert.ll index c53237e..facb544 100644 --- a/llvm/test/CodeGen/RISCV/half-convert.ll +++ b/llvm/test/CodeGen/RISCV/half-convert.ll @@ -4388,17 +4388,11 @@ define half @fcvt_h_wu(i32 %a) nounwind { } define half @fcvt_h_wu_load(ptr %p) nounwind { -; RV32IZFH-LABEL: fcvt_h_wu_load: -; RV32IZFH: # %bb.0: -; RV32IZFH-NEXT: lw a0, 0(a0) -; RV32IZFH-NEXT: fcvt.h.wu fa0, a0 -; RV32IZFH-NEXT: ret -; -; RV64IZFH-LABEL: fcvt_h_wu_load: -; RV64IZFH: # %bb.0: -; RV64IZFH-NEXT: lwu a0, 0(a0) -; RV64IZFH-NEXT: fcvt.h.wu fa0, a0 -; RV64IZFH-NEXT: ret +; CHECKIZFH-LABEL: fcvt_h_wu_load: +; CHECKIZFH: # %bb.0: +; CHECKIZFH-NEXT: lw a0, 0(a0) +; CHECKIZFH-NEXT: fcvt.h.wu fa0, a0 +; CHECKIZFH-NEXT: ret ; ; RV32IDZFH-LABEL: fcvt_h_wu_load: ; RV32IDZFH: # %bb.0: @@ -4408,33 +4402,21 @@ define half @fcvt_h_wu_load(ptr %p) nounwind { ; ; RV64IDZFH-LABEL: fcvt_h_wu_load: ; RV64IDZFH: # %bb.0: -; RV64IDZFH-NEXT: lwu a0, 0(a0) +; RV64IDZFH-NEXT: lw a0, 0(a0) ; RV64IDZFH-NEXT: fcvt.h.wu fa0, a0 ; RV64IDZFH-NEXT: ret ; -; RV32IZHINX-LABEL: fcvt_h_wu_load: -; RV32IZHINX: # %bb.0: -; RV32IZHINX-NEXT: lw a0, 0(a0) -; RV32IZHINX-NEXT: fcvt.h.wu a0, a0 -; RV32IZHINX-NEXT: ret -; -; RV64IZHINX-LABEL: fcvt_h_wu_load: -; RV64IZHINX: # %bb.0: -; RV64IZHINX-NEXT: lwu a0, 0(a0) -; RV64IZHINX-NEXT: fcvt.h.wu a0, a0 -; RV64IZHINX-NEXT: ret -; -; RV32IZDINXZHINX-LABEL: fcvt_h_wu_load: -; RV32IZDINXZHINX: # %bb.0: -; RV32IZDINXZHINX-NEXT: lw a0, 0(a0) -; RV32IZDINXZHINX-NEXT: fcvt.h.wu a0, a0 -; RV32IZDINXZHINX-NEXT: ret +; CHECKIZHINX-LABEL: fcvt_h_wu_load: +; CHECKIZHINX: # %bb.0: +; CHECKIZHINX-NEXT: lw a0, 0(a0) +; CHECKIZHINX-NEXT: fcvt.h.wu a0, a0 +; CHECKIZHINX-NEXT: ret ; -; RV64IZDINXZHINX-LABEL: fcvt_h_wu_load: -; RV64IZDINXZHINX: # %bb.0: -; RV64IZDINXZHINX-NEXT: lwu a0, 0(a0) -; RV64IZDINXZHINX-NEXT: fcvt.h.wu a0, a0 -; RV64IZDINXZHINX-NEXT: ret +; CHECKIZDINXZHINX-LABEL: fcvt_h_wu_load: +; CHECKIZDINXZHINX: # %bb.0: +; CHECKIZDINXZHINX-NEXT: lw a0, 0(a0) +; CHECKIZDINXZHINX-NEXT: fcvt.h.wu a0, a0 +; CHECKIZDINXZHINX-NEXT: ret ; ; RV32I-LABEL: fcvt_h_wu_load: ; RV32I: # %bb.0: @@ -4476,7 +4458,7 @@ define half @fcvt_h_wu_load(ptr %p) nounwind { ; RV64ID-LP64: # %bb.0: ; RV64ID-LP64-NEXT: addi sp, sp, -16 ; RV64ID-LP64-NEXT: sd ra, 8(sp) # 8-byte Folded Spill -; RV64ID-LP64-NEXT: lwu a0, 0(a0) +; RV64ID-LP64-NEXT: lw a0, 0(a0) ; RV64ID-LP64-NEXT: fcvt.s.wu fa5, a0 ; RV64ID-LP64-NEXT: fmv.x.w a0, fa5 ; RV64ID-LP64-NEXT: call __truncsfhf2 @@ -4505,7 +4487,7 @@ define half @fcvt_h_wu_load(ptr %p) nounwind { ; RV64ID: # %bb.0: ; RV64ID-NEXT: addi sp, sp, -16 ; RV64ID-NEXT: sd ra, 8(sp) # 8-byte Folded Spill -; RV64ID-NEXT: lwu a0, 0(a0) +; RV64ID-NEXT: lw a0, 0(a0) ; RV64ID-NEXT: fcvt.s.wu fa0, a0 ; RV64ID-NEXT: call __truncsfhf2 ; RV64ID-NEXT: fmv.x.w a0, fa0 @@ -4525,7 +4507,7 @@ define half @fcvt_h_wu_load(ptr %p) nounwind { ; ; CHECK64-IZFHMIN-LABEL: fcvt_h_wu_load: ; CHECK64-IZFHMIN: # %bb.0: -; CHECK64-IZFHMIN-NEXT: lwu a0, 0(a0) +; CHECK64-IZFHMIN-NEXT: lw a0, 0(a0) ; CHECK64-IZFHMIN-NEXT: fcvt.s.wu fa5, a0 ; CHECK64-IZFHMIN-NEXT: fcvt.h.s fa0, fa5 ; CHECK64-IZFHMIN-NEXT: ret @@ -4539,7 +4521,7 @@ define half @fcvt_h_wu_load(ptr %p) nounwind { ; ; CHECK64-IZHINXMIN-LABEL: fcvt_h_wu_load: ; CHECK64-IZHINXMIN: # %bb.0: -; CHECK64-IZHINXMIN-NEXT: lwu a0, 0(a0) +; CHECK64-IZHINXMIN-NEXT: lw a0, 0(a0) ; CHECK64-IZHINXMIN-NEXT: fcvt.s.wu a0, a0 ; CHECK64-IZHINXMIN-NEXT: fcvt.h.s a0, a0 ; CHECK64-IZHINXMIN-NEXT: ret @@ -4553,7 +4535,7 @@ define half @fcvt_h_wu_load(ptr %p) nounwind { ; ; CHECK64-IZDINXZHINXMIN-LABEL: fcvt_h_wu_load: ; CHECK64-IZDINXZHINXMIN: # %bb.0: -; CHECK64-IZDINXZHINXMIN-NEXT: lwu a0, 0(a0) +; CHECK64-IZDINXZHINXMIN-NEXT: lw a0, 0(a0) ; CHECK64-IZDINXZHINXMIN-NEXT: fcvt.s.wu a0, a0 ; CHECK64-IZDINXZHINXMIN-NEXT: fcvt.h.s a0, a0 ; CHECK64-IZDINXZHINXMIN-NEXT: ret diff --git a/llvm/test/CodeGen/RISCV/iabs.ll b/llvm/test/CodeGen/RISCV/iabs.ll index 66cde32..774f1a1 100644 --- a/llvm/test/CodeGen/RISCV/iabs.ll +++ b/llvm/test/CodeGen/RISCV/iabs.ll @@ -651,7 +651,7 @@ define void @zext16_abs8(i8 %x, ptr %p) { ; RV64I-NEXT: srai a2, a0, 63 ; RV64I-NEXT: srai a0, a0, 56 ; RV64I-NEXT: xor a0, a0, a2 -; RV64I-NEXT: subw a0, a0, a2 +; RV64I-NEXT: sub a0, a0, a2 ; RV64I-NEXT: sh a0, 0(a1) ; RV64I-NEXT: ret ; diff --git a/llvm/test/CodeGen/RISCV/intrinsic-cttz-elts-vscale.ll b/llvm/test/CodeGen/RISCV/intrinsic-cttz-elts-vscale.ll index b1a6d16..87c8343 100644 --- a/llvm/test/CodeGen/RISCV/intrinsic-cttz-elts-vscale.ll +++ b/llvm/test/CodeGen/RISCV/intrinsic-cttz-elts-vscale.ll @@ -42,7 +42,7 @@ define i32 @ctz_nxv4i32(<vscale x 4 x i32> %a) #0 { ; RV64-NEXT: vmerge.vvm v8, v9, v8, v0 ; RV64-NEXT: vredmaxu.vs v8, v8, v8 ; RV64-NEXT: vmv.x.s a1, v8 -; RV64-NEXT: subw a0, a0, a1 +; RV64-NEXT: sub a0, a0, a1 ; RV64-NEXT: slli a0, a0, 48 ; RV64-NEXT: srli a0, a0, 48 ; RV64-NEXT: ret diff --git a/llvm/test/CodeGen/RISCV/intrinsic-cttz-elts.ll b/llvm/test/CodeGen/RISCV/intrinsic-cttz-elts.ll index 20dd590..1216d30 100644 --- a/llvm/test/CodeGen/RISCV/intrinsic-cttz-elts.ll +++ b/llvm/test/CodeGen/RISCV/intrinsic-cttz-elts.ll @@ -35,7 +35,7 @@ define i16 @ctz_v4i32(<4 x i32> %a) { ; RV64-NEXT: vredmaxu.vs v8, v8, v8 ; RV64-NEXT: vmv.x.s a0, v8 ; RV64-NEXT: li a1, 4 -; RV64-NEXT: subw a1, a1, a0 +; RV64-NEXT: sub a1, a1, a0 ; RV64-NEXT: zext.b a0, a1 ; RV64-NEXT: ret %res = call i16 @llvm.experimental.cttz.elts.i16.v4i32(<4 x i32> %a, i1 0) diff --git a/llvm/test/CodeGen/RISCV/machine-combiner.ll b/llvm/test/CodeGen/RISCV/machine-combiner.ll index 1be599e4..7a1c41c 100644 --- a/llvm/test/CodeGen/RISCV/machine-combiner.ll +++ b/llvm/test/CodeGen/RISCV/machine-combiner.ll @@ -454,7 +454,7 @@ define i32 @test_reassoc_add_sub_i32_1(i32 %a0, i32 %a1, i32 %a2, i32 %a3) { ; CHECK-LABEL: test_reassoc_add_sub_i32_1: ; CHECK: # %bb.0: ; CHECK-NEXT: add a0, a0, a1 -; CHECK-NEXT: subw a2, a2, a3 +; CHECK-NEXT: sub a2, a2, a3 ; CHECK-NEXT: subw a0, a0, a2 ; CHECK-NEXT: ret %t0 = add i32 %a0, %a1 @@ -467,7 +467,7 @@ define i32 @test_reassoc_add_sub_i32_2(i32 %a0, i32 %a1, i32 %a2, i32 %a3) { ; CHECK-LABEL: test_reassoc_add_sub_i32_2: ; CHECK: # %bb.0: ; CHECK-NEXT: add a0, a0, a1 -; CHECK-NEXT: subw a2, a2, a3 +; CHECK-NEXT: sub a2, a2, a3 ; CHECK-NEXT: addw a0, a0, a2 ; CHECK-NEXT: ret %t0 = add i32 %a0, %a1 diff --git a/llvm/test/CodeGen/RISCV/memcmp-optsize.ll b/llvm/test/CodeGen/RISCV/memcmp-optsize.ll index 0d57e42..cd93579 100644 --- a/llvm/test/CodeGen/RISCV/memcmp-optsize.ll +++ b/llvm/test/CodeGen/RISCV/memcmp-optsize.ll @@ -3780,9 +3780,9 @@ define i32 @memcmp_size_5(ptr %s1, ptr %s2) nounwind optsize { ; ; CHECK-UNALIGNED-RV64-ZBKB-LABEL: memcmp_size_5: ; CHECK-UNALIGNED-RV64-ZBKB: # %bb.0: # %entry -; CHECK-UNALIGNED-RV64-ZBKB-NEXT: lwu a2, 0(a0) +; CHECK-UNALIGNED-RV64-ZBKB-NEXT: lw a2, 0(a0) ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: lbu a0, 4(a0) -; CHECK-UNALIGNED-RV64-ZBKB-NEXT: lwu a3, 0(a1) +; CHECK-UNALIGNED-RV64-ZBKB-NEXT: lw a3, 0(a1) ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: lbu a1, 4(a1) ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: pack a0, a2, a0 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: pack a1, a3, a1 @@ -3985,9 +3985,9 @@ define i32 @memcmp_size_6(ptr %s1, ptr %s2) nounwind optsize { ; ; CHECK-UNALIGNED-RV64-ZBKB-LABEL: memcmp_size_6: ; CHECK-UNALIGNED-RV64-ZBKB: # %bb.0: # %entry -; CHECK-UNALIGNED-RV64-ZBKB-NEXT: lwu a2, 0(a0) +; CHECK-UNALIGNED-RV64-ZBKB-NEXT: lw a2, 0(a0) ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: lhu a0, 4(a0) -; CHECK-UNALIGNED-RV64-ZBKB-NEXT: lwu a3, 0(a1) +; CHECK-UNALIGNED-RV64-ZBKB-NEXT: lw a3, 0(a1) ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: lhu a1, 4(a1) ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: pack a0, a2, a0 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: pack a1, a3, a1 diff --git a/llvm/test/CodeGen/RISCV/memcmp.ll b/llvm/test/CodeGen/RISCV/memcmp.ll index 0caab1f..a5bdb13 100644 --- a/llvm/test/CodeGen/RISCV/memcmp.ll +++ b/llvm/test/CodeGen/RISCV/memcmp.ll @@ -4410,9 +4410,9 @@ define i32 @memcmp_size_5(ptr %s1, ptr %s2) nounwind { ; ; CHECK-UNALIGNED-RV64-ZBKB-LABEL: memcmp_size_5: ; CHECK-UNALIGNED-RV64-ZBKB: # %bb.0: # %entry -; CHECK-UNALIGNED-RV64-ZBKB-NEXT: lwu a2, 0(a0) +; CHECK-UNALIGNED-RV64-ZBKB-NEXT: lw a2, 0(a0) ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: lbu a0, 4(a0) -; CHECK-UNALIGNED-RV64-ZBKB-NEXT: lwu a3, 0(a1) +; CHECK-UNALIGNED-RV64-ZBKB-NEXT: lw a3, 0(a1) ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: lbu a1, 4(a1) ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: pack a0, a2, a0 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: pack a1, a3, a1 @@ -4615,9 +4615,9 @@ define i32 @memcmp_size_6(ptr %s1, ptr %s2) nounwind { ; ; CHECK-UNALIGNED-RV64-ZBKB-LABEL: memcmp_size_6: ; CHECK-UNALIGNED-RV64-ZBKB: # %bb.0: # %entry -; CHECK-UNALIGNED-RV64-ZBKB-NEXT: lwu a2, 0(a0) +; CHECK-UNALIGNED-RV64-ZBKB-NEXT: lw a2, 0(a0) ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: lhu a0, 4(a0) -; CHECK-UNALIGNED-RV64-ZBKB-NEXT: lwu a3, 0(a1) +; CHECK-UNALIGNED-RV64-ZBKB-NEXT: lw a3, 0(a1) ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: lhu a1, 4(a1) ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: pack a0, a2, a0 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: pack a1, a3, a1 diff --git a/llvm/test/CodeGen/RISCV/mul.ll b/llvm/test/CodeGen/RISCV/mul.ll index 27d5eaa..4c9a98c 100644 --- a/llvm/test/CodeGen/RISCV/mul.ll +++ b/llvm/test/CodeGen/RISCV/mul.ll @@ -1080,14 +1080,14 @@ define i32 @muli32_m65(i32 %a) nounwind { ; RV64I-LABEL: muli32_m65: ; RV64I: # %bb.0: ; RV64I-NEXT: slli a1, a0, 6 -; RV64I-NEXT: negw a0, a0 +; RV64I-NEXT: neg a0, a0 ; RV64I-NEXT: subw a0, a0, a1 ; RV64I-NEXT: ret ; ; RV64IM-LABEL: muli32_m65: ; RV64IM: # %bb.0: ; RV64IM-NEXT: slli a1, a0, 6 -; RV64IM-NEXT: negw a0, a0 +; RV64IM-NEXT: neg a0, a0 ; RV64IM-NEXT: subw a0, a0, a1 ; RV64IM-NEXT: ret %1 = mul i32 %a, -65 @@ -1980,14 +1980,14 @@ define i8 @muladd_demand(i8 %x, i8 %y) nounwind { ; RV64I-LABEL: muladd_demand: ; RV64I: # %bb.0: ; RV64I-NEXT: slli a0, a0, 1 -; RV64I-NEXT: subw a0, a1, a0 +; RV64I-NEXT: sub a0, a1, a0 ; RV64I-NEXT: andi a0, a0, 15 ; RV64I-NEXT: ret ; ; RV64IM-LABEL: muladd_demand: ; RV64IM: # %bb.0: ; RV64IM-NEXT: slli a0, a0, 1 -; RV64IM-NEXT: subw a0, a1, a0 +; RV64IM-NEXT: sub a0, a1, a0 ; RV64IM-NEXT: andi a0, a0, 15 ; RV64IM-NEXT: ret %m = mul i8 %x, 14 @@ -2048,14 +2048,14 @@ define i8 @muladd_demand_2(i8 %x, i8 %y) nounwind { ; RV64I-LABEL: muladd_demand_2: ; RV64I: # %bb.0: ; RV64I-NEXT: slli a0, a0, 1 -; RV64I-NEXT: subw a1, a1, a0 +; RV64I-NEXT: sub a1, a1, a0 ; RV64I-NEXT: ori a0, a1, -16 ; RV64I-NEXT: ret ; ; RV64IM-LABEL: muladd_demand_2: ; RV64IM: # %bb.0: ; RV64IM-NEXT: slli a0, a0, 1 -; RV64IM-NEXT: subw a1, a1, a0 +; RV64IM-NEXT: sub a1, a1, a0 ; RV64IM-NEXT: ori a0, a1, -16 ; RV64IM-NEXT: ret %m = mul i8 %x, 14 diff --git a/llvm/test/CodeGen/RISCV/neg-abs.ll b/llvm/test/CodeGen/RISCV/neg-abs.ll index fe19a4fa..da81fe5 100644 --- a/llvm/test/CodeGen/RISCV/neg-abs.ll +++ b/llvm/test/CodeGen/RISCV/neg-abs.ll @@ -179,7 +179,7 @@ define i32 @neg_abs32_multiuse(i32 %x, ptr %y) { ; RV64I: # %bb.0: ; RV64I-NEXT: sraiw a2, a0, 31 ; RV64I-NEXT: xor a0, a0, a2 -; RV64I-NEXT: subw a2, a0, a2 +; RV64I-NEXT: sub a2, a0, a2 ; RV64I-NEXT: negw a0, a2 ; RV64I-NEXT: sw a2, 0(a1) ; RV64I-NEXT: ret diff --git a/llvm/test/CodeGen/RISCV/overflow-intrinsics.ll b/llvm/test/CodeGen/RISCV/overflow-intrinsics.ll index 47b90a0..ba6769b 100644 --- a/llvm/test/CodeGen/RISCV/overflow-intrinsics.ll +++ b/llvm/test/CodeGen/RISCV/overflow-intrinsics.ll @@ -833,7 +833,7 @@ define i1 @usubo_ugt_i32(i32 %x, i32 %y, ptr %p) { ; RV64-NEXT: sext.w a3, a1 ; RV64-NEXT: sext.w a4, a0 ; RV64-NEXT: sltu a3, a4, a3 -; RV64-NEXT: subw a0, a0, a1 +; RV64-NEXT: sub a0, a0, a1 ; RV64-NEXT: sw a0, 0(a2) ; RV64-NEXT: mv a0, a3 ; RV64-NEXT: ret @@ -860,7 +860,7 @@ define i1 @usubo_ugt_constant_op0_i8(i8 %x, ptr %p) { ; RV64: # %bb.0: ; RV64-NEXT: zext.b a2, a0 ; RV64-NEXT: li a3, 42 -; RV64-NEXT: subw a3, a3, a0 +; RV64-NEXT: sub a3, a3, a0 ; RV64-NEXT: sltiu a0, a2, 43 ; RV64-NEXT: xori a0, a0, 1 ; RV64-NEXT: sb a3, 0(a1) @@ -890,7 +890,7 @@ define i1 @usubo_ult_constant_op0_i16(i16 %x, ptr %p) { ; RV64-NEXT: slli a2, a0, 48 ; RV64-NEXT: li a3, 43 ; RV64-NEXT: srli a2, a2, 48 -; RV64-NEXT: subw a3, a3, a0 +; RV64-NEXT: sub a3, a3, a0 ; RV64-NEXT: sltiu a0, a2, 44 ; RV64-NEXT: xori a0, a0, 1 ; RV64-NEXT: sh a3, 0(a1) @@ -987,7 +987,7 @@ define i1 @usubo_ne_constant0_op1_i32(i32 %x, ptr %p) { ; RV64-LABEL: usubo_ne_constant0_op1_i32: ; RV64: # %bb.0: ; RV64-NEXT: sext.w a2, a0 -; RV64-NEXT: negw a3, a0 +; RV64-NEXT: neg a3, a0 ; RV64-NEXT: snez a0, a2 ; RV64-NEXT: sw a3, 0(a1) ; RV64-NEXT: ret diff --git a/llvm/test/CodeGen/RISCV/pr145360.ll b/llvm/test/CodeGen/RISCV/pr145360.ll index 4251ac6..1c77fad 100644 --- a/llvm/test/CodeGen/RISCV/pr145360.ll +++ b/llvm/test/CodeGen/RISCV/pr145360.ll @@ -8,7 +8,7 @@ define i32 @signed(i32 %0, ptr %1) { ; CHECK-NEXT: srliw a2, a2, 24 ; CHECK-NEXT: add a2, a0, a2 ; CHECK-NEXT: andi a2, a2, -256 -; CHECK-NEXT: subw a2, a0, a2 +; CHECK-NEXT: sub a2, a0, a2 ; CHECK-NEXT: sraiw a0, a0, 8 ; CHECK-NEXT: sw a2, 0(a1) ; CHECK-NEXT: ret @@ -29,7 +29,7 @@ define i32 @unsigned(i32 %0, ptr %1) { ; CHECK-NEXT: srli a2, a2, 36 ; CHECK-NEXT: slli a4, a2, 5 ; CHECK-NEXT: slli a2, a2, 3 -; CHECK-NEXT: subw a2, a2, a4 +; CHECK-NEXT: sub a2, a2, a4 ; CHECK-NEXT: srliw a4, a0, 3 ; CHECK-NEXT: add a2, a0, a2 ; CHECK-NEXT: mulw a0, a4, a3 @@ -49,7 +49,7 @@ define i32 @signed_div_first(i32 %0, ptr %1) { ; CHECK-NEXT: add a3, a0, a2 ; CHECK-NEXT: sraiw a2, a3, 8 ; CHECK-NEXT: andi a3, a3, -256 -; CHECK-NEXT: subw a0, a0, a3 +; CHECK-NEXT: sub a0, a0, a3 ; CHECK-NEXT: sw a0, 0(a1) ; CHECK-NEXT: mv a0, a2 ; CHECK-NEXT: ret @@ -70,7 +70,7 @@ define i32 @unsigned_div_first(i32 %0, ptr %1) { ; CHECK-NEXT: srli a2, a2, 36 ; CHECK-NEXT: slli a3, a2, 5 ; CHECK-NEXT: slli a4, a2, 3 -; CHECK-NEXT: subw a4, a4, a3 +; CHECK-NEXT: sub a4, a4, a3 ; CHECK-NEXT: add a0, a0, a4 ; CHECK-NEXT: sw a0, 0(a1) ; CHECK-NEXT: mv a0, a2 diff --git a/llvm/test/CodeGen/RISCV/prefer-w-inst.mir b/llvm/test/CodeGen/RISCV/prefer-w-inst.mir index e05e27a..b8ff783 100644 --- a/llvm/test/CodeGen/RISCV/prefer-w-inst.mir +++ b/llvm/test/CodeGen/RISCV/prefer-w-inst.mir @@ -239,8 +239,8 @@ body: | ; NO-PREFER-W-INST-NEXT: {{ $}} ; NO-PREFER-W-INST-NEXT: [[COPY:%[0-9]+]]:gpr = COPY $x10 ; NO-PREFER-W-INST-NEXT: [[COPY1:%[0-9]+]]:gpr = COPY $x11 - ; NO-PREFER-W-INST-NEXT: [[LWU:%[0-9]+]]:gpr = LWU [[COPY]], 0 - ; NO-PREFER-W-INST-NEXT: [[ADDIW:%[0-9]+]]:gpr = ADDIW [[LWU]], 1 + ; NO-PREFER-W-INST-NEXT: [[LW:%[0-9]+]]:gpr = LW [[COPY]], 0 + ; NO-PREFER-W-INST-NEXT: [[ADDIW:%[0-9]+]]:gpr = ADDIW [[LW]], 1 ; NO-PREFER-W-INST-NEXT: $x10 = COPY [[ADDIW]] ; NO-PREFER-W-INST-NEXT: PseudoRET ; diff --git a/llvm/test/CodeGen/RISCV/rotl-rotr.ll b/llvm/test/CodeGen/RISCV/rotl-rotr.ll index 634cca5..cf64650 100644 --- a/llvm/test/CodeGen/RISCV/rotl-rotr.ll +++ b/llvm/test/CodeGen/RISCV/rotl-rotr.ll @@ -29,7 +29,7 @@ define i32 @rotl_32(i32 %x, i32 %y) nounwind { ; ; RV64I-LABEL: rotl_32: ; RV64I: # %bb.0: -; RV64I-NEXT: negw a2, a1 +; RV64I-NEXT: neg a2, a1 ; RV64I-NEXT: sllw a1, a0, a1 ; RV64I-NEXT: srlw a0, a0, a2 ; RV64I-NEXT: or a0, a1, a0 @@ -56,7 +56,7 @@ define i32 @rotl_32(i32 %x, i32 %y) nounwind { ; RV64XTHEADBB-LABEL: rotl_32: ; RV64XTHEADBB: # %bb.0: ; RV64XTHEADBB-NEXT: sllw a2, a0, a1 -; RV64XTHEADBB-NEXT: negw a1, a1 +; RV64XTHEADBB-NEXT: neg a1, a1 ; RV64XTHEADBB-NEXT: srlw a0, a0, a1 ; RV64XTHEADBB-NEXT: or a0, a2, a0 ; RV64XTHEADBB-NEXT: ret @@ -78,7 +78,7 @@ define i32 @rotr_32(i32 %x, i32 %y) nounwind { ; ; RV64I-LABEL: rotr_32: ; RV64I: # %bb.0: -; RV64I-NEXT: negw a2, a1 +; RV64I-NEXT: neg a2, a1 ; RV64I-NEXT: srlw a1, a0, a1 ; RV64I-NEXT: sllw a0, a0, a2 ; RV64I-NEXT: or a0, a1, a0 @@ -105,7 +105,7 @@ define i32 @rotr_32(i32 %x, i32 %y) nounwind { ; RV64XTHEADBB-LABEL: rotr_32: ; RV64XTHEADBB: # %bb.0: ; RV64XTHEADBB-NEXT: srlw a2, a0, a1 -; RV64XTHEADBB-NEXT: negw a1, a1 +; RV64XTHEADBB-NEXT: neg a1, a1 ; RV64XTHEADBB-NEXT: sllw a0, a0, a1 ; RV64XTHEADBB-NEXT: or a0, a2, a0 ; RV64XTHEADBB-NEXT: ret @@ -159,7 +159,7 @@ define i64 @rotl_64(i64 %x, i64 %y) nounwind { ; ; RV64I-LABEL: rotl_64: ; RV64I: # %bb.0: -; RV64I-NEXT: negw a2, a1 +; RV64I-NEXT: neg a2, a1 ; RV64I-NEXT: sll a1, a0, a1 ; RV64I-NEXT: srl a0, a0, a2 ; RV64I-NEXT: or a0, a1, a0 @@ -253,7 +253,7 @@ define i64 @rotl_64(i64 %x, i64 %y) nounwind { ; RV64XTHEADBB-LABEL: rotl_64: ; RV64XTHEADBB: # %bb.0: ; RV64XTHEADBB-NEXT: sll a2, a0, a1 -; RV64XTHEADBB-NEXT: negw a1, a1 +; RV64XTHEADBB-NEXT: neg a1, a1 ; RV64XTHEADBB-NEXT: srl a0, a0, a1 ; RV64XTHEADBB-NEXT: or a0, a2, a0 ; RV64XTHEADBB-NEXT: ret @@ -307,7 +307,7 @@ define i64 @rotr_64(i64 %x, i64 %y) nounwind { ; ; RV64I-LABEL: rotr_64: ; RV64I: # %bb.0: -; RV64I-NEXT: negw a2, a1 +; RV64I-NEXT: neg a2, a1 ; RV64I-NEXT: srl a1, a0, a1 ; RV64I-NEXT: sll a0, a0, a2 ; RV64I-NEXT: or a0, a1, a0 @@ -401,7 +401,7 @@ define i64 @rotr_64(i64 %x, i64 %y) nounwind { ; RV64XTHEADBB-LABEL: rotr_64: ; RV64XTHEADBB: # %bb.0: ; RV64XTHEADBB-NEXT: srl a2, a0, a1 -; RV64XTHEADBB-NEXT: negw a1, a1 +; RV64XTHEADBB-NEXT: neg a1, a1 ; RV64XTHEADBB-NEXT: sll a0, a0, a1 ; RV64XTHEADBB-NEXT: or a0, a2, a0 ; RV64XTHEADBB-NEXT: ret @@ -423,7 +423,7 @@ define i32 @rotl_32_mask(i32 %x, i32 %y) nounwind { ; ; RV64I-LABEL: rotl_32_mask: ; RV64I: # %bb.0: -; RV64I-NEXT: negw a2, a1 +; RV64I-NEXT: neg a2, a1 ; RV64I-NEXT: sllw a1, a0, a1 ; RV64I-NEXT: srlw a0, a0, a2 ; RV64I-NEXT: or a0, a1, a0 @@ -450,7 +450,7 @@ define i32 @rotl_32_mask(i32 %x, i32 %y) nounwind { ; RV64XTHEADBB-LABEL: rotl_32_mask: ; RV64XTHEADBB: # %bb.0: ; RV64XTHEADBB-NEXT: sllw a2, a0, a1 -; RV64XTHEADBB-NEXT: negw a1, a1 +; RV64XTHEADBB-NEXT: neg a1, a1 ; RV64XTHEADBB-NEXT: srlw a0, a0, a1 ; RV64XTHEADBB-NEXT: or a0, a2, a0 ; RV64XTHEADBB-NEXT: ret @@ -474,7 +474,7 @@ define i32 @rotl_32_mask_and_63_and_31(i32 %x, i32 %y) nounwind { ; RV64I-LABEL: rotl_32_mask_and_63_and_31: ; RV64I: # %bb.0: ; RV64I-NEXT: sllw a2, a0, a1 -; RV64I-NEXT: negw a1, a1 +; RV64I-NEXT: neg a1, a1 ; RV64I-NEXT: srlw a0, a0, a1 ; RV64I-NEXT: or a0, a2, a0 ; RV64I-NEXT: ret @@ -500,7 +500,7 @@ define i32 @rotl_32_mask_and_63_and_31(i32 %x, i32 %y) nounwind { ; RV64XTHEADBB-LABEL: rotl_32_mask_and_63_and_31: ; RV64XTHEADBB: # %bb.0: ; RV64XTHEADBB-NEXT: sllw a2, a0, a1 -; RV64XTHEADBB-NEXT: negw a1, a1 +; RV64XTHEADBB-NEXT: neg a1, a1 ; RV64XTHEADBB-NEXT: srlw a0, a0, a1 ; RV64XTHEADBB-NEXT: or a0, a2, a0 ; RV64XTHEADBB-NEXT: ret @@ -545,7 +545,7 @@ define i32 @rotl_32_mask_or_64_or_32(i32 %x, i32 %y) nounwind { ; RV64XTHEADBB-LABEL: rotl_32_mask_or_64_or_32: ; RV64XTHEADBB: # %bb.0: ; RV64XTHEADBB-NEXT: sllw a2, a0, a1 -; RV64XTHEADBB-NEXT: negw a1, a1 +; RV64XTHEADBB-NEXT: neg a1, a1 ; RV64XTHEADBB-NEXT: srlw a0, a0, a1 ; RV64XTHEADBB-NEXT: or a0, a2, a0 ; RV64XTHEADBB-NEXT: ret @@ -569,7 +569,7 @@ define i32 @rotr_32_mask(i32 %x, i32 %y) nounwind { ; ; RV64I-LABEL: rotr_32_mask: ; RV64I: # %bb.0: -; RV64I-NEXT: negw a2, a1 +; RV64I-NEXT: neg a2, a1 ; RV64I-NEXT: srlw a1, a0, a1 ; RV64I-NEXT: sllw a0, a0, a2 ; RV64I-NEXT: or a0, a1, a0 @@ -596,7 +596,7 @@ define i32 @rotr_32_mask(i32 %x, i32 %y) nounwind { ; RV64XTHEADBB-LABEL: rotr_32_mask: ; RV64XTHEADBB: # %bb.0: ; RV64XTHEADBB-NEXT: srlw a2, a0, a1 -; RV64XTHEADBB-NEXT: negw a1, a1 +; RV64XTHEADBB-NEXT: neg a1, a1 ; RV64XTHEADBB-NEXT: sllw a0, a0, a1 ; RV64XTHEADBB-NEXT: or a0, a2, a0 ; RV64XTHEADBB-NEXT: ret @@ -620,7 +620,7 @@ define i32 @rotr_32_mask_and_63_and_31(i32 %x, i32 %y) nounwind { ; RV64I-LABEL: rotr_32_mask_and_63_and_31: ; RV64I: # %bb.0: ; RV64I-NEXT: srlw a2, a0, a1 -; RV64I-NEXT: negw a1, a1 +; RV64I-NEXT: neg a1, a1 ; RV64I-NEXT: sllw a0, a0, a1 ; RV64I-NEXT: or a0, a2, a0 ; RV64I-NEXT: ret @@ -646,7 +646,7 @@ define i32 @rotr_32_mask_and_63_and_31(i32 %x, i32 %y) nounwind { ; RV64XTHEADBB-LABEL: rotr_32_mask_and_63_and_31: ; RV64XTHEADBB: # %bb.0: ; RV64XTHEADBB-NEXT: srlw a2, a0, a1 -; RV64XTHEADBB-NEXT: negw a1, a1 +; RV64XTHEADBB-NEXT: neg a1, a1 ; RV64XTHEADBB-NEXT: sllw a0, a0, a1 ; RV64XTHEADBB-NEXT: or a0, a2, a0 ; RV64XTHEADBB-NEXT: ret @@ -691,7 +691,7 @@ define i32 @rotr_32_mask_or_64_or_32(i32 %x, i32 %y) nounwind { ; RV64XTHEADBB-LABEL: rotr_32_mask_or_64_or_32: ; RV64XTHEADBB: # %bb.0: ; RV64XTHEADBB-NEXT: srlw a2, a0, a1 -; RV64XTHEADBB-NEXT: negw a1, a1 +; RV64XTHEADBB-NEXT: neg a1, a1 ; RV64XTHEADBB-NEXT: sllw a0, a0, a1 ; RV64XTHEADBB-NEXT: or a0, a2, a0 ; RV64XTHEADBB-NEXT: ret @@ -745,7 +745,7 @@ define i64 @rotl_64_mask(i64 %x, i64 %y) nounwind { ; ; RV64I-LABEL: rotl_64_mask: ; RV64I: # %bb.0: -; RV64I-NEXT: negw a2, a1 +; RV64I-NEXT: neg a2, a1 ; RV64I-NEXT: sll a1, a0, a1 ; RV64I-NEXT: srl a0, a0, a2 ; RV64I-NEXT: or a0, a1, a0 @@ -835,7 +835,7 @@ define i64 @rotl_64_mask(i64 %x, i64 %y) nounwind { ; RV64XTHEADBB-LABEL: rotl_64_mask: ; RV64XTHEADBB: # %bb.0: ; RV64XTHEADBB-NEXT: sll a2, a0, a1 -; RV64XTHEADBB-NEXT: negw a1, a1 +; RV64XTHEADBB-NEXT: neg a1, a1 ; RV64XTHEADBB-NEXT: srl a0, a0, a1 ; RV64XTHEADBB-NEXT: or a0, a2, a0 ; RV64XTHEADBB-NEXT: ret @@ -890,7 +890,7 @@ define i64 @rotl_64_mask_and_127_and_63(i64 %x, i64 %y) nounwind { ; RV64I-LABEL: rotl_64_mask_and_127_and_63: ; RV64I: # %bb.0: ; RV64I-NEXT: sll a2, a0, a1 -; RV64I-NEXT: negw a1, a1 +; RV64I-NEXT: neg a1, a1 ; RV64I-NEXT: srl a0, a0, a1 ; RV64I-NEXT: or a0, a2, a0 ; RV64I-NEXT: ret @@ -981,7 +981,7 @@ define i64 @rotl_64_mask_and_127_and_63(i64 %x, i64 %y) nounwind { ; RV64XTHEADBB-LABEL: rotl_64_mask_and_127_and_63: ; RV64XTHEADBB: # %bb.0: ; RV64XTHEADBB-NEXT: sll a2, a0, a1 -; RV64XTHEADBB-NEXT: negw a1, a1 +; RV64XTHEADBB-NEXT: neg a1, a1 ; RV64XTHEADBB-NEXT: srl a0, a0, a1 ; RV64XTHEADBB-NEXT: or a0, a2, a0 ; RV64XTHEADBB-NEXT: ret @@ -1026,7 +1026,7 @@ define i64 @rotl_64_mask_or_128_or_64(i64 %x, i64 %y) nounwind { ; RV64XTHEADBB-LABEL: rotl_64_mask_or_128_or_64: ; RV64XTHEADBB: # %bb.0: ; RV64XTHEADBB-NEXT: sll a2, a0, a1 -; RV64XTHEADBB-NEXT: negw a1, a1 +; RV64XTHEADBB-NEXT: neg a1, a1 ; RV64XTHEADBB-NEXT: srl a0, a0, a1 ; RV64XTHEADBB-NEXT: or a0, a2, a0 ; RV64XTHEADBB-NEXT: ret @@ -1080,7 +1080,7 @@ define i64 @rotr_64_mask(i64 %x, i64 %y) nounwind { ; ; RV64I-LABEL: rotr_64_mask: ; RV64I: # %bb.0: -; RV64I-NEXT: negw a2, a1 +; RV64I-NEXT: neg a2, a1 ; RV64I-NEXT: srl a1, a0, a1 ; RV64I-NEXT: sll a0, a0, a2 ; RV64I-NEXT: or a0, a1, a0 @@ -1170,7 +1170,7 @@ define i64 @rotr_64_mask(i64 %x, i64 %y) nounwind { ; RV64XTHEADBB-LABEL: rotr_64_mask: ; RV64XTHEADBB: # %bb.0: ; RV64XTHEADBB-NEXT: srl a2, a0, a1 -; RV64XTHEADBB-NEXT: negw a1, a1 +; RV64XTHEADBB-NEXT: neg a1, a1 ; RV64XTHEADBB-NEXT: sll a0, a0, a1 ; RV64XTHEADBB-NEXT: or a0, a2, a0 ; RV64XTHEADBB-NEXT: ret @@ -1225,7 +1225,7 @@ define i64 @rotr_64_mask_and_127_and_63(i64 %x, i64 %y) nounwind { ; RV64I-LABEL: rotr_64_mask_and_127_and_63: ; RV64I: # %bb.0: ; RV64I-NEXT: srl a2, a0, a1 -; RV64I-NEXT: negw a1, a1 +; RV64I-NEXT: neg a1, a1 ; RV64I-NEXT: sll a0, a0, a1 ; RV64I-NEXT: or a0, a2, a0 ; RV64I-NEXT: ret @@ -1316,7 +1316,7 @@ define i64 @rotr_64_mask_and_127_and_63(i64 %x, i64 %y) nounwind { ; RV64XTHEADBB-LABEL: rotr_64_mask_and_127_and_63: ; RV64XTHEADBB: # %bb.0: ; RV64XTHEADBB-NEXT: srl a2, a0, a1 -; RV64XTHEADBB-NEXT: negw a1, a1 +; RV64XTHEADBB-NEXT: neg a1, a1 ; RV64XTHEADBB-NEXT: sll a0, a0, a1 ; RV64XTHEADBB-NEXT: or a0, a2, a0 ; RV64XTHEADBB-NEXT: ret @@ -1361,7 +1361,7 @@ define i64 @rotr_64_mask_or_128_or_64(i64 %x, i64 %y) nounwind { ; RV64XTHEADBB-LABEL: rotr_64_mask_or_128_or_64: ; RV64XTHEADBB: # %bb.0: ; RV64XTHEADBB-NEXT: srl a2, a0, a1 -; RV64XTHEADBB-NEXT: negw a1, a1 +; RV64XTHEADBB-NEXT: neg a1, a1 ; RV64XTHEADBB-NEXT: sll a0, a0, a1 ; RV64XTHEADBB-NEXT: or a0, a2, a0 ; RV64XTHEADBB-NEXT: ret @@ -1390,7 +1390,7 @@ define signext i32 @rotl_32_mask_shared(i32 signext %a, i32 signext %b, i32 sign ; RV64I-LABEL: rotl_32_mask_shared: ; RV64I: # %bb.0: ; RV64I-NEXT: sllw a3, a0, a2 -; RV64I-NEXT: negw a4, a2 +; RV64I-NEXT: neg a4, a2 ; RV64I-NEXT: srlw a0, a0, a4 ; RV64I-NEXT: or a0, a3, a0 ; RV64I-NEXT: sllw a1, a1, a2 @@ -1424,7 +1424,7 @@ define signext i32 @rotl_32_mask_shared(i32 signext %a, i32 signext %b, i32 sign ; RV64XTHEADBB-LABEL: rotl_32_mask_shared: ; RV64XTHEADBB: # %bb.0: ; RV64XTHEADBB-NEXT: sllw a3, a0, a2 -; RV64XTHEADBB-NEXT: negw a4, a2 +; RV64XTHEADBB-NEXT: neg a4, a2 ; RV64XTHEADBB-NEXT: srlw a0, a0, a4 ; RV64XTHEADBB-NEXT: or a0, a3, a0 ; RV64XTHEADBB-NEXT: sllw a1, a1, a2 @@ -1486,7 +1486,7 @@ define signext i64 @rotl_64_mask_shared(i64 signext %a, i64 signext %b, i64 sign ; RV64I-LABEL: rotl_64_mask_shared: ; RV64I: # %bb.0: ; RV64I-NEXT: sll a3, a0, a2 -; RV64I-NEXT: negw a4, a2 +; RV64I-NEXT: neg a4, a2 ; RV64I-NEXT: srl a0, a0, a4 ; RV64I-NEXT: or a0, a3, a0 ; RV64I-NEXT: sll a1, a1, a2 @@ -1590,7 +1590,7 @@ define signext i64 @rotl_64_mask_shared(i64 signext %a, i64 signext %b, i64 sign ; RV64XTHEADBB-LABEL: rotl_64_mask_shared: ; RV64XTHEADBB: # %bb.0: ; RV64XTHEADBB-NEXT: sll a3, a0, a2 -; RV64XTHEADBB-NEXT: negw a4, a2 +; RV64XTHEADBB-NEXT: neg a4, a2 ; RV64XTHEADBB-NEXT: srl a0, a0, a4 ; RV64XTHEADBB-NEXT: or a0, a3, a0 ; RV64XTHEADBB-NEXT: sll a1, a1, a2 @@ -1618,7 +1618,7 @@ define signext i32 @rotr_32_mask_shared(i32 signext %a, i32 signext %b, i32 sign ; RV64I-LABEL: rotr_32_mask_shared: ; RV64I: # %bb.0: ; RV64I-NEXT: srlw a3, a0, a2 -; RV64I-NEXT: negw a4, a2 +; RV64I-NEXT: neg a4, a2 ; RV64I-NEXT: sllw a0, a0, a4 ; RV64I-NEXT: or a0, a3, a0 ; RV64I-NEXT: sllw a1, a1, a2 @@ -1652,7 +1652,7 @@ define signext i32 @rotr_32_mask_shared(i32 signext %a, i32 signext %b, i32 sign ; RV64XTHEADBB-LABEL: rotr_32_mask_shared: ; RV64XTHEADBB: # %bb.0: ; RV64XTHEADBB-NEXT: srlw a3, a0, a2 -; RV64XTHEADBB-NEXT: negw a4, a2 +; RV64XTHEADBB-NEXT: neg a4, a2 ; RV64XTHEADBB-NEXT: sllw a0, a0, a4 ; RV64XTHEADBB-NEXT: or a0, a3, a0 ; RV64XTHEADBB-NEXT: sllw a1, a1, a2 @@ -1713,7 +1713,7 @@ define signext i64 @rotr_64_mask_shared(i64 signext %a, i64 signext %b, i64 sign ; RV64I-LABEL: rotr_64_mask_shared: ; RV64I: # %bb.0: ; RV64I-NEXT: srl a3, a0, a2 -; RV64I-NEXT: negw a4, a2 +; RV64I-NEXT: neg a4, a2 ; RV64I-NEXT: sll a0, a0, a4 ; RV64I-NEXT: or a0, a3, a0 ; RV64I-NEXT: sll a1, a1, a2 @@ -1816,7 +1816,7 @@ define signext i64 @rotr_64_mask_shared(i64 signext %a, i64 signext %b, i64 sign ; RV64XTHEADBB-LABEL: rotr_64_mask_shared: ; RV64XTHEADBB: # %bb.0: ; RV64XTHEADBB-NEXT: srl a3, a0, a2 -; RV64XTHEADBB-NEXT: negw a4, a2 +; RV64XTHEADBB-NEXT: neg a4, a2 ; RV64XTHEADBB-NEXT: sll a0, a0, a4 ; RV64XTHEADBB-NEXT: or a0, a3, a0 ; RV64XTHEADBB-NEXT: sll a1, a1, a2 @@ -1846,7 +1846,7 @@ define signext i32 @rotl_32_mask_multiple(i32 signext %a, i32 signext %b, i32 si ; RV64I-LABEL: rotl_32_mask_multiple: ; RV64I: # %bb.0: ; RV64I-NEXT: sllw a3, a0, a2 -; RV64I-NEXT: negw a4, a2 +; RV64I-NEXT: neg a4, a2 ; RV64I-NEXT: sllw a2, a1, a2 ; RV64I-NEXT: srlw a0, a0, a4 ; RV64I-NEXT: srlw a1, a1, a4 @@ -1884,7 +1884,7 @@ define signext i32 @rotl_32_mask_multiple(i32 signext %a, i32 signext %b, i32 si ; RV64XTHEADBB-LABEL: rotl_32_mask_multiple: ; RV64XTHEADBB: # %bb.0: ; RV64XTHEADBB-NEXT: sllw a3, a0, a2 -; RV64XTHEADBB-NEXT: negw a4, a2 +; RV64XTHEADBB-NEXT: neg a4, a2 ; RV64XTHEADBB-NEXT: sllw a2, a1, a2 ; RV64XTHEADBB-NEXT: srlw a0, a0, a4 ; RV64XTHEADBB-NEXT: srlw a1, a1, a4 @@ -1948,7 +1948,7 @@ define i64 @rotl_64_mask_multiple(i64 %a, i64 %b, i64 %amt) nounwind { ; RV64I-LABEL: rotl_64_mask_multiple: ; RV64I: # %bb.0: ; RV64I-NEXT: sll a3, a0, a2 -; RV64I-NEXT: negw a4, a2 +; RV64I-NEXT: neg a4, a2 ; RV64I-NEXT: sll a2, a1, a2 ; RV64I-NEXT: srl a0, a0, a4 ; RV64I-NEXT: srl a1, a1, a4 @@ -2056,7 +2056,7 @@ define i64 @rotl_64_mask_multiple(i64 %a, i64 %b, i64 %amt) nounwind { ; RV64XTHEADBB-LABEL: rotl_64_mask_multiple: ; RV64XTHEADBB: # %bb.0: ; RV64XTHEADBB-NEXT: sll a3, a0, a2 -; RV64XTHEADBB-NEXT: negw a4, a2 +; RV64XTHEADBB-NEXT: neg a4, a2 ; RV64XTHEADBB-NEXT: sll a2, a1, a2 ; RV64XTHEADBB-NEXT: srl a0, a0, a4 ; RV64XTHEADBB-NEXT: srl a1, a1, a4 @@ -2087,7 +2087,7 @@ define signext i32 @rotr_32_mask_multiple(i32 signext %a, i32 signext %b, i32 si ; RV64I-LABEL: rotr_32_mask_multiple: ; RV64I: # %bb.0: ; RV64I-NEXT: srlw a3, a0, a2 -; RV64I-NEXT: negw a4, a2 +; RV64I-NEXT: neg a4, a2 ; RV64I-NEXT: srlw a2, a1, a2 ; RV64I-NEXT: sllw a0, a0, a4 ; RV64I-NEXT: sllw a1, a1, a4 @@ -2125,7 +2125,7 @@ define signext i32 @rotr_32_mask_multiple(i32 signext %a, i32 signext %b, i32 si ; RV64XTHEADBB-LABEL: rotr_32_mask_multiple: ; RV64XTHEADBB: # %bb.0: ; RV64XTHEADBB-NEXT: srlw a3, a0, a2 -; RV64XTHEADBB-NEXT: negw a4, a2 +; RV64XTHEADBB-NEXT: neg a4, a2 ; RV64XTHEADBB-NEXT: srlw a2, a1, a2 ; RV64XTHEADBB-NEXT: sllw a0, a0, a4 ; RV64XTHEADBB-NEXT: sllw a1, a1, a4 @@ -2188,7 +2188,7 @@ define i64 @rotr_64_mask_multiple(i64 %a, i64 %b, i64 %amt) nounwind { ; RV64I-LABEL: rotr_64_mask_multiple: ; RV64I: # %bb.0: ; RV64I-NEXT: srl a3, a0, a2 -; RV64I-NEXT: negw a4, a2 +; RV64I-NEXT: neg a4, a2 ; RV64I-NEXT: srl a2, a1, a2 ; RV64I-NEXT: sll a0, a0, a4 ; RV64I-NEXT: sll a1, a1, a4 @@ -2295,7 +2295,7 @@ define i64 @rotr_64_mask_multiple(i64 %a, i64 %b, i64 %amt) nounwind { ; RV64XTHEADBB-LABEL: rotr_64_mask_multiple: ; RV64XTHEADBB: # %bb.0: ; RV64XTHEADBB-NEXT: srl a3, a0, a2 -; RV64XTHEADBB-NEXT: negw a4, a2 +; RV64XTHEADBB-NEXT: neg a4, a2 ; RV64XTHEADBB-NEXT: srl a2, a1, a2 ; RV64XTHEADBB-NEXT: sll a0, a0, a4 ; RV64XTHEADBB-NEXT: sll a1, a1, a4 @@ -2353,7 +2353,7 @@ define i64 @rotl_64_zext(i64 %x, i32 %y) nounwind { ; ; RV64I-LABEL: rotl_64_zext: ; RV64I: # %bb.0: -; RV64I-NEXT: negw a2, a1 +; RV64I-NEXT: neg a2, a1 ; RV64I-NEXT: sll a1, a0, a1 ; RV64I-NEXT: srl a0, a0, a2 ; RV64I-NEXT: or a0, a1, a0 @@ -2447,7 +2447,7 @@ define i64 @rotl_64_zext(i64 %x, i32 %y) nounwind { ; RV64XTHEADBB-LABEL: rotl_64_zext: ; RV64XTHEADBB: # %bb.0: ; RV64XTHEADBB-NEXT: sll a2, a0, a1 -; RV64XTHEADBB-NEXT: negw a1, a1 +; RV64XTHEADBB-NEXT: neg a1, a1 ; RV64XTHEADBB-NEXT: srl a0, a0, a1 ; RV64XTHEADBB-NEXT: or a0, a2, a0 ; RV64XTHEADBB-NEXT: ret @@ -2503,7 +2503,7 @@ define i64 @rotr_64_zext(i64 %x, i32 %y) nounwind { ; ; RV64I-LABEL: rotr_64_zext: ; RV64I: # %bb.0: -; RV64I-NEXT: negw a2, a1 +; RV64I-NEXT: neg a2, a1 ; RV64I-NEXT: srl a1, a0, a1 ; RV64I-NEXT: sll a0, a0, a2 ; RV64I-NEXT: or a0, a1, a0 @@ -2597,7 +2597,7 @@ define i64 @rotr_64_zext(i64 %x, i32 %y) nounwind { ; RV64XTHEADBB-LABEL: rotr_64_zext: ; RV64XTHEADBB: # %bb.0: ; RV64XTHEADBB-NEXT: srl a2, a0, a1 -; RV64XTHEADBB-NEXT: negw a1, a1 +; RV64XTHEADBB-NEXT: neg a1, a1 ; RV64XTHEADBB-NEXT: sll a0, a0, a1 ; RV64XTHEADBB-NEXT: or a0, a2, a0 ; RV64XTHEADBB-NEXT: ret diff --git a/llvm/test/CodeGen/RISCV/rv64i-demanded-bits.ll b/llvm/test/CodeGen/RISCV/rv64i-demanded-bits.ll index b8c4328..721436d 100644 --- a/llvm/test/CodeGen/RISCV/rv64i-demanded-bits.ll +++ b/llvm/test/CodeGen/RISCV/rv64i-demanded-bits.ll @@ -121,7 +121,7 @@ define signext i32 @andi_sub_cse(i32 signext %0, i32 signext %1, ptr %2) { define signext i32 @addi_sub_cse(i32 signext %0, i32 signext %1, ptr %2) { ; CHECK-LABEL: addi_sub_cse: ; CHECK: # %bb.0: -; CHECK-NEXT: subw a0, a0, a1 +; CHECK-NEXT: sub a0, a0, a1 ; CHECK-NEXT: addiw a0, a0, -8 ; CHECK-NEXT: sw a0, 0(a2) ; CHECK-NEXT: ret diff --git a/llvm/test/CodeGen/RISCV/rv64i-exhaustive-w-insts.ll b/llvm/test/CodeGen/RISCV/rv64i-exhaustive-w-insts.ll index dad20b2..6b4c253 100644 --- a/llvm/test/CodeGen/RISCV/rv64i-exhaustive-w-insts.ll +++ b/llvm/test/CodeGen/RISCV/rv64i-exhaustive-w-insts.ll @@ -501,14 +501,14 @@ define signext i32 @sext_subw_zext_zext(i32 zeroext %a, i32 zeroext %b) nounwind define zeroext i32 @zext_subw_aext_aext(i32 %a, i32 %b) nounwind { ; RV64I-LABEL: zext_subw_aext_aext: ; RV64I: # %bb.0: -; RV64I-NEXT: subw a0, a0, a1 +; RV64I-NEXT: sub a0, a0, a1 ; RV64I-NEXT: slli a0, a0, 32 ; RV64I-NEXT: srli a0, a0, 32 ; RV64I-NEXT: ret ; ; RV64ZBA-LABEL: zext_subw_aext_aext: ; RV64ZBA: # %bb.0: -; RV64ZBA-NEXT: subw a0, a0, a1 +; RV64ZBA-NEXT: sub a0, a0, a1 ; RV64ZBA-NEXT: zext.w a0, a0 ; RV64ZBA-NEXT: ret %1 = sub i32 %a, %b @@ -518,14 +518,14 @@ define zeroext i32 @zext_subw_aext_aext(i32 %a, i32 %b) nounwind { define zeroext i32 @zext_subw_aext_sext(i32 %a, i32 signext %b) nounwind { ; RV64I-LABEL: zext_subw_aext_sext: ; RV64I: # %bb.0: -; RV64I-NEXT: subw a0, a0, a1 +; RV64I-NEXT: sub a0, a0, a1 ; RV64I-NEXT: slli a0, a0, 32 ; RV64I-NEXT: srli a0, a0, 32 ; RV64I-NEXT: ret ; ; RV64ZBA-LABEL: zext_subw_aext_sext: ; RV64ZBA: # %bb.0: -; RV64ZBA-NEXT: subw a0, a0, a1 +; RV64ZBA-NEXT: sub a0, a0, a1 ; RV64ZBA-NEXT: zext.w a0, a0 ; RV64ZBA-NEXT: ret %1 = sub i32 %a, %b @@ -535,14 +535,14 @@ define zeroext i32 @zext_subw_aext_sext(i32 %a, i32 signext %b) nounwind { define zeroext i32 @zext_subw_aext_zext(i32 %a, i32 zeroext %b) nounwind { ; RV64I-LABEL: zext_subw_aext_zext: ; RV64I: # %bb.0: -; RV64I-NEXT: subw a0, a0, a1 +; RV64I-NEXT: sub a0, a0, a1 ; RV64I-NEXT: slli a0, a0, 32 ; RV64I-NEXT: srli a0, a0, 32 ; RV64I-NEXT: ret ; ; RV64ZBA-LABEL: zext_subw_aext_zext: ; RV64ZBA: # %bb.0: -; RV64ZBA-NEXT: subw a0, a0, a1 +; RV64ZBA-NEXT: sub a0, a0, a1 ; RV64ZBA-NEXT: zext.w a0, a0 ; RV64ZBA-NEXT: ret %1 = sub i32 %a, %b @@ -552,14 +552,14 @@ define zeroext i32 @zext_subw_aext_zext(i32 %a, i32 zeroext %b) nounwind { define zeroext i32 @zext_subw_sext_aext(i32 signext %a, i32 %b) nounwind { ; RV64I-LABEL: zext_subw_sext_aext: ; RV64I: # %bb.0: -; RV64I-NEXT: subw a0, a0, a1 +; RV64I-NEXT: sub a0, a0, a1 ; RV64I-NEXT: slli a0, a0, 32 ; RV64I-NEXT: srli a0, a0, 32 ; RV64I-NEXT: ret ; ; RV64ZBA-LABEL: zext_subw_sext_aext: ; RV64ZBA: # %bb.0: -; RV64ZBA-NEXT: subw a0, a0, a1 +; RV64ZBA-NEXT: sub a0, a0, a1 ; RV64ZBA-NEXT: zext.w a0, a0 ; RV64ZBA-NEXT: ret %1 = sub i32 %a, %b @@ -569,14 +569,14 @@ define zeroext i32 @zext_subw_sext_aext(i32 signext %a, i32 %b) nounwind { define zeroext i32 @zext_subw_sext_sext(i32 signext %a, i32 signext %b) nounwind { ; RV64I-LABEL: zext_subw_sext_sext: ; RV64I: # %bb.0: -; RV64I-NEXT: subw a0, a0, a1 +; RV64I-NEXT: sub a0, a0, a1 ; RV64I-NEXT: slli a0, a0, 32 ; RV64I-NEXT: srli a0, a0, 32 ; RV64I-NEXT: ret ; ; RV64ZBA-LABEL: zext_subw_sext_sext: ; RV64ZBA: # %bb.0: -; RV64ZBA-NEXT: subw a0, a0, a1 +; RV64ZBA-NEXT: sub a0, a0, a1 ; RV64ZBA-NEXT: zext.w a0, a0 ; RV64ZBA-NEXT: ret %1 = sub i32 %a, %b @@ -586,14 +586,14 @@ define zeroext i32 @zext_subw_sext_sext(i32 signext %a, i32 signext %b) nounwind define zeroext i32 @zext_subw_sext_zext(i32 signext %a, i32 zeroext %b) nounwind { ; RV64I-LABEL: zext_subw_sext_zext: ; RV64I: # %bb.0: -; RV64I-NEXT: subw a0, a0, a1 +; RV64I-NEXT: sub a0, a0, a1 ; RV64I-NEXT: slli a0, a0, 32 ; RV64I-NEXT: srli a0, a0, 32 ; RV64I-NEXT: ret ; ; RV64ZBA-LABEL: zext_subw_sext_zext: ; RV64ZBA: # %bb.0: -; RV64ZBA-NEXT: subw a0, a0, a1 +; RV64ZBA-NEXT: sub a0, a0, a1 ; RV64ZBA-NEXT: zext.w a0, a0 ; RV64ZBA-NEXT: ret %1 = sub i32 %a, %b @@ -603,14 +603,14 @@ define zeroext i32 @zext_subw_sext_zext(i32 signext %a, i32 zeroext %b) nounwind define zeroext i32 @zext_subw_zext_aext(i32 zeroext %a, i32 %b) nounwind { ; RV64I-LABEL: zext_subw_zext_aext: ; RV64I: # %bb.0: -; RV64I-NEXT: subw a0, a0, a1 +; RV64I-NEXT: sub a0, a0, a1 ; RV64I-NEXT: slli a0, a0, 32 ; RV64I-NEXT: srli a0, a0, 32 ; RV64I-NEXT: ret ; ; RV64ZBA-LABEL: zext_subw_zext_aext: ; RV64ZBA: # %bb.0: -; RV64ZBA-NEXT: subw a0, a0, a1 +; RV64ZBA-NEXT: sub a0, a0, a1 ; RV64ZBA-NEXT: zext.w a0, a0 ; RV64ZBA-NEXT: ret %1 = sub i32 %a, %b @@ -620,14 +620,14 @@ define zeroext i32 @zext_subw_zext_aext(i32 zeroext %a, i32 %b) nounwind { define zeroext i32 @zext_subw_zext_sext(i32 zeroext %a, i32 signext %b) nounwind { ; RV64I-LABEL: zext_subw_zext_sext: ; RV64I: # %bb.0: -; RV64I-NEXT: subw a0, a0, a1 +; RV64I-NEXT: sub a0, a0, a1 ; RV64I-NEXT: slli a0, a0, 32 ; RV64I-NEXT: srli a0, a0, 32 ; RV64I-NEXT: ret ; ; RV64ZBA-LABEL: zext_subw_zext_sext: ; RV64ZBA: # %bb.0: -; RV64ZBA-NEXT: subw a0, a0, a1 +; RV64ZBA-NEXT: sub a0, a0, a1 ; RV64ZBA-NEXT: zext.w a0, a0 ; RV64ZBA-NEXT: ret %1 = sub i32 %a, %b @@ -637,14 +637,14 @@ define zeroext i32 @zext_subw_zext_sext(i32 zeroext %a, i32 signext %b) nounwind define zeroext i32 @zext_subw_zext_zext(i32 zeroext %a, i32 zeroext %b) nounwind { ; RV64I-LABEL: zext_subw_zext_zext: ; RV64I: # %bb.0: -; RV64I-NEXT: subw a0, a0, a1 +; RV64I-NEXT: sub a0, a0, a1 ; RV64I-NEXT: slli a0, a0, 32 ; RV64I-NEXT: srli a0, a0, 32 ; RV64I-NEXT: ret ; ; RV64ZBA-LABEL: zext_subw_zext_zext: ; RV64ZBA: # %bb.0: -; RV64ZBA-NEXT: subw a0, a0, a1 +; RV64ZBA-NEXT: sub a0, a0, a1 ; RV64ZBA-NEXT: zext.w a0, a0 ; RV64ZBA-NEXT: ret %1 = sub i32 %a, %b diff --git a/llvm/test/CodeGen/RISCV/rv64i-w-insts-legalization.ll b/llvm/test/CodeGen/RISCV/rv64i-w-insts-legalization.ll index 0782018..219a5aa 100644 --- a/llvm/test/CodeGen/RISCV/rv64i-w-insts-legalization.ll +++ b/llvm/test/CodeGen/RISCV/rv64i-w-insts-legalization.ll @@ -9,7 +9,7 @@ define signext i32 @addw(i32 signext %s, i32 signext %n, i32 signext %k) nounwin ; CHECK-NEXT: not a2, a0 ; CHECK-NEXT: addi a3, a0, 1 ; CHECK-NEXT: add a2, a2, a1 -; CHECK-NEXT: subw a1, a1, a0 +; CHECK-NEXT: sub a1, a1, a0 ; CHECK-NEXT: addi a1, a1, -2 ; CHECK-NEXT: mul a3, a2, a3 ; CHECK-NEXT: slli a1, a1, 32 @@ -53,7 +53,7 @@ define signext i32 @subw(i32 signext %s, i32 signext %n, i32 signext %k) nounwin ; CHECK-NEXT: bge a0, a1, .LBB1_2 ; CHECK-NEXT: # %bb.1: # %for.body.preheader ; CHECK-NEXT: not a2, a0 -; CHECK-NEXT: subw a3, a1, a0 +; CHECK-NEXT: sub a3, a1, a0 ; CHECK-NEXT: add a1, a2, a1 ; CHECK-NEXT: addi a3, a3, -2 ; CHECK-NEXT: mul a2, a1, a2 @@ -61,7 +61,7 @@ define signext i32 @subw(i32 signext %s, i32 signext %n, i32 signext %k) nounwin ; CHECK-NEXT: slli a1, a1, 32 ; CHECK-NEXT: mulhu a1, a1, a3 ; CHECK-NEXT: srli a1, a1, 1 -; CHECK-NEXT: subw a0, a2, a0 +; CHECK-NEXT: sub a0, a2, a0 ; CHECK-NEXT: subw a0, a0, a1 ; CHECK-NEXT: ret ; CHECK-NEXT: .LBB1_2: diff --git a/llvm/test/CodeGen/RISCV/rv64xtheadbb.ll b/llvm/test/CodeGen/RISCV/rv64xtheadbb.ll index 00f7b46..81acb4f7 100644 --- a/llvm/test/CodeGen/RISCV/rv64xtheadbb.ll +++ b/llvm/test/CodeGen/RISCV/rv64xtheadbb.ll @@ -357,7 +357,7 @@ define signext i32 @cttz_i32(i32 signext %a) nounwind { ; RV64I: # %bb.0: ; RV64I-NEXT: beqz a0, .LBB6_2 ; RV64I-NEXT: # %bb.1: # %cond.false -; RV64I-NEXT: negw a1, a0 +; RV64I-NEXT: neg a1, a0 ; RV64I-NEXT: and a0, a0, a1 ; RV64I-NEXT: slli a1, a0, 6 ; RV64I-NEXT: slli a2, a0, 8 @@ -365,16 +365,16 @@ define signext i32 @cttz_i32(i32 signext %a) nounwind { ; RV64I-NEXT: slli a4, a0, 12 ; RV64I-NEXT: add a1, a1, a2 ; RV64I-NEXT: slli a2, a0, 16 -; RV64I-NEXT: subw a3, a3, a4 +; RV64I-NEXT: sub a3, a3, a4 ; RV64I-NEXT: slli a4, a0, 18 -; RV64I-NEXT: subw a2, a2, a4 +; RV64I-NEXT: sub a2, a2, a4 ; RV64I-NEXT: slli a4, a0, 4 -; RV64I-NEXT: subw a4, a0, a4 +; RV64I-NEXT: sub a4, a0, a4 ; RV64I-NEXT: add a1, a4, a1 ; RV64I-NEXT: slli a4, a0, 14 -; RV64I-NEXT: subw a3, a3, a4 +; RV64I-NEXT: sub a3, a3, a4 ; RV64I-NEXT: slli a4, a0, 23 -; RV64I-NEXT: subw a2, a2, a4 +; RV64I-NEXT: sub a2, a2, a4 ; RV64I-NEXT: slli a0, a0, 27 ; RV64I-NEXT: add a1, a1, a3 ; RV64I-NEXT: add a0, a2, a0 @@ -410,7 +410,7 @@ define signext i32 @cttz_i32(i32 signext %a) nounwind { define signext i32 @cttz_zero_undef_i32(i32 signext %a) nounwind { ; RV64I-LABEL: cttz_zero_undef_i32: ; RV64I: # %bb.0: -; RV64I-NEXT: negw a1, a0 +; RV64I-NEXT: neg a1, a0 ; RV64I-NEXT: and a0, a0, a1 ; RV64I-NEXT: slli a1, a0, 6 ; RV64I-NEXT: slli a2, a0, 8 @@ -418,16 +418,16 @@ define signext i32 @cttz_zero_undef_i32(i32 signext %a) nounwind { ; RV64I-NEXT: slli a4, a0, 12 ; RV64I-NEXT: add a1, a1, a2 ; RV64I-NEXT: slli a2, a0, 16 -; RV64I-NEXT: subw a3, a3, a4 +; RV64I-NEXT: sub a3, a3, a4 ; RV64I-NEXT: slli a4, a0, 18 -; RV64I-NEXT: subw a2, a2, a4 +; RV64I-NEXT: sub a2, a2, a4 ; RV64I-NEXT: slli a4, a0, 4 -; RV64I-NEXT: subw a4, a0, a4 +; RV64I-NEXT: sub a4, a0, a4 ; RV64I-NEXT: add a1, a4, a1 ; RV64I-NEXT: slli a4, a0, 14 -; RV64I-NEXT: subw a3, a3, a4 +; RV64I-NEXT: sub a3, a3, a4 ; RV64I-NEXT: slli a4, a0, 23 -; RV64I-NEXT: subw a2, a2, a4 +; RV64I-NEXT: sub a2, a2, a4 ; RV64I-NEXT: slli a0, a0, 27 ; RV64I-NEXT: add a1, a1, a3 ; RV64I-NEXT: add a0, a2, a0 @@ -455,7 +455,7 @@ define signext i32 @cttz_zero_undef_i32(i32 signext %a) nounwind { define signext i32 @findFirstSet_i32(i32 signext %a) nounwind { ; RV64I-LABEL: findFirstSet_i32: ; RV64I: # %bb.0: -; RV64I-NEXT: negw a1, a0 +; RV64I-NEXT: neg a1, a0 ; RV64I-NEXT: and a1, a0, a1 ; RV64I-NEXT: slli a2, a1, 6 ; RV64I-NEXT: slli a3, a1, 8 @@ -463,16 +463,16 @@ define signext i32 @findFirstSet_i32(i32 signext %a) nounwind { ; RV64I-NEXT: slli a5, a1, 12 ; RV64I-NEXT: add a2, a2, a3 ; RV64I-NEXT: slli a3, a1, 16 -; RV64I-NEXT: subw a4, a4, a5 +; RV64I-NEXT: sub a4, a4, a5 ; RV64I-NEXT: slli a5, a1, 18 -; RV64I-NEXT: subw a3, a3, a5 +; RV64I-NEXT: sub a3, a3, a5 ; RV64I-NEXT: slli a5, a1, 4 -; RV64I-NEXT: subw a5, a1, a5 +; RV64I-NEXT: sub a5, a1, a5 ; RV64I-NEXT: add a2, a5, a2 ; RV64I-NEXT: slli a5, a1, 14 -; RV64I-NEXT: subw a4, a4, a5 +; RV64I-NEXT: sub a4, a4, a5 ; RV64I-NEXT: slli a5, a1, 23 -; RV64I-NEXT: subw a3, a3, a5 +; RV64I-NEXT: sub a3, a3, a5 ; RV64I-NEXT: slli a1, a1, 27 ; RV64I-NEXT: add a2, a2, a4 ; RV64I-NEXT: add a1, a3, a1 @@ -508,7 +508,7 @@ define signext i32 @findFirstSet_i32(i32 signext %a) nounwind { define signext i32 @ffs_i32(i32 signext %a) nounwind { ; RV64I-LABEL: ffs_i32: ; RV64I: # %bb.0: -; RV64I-NEXT: negw a1, a0 +; RV64I-NEXT: neg a1, a0 ; RV64I-NEXT: and a1, a0, a1 ; RV64I-NEXT: slli a2, a1, 6 ; RV64I-NEXT: slli a3, a1, 8 @@ -516,16 +516,16 @@ define signext i32 @ffs_i32(i32 signext %a) nounwind { ; RV64I-NEXT: slli a5, a1, 12 ; RV64I-NEXT: add a2, a2, a3 ; RV64I-NEXT: slli a3, a1, 16 -; RV64I-NEXT: subw a4, a4, a5 +; RV64I-NEXT: sub a4, a4, a5 ; RV64I-NEXT: slli a5, a1, 18 -; RV64I-NEXT: subw a3, a3, a5 +; RV64I-NEXT: sub a3, a3, a5 ; RV64I-NEXT: slli a5, a1, 4 -; RV64I-NEXT: subw a5, a1, a5 +; RV64I-NEXT: sub a5, a1, a5 ; RV64I-NEXT: add a2, a5, a2 ; RV64I-NEXT: slli a5, a1, 14 -; RV64I-NEXT: subw a4, a4, a5 +; RV64I-NEXT: sub a4, a4, a5 ; RV64I-NEXT: slli a5, a1, 23 -; RV64I-NEXT: subw a3, a3, a5 +; RV64I-NEXT: sub a3, a3, a5 ; RV64I-NEXT: add a2, a2, a4 ; RV64I-NEXT: lui a4, %hi(.LCPI9_0) ; RV64I-NEXT: addi a4, a4, %lo(.LCPI9_0) diff --git a/llvm/test/CodeGen/RISCV/rv64zba.ll b/llvm/test/CodeGen/RISCV/rv64zba.ll index fdff4a3..b46f7cc 100644 --- a/llvm/test/CodeGen/RISCV/rv64zba.ll +++ b/llvm/test/CodeGen/RISCV/rv64zba.ll @@ -3707,7 +3707,7 @@ define ptr @test_gep_gep_dont_crash(ptr %p, i64 %a1, i64 %a2) { define i64 @regression(i32 signext %x, i32 signext %y) { ; RV64I-LABEL: regression: ; RV64I: # %bb.0: -; RV64I-NEXT: subw a0, a0, a1 +; RV64I-NEXT: sub a0, a0, a1 ; RV64I-NEXT: slli a0, a0, 32 ; RV64I-NEXT: srli a1, a0, 29 ; RV64I-NEXT: srli a0, a0, 27 @@ -3716,14 +3716,14 @@ define i64 @regression(i32 signext %x, i32 signext %y) { ; ; RV64ZBA-LABEL: regression: ; RV64ZBA: # %bb.0: -; RV64ZBA-NEXT: subw a0, a0, a1 +; RV64ZBA-NEXT: sub a0, a0, a1 ; RV64ZBA-NEXT: slli.uw a0, a0, 3 ; RV64ZBA-NEXT: sh1add a0, a0, a0 ; RV64ZBA-NEXT: ret ; ; RV64XANDESPERF-LABEL: regression: ; RV64XANDESPERF: # %bb.0: -; RV64XANDESPERF-NEXT: subw a0, a0, a1 +; RV64XANDESPERF-NEXT: sub a0, a0, a1 ; RV64XANDESPERF-NEXT: slli a0, a0, 32 ; RV64XANDESPERF-NEXT: srli a0, a0, 29 ; RV64XANDESPERF-NEXT: nds.lea.h a0, a0, a0 diff --git a/llvm/test/CodeGen/RISCV/rv64zbb-zbkb.ll b/llvm/test/CodeGen/RISCV/rv64zbb-zbkb.ll index 12fc98c..f2c95f8 100644 --- a/llvm/test/CodeGen/RISCV/rv64zbb-zbkb.ll +++ b/llvm/test/CodeGen/RISCV/rv64zbb-zbkb.ll @@ -225,7 +225,7 @@ define signext i32 @rol_i32(i32 signext %a, i32 signext %b) nounwind { ; RV64I-LABEL: rol_i32: ; RV64I: # %bb.0: ; RV64I-NEXT: sllw a2, a0, a1 -; RV64I-NEXT: negw a1, a1 +; RV64I-NEXT: neg a1, a1 ; RV64I-NEXT: srlw a0, a0, a1 ; RV64I-NEXT: or a0, a2, a0 ; RV64I-NEXT: ret @@ -243,7 +243,7 @@ define void @rol_i32_nosext(i32 signext %a, i32 signext %b, ptr %x) nounwind { ; RV64I-LABEL: rol_i32_nosext: ; RV64I: # %bb.0: ; RV64I-NEXT: sllw a3, a0, a1 -; RV64I-NEXT: negw a1, a1 +; RV64I-NEXT: neg a1, a1 ; RV64I-NEXT: srlw a0, a0, a1 ; RV64I-NEXT: or a0, a3, a0 ; RV64I-NEXT: sw a0, 0(a2) @@ -263,7 +263,7 @@ define signext i32 @rol_i32_neg_constant_rhs(i32 signext %a) nounwind { ; RV64I-LABEL: rol_i32_neg_constant_rhs: ; RV64I: # %bb.0: ; RV64I-NEXT: li a1, -2 -; RV64I-NEXT: negw a2, a0 +; RV64I-NEXT: neg a2, a0 ; RV64I-NEXT: sllw a0, a1, a0 ; RV64I-NEXT: srlw a1, a1, a2 ; RV64I-NEXT: or a0, a0, a1 @@ -284,7 +284,7 @@ define i64 @rol_i64(i64 %a, i64 %b) nounwind { ; RV64I-LABEL: rol_i64: ; RV64I: # %bb.0: ; RV64I-NEXT: sll a2, a0, a1 -; RV64I-NEXT: negw a1, a1 +; RV64I-NEXT: neg a1, a1 ; RV64I-NEXT: srl a0, a0, a1 ; RV64I-NEXT: or a0, a2, a0 ; RV64I-NEXT: ret @@ -303,7 +303,7 @@ define signext i32 @ror_i32(i32 signext %a, i32 signext %b) nounwind { ; RV64I-LABEL: ror_i32: ; RV64I: # %bb.0: ; RV64I-NEXT: srlw a2, a0, a1 -; RV64I-NEXT: negw a1, a1 +; RV64I-NEXT: neg a1, a1 ; RV64I-NEXT: sllw a0, a0, a1 ; RV64I-NEXT: or a0, a2, a0 ; RV64I-NEXT: ret @@ -321,7 +321,7 @@ define void @ror_i32_nosext(i32 signext %a, i32 signext %b, ptr %x) nounwind { ; RV64I-LABEL: ror_i32_nosext: ; RV64I: # %bb.0: ; RV64I-NEXT: srlw a3, a0, a1 -; RV64I-NEXT: negw a1, a1 +; RV64I-NEXT: neg a1, a1 ; RV64I-NEXT: sllw a0, a0, a1 ; RV64I-NEXT: or a0, a3, a0 ; RV64I-NEXT: sw a0, 0(a2) @@ -341,7 +341,7 @@ define signext i32 @ror_i32_neg_constant_rhs(i32 signext %a) nounwind { ; RV64I-LABEL: ror_i32_neg_constant_rhs: ; RV64I: # %bb.0: ; RV64I-NEXT: li a1, -2 -; RV64I-NEXT: negw a2, a0 +; RV64I-NEXT: neg a2, a0 ; RV64I-NEXT: srlw a0, a1, a0 ; RV64I-NEXT: sllw a1, a1, a2 ; RV64I-NEXT: or a0, a0, a1 @@ -362,7 +362,7 @@ define i64 @ror_i64(i64 %a, i64 %b) nounwind { ; RV64I-LABEL: ror_i64: ; RV64I: # %bb.0: ; RV64I-NEXT: srl a2, a0, a1 -; RV64I-NEXT: negw a1, a1 +; RV64I-NEXT: neg a1, a1 ; RV64I-NEXT: sll a0, a0, a1 ; RV64I-NEXT: or a0, a2, a0 ; RV64I-NEXT: ret diff --git a/llvm/test/CodeGen/RISCV/rv64zbb.ll b/llvm/test/CodeGen/RISCV/rv64zbb.ll index e640727..d133f9d 100644 --- a/llvm/test/CodeGen/RISCV/rv64zbb.ll +++ b/llvm/test/CodeGen/RISCV/rv64zbb.ll @@ -347,7 +347,7 @@ define signext i32 @cttz_i32(i32 signext %a) nounwind { ; RV64I: # %bb.0: ; RV64I-NEXT: beqz a0, .LBB6_2 ; RV64I-NEXT: # %bb.1: # %cond.false -; RV64I-NEXT: negw a1, a0 +; RV64I-NEXT: neg a1, a0 ; RV64I-NEXT: and a0, a0, a1 ; RV64I-NEXT: slli a1, a0, 6 ; RV64I-NEXT: slli a2, a0, 8 @@ -355,16 +355,16 @@ define signext i32 @cttz_i32(i32 signext %a) nounwind { ; RV64I-NEXT: slli a4, a0, 12 ; RV64I-NEXT: add a1, a1, a2 ; RV64I-NEXT: slli a2, a0, 16 -; RV64I-NEXT: subw a3, a3, a4 +; RV64I-NEXT: sub a3, a3, a4 ; RV64I-NEXT: slli a4, a0, 18 -; RV64I-NEXT: subw a2, a2, a4 +; RV64I-NEXT: sub a2, a2, a4 ; RV64I-NEXT: slli a4, a0, 4 -; RV64I-NEXT: subw a4, a0, a4 +; RV64I-NEXT: sub a4, a0, a4 ; RV64I-NEXT: add a1, a4, a1 ; RV64I-NEXT: slli a4, a0, 14 -; RV64I-NEXT: subw a3, a3, a4 +; RV64I-NEXT: sub a3, a3, a4 ; RV64I-NEXT: slli a4, a0, 23 -; RV64I-NEXT: subw a2, a2, a4 +; RV64I-NEXT: sub a2, a2, a4 ; RV64I-NEXT: slli a0, a0, 27 ; RV64I-NEXT: add a1, a1, a3 ; RV64I-NEXT: add a0, a2, a0 @@ -390,7 +390,7 @@ define signext i32 @cttz_i32(i32 signext %a) nounwind { define signext i32 @cttz_zero_undef_i32(i32 signext %a) nounwind { ; RV64I-LABEL: cttz_zero_undef_i32: ; RV64I: # %bb.0: -; RV64I-NEXT: negw a1, a0 +; RV64I-NEXT: neg a1, a0 ; RV64I-NEXT: and a0, a0, a1 ; RV64I-NEXT: slli a1, a0, 6 ; RV64I-NEXT: slli a2, a0, 8 @@ -398,16 +398,16 @@ define signext i32 @cttz_zero_undef_i32(i32 signext %a) nounwind { ; RV64I-NEXT: slli a4, a0, 12 ; RV64I-NEXT: add a1, a1, a2 ; RV64I-NEXT: slli a2, a0, 16 -; RV64I-NEXT: subw a3, a3, a4 +; RV64I-NEXT: sub a3, a3, a4 ; RV64I-NEXT: slli a4, a0, 18 -; RV64I-NEXT: subw a2, a2, a4 +; RV64I-NEXT: sub a2, a2, a4 ; RV64I-NEXT: slli a4, a0, 4 -; RV64I-NEXT: subw a4, a0, a4 +; RV64I-NEXT: sub a4, a0, a4 ; RV64I-NEXT: add a1, a4, a1 ; RV64I-NEXT: slli a4, a0, 14 -; RV64I-NEXT: subw a3, a3, a4 +; RV64I-NEXT: sub a3, a3, a4 ; RV64I-NEXT: slli a4, a0, 23 -; RV64I-NEXT: subw a2, a2, a4 +; RV64I-NEXT: sub a2, a2, a4 ; RV64I-NEXT: slli a0, a0, 27 ; RV64I-NEXT: add a1, a1, a3 ; RV64I-NEXT: add a0, a2, a0 @@ -430,7 +430,7 @@ define signext i32 @cttz_zero_undef_i32(i32 signext %a) nounwind { define signext i32 @findFirstSet_i32(i32 signext %a) nounwind { ; RV64I-LABEL: findFirstSet_i32: ; RV64I: # %bb.0: -; RV64I-NEXT: negw a1, a0 +; RV64I-NEXT: neg a1, a0 ; RV64I-NEXT: and a1, a0, a1 ; RV64I-NEXT: slli a2, a1, 6 ; RV64I-NEXT: slli a3, a1, 8 @@ -438,16 +438,16 @@ define signext i32 @findFirstSet_i32(i32 signext %a) nounwind { ; RV64I-NEXT: slli a5, a1, 12 ; RV64I-NEXT: add a2, a2, a3 ; RV64I-NEXT: slli a3, a1, 16 -; RV64I-NEXT: subw a4, a4, a5 +; RV64I-NEXT: sub a4, a4, a5 ; RV64I-NEXT: slli a5, a1, 18 -; RV64I-NEXT: subw a3, a3, a5 +; RV64I-NEXT: sub a3, a3, a5 ; RV64I-NEXT: slli a5, a1, 4 -; RV64I-NEXT: subw a5, a1, a5 +; RV64I-NEXT: sub a5, a1, a5 ; RV64I-NEXT: add a2, a5, a2 ; RV64I-NEXT: slli a5, a1, 14 -; RV64I-NEXT: subw a4, a4, a5 +; RV64I-NEXT: sub a4, a4, a5 ; RV64I-NEXT: slli a5, a1, 23 -; RV64I-NEXT: subw a3, a3, a5 +; RV64I-NEXT: sub a3, a3, a5 ; RV64I-NEXT: slli a1, a1, 27 ; RV64I-NEXT: add a2, a2, a4 ; RV64I-NEXT: add a1, a3, a1 @@ -478,7 +478,7 @@ define signext i32 @findFirstSet_i32(i32 signext %a) nounwind { define signext i32 @ffs_i32(i32 signext %a) nounwind { ; RV64I-LABEL: ffs_i32: ; RV64I: # %bb.0: -; RV64I-NEXT: negw a1, a0 +; RV64I-NEXT: neg a1, a0 ; RV64I-NEXT: and a1, a0, a1 ; RV64I-NEXT: slli a2, a1, 6 ; RV64I-NEXT: slli a3, a1, 8 @@ -486,16 +486,16 @@ define signext i32 @ffs_i32(i32 signext %a) nounwind { ; RV64I-NEXT: slli a5, a1, 12 ; RV64I-NEXT: add a2, a2, a3 ; RV64I-NEXT: slli a3, a1, 16 -; RV64I-NEXT: subw a4, a4, a5 +; RV64I-NEXT: sub a4, a4, a5 ; RV64I-NEXT: slli a5, a1, 18 -; RV64I-NEXT: subw a3, a3, a5 +; RV64I-NEXT: sub a3, a3, a5 ; RV64I-NEXT: slli a5, a1, 4 -; RV64I-NEXT: subw a5, a1, a5 +; RV64I-NEXT: sub a5, a1, a5 ; RV64I-NEXT: add a2, a5, a2 ; RV64I-NEXT: slli a5, a1, 14 -; RV64I-NEXT: subw a4, a4, a5 +; RV64I-NEXT: sub a4, a4, a5 ; RV64I-NEXT: slli a5, a1, 23 -; RV64I-NEXT: subw a3, a3, a5 +; RV64I-NEXT: sub a3, a3, a5 ; RV64I-NEXT: add a2, a2, a4 ; RV64I-NEXT: lui a4, %hi(.LCPI9_0) ; RV64I-NEXT: addi a4, a4, %lo(.LCPI9_0) @@ -701,7 +701,7 @@ define signext i32 @ctpop_i32_load(ptr %p) nounwind { ; ; RV64ZBB-LABEL: ctpop_i32_load: ; RV64ZBB: # %bb.0: -; RV64ZBB-NEXT: lwu a0, 0(a0) +; RV64ZBB-NEXT: lw a0, 0(a0) ; RV64ZBB-NEXT: cpopw a0, a0 ; RV64ZBB-NEXT: ret %a = load i32, ptr %p @@ -1741,7 +1741,7 @@ define i8 @sub_if_uge_i8(i8 %x, i8 %y) { ; RV64ZBB-LABEL: sub_if_uge_i8: ; RV64ZBB: # %bb.0: ; RV64ZBB-NEXT: zext.b a2, a0 -; RV64ZBB-NEXT: subw a0, a0, a1 +; RV64ZBB-NEXT: sub a0, a0, a1 ; RV64ZBB-NEXT: zext.b a0, a0 ; RV64ZBB-NEXT: minu a0, a2, a0 ; RV64ZBB-NEXT: ret @@ -1767,7 +1767,7 @@ define i16 @sub_if_uge_i16(i16 %x, i16 %y) { ; RV64ZBB-LABEL: sub_if_uge_i16: ; RV64ZBB: # %bb.0: ; RV64ZBB-NEXT: zext.h a2, a0 -; RV64ZBB-NEXT: subw a0, a0, a1 +; RV64ZBB-NEXT: sub a0, a0, a1 ; RV64ZBB-NEXT: zext.h a0, a0 ; RV64ZBB-NEXT: minu a0, a2, a0 ; RV64ZBB-NEXT: ret @@ -1852,7 +1852,7 @@ define i32 @sub_if_uge_multiuse_select_i32(i32 %x, i32 %y) { ; CHECK-NEXT: sltu a2, a3, a2 ; CHECK-NEXT: addi a2, a2, -1 ; CHECK-NEXT: and a1, a2, a1 -; CHECK-NEXT: subw a0, a0, a1 +; CHECK-NEXT: sub a0, a0, a1 ; CHECK-NEXT: sllw a0, a0, a1 ; CHECK-NEXT: ret %cmp = icmp ult i32 %x, %y @@ -1870,7 +1870,7 @@ define i32 @sub_if_uge_multiuse_cmp_i32(i32 %x, i32 %y) { ; RV64I-NEXT: sltu a4, a3, a2 ; RV64I-NEXT: addi a4, a4, -1 ; RV64I-NEXT: and a1, a4, a1 -; RV64I-NEXT: subw a0, a0, a1 +; RV64I-NEXT: sub a0, a0, a1 ; RV64I-NEXT: bltu a3, a2, .LBB68_2 ; RV64I-NEXT: # %bb.1: ; RV64I-NEXT: li a1, 4 @@ -1980,7 +1980,7 @@ define i32 @sub_if_uge_C_i32(i32 signext %x) { ; RV64I-NEXT: lui a2, 1048560 ; RV64I-NEXT: addi a1, a1, -16 ; RV64I-NEXT: sltu a1, a1, a0 -; RV64I-NEXT: negw a1, a1 +; RV64I-NEXT: neg a1, a1 ; RV64I-NEXT: addi a2, a2, 15 ; RV64I-NEXT: and a1, a1, a2 ; RV64I-NEXT: addw a0, a0, a1 @@ -2036,7 +2036,7 @@ define i32 @sub_if_uge_C_multiuse_cmp_i32(i32 signext %x, ptr %z) { ; RV64I-NEXT: lui a3, 1048560 ; RV64I-NEXT: addi a2, a2, -16 ; RV64I-NEXT: sltu a2, a2, a0 -; RV64I-NEXT: negw a4, a2 +; RV64I-NEXT: neg a4, a2 ; RV64I-NEXT: addi a3, a3, 15 ; RV64I-NEXT: and a3, a4, a3 ; RV64I-NEXT: addw a0, a0, a3 diff --git a/llvm/test/CodeGen/RISCV/rv64zbkb.ll b/llvm/test/CodeGen/RISCV/rv64zbkb.ll index 696c2a5..818ea72 100644 --- a/llvm/test/CodeGen/RISCV/rv64zbkb.ll +++ b/llvm/test/CodeGen/RISCV/rv64zbkb.ll @@ -114,7 +114,7 @@ define i64 @pack_i64_3(ptr %0, ptr %1) { ; RV64ZBKB-LABEL: pack_i64_3: ; RV64ZBKB: # %bb.0: ; RV64ZBKB-NEXT: lw a0, 0(a0) -; RV64ZBKB-NEXT: lwu a1, 0(a1) +; RV64ZBKB-NEXT: lw a1, 0(a1) ; RV64ZBKB-NEXT: pack a0, a1, a0 ; RV64ZBKB-NEXT: ret %3 = load i32, ptr %0, align 4 diff --git a/llvm/test/CodeGen/RISCV/rvv/expand-no-v.ll b/llvm/test/CodeGen/RISCV/rvv/expand-no-v.ll index 5b82b27..81b2b65 100644 --- a/llvm/test/CodeGen/RISCV/rvv/expand-no-v.ll +++ b/llvm/test/CodeGen/RISCV/rvv/expand-no-v.ll @@ -63,10 +63,10 @@ define i32 @vpreduce_add_v4i32(i32 %s, <4 x i32> %v, <4 x i1> %m, i32 %evl) { ; RV64-NEXT: and a2, t4, a2 ; RV64-NEXT: and t0, t3, t1 ; RV64-NEXT: and a7, t2, a7 -; RV64-NEXT: negw a7, a7 -; RV64-NEXT: negw t0, t0 -; RV64-NEXT: negw a2, a2 -; RV64-NEXT: negw a3, a3 +; RV64-NEXT: neg a7, a7 +; RV64-NEXT: neg t0, t0 +; RV64-NEXT: neg a2, a2 +; RV64-NEXT: neg a3, a3 ; RV64-NEXT: and a4, a7, a4 ; RV64-NEXT: and a6, t0, a6 ; RV64-NEXT: and a1, a2, a1 diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-strided-load-store-asm.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-strided-load-store-asm.ll index 07aa05f..48845c5 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-strided-load-store-asm.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-strided-load-store-asm.ll @@ -930,7 +930,7 @@ define void @strided_load_startval_add_with_splat(ptr noalias nocapture %arg, pt ; CHECK-NEXT: add a2, a0, a4 ; CHECK-NEXT: slli a5, a4, 2 ; CHECK-NEXT: add a1, a1, a4 -; CHECK-NEXT: subw a3, a3, a4 +; CHECK-NEXT: sub a3, a3, a4 ; CHECK-NEXT: add a1, a1, a5 ; CHECK-NEXT: slli a3, a3, 32 ; CHECK-NEXT: srli a3, a3, 32 diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-unaligned.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-unaligned.ll index b6253c6..dcf1ab0 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-unaligned.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-unaligned.ll @@ -204,7 +204,7 @@ define <2 x i64> @mgather_v2i64_align4(<2 x ptr> %ptrs, <2 x i1> %m, <2 x i64> % ; RV64-SLOW-NEXT: # %bb.1: # %cond.load ; RV64-SLOW-NEXT: vsetvli zero, zero, e64, m8, tu, ma ; RV64-SLOW-NEXT: vmv.x.s a1, v8 -; RV64-SLOW-NEXT: lwu a2, 4(a1) +; RV64-SLOW-NEXT: lw a2, 4(a1) ; RV64-SLOW-NEXT: lwu a1, 0(a1) ; RV64-SLOW-NEXT: slli a2, a2, 32 ; RV64-SLOW-NEXT: or a1, a2, a1 @@ -216,7 +216,7 @@ define <2 x i64> @mgather_v2i64_align4(<2 x ptr> %ptrs, <2 x i1> %m, <2 x i64> % ; RV64-SLOW-NEXT: vsetivli zero, 2, e64, m1, ta, ma ; RV64-SLOW-NEXT: vslidedown.vi v8, v8, 1 ; RV64-SLOW-NEXT: vmv.x.s a0, v8 -; RV64-SLOW-NEXT: lwu a1, 4(a0) +; RV64-SLOW-NEXT: lw a1, 4(a0) ; RV64-SLOW-NEXT: lwu a0, 0(a0) ; RV64-SLOW-NEXT: slli a1, a1, 32 ; RV64-SLOW-NEXT: or a0, a1, a0 diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwaddu.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwaddu.ll index 1a716f6..e89bac5 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwaddu.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwaddu.ll @@ -818,7 +818,7 @@ define <2 x i64> @vwaddu_vx_v2i64_i32(ptr %x, ptr %y) nounwind { ; RV64: # %bb.0: ; RV64-NEXT: vsetivli zero, 2, e32, mf2, ta, ma ; RV64-NEXT: vle32.v v9, (a0) -; RV64-NEXT: lwu a0, 0(a1) +; RV64-NEXT: lw a0, 0(a1) ; RV64-NEXT: vwaddu.vx v8, v9, a0 ; RV64-NEXT: ret %a = load <2 x i32>, ptr %x diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwmulsu.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwmulsu.ll index 8ebd93e..b933ef9 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwmulsu.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwmulsu.ll @@ -853,7 +853,7 @@ define <2 x i64> @vwmulsu_vx_v2i64_i32(ptr %x, ptr %y) { ; RV64: # %bb.0: ; RV64-NEXT: vsetivli zero, 2, e32, mf2, ta, ma ; RV64-NEXT: vle32.v v9, (a0) -; RV64-NEXT: lwu a0, 0(a1) +; RV64-NEXT: lw a0, 0(a1) ; RV64-NEXT: vwmulsu.vx v8, v9, a0 ; RV64-NEXT: ret %a = load <2 x i32>, ptr %x diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwmulu.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwmulu.ll index 90e9ffd..7cedee5 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwmulu.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwmulu.ll @@ -710,13 +710,6 @@ define <4 x i32> @vwmulu_vx_v4i32_i8(ptr %x, ptr %y) { } define <4 x i32> @vwmulu_vx_v4i32_i16(ptr %x, ptr %y) { -; CHECK-LABEL: vwmulu_vx_v4i32_i16: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, ma -; CHECK-NEXT: vle16.v v9, (a0) -; CHECK-NEXT: lhu a0, 0(a1) -; CHECK-NEXT: vwmulu.vx v8, v9, a0 -; CHECK-NEXT: ret %a = load <4 x i16>, ptr %x %b = load i16, ptr %y %c = zext i16 %b to i32 diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwsubu.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwsubu.ll index bfdda47..86ac038e 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwsubu.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwsubu.ll @@ -821,7 +821,7 @@ define <2 x i64> @vwsubu_vx_v2i64_i32(ptr %x, ptr %y) nounwind { ; ; RV64-LABEL: vwsubu_vx_v2i64_i32: ; RV64: # %bb.0: -; RV64-NEXT: lwu a1, 0(a1) +; RV64-NEXT: lw a1, 0(a1) ; RV64-NEXT: vsetivli zero, 2, e32, mf2, ta, ma ; RV64-NEXT: vle32.v v9, (a0) ; RV64-NEXT: vmv.v.x v10, a1 diff --git a/llvm/test/CodeGen/RISCV/rvv/fpclamptosat_vec.ll b/llvm/test/CodeGen/RISCV/rvv/fpclamptosat_vec.ll index f9ac53b..f481f9c 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fpclamptosat_vec.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fpclamptosat_vec.ll @@ -274,10 +274,10 @@ define <4 x i32> @ustest_f32i32(<4 x float> %x) { ; CHECK-NOV-NEXT: sgtz a6, a2 ; CHECK-NOV-NEXT: sgtz a7, a3 ; CHECK-NOV-NEXT: sgtz t0, a5 -; CHECK-NOV-NEXT: negw t0, t0 -; CHECK-NOV-NEXT: negw a7, a7 -; CHECK-NOV-NEXT: negw a6, a6 -; CHECK-NOV-NEXT: negw a4, a4 +; CHECK-NOV-NEXT: neg t0, t0 +; CHECK-NOV-NEXT: neg a7, a7 +; CHECK-NOV-NEXT: neg a6, a6 +; CHECK-NOV-NEXT: neg a4, a4 ; CHECK-NOV-NEXT: and a5, t0, a5 ; CHECK-NOV-NEXT: and a3, a7, a3 ; CHECK-NOV-NEXT: and a2, a6, a2 @@ -755,10 +755,10 @@ define <4 x i32> @ustest_f16i32(<4 x half> %x) { ; CHECK-NOV-NEXT: sgtz a4, s1 ; CHECK-NOV-NEXT: sgtz a5, a1 ; CHECK-NOV-NEXT: sgtz a6, a3 -; CHECK-NOV-NEXT: negw a6, a6 -; CHECK-NOV-NEXT: negw a5, a5 -; CHECK-NOV-NEXT: negw a4, a4 -; CHECK-NOV-NEXT: negw a2, a2 +; CHECK-NOV-NEXT: neg a6, a6 +; CHECK-NOV-NEXT: neg a5, a5 +; CHECK-NOV-NEXT: neg a4, a4 +; CHECK-NOV-NEXT: neg a2, a2 ; CHECK-NOV-NEXT: and a3, a6, a3 ; CHECK-NOV-NEXT: and a1, a5, a1 ; CHECK-NOV-NEXT: and a4, a4, s1 @@ -1166,10 +1166,10 @@ define <4 x i16> @ustest_f32i16(<4 x float> %x) { ; CHECK-NOV-NEXT: sgtz a6, a2 ; CHECK-NOV-NEXT: sgtz a7, a3 ; CHECK-NOV-NEXT: sgtz t0, a5 -; CHECK-NOV-NEXT: negw t0, t0 -; CHECK-NOV-NEXT: negw a7, a7 -; CHECK-NOV-NEXT: negw a6, a6 -; CHECK-NOV-NEXT: negw a4, a4 +; CHECK-NOV-NEXT: neg t0, t0 +; CHECK-NOV-NEXT: neg a7, a7 +; CHECK-NOV-NEXT: neg a6, a6 +; CHECK-NOV-NEXT: neg a4, a4 ; CHECK-NOV-NEXT: and a5, t0, a5 ; CHECK-NOV-NEXT: and a3, a7, a3 ; CHECK-NOV-NEXT: and a2, a6, a2 @@ -2040,14 +2040,14 @@ define <8 x i16> @ustest_f16i16(<8 x half> %x) { ; CHECK-NOV-NEXT: sgtz t4, a5 ; CHECK-NOV-NEXT: sgtz t5, a6 ; CHECK-NOV-NEXT: sgtz t6, a7 -; CHECK-NOV-NEXT: negw t6, t6 -; CHECK-NOV-NEXT: negw t5, t5 -; CHECK-NOV-NEXT: negw t4, t4 -; CHECK-NOV-NEXT: negw t3, t3 -; CHECK-NOV-NEXT: negw t2, t2 -; CHECK-NOV-NEXT: negw t1, t1 -; CHECK-NOV-NEXT: negw t0, t0 -; CHECK-NOV-NEXT: negw a4, a4 +; CHECK-NOV-NEXT: neg t6, t6 +; CHECK-NOV-NEXT: neg t5, t5 +; CHECK-NOV-NEXT: neg t4, t4 +; CHECK-NOV-NEXT: neg t3, t3 +; CHECK-NOV-NEXT: neg t2, t2 +; CHECK-NOV-NEXT: neg t1, t1 +; CHECK-NOV-NEXT: neg t0, t0 +; CHECK-NOV-NEXT: neg a4, a4 ; CHECK-NOV-NEXT: and a7, t6, a7 ; CHECK-NOV-NEXT: and a6, t5, a6 ; CHECK-NOV-NEXT: and a5, t4, a5 @@ -3830,16 +3830,16 @@ define <4 x i32> @ustest_f32i32_mm(<4 x float> %x) { ; CHECK-NOV-NEXT: mv a5, a3 ; CHECK-NOV-NEXT: .LBB32_5: # %entry ; CHECK-NOV-NEXT: sgtz a3, a5 -; CHECK-NOV-NEXT: negw a3, a3 +; CHECK-NOV-NEXT: neg a3, a3 ; CHECK-NOV-NEXT: and a3, a3, a5 ; CHECK-NOV-NEXT: sgtz a5, a4 -; CHECK-NOV-NEXT: negw a5, a5 +; CHECK-NOV-NEXT: neg a5, a5 ; CHECK-NOV-NEXT: and a4, a5, a4 ; CHECK-NOV-NEXT: sgtz a5, a2 -; CHECK-NOV-NEXT: negw a5, a5 +; CHECK-NOV-NEXT: neg a5, a5 ; CHECK-NOV-NEXT: and a2, a5, a2 ; CHECK-NOV-NEXT: sgtz a5, a1 -; CHECK-NOV-NEXT: negw a5, a5 +; CHECK-NOV-NEXT: neg a5, a5 ; CHECK-NOV-NEXT: and a1, a5, a1 ; CHECK-NOV-NEXT: sw a3, 0(a0) ; CHECK-NOV-NEXT: sw a4, 4(a0) @@ -4306,16 +4306,16 @@ define <4 x i32> @ustest_f16i32_mm(<4 x half> %x) { ; CHECK-NOV-NEXT: mv a3, a2 ; CHECK-NOV-NEXT: .LBB35_5: # %entry ; CHECK-NOV-NEXT: sgtz a2, a3 -; CHECK-NOV-NEXT: negw a2, a2 +; CHECK-NOV-NEXT: neg a2, a2 ; CHECK-NOV-NEXT: and a2, a2, a3 ; CHECK-NOV-NEXT: sgtz a3, a1 -; CHECK-NOV-NEXT: negw a3, a3 +; CHECK-NOV-NEXT: neg a3, a3 ; CHECK-NOV-NEXT: and a1, a3, a1 ; CHECK-NOV-NEXT: sgtz a3, s1 -; CHECK-NOV-NEXT: negw a3, a3 +; CHECK-NOV-NEXT: neg a3, a3 ; CHECK-NOV-NEXT: and a3, a3, s1 ; CHECK-NOV-NEXT: sgtz a4, a0 -; CHECK-NOV-NEXT: negw a4, a4 +; CHECK-NOV-NEXT: neg a4, a4 ; CHECK-NOV-NEXT: and a0, a4, a0 ; CHECK-NOV-NEXT: sw a2, 0(s0) ; CHECK-NOV-NEXT: sw a1, 4(s0) @@ -4707,16 +4707,16 @@ define <4 x i16> @ustest_f32i16_mm(<4 x float> %x) { ; CHECK-NOV-NEXT: mv a5, a3 ; CHECK-NOV-NEXT: .LBB41_5: # %entry ; CHECK-NOV-NEXT: sgtz a3, a5 -; CHECK-NOV-NEXT: negw a3, a3 +; CHECK-NOV-NEXT: neg a3, a3 ; CHECK-NOV-NEXT: and a3, a3, a5 ; CHECK-NOV-NEXT: sgtz a5, a4 -; CHECK-NOV-NEXT: negw a5, a5 +; CHECK-NOV-NEXT: neg a5, a5 ; CHECK-NOV-NEXT: and a4, a5, a4 ; CHECK-NOV-NEXT: sgtz a5, a2 -; CHECK-NOV-NEXT: negw a5, a5 +; CHECK-NOV-NEXT: neg a5, a5 ; CHECK-NOV-NEXT: and a2, a5, a2 ; CHECK-NOV-NEXT: sgtz a5, a1 -; CHECK-NOV-NEXT: negw a5, a5 +; CHECK-NOV-NEXT: neg a5, a5 ; CHECK-NOV-NEXT: and a1, a5, a1 ; CHECK-NOV-NEXT: sh a3, 0(a0) ; CHECK-NOV-NEXT: sh a4, 2(a0) @@ -5572,28 +5572,28 @@ define <8 x i16> @ustest_f16i16_mm(<8 x half> %x) { ; CHECK-NOV-NEXT: mv a7, a3 ; CHECK-NOV-NEXT: .LBB44_9: # %entry ; CHECK-NOV-NEXT: sgtz a3, a7 -; CHECK-NOV-NEXT: negw a3, a3 +; CHECK-NOV-NEXT: neg a3, a3 ; CHECK-NOV-NEXT: and a3, a3, a7 ; CHECK-NOV-NEXT: sgtz a7, a6 -; CHECK-NOV-NEXT: negw a7, a7 +; CHECK-NOV-NEXT: neg a7, a7 ; CHECK-NOV-NEXT: and a6, a7, a6 ; CHECK-NOV-NEXT: sgtz a7, a5 -; CHECK-NOV-NEXT: negw a7, a7 +; CHECK-NOV-NEXT: neg a7, a7 ; CHECK-NOV-NEXT: and a5, a7, a5 ; CHECK-NOV-NEXT: sgtz a7, a4 -; CHECK-NOV-NEXT: negw a7, a7 +; CHECK-NOV-NEXT: neg a7, a7 ; CHECK-NOV-NEXT: and a4, a7, a4 ; CHECK-NOV-NEXT: sgtz a7, a2 -; CHECK-NOV-NEXT: negw a7, a7 +; CHECK-NOV-NEXT: neg a7, a7 ; CHECK-NOV-NEXT: and a2, a7, a2 ; CHECK-NOV-NEXT: sgtz a7, a1 -; CHECK-NOV-NEXT: negw a7, a7 +; CHECK-NOV-NEXT: neg a7, a7 ; CHECK-NOV-NEXT: and a1, a7, a1 ; CHECK-NOV-NEXT: sgtz a7, s1 -; CHECK-NOV-NEXT: negw a7, a7 +; CHECK-NOV-NEXT: neg a7, a7 ; CHECK-NOV-NEXT: and a7, a7, s1 ; CHECK-NOV-NEXT: sgtz t0, a0 -; CHECK-NOV-NEXT: negw t0, t0 +; CHECK-NOV-NEXT: neg t0, t0 ; CHECK-NOV-NEXT: and a0, t0, a0 ; CHECK-NOV-NEXT: sh a2, 8(s0) ; CHECK-NOV-NEXT: sh a1, 10(s0) diff --git a/llvm/test/CodeGen/RISCV/rvv/known-never-zero.ll b/llvm/test/CodeGen/RISCV/rvv/known-never-zero.ll index 4d9a6ae..749b2041 100644 --- a/llvm/test/CodeGen/RISCV/rvv/known-never-zero.ll +++ b/llvm/test/CodeGen/RISCV/rvv/known-never-zero.ll @@ -11,7 +11,7 @@ define i32 @vscale_known_nonzero() { ; CHECK: # %bb.0: ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: srli a0, a0, 3 -; CHECK-NEXT: negw a1, a0 +; CHECK-NEXT: neg a1, a0 ; CHECK-NEXT: and a0, a0, a1 ; CHECK-NEXT: slli a1, a0, 6 ; CHECK-NEXT: slli a2, a0, 8 @@ -19,16 +19,16 @@ define i32 @vscale_known_nonzero() { ; CHECK-NEXT: slli a4, a0, 12 ; CHECK-NEXT: add a1, a1, a2 ; CHECK-NEXT: slli a2, a0, 16 -; CHECK-NEXT: subw a3, a3, a4 +; CHECK-NEXT: sub a3, a3, a4 ; CHECK-NEXT: slli a4, a0, 18 -; CHECK-NEXT: subw a2, a2, a4 +; CHECK-NEXT: sub a2, a2, a4 ; CHECK-NEXT: slli a4, a0, 4 -; CHECK-NEXT: subw a4, a0, a4 +; CHECK-NEXT: sub a4, a0, a4 ; CHECK-NEXT: add a1, a4, a1 ; CHECK-NEXT: slli a4, a0, 14 -; CHECK-NEXT: subw a3, a3, a4 +; CHECK-NEXT: sub a3, a3, a4 ; CHECK-NEXT: slli a4, a0, 23 -; CHECK-NEXT: subw a2, a2, a4 +; CHECK-NEXT: sub a2, a2, a4 ; CHECK-NEXT: slli a0, a0, 27 ; CHECK-NEXT: add a1, a1, a3 ; CHECK-NEXT: add a0, a2, a0 diff --git a/llvm/test/CodeGen/RISCV/rvv/setcc-fp-vp.ll b/llvm/test/CodeGen/RISCV/rvv/setcc-fp-vp.ll index 8495dfe..32892bc 100644 --- a/llvm/test/CodeGen/RISCV/rvv/setcc-fp-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/setcc-fp-vp.ll @@ -1,16 +1,16 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -mtriple=riscv32 -mattr=+d,+zvfh,+zfbfmin,+zvfbfmin,+v \ ; RUN: -target-abi=ilp32d -verify-machineinstrs < %s | FileCheck %s \ -; RUN: --check-prefixes=CHECK,ZVFH +; RUN: --check-prefixes=CHECK,CHECK32,ZVFH ; RUN: llc -mtriple=riscv64 -mattr=+d,+zvfh,+zfbfmin,+zvfbfmin,+v \ ; RUN: -target-abi=lp64d -verify-machineinstrs < %s | FileCheck %s \ -; RUN: --check-prefixes=CHECK,ZVFH +; RUN: --check-prefixes=CHECK,CHECK64,ZVFH ; RUN: llc -mtriple=riscv32 -mattr=+d,+zfhmin,+zvfhmin,+zfbfmin,+zvfbfmin,+v \ ; RUN: -target-abi=ilp32d -verify-machineinstrs < %s | FileCheck %s \ -; RUN: --check-prefixes=CHECK,ZVFHMIN +; RUN: --check-prefixes=CHECK,CHECK32,ZVFHMIN ; RUN: llc -mtriple=riscv64 -mattr=+d,+zfhmin,+zvfhmin,+zfbfmin,+zvfbfmin,+v \ ; RUN: -target-abi=lp64d -verify-machineinstrs < %s | FileCheck %s \ -; RUN: --check-prefixes=CHECK,ZVFHMIN +; RUN: --check-prefixes=CHECK,CHECK64,ZVFHMIN declare <vscale x 1 x i1> @llvm.vp.fcmp.nxv1bf16(<vscale x 1 x bfloat>, <vscale x 1 x bfloat>, metadata, <vscale x 1 x i1>, i32) @@ -4820,6 +4820,427 @@ define <vscale x 8 x i1> @fcmp_uno_vf_swap_nxv8f64(<vscale x 8 x double> %va, do declare <vscale x 32 x i1> @llvm.vp.fcmp.nxv32f64(<vscale x 32 x double>, <vscale x 32 x double>, metadata, <vscale x 32 x i1>, i32) define <vscale x 32 x i1> @fcmp_oeq_vv_nxv32f64(<vscale x 32 x double> %va, <vscale x 32 x double> %vb, <vscale x 32 x i1> %m, i32 zeroext %evl) { +; CHECK32-LABEL: fcmp_oeq_vv_nxv32f64: +; CHECK32: # %bb.0: +; CHECK32-NEXT: addi sp, sp, -48 +; CHECK32-NEXT: .cfi_def_cfa_offset 48 +; CHECK32-NEXT: sw ra, 44(sp) # 4-byte Folded Spill +; CHECK32-NEXT: sw s0, 40(sp) # 4-byte Folded Spill +; CHECK32-NEXT: sw s1, 36(sp) # 4-byte Folded Spill +; CHECK32-NEXT: sw s2, 32(sp) # 4-byte Folded Spill +; CHECK32-NEXT: sw s3, 28(sp) # 4-byte Folded Spill +; CHECK32-NEXT: sw s4, 24(sp) # 4-byte Folded Spill +; CHECK32-NEXT: .cfi_offset ra, -4 +; CHECK32-NEXT: .cfi_offset s0, -8 +; CHECK32-NEXT: .cfi_offset s1, -12 +; CHECK32-NEXT: .cfi_offset s2, -16 +; CHECK32-NEXT: .cfi_offset s3, -20 +; CHECK32-NEXT: .cfi_offset s4, -24 +; CHECK32-NEXT: csrr a1, vlenb +; CHECK32-NEXT: slli a1, a1, 1 +; CHECK32-NEXT: mv a3, a1 +; CHECK32-NEXT: slli a1, a1, 2 +; CHECK32-NEXT: add a3, a3, a1 +; CHECK32-NEXT: slli a1, a1, 1 +; CHECK32-NEXT: add a1, a1, a3 +; CHECK32-NEXT: sub sp, sp, a1 +; CHECK32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x30, 0x22, 0x11, 0x1a, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 48 + 26 * vlenb +; CHECK32-NEXT: mv s1, a6 +; CHECK32-NEXT: csrr a1, vlenb +; CHECK32-NEXT: add a1, sp, a1 +; CHECK32-NEXT: addi a1, a1, 16 +; CHECK32-NEXT: vs1r.v v0, (a1) # vscale x 8-byte Folded Spill +; CHECK32-NEXT: mv s3, a2 +; CHECK32-NEXT: mv s2, a0 +; CHECK32-NEXT: csrr a0, vlenb +; CHECK32-NEXT: slli a1, a0, 3 +; CHECK32-NEXT: add a0, a1, a0 +; CHECK32-NEXT: add a0, sp, a0 +; CHECK32-NEXT: addi a0, a0, 16 +; CHECK32-NEXT: vs8r.v v16, (a0) # vscale x 64-byte Folded Spill +; CHECK32-NEXT: csrr a0, vlenb +; CHECK32-NEXT: slli a0, a0, 1 +; CHECK32-NEXT: mv a1, a0 +; CHECK32-NEXT: slli a0, a0, 3 +; CHECK32-NEXT: add a0, a0, a1 +; CHECK32-NEXT: add a0, sp, a0 +; CHECK32-NEXT: addi a0, a0, 16 +; CHECK32-NEXT: vs8r.v v8, (a0) # vscale x 64-byte Folded Spill +; CHECK32-NEXT: csrr s0, vlenb +; CHECK32-NEXT: li a1, 24 +; CHECK32-NEXT: mv a0, s0 +; CHECK32-NEXT: call __mulsi3 +; CHECK32-NEXT: csrr a1, vlenb +; CHECK32-NEXT: add a1, sp, a1 +; CHECK32-NEXT: addi a1, a1, 16 +; CHECK32-NEXT: vl1r.v v6, (a1) # vscale x 8-byte Folded Reload +; CHECK32-NEXT: mv a1, a0 +; CHECK32-NEXT: slli a4, s0, 3 +; CHECK32-NEXT: srli s4, s0, 2 +; CHECK32-NEXT: srli a0, s0, 3 +; CHECK32-NEXT: vsetvli a2, zero, e8, mf2, ta, ma +; CHECK32-NEXT: vslidedown.vx v7, v6, s4 +; CHECK32-NEXT: add a2, s3, a4 +; CHECK32-NEXT: vl8re64.v v16, (a2) +; CHECK32-NEXT: slli a6, s0, 4 +; CHECK32-NEXT: slli a2, s0, 1 +; CHECK32-NEXT: vsetvli a3, zero, e8, mf4, ta, ma +; CHECK32-NEXT: vslidedown.vx v0, v6, a0 +; CHECK32-NEXT: mv a3, s1 +; CHECK32-NEXT: bltu s1, a2, .LBB257_2 +; CHECK32-NEXT: # %bb.1: +; CHECK32-NEXT: mv a3, a2 +; CHECK32-NEXT: .LBB257_2: +; CHECK32-NEXT: add a5, s3, a1 +; CHECK32-NEXT: add a1, s2, a4 +; CHECK32-NEXT: vslidedown.vx v9, v7, a0 +; CHECK32-NEXT: csrr a4, vlenb +; CHECK32-NEXT: slli a7, a4, 4 +; CHECK32-NEXT: add a4, a7, a4 +; CHECK32-NEXT: add a4, sp, a4 +; CHECK32-NEXT: addi a4, a4, 16 +; CHECK32-NEXT: vs1r.v v9, (a4) # vscale x 8-byte Folded Spill +; CHECK32-NEXT: add a4, s3, a6 +; CHECK32-NEXT: vl8re64.v v24, (s3) +; CHECK32-NEXT: sub a6, a3, s0 +; CHECK32-NEXT: sltu a7, a3, a6 +; CHECK32-NEXT: addi a7, a7, -1 +; CHECK32-NEXT: and a6, a7, a6 +; CHECK32-NEXT: csrr a7, vlenb +; CHECK32-NEXT: slli t0, a7, 3 +; CHECK32-NEXT: add a7, t0, a7 +; CHECK32-NEXT: add a7, sp, a7 +; CHECK32-NEXT: addi a7, a7, 16 +; CHECK32-NEXT: vl8r.v v8, (a7) # vscale x 64-byte Folded Reload +; CHECK32-NEXT: vsetvli zero, a6, e64, m8, ta, ma +; CHECK32-NEXT: vmfeq.vv v5, v8, v16, v0.t +; CHECK32-NEXT: bltu a3, s0, .LBB257_4 +; CHECK32-NEXT: # %bb.3: +; CHECK32-NEXT: mv a3, s0 +; CHECK32-NEXT: .LBB257_4: +; CHECK32-NEXT: vmv1r.v v0, v6 +; CHECK32-NEXT: vl8re64.v v8, (a5) +; CHECK32-NEXT: csrr a5, vlenb +; CHECK32-NEXT: slli a6, a5, 3 +; CHECK32-NEXT: add a5, a6, a5 +; CHECK32-NEXT: add a5, sp, a5 +; CHECK32-NEXT: addi a5, a5, 16 +; CHECK32-NEXT: vs8r.v v8, (a5) # vscale x 64-byte Folded Spill +; CHECK32-NEXT: csrr a5, vlenb +; CHECK32-NEXT: slli a5, a5, 1 +; CHECK32-NEXT: mv a6, a5 +; CHECK32-NEXT: slli a5, a5, 3 +; CHECK32-NEXT: add a5, a5, a6 +; CHECK32-NEXT: add a5, sp, a5 +; CHECK32-NEXT: addi a5, a5, 16 +; CHECK32-NEXT: vl8r.v v16, (a5) # vscale x 64-byte Folded Reload +; CHECK32-NEXT: vsetvli zero, a3, e64, m8, ta, ma +; CHECK32-NEXT: vmfeq.vv v8, v16, v24, v0.t +; CHECK32-NEXT: vl8re64.v v16, (a1) +; CHECK32-NEXT: csrr a1, vlenb +; CHECK32-NEXT: add a1, sp, a1 +; CHECK32-NEXT: addi a1, a1, 16 +; CHECK32-NEXT: vs8r.v v16, (a1) # vscale x 64-byte Folded Spill +; CHECK32-NEXT: vl8re64.v v16, (a4) +; CHECK32-NEXT: sub a1, s1, a2 +; CHECK32-NEXT: sltu a2, s1, a1 +; CHECK32-NEXT: vl8re64.v v24, (s2) +; CHECK32-NEXT: addi a2, a2, -1 +; CHECK32-NEXT: and s1, a2, a1 +; CHECK32-NEXT: vsetvli zero, s4, e8, mf2, tu, ma +; CHECK32-NEXT: vslideup.vx v8, v5, a0 +; CHECK32-NEXT: csrr a1, vlenb +; CHECK32-NEXT: slli a1, a1, 1 +; CHECK32-NEXT: mv a2, a1 +; CHECK32-NEXT: slli a1, a1, 3 +; CHECK32-NEXT: add a1, a1, a2 +; CHECK32-NEXT: add a1, sp, a1 +; CHECK32-NEXT: addi a1, a1, 16 +; CHECK32-NEXT: vs1r.v v8, (a1) # vscale x 8-byte Folded Spill +; CHECK32-NEXT: mv a1, s1 +; CHECK32-NEXT: bltu s1, s0, .LBB257_6 +; CHECK32-NEXT: # %bb.5: +; CHECK32-NEXT: mv a1, s0 +; CHECK32-NEXT: .LBB257_6: +; CHECK32-NEXT: vmv1r.v v0, v7 +; CHECK32-NEXT: vsetvli zero, a1, e64, m8, ta, ma +; CHECK32-NEXT: vmfeq.vv v8, v24, v16, v0.t +; CHECK32-NEXT: addi a1, sp, 16 +; CHECK32-NEXT: vs1r.v v8, (a1) # vscale x 8-byte Folded Spill +; CHECK32-NEXT: li a1, 3 +; CHECK32-NEXT: call __mulsi3 +; CHECK32-NEXT: csrr a1, vlenb +; CHECK32-NEXT: slli a2, a1, 4 +; CHECK32-NEXT: add a1, a2, a1 +; CHECK32-NEXT: add a1, sp, a1 +; CHECK32-NEXT: addi a1, a1, 16 +; CHECK32-NEXT: vl1r.v v0, (a1) # vscale x 8-byte Folded Reload +; CHECK32-NEXT: csrr a1, vlenb +; CHECK32-NEXT: slli a1, a1, 1 +; CHECK32-NEXT: mv a2, a1 +; CHECK32-NEXT: slli a1, a1, 3 +; CHECK32-NEXT: add a1, a1, a2 +; CHECK32-NEXT: add a1, sp, a1 +; CHECK32-NEXT: addi a1, a1, 16 +; CHECK32-NEXT: vl1r.v v9, (a1) # vscale x 8-byte Folded Reload +; CHECK32-NEXT: addi a1, sp, 16 +; CHECK32-NEXT: vl1r.v v8, (a1) # vscale x 8-byte Folded Reload +; CHECK32-NEXT: vsetvli zero, a0, e8, mf2, tu, ma +; CHECK32-NEXT: vslideup.vx v9, v8, s4 +; CHECK32-NEXT: sub a1, s1, s0 +; CHECK32-NEXT: sltu a2, s1, a1 +; CHECK32-NEXT: addi a2, a2, -1 +; CHECK32-NEXT: and a1, a2, a1 +; CHECK32-NEXT: csrr a2, vlenb +; CHECK32-NEXT: slli a3, a2, 3 +; CHECK32-NEXT: add a2, a3, a2 +; CHECK32-NEXT: add a2, sp, a2 +; CHECK32-NEXT: addi a2, a2, 16 +; CHECK32-NEXT: vl8r.v v16, (a2) # vscale x 64-byte Folded Reload +; CHECK32-NEXT: csrr a2, vlenb +; CHECK32-NEXT: add a2, sp, a2 +; CHECK32-NEXT: addi a2, a2, 16 +; CHECK32-NEXT: vl8r.v v24, (a2) # vscale x 64-byte Folded Reload +; CHECK32-NEXT: vsetvli zero, a1, e64, m8, ta, ma +; CHECK32-NEXT: vmfeq.vv v8, v24, v16, v0.t +; CHECK32-NEXT: vsetvli a1, zero, e8, mf2, ta, ma +; CHECK32-NEXT: vslideup.vx v9, v8, a0 +; CHECK32-NEXT: vmv1r.v v0, v9 +; CHECK32-NEXT: csrr a0, vlenb +; CHECK32-NEXT: slli a0, a0, 1 +; CHECK32-NEXT: mv a1, a0 +; CHECK32-NEXT: slli a0, a0, 2 +; CHECK32-NEXT: add a1, a1, a0 +; CHECK32-NEXT: slli a0, a0, 1 +; CHECK32-NEXT: add a0, a0, a1 +; CHECK32-NEXT: add sp, sp, a0 +; CHECK32-NEXT: .cfi_def_cfa sp, 48 +; CHECK32-NEXT: lw ra, 44(sp) # 4-byte Folded Reload +; CHECK32-NEXT: lw s0, 40(sp) # 4-byte Folded Reload +; CHECK32-NEXT: lw s1, 36(sp) # 4-byte Folded Reload +; CHECK32-NEXT: lw s2, 32(sp) # 4-byte Folded Reload +; CHECK32-NEXT: lw s3, 28(sp) # 4-byte Folded Reload +; CHECK32-NEXT: lw s4, 24(sp) # 4-byte Folded Reload +; CHECK32-NEXT: .cfi_restore ra +; CHECK32-NEXT: .cfi_restore s0 +; CHECK32-NEXT: .cfi_restore s1 +; CHECK32-NEXT: .cfi_restore s2 +; CHECK32-NEXT: .cfi_restore s3 +; CHECK32-NEXT: .cfi_restore s4 +; CHECK32-NEXT: addi sp, sp, 48 +; CHECK32-NEXT: .cfi_def_cfa_offset 0 +; CHECK32-NEXT: ret +; +; CHECK64-LABEL: fcmp_oeq_vv_nxv32f64: +; CHECK64: # %bb.0: +; CHECK64-NEXT: addi sp, sp, -64 +; CHECK64-NEXT: .cfi_def_cfa_offset 64 +; CHECK64-NEXT: sd ra, 56(sp) # 8-byte Folded Spill +; CHECK64-NEXT: sd s0, 48(sp) # 8-byte Folded Spill +; CHECK64-NEXT: sd s1, 40(sp) # 8-byte Folded Spill +; CHECK64-NEXT: sd s2, 32(sp) # 8-byte Folded Spill +; CHECK64-NEXT: sd s3, 24(sp) # 8-byte Folded Spill +; CHECK64-NEXT: sd s4, 16(sp) # 8-byte Folded Spill +; CHECK64-NEXT: .cfi_offset ra, -8 +; CHECK64-NEXT: .cfi_offset s0, -16 +; CHECK64-NEXT: .cfi_offset s1, -24 +; CHECK64-NEXT: .cfi_offset s2, -32 +; CHECK64-NEXT: .cfi_offset s3, -40 +; CHECK64-NEXT: .cfi_offset s4, -48 +; CHECK64-NEXT: csrr a1, vlenb +; CHECK64-NEXT: slli a1, a1, 1 +; CHECK64-NEXT: mv a3, a1 +; CHECK64-NEXT: slli a1, a1, 2 +; CHECK64-NEXT: add a3, a3, a1 +; CHECK64-NEXT: slli a1, a1, 1 +; CHECK64-NEXT: add a1, a1, a3 +; CHECK64-NEXT: sub sp, sp, a1 +; CHECK64-NEXT: .cfi_escape 0x0f, 0x0e, 0x72, 0x00, 0x11, 0xc0, 0x00, 0x22, 0x11, 0x1a, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 64 + 26 * vlenb +; CHECK64-NEXT: mv s1, a6 +; CHECK64-NEXT: csrr a1, vlenb +; CHECK64-NEXT: add a1, sp, a1 +; CHECK64-NEXT: addi a1, a1, 16 +; CHECK64-NEXT: vs1r.v v0, (a1) # vscale x 8-byte Folded Spill +; CHECK64-NEXT: mv s3, a2 +; CHECK64-NEXT: mv s2, a0 +; CHECK64-NEXT: csrr a0, vlenb +; CHECK64-NEXT: slli a1, a0, 3 +; CHECK64-NEXT: add a0, a1, a0 +; CHECK64-NEXT: add a0, sp, a0 +; CHECK64-NEXT: addi a0, a0, 16 +; CHECK64-NEXT: vs8r.v v16, (a0) # vscale x 64-byte Folded Spill +; CHECK64-NEXT: csrr a0, vlenb +; CHECK64-NEXT: slli a0, a0, 1 +; CHECK64-NEXT: mv a1, a0 +; CHECK64-NEXT: slli a0, a0, 3 +; CHECK64-NEXT: add a0, a0, a1 +; CHECK64-NEXT: add a0, sp, a0 +; CHECK64-NEXT: addi a0, a0, 16 +; CHECK64-NEXT: vs8r.v v8, (a0) # vscale x 64-byte Folded Spill +; CHECK64-NEXT: csrr s0, vlenb +; CHECK64-NEXT: li a1, 24 +; CHECK64-NEXT: mv a0, s0 +; CHECK64-NEXT: call __muldi3 +; CHECK64-NEXT: csrr a1, vlenb +; CHECK64-NEXT: add a1, sp, a1 +; CHECK64-NEXT: addi a1, a1, 16 +; CHECK64-NEXT: vl1r.v v6, (a1) # vscale x 8-byte Folded Reload +; CHECK64-NEXT: mv a1, a0 +; CHECK64-NEXT: slli a4, s0, 3 +; CHECK64-NEXT: srli s4, s0, 2 +; CHECK64-NEXT: srli a0, s0, 3 +; CHECK64-NEXT: vsetvli a2, zero, e8, mf2, ta, ma +; CHECK64-NEXT: vslidedown.vx v7, v6, s4 +; CHECK64-NEXT: add a2, s3, a4 +; CHECK64-NEXT: vl8re64.v v16, (a2) +; CHECK64-NEXT: slli a6, s0, 4 +; CHECK64-NEXT: slli a2, s0, 1 +; CHECK64-NEXT: vsetvli a3, zero, e8, mf4, ta, ma +; CHECK64-NEXT: vslidedown.vx v0, v6, a0 +; CHECK64-NEXT: mv a3, s1 +; CHECK64-NEXT: bltu s1, a2, .LBB257_2 +; CHECK64-NEXT: # %bb.1: +; CHECK64-NEXT: mv a3, a2 +; CHECK64-NEXT: .LBB257_2: +; CHECK64-NEXT: add a5, s3, a1 +; CHECK64-NEXT: add a1, s2, a4 +; CHECK64-NEXT: vslidedown.vx v9, v7, a0 +; CHECK64-NEXT: csrr a4, vlenb +; CHECK64-NEXT: slli a7, a4, 4 +; CHECK64-NEXT: add a4, a7, a4 +; CHECK64-NEXT: add a4, sp, a4 +; CHECK64-NEXT: addi a4, a4, 16 +; CHECK64-NEXT: vs1r.v v9, (a4) # vscale x 8-byte Folded Spill +; CHECK64-NEXT: add a4, s3, a6 +; CHECK64-NEXT: vl8re64.v v24, (s3) +; CHECK64-NEXT: sub a6, a3, s0 +; CHECK64-NEXT: sltu a7, a3, a6 +; CHECK64-NEXT: addi a7, a7, -1 +; CHECK64-NEXT: and a6, a7, a6 +; CHECK64-NEXT: csrr a7, vlenb +; CHECK64-NEXT: slli t0, a7, 3 +; CHECK64-NEXT: add a7, t0, a7 +; CHECK64-NEXT: add a7, sp, a7 +; CHECK64-NEXT: addi a7, a7, 16 +; CHECK64-NEXT: vl8r.v v8, (a7) # vscale x 64-byte Folded Reload +; CHECK64-NEXT: vsetvli zero, a6, e64, m8, ta, ma +; CHECK64-NEXT: vmfeq.vv v5, v8, v16, v0.t +; CHECK64-NEXT: bltu a3, s0, .LBB257_4 +; CHECK64-NEXT: # %bb.3: +; CHECK64-NEXT: mv a3, s0 +; CHECK64-NEXT: .LBB257_4: +; CHECK64-NEXT: vmv1r.v v0, v6 +; CHECK64-NEXT: vl8re64.v v8, (a5) +; CHECK64-NEXT: csrr a5, vlenb +; CHECK64-NEXT: slli a6, a5, 3 +; CHECK64-NEXT: add a5, a6, a5 +; CHECK64-NEXT: add a5, sp, a5 +; CHECK64-NEXT: addi a5, a5, 16 +; CHECK64-NEXT: vs8r.v v8, (a5) # vscale x 64-byte Folded Spill +; CHECK64-NEXT: csrr a5, vlenb +; CHECK64-NEXT: slli a5, a5, 1 +; CHECK64-NEXT: mv a6, a5 +; CHECK64-NEXT: slli a5, a5, 3 +; CHECK64-NEXT: add a5, a5, a6 +; CHECK64-NEXT: add a5, sp, a5 +; CHECK64-NEXT: addi a5, a5, 16 +; CHECK64-NEXT: vl8r.v v16, (a5) # vscale x 64-byte Folded Reload +; CHECK64-NEXT: vsetvli zero, a3, e64, m8, ta, ma +; CHECK64-NEXT: vmfeq.vv v8, v16, v24, v0.t +; CHECK64-NEXT: vl8re64.v v16, (a1) +; CHECK64-NEXT: csrr a1, vlenb +; CHECK64-NEXT: add a1, sp, a1 +; CHECK64-NEXT: addi a1, a1, 16 +; CHECK64-NEXT: vs8r.v v16, (a1) # vscale x 64-byte Folded Spill +; CHECK64-NEXT: vl8re64.v v16, (a4) +; CHECK64-NEXT: sub a1, s1, a2 +; CHECK64-NEXT: sltu a2, s1, a1 +; CHECK64-NEXT: vl8re64.v v24, (s2) +; CHECK64-NEXT: addi a2, a2, -1 +; CHECK64-NEXT: and s1, a2, a1 +; CHECK64-NEXT: vsetvli zero, s4, e8, mf2, tu, ma +; CHECK64-NEXT: vslideup.vx v8, v5, a0 +; CHECK64-NEXT: csrr a1, vlenb +; CHECK64-NEXT: slli a1, a1, 1 +; CHECK64-NEXT: mv a2, a1 +; CHECK64-NEXT: slli a1, a1, 3 +; CHECK64-NEXT: add a1, a1, a2 +; CHECK64-NEXT: add a1, sp, a1 +; CHECK64-NEXT: addi a1, a1, 16 +; CHECK64-NEXT: vs1r.v v8, (a1) # vscale x 8-byte Folded Spill +; CHECK64-NEXT: mv a1, s1 +; CHECK64-NEXT: bltu s1, s0, .LBB257_6 +; CHECK64-NEXT: # %bb.5: +; CHECK64-NEXT: mv a1, s0 +; CHECK64-NEXT: .LBB257_6: +; CHECK64-NEXT: vmv1r.v v0, v7 +; CHECK64-NEXT: vsetvli zero, a1, e64, m8, ta, ma +; CHECK64-NEXT: vmfeq.vv v8, v24, v16, v0.t +; CHECK64-NEXT: addi a1, sp, 16 +; CHECK64-NEXT: vs1r.v v8, (a1) # vscale x 8-byte Folded Spill +; CHECK64-NEXT: li a1, 3 +; CHECK64-NEXT: call __muldi3 +; CHECK64-NEXT: csrr a1, vlenb +; CHECK64-NEXT: slli a2, a1, 4 +; CHECK64-NEXT: add a1, a2, a1 +; CHECK64-NEXT: add a1, sp, a1 +; CHECK64-NEXT: addi a1, a1, 16 +; CHECK64-NEXT: vl1r.v v0, (a1) # vscale x 8-byte Folded Reload +; CHECK64-NEXT: csrr a1, vlenb +; CHECK64-NEXT: slli a1, a1, 1 +; CHECK64-NEXT: mv a2, a1 +; CHECK64-NEXT: slli a1, a1, 3 +; CHECK64-NEXT: add a1, a1, a2 +; CHECK64-NEXT: add a1, sp, a1 +; CHECK64-NEXT: addi a1, a1, 16 +; CHECK64-NEXT: vl1r.v v9, (a1) # vscale x 8-byte Folded Reload +; CHECK64-NEXT: addi a1, sp, 16 +; CHECK64-NEXT: vl1r.v v8, (a1) # vscale x 8-byte Folded Reload +; CHECK64-NEXT: vsetvli zero, a0, e8, mf2, tu, ma +; CHECK64-NEXT: vslideup.vx v9, v8, s4 +; CHECK64-NEXT: sub a1, s1, s0 +; CHECK64-NEXT: sltu a2, s1, a1 +; CHECK64-NEXT: addi a2, a2, -1 +; CHECK64-NEXT: and a1, a2, a1 +; CHECK64-NEXT: csrr a2, vlenb +; CHECK64-NEXT: slli a3, a2, 3 +; CHECK64-NEXT: add a2, a3, a2 +; CHECK64-NEXT: add a2, sp, a2 +; CHECK64-NEXT: addi a2, a2, 16 +; CHECK64-NEXT: vl8r.v v16, (a2) # vscale x 64-byte Folded Reload +; CHECK64-NEXT: csrr a2, vlenb +; CHECK64-NEXT: add a2, sp, a2 +; CHECK64-NEXT: addi a2, a2, 16 +; CHECK64-NEXT: vl8r.v v24, (a2) # vscale x 64-byte Folded Reload +; CHECK64-NEXT: vsetvli zero, a1, e64, m8, ta, ma +; CHECK64-NEXT: vmfeq.vv v8, v24, v16, v0.t +; CHECK64-NEXT: vsetvli a1, zero, e8, mf2, ta, ma +; CHECK64-NEXT: vslideup.vx v9, v8, a0 +; CHECK64-NEXT: vmv1r.v v0, v9 +; CHECK64-NEXT: csrr a0, vlenb +; CHECK64-NEXT: slli a0, a0, 1 +; CHECK64-NEXT: mv a1, a0 +; CHECK64-NEXT: slli a0, a0, 2 +; CHECK64-NEXT: add a1, a1, a0 +; CHECK64-NEXT: slli a0, a0, 1 +; CHECK64-NEXT: add a0, a0, a1 +; CHECK64-NEXT: add sp, sp, a0 +; CHECK64-NEXT: .cfi_def_cfa sp, 64 +; CHECK64-NEXT: ld ra, 56(sp) # 8-byte Folded Reload +; CHECK64-NEXT: ld s0, 48(sp) # 8-byte Folded Reload +; CHECK64-NEXT: ld s1, 40(sp) # 8-byte Folded Reload +; CHECK64-NEXT: ld s2, 32(sp) # 8-byte Folded Reload +; CHECK64-NEXT: ld s3, 24(sp) # 8-byte Folded Reload +; CHECK64-NEXT: ld s4, 16(sp) # 8-byte Folded Reload +; CHECK64-NEXT: .cfi_restore ra +; CHECK64-NEXT: .cfi_restore s0 +; CHECK64-NEXT: .cfi_restore s1 +; CHECK64-NEXT: .cfi_restore s2 +; CHECK64-NEXT: .cfi_restore s3 +; CHECK64-NEXT: .cfi_restore s4 +; CHECK64-NEXT: addi sp, sp, 64 +; CHECK64-NEXT: .cfi_def_cfa_offset 0 +; CHECK64-NEXT: ret %v = call <vscale x 32 x i1> @llvm.vp.fcmp.nxv32f64(<vscale x 32 x double> %va, <vscale x 32 x double> %vb, metadata !"oeq", <vscale x 32 x i1> %m, i32 %evl) ret <vscale x 32 x i1> %v } diff --git a/llvm/test/CodeGen/RISCV/rvv/sink-splat-operands.ll b/llvm/test/CodeGen/RISCV/rvv/sink-splat-operands.ll index c216fb6..346e40a 100644 --- a/llvm/test/CodeGen/RISCV/rvv/sink-splat-operands.ll +++ b/llvm/test/CodeGen/RISCV/rvv/sink-splat-operands.ll @@ -549,7 +549,7 @@ define void @sink_splat_rsub_scalable(ptr nocapture %a, i32 signext %x) { ; CHECK-NEXT: .LBB10_6: # %for.body ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: lw a3, 0(a2) -; CHECK-NEXT: subw a3, a1, a3 +; CHECK-NEXT: sub a3, a1, a3 ; CHECK-NEXT: sw a3, 0(a2) ; CHECK-NEXT: addi a2, a2, 4 ; CHECK-NEXT: bne a2, a0, .LBB10_6 diff --git a/llvm/test/CodeGen/RISCV/rvv/vandn-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/vandn-sdnode.ll index 66e114c..f295bd8 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vandn-sdnode.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vandn-sdnode.ll @@ -2300,7 +2300,7 @@ define void @vand_vx_loop_hoisted_not(ptr %a, i32 noundef signext %mask) { ; CHECK-RV64-NEXT: j .LBB98_5 ; CHECK-RV64-NEXT: .LBB98_2: # %vector.ph ; CHECK-RV64-NEXT: srli a3, a4, 1 -; CHECK-RV64-NEXT: negw a2, a3 +; CHECK-RV64-NEXT: neg a2, a3 ; CHECK-RV64-NEXT: andi a2, a2, 256 ; CHECK-RV64-NEXT: slli a4, a4, 1 ; CHECK-RV64-NEXT: mv a5, a0 @@ -2393,7 +2393,7 @@ define void @vand_vx_loop_hoisted_not(ptr %a, i32 noundef signext %mask) { ; CHECK-ZVKB-NOZBB64-NEXT: j .LBB98_5 ; CHECK-ZVKB-NOZBB64-NEXT: .LBB98_2: # %vector.ph ; CHECK-ZVKB-NOZBB64-NEXT: srli a3, a4, 1 -; CHECK-ZVKB-NOZBB64-NEXT: negw a2, a3 +; CHECK-ZVKB-NOZBB64-NEXT: neg a2, a3 ; CHECK-ZVKB-NOZBB64-NEXT: andi a2, a2, 256 ; CHECK-ZVKB-NOZBB64-NEXT: slli a4, a4, 1 ; CHECK-ZVKB-NOZBB64-NEXT: mv a5, a0 @@ -2485,7 +2485,7 @@ define void @vand_vx_loop_hoisted_not(ptr %a, i32 noundef signext %mask) { ; CHECK-ZVKB-ZBB64-NEXT: j .LBB98_5 ; CHECK-ZVKB-ZBB64-NEXT: .LBB98_2: # %vector.ph ; CHECK-ZVKB-ZBB64-NEXT: srli a3, a4, 1 -; CHECK-ZVKB-ZBB64-NEXT: negw a2, a3 +; CHECK-ZVKB-ZBB64-NEXT: neg a2, a3 ; CHECK-ZVKB-ZBB64-NEXT: andi a2, a2, 256 ; CHECK-ZVKB-ZBB64-NEXT: slli a4, a4, 1 ; CHECK-ZVKB-ZBB64-NEXT: mv a5, a0 diff --git a/llvm/test/CodeGen/RISCV/rvv/vec3-setcc-crash.ll b/llvm/test/CodeGen/RISCV/rvv/vec3-setcc-crash.ll index 3740737..d0b184b 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vec3-setcc-crash.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vec3-setcc-crash.ll @@ -50,9 +50,9 @@ define void @vec3_setcc_crash(ptr %in, ptr %out) { ; RV64-NEXT: sgtz a5, a5 ; RV64-NEXT: sgtz a4, a4 ; RV64-NEXT: sgtz a3, a3 -; RV64-NEXT: negw a3, a3 -; RV64-NEXT: negw a4, a4 -; RV64-NEXT: negw a5, a5 +; RV64-NEXT: neg a3, a3 +; RV64-NEXT: neg a4, a4 +; RV64-NEXT: neg a5, a5 ; RV64-NEXT: and a3, a3, a6 ; RV64-NEXT: and a0, a4, a0 ; RV64-NEXT: and a2, a5, a2 diff --git a/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave-load.ll b/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave-load.ll index 578b67e..96a7b14 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave-load.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave-load.ll @@ -542,10 +542,8 @@ define { <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x define {<vscale x 16 x i8>, <vscale x 16 x i8>} @masked_load_factor2(ptr %p) { ; CHECK-LABEL: masked_load_factor2: ; CHECK: # %bb.0: -; CHECK-NEXT: vl4r.v v12, (a0) -; CHECK-NEXT: vsetvli a0, zero, e8, m2, ta, ma -; CHECK-NEXT: vnsrl.wi v8, v12, 0 -; CHECK-NEXT: vnsrl.wi v10, v12, 8 +; CHECK-NEXT: vsetvli a1, zero, e8, m2, ta, ma +; CHECK-NEXT: vlseg2e8.v v8, (a0) ; CHECK-NEXT: ret %vec = call <vscale x 32 x i8> @llvm.masked.load(ptr %p, i32 4, <vscale x 32 x i1> splat (i1 true), <vscale x 32 x i8> poison) %deinterleaved.results = call {<vscale x 16 x i8>, <vscale x 16 x i8>} @llvm.vector.deinterleave2.nxv32i8(<vscale x 32 x i8> %vec) @@ -555,23 +553,8 @@ define {<vscale x 16 x i8>, <vscale x 16 x i8>} @masked_load_factor2(ptr %p) { define {<vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>} @masked_loat_factor4(ptr %p) { ; CHECK-LABEL: masked_loat_factor4: ; CHECK: # %bb.0: -; CHECK-NEXT: addi sp, sp, -16 -; CHECK-NEXT: .cfi_def_cfa_offset 16 -; CHECK-NEXT: csrr a1, vlenb -; CHECK-NEXT: slli a1, a1, 2 -; CHECK-NEXT: sub sp, sp, a1 -; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x04, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 4 * vlenb -; CHECK-NEXT: vl4r.v v8, (a0) -; CHECK-NEXT: addi a0, sp, 16 -; CHECK-NEXT: vs4r.v v8, (a0) ; CHECK-NEXT: vsetvli a1, zero, e8, m1, ta, ma ; CHECK-NEXT: vlseg4e8.v v8, (a0) -; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: slli a0, a0, 2 -; CHECK-NEXT: add sp, sp, a0 -; CHECK-NEXT: .cfi_def_cfa sp, 16 -; CHECK-NEXT: addi sp, sp, 16 -; CHECK-NEXT: .cfi_def_cfa_offset 0 ; CHECK-NEXT: ret %vec = call <vscale x 32 x i8> @llvm.masked.load(ptr %p, i32 4, <vscale x 32 x i1> splat (i1 true), <vscale x 32 x i8> poison) %deinterleaved.results = call {<vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>} @llvm.vector.deinterleave4.nxv32i8(<vscale x 32 x i8> %vec) @@ -581,56 +564,8 @@ define {<vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i define {<vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>} @masked_loat_factor4_mask(ptr %p, <vscale x 8 x i1> %mask) { ; CHECK-LABEL: masked_loat_factor4_mask: ; CHECK: # %bb.0: -; CHECK-NEXT: addi sp, sp, -16 -; CHECK-NEXT: .cfi_def_cfa_offset 16 -; CHECK-NEXT: csrr a1, vlenb -; CHECK-NEXT: slli a1, a1, 3 -; CHECK-NEXT: sub sp, sp, a1 -; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb ; CHECK-NEXT: vsetvli a1, zero, e8, m1, ta, ma -; CHECK-NEXT: vmv.v.i v8, 0 -; CHECK-NEXT: addi a1, sp, 16 -; CHECK-NEXT: csrr a2, vlenb -; CHECK-NEXT: vmerge.vim v8, v8, 1, v0 -; CHECK-NEXT: add a3, a1, a2 -; CHECK-NEXT: vmv.v.v v9, v8 -; CHECK-NEXT: srli a4, a2, 2 -; CHECK-NEXT: vmv.v.v v10, v8 -; CHECK-NEXT: srli a5, a2, 3 -; CHECK-NEXT: vmv.v.v v11, v8 -; CHECK-NEXT: vsseg4e8.v v8, (a1) -; CHECK-NEXT: vl1r.v v8, (a1) -; CHECK-NEXT: add a1, a4, a5 -; CHECK-NEXT: vl1r.v v9, (a3) -; CHECK-NEXT: add a3, a3, a2 -; CHECK-NEXT: add a2, a3, a2 -; CHECK-NEXT: vl1r.v v10, (a3) -; CHECK-NEXT: vl1r.v v11, (a2) -; CHECK-NEXT: vmsne.vi v9, v9, 0 -; CHECK-NEXT: vmsne.vi v0, v8, 0 -; CHECK-NEXT: vmsne.vi v8, v10, 0 -; CHECK-NEXT: vmsne.vi v10, v11, 0 -; CHECK-NEXT: vsetvli zero, a4, e8, mf2, tu, ma -; CHECK-NEXT: vslideup.vx v0, v9, a5 -; CHECK-NEXT: vsetvli zero, a1, e8, mf2, tu, ma -; CHECK-NEXT: vslideup.vx v0, v8, a4 -; CHECK-NEXT: vsetvli a2, zero, e8, mf2, ta, ma -; CHECK-NEXT: vslideup.vx v0, v10, a1 -; CHECK-NEXT: vsetvli a1, zero, e8, m4, ta, ma -; CHECK-NEXT: vle8.v v8, (a0), v0.t -; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: slli a0, a0, 2 -; CHECK-NEXT: add a0, sp, a0 -; CHECK-NEXT: addi a0, a0, 16 -; CHECK-NEXT: vs4r.v v8, (a0) -; CHECK-NEXT: vsetvli a1, zero, e8, m1, ta, ma -; CHECK-NEXT: vlseg4e8.v v8, (a0) -; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: slli a0, a0, 3 -; CHECK-NEXT: add sp, sp, a0 -; CHECK-NEXT: .cfi_def_cfa sp, 16 -; CHECK-NEXT: addi sp, sp, 16 -; CHECK-NEXT: .cfi_def_cfa_offset 0 +; CHECK-NEXT: vlseg4e8.v v8, (a0), v0.t ; CHECK-NEXT: ret %interleaved.mask = tail call <vscale x 32 x i1> @llvm.vector.interleave4.nxv32i1(<vscale x 8 x i1> %mask, <vscale x 8 x i1> %mask, <vscale x 8 x i1> %mask, <vscale x 8 x i1> %mask) %vec = call <vscale x 32 x i8> @llvm.masked.load(ptr %p, i32 4, <vscale x 32 x i1> %interleaved.mask, <vscale x 32 x i8> poison) diff --git a/llvm/test/CodeGen/RISCV/rvv/vrol-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/vrol-sdnode.ll index 25a226e..eb129da 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vrol-sdnode.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vrol-sdnode.ll @@ -959,7 +959,7 @@ define <vscale x 1 x i64> @vrol_vx_nxv1i64(<vscale x 1 x i64> %a, i64 %b) { ; CHECK-RV64-LABEL: vrol_vx_nxv1i64: ; CHECK-RV64: # %bb.0: ; CHECK-RV64-NEXT: andi a1, a0, 63 -; CHECK-RV64-NEXT: negw a0, a0 +; CHECK-RV64-NEXT: neg a0, a0 ; CHECK-RV64-NEXT: vsetvli a2, zero, e64, m1, ta, ma ; CHECK-RV64-NEXT: vsll.vx v9, v8, a1 ; CHECK-RV64-NEXT: andi a0, a0, 63 @@ -1022,7 +1022,7 @@ define <vscale x 2 x i64> @vrol_vx_nxv2i64(<vscale x 2 x i64> %a, i64 %b) { ; CHECK-RV64-LABEL: vrol_vx_nxv2i64: ; CHECK-RV64: # %bb.0: ; CHECK-RV64-NEXT: andi a1, a0, 63 -; CHECK-RV64-NEXT: negw a0, a0 +; CHECK-RV64-NEXT: neg a0, a0 ; CHECK-RV64-NEXT: vsetvli a2, zero, e64, m2, ta, ma ; CHECK-RV64-NEXT: vsll.vx v10, v8, a1 ; CHECK-RV64-NEXT: andi a0, a0, 63 @@ -1085,7 +1085,7 @@ define <vscale x 4 x i64> @vrol_vx_nxv4i64(<vscale x 4 x i64> %a, i64 %b) { ; CHECK-RV64-LABEL: vrol_vx_nxv4i64: ; CHECK-RV64: # %bb.0: ; CHECK-RV64-NEXT: andi a1, a0, 63 -; CHECK-RV64-NEXT: negw a0, a0 +; CHECK-RV64-NEXT: neg a0, a0 ; CHECK-RV64-NEXT: vsetvli a2, zero, e64, m4, ta, ma ; CHECK-RV64-NEXT: vsll.vx v12, v8, a1 ; CHECK-RV64-NEXT: andi a0, a0, 63 @@ -1148,7 +1148,7 @@ define <vscale x 8 x i64> @vrol_vx_nxv8i64(<vscale x 8 x i64> %a, i64 %b) { ; CHECK-RV64-LABEL: vrol_vx_nxv8i64: ; CHECK-RV64: # %bb.0: ; CHECK-RV64-NEXT: andi a1, a0, 63 -; CHECK-RV64-NEXT: negw a0, a0 +; CHECK-RV64-NEXT: neg a0, a0 ; CHECK-RV64-NEXT: vsetvli a2, zero, e64, m8, ta, ma ; CHECK-RV64-NEXT: vsll.vx v16, v8, a1 ; CHECK-RV64-NEXT: andi a0, a0, 63 diff --git a/llvm/test/CodeGen/RISCV/rvv/vror-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/vror-sdnode.ll index 9e63b61..97524ac 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vror-sdnode.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vror-sdnode.ll @@ -1626,7 +1626,7 @@ define <vscale x 1 x i64> @vror_vx_nxv1i64(<vscale x 1 x i64> %a, i64 %b) { ; CHECK-RV64-LABEL: vror_vx_nxv1i64: ; CHECK-RV64: # %bb.0: ; CHECK-RV64-NEXT: andi a1, a0, 63 -; CHECK-RV64-NEXT: negw a0, a0 +; CHECK-RV64-NEXT: neg a0, a0 ; CHECK-RV64-NEXT: vsetvli a2, zero, e64, m1, ta, ma ; CHECK-RV64-NEXT: vsrl.vx v9, v8, a1 ; CHECK-RV64-NEXT: andi a0, a0, 63 @@ -1728,7 +1728,7 @@ define <vscale x 2 x i64> @vror_vx_nxv2i64(<vscale x 2 x i64> %a, i64 %b) { ; CHECK-RV64-LABEL: vror_vx_nxv2i64: ; CHECK-RV64: # %bb.0: ; CHECK-RV64-NEXT: andi a1, a0, 63 -; CHECK-RV64-NEXT: negw a0, a0 +; CHECK-RV64-NEXT: neg a0, a0 ; CHECK-RV64-NEXT: vsetvli a2, zero, e64, m2, ta, ma ; CHECK-RV64-NEXT: vsrl.vx v10, v8, a1 ; CHECK-RV64-NEXT: andi a0, a0, 63 @@ -1830,7 +1830,7 @@ define <vscale x 4 x i64> @vror_vx_nxv4i64(<vscale x 4 x i64> %a, i64 %b) { ; CHECK-RV64-LABEL: vror_vx_nxv4i64: ; CHECK-RV64: # %bb.0: ; CHECK-RV64-NEXT: andi a1, a0, 63 -; CHECK-RV64-NEXT: negw a0, a0 +; CHECK-RV64-NEXT: neg a0, a0 ; CHECK-RV64-NEXT: vsetvli a2, zero, e64, m4, ta, ma ; CHECK-RV64-NEXT: vsrl.vx v12, v8, a1 ; CHECK-RV64-NEXT: andi a0, a0, 63 @@ -1932,7 +1932,7 @@ define <vscale x 8 x i64> @vror_vx_nxv8i64(<vscale x 8 x i64> %a, i64 %b) { ; CHECK-RV64-LABEL: vror_vx_nxv8i64: ; CHECK-RV64: # %bb.0: ; CHECK-RV64-NEXT: andi a1, a0, 63 -; CHECK-RV64-NEXT: negw a0, a0 +; CHECK-RV64-NEXT: neg a0, a0 ; CHECK-RV64-NEXT: vsetvli a2, zero, e64, m8, ta, ma ; CHECK-RV64-NEXT: vsrl.vx v16, v8, a1 ; CHECK-RV64-NEXT: andi a0, a0, 63 diff --git a/llvm/test/CodeGen/RISCV/rvv/vscale-power-of-two.ll b/llvm/test/CodeGen/RISCV/rvv/vscale-power-of-two.ll index 8eef133..4442f97 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vscale-power-of-two.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vscale-power-of-two.ll @@ -77,7 +77,7 @@ define i64 @con1024_minus_rem() { ; CHECK: # %bb.0: ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: srli a0, a0, 3 -; CHECK-NEXT: negw a0, a0 +; CHECK-NEXT: neg a0, a0 ; CHECK-NEXT: andi a0, a0, 1024 ; CHECK-NEXT: ret %vscale = call i64 @llvm.vscale.i64() diff --git a/llvm/test/CodeGen/RISCV/select.ll b/llvm/test/CodeGen/RISCV/select.ll index 0ea80bf..2e1784d 100644 --- a/llvm/test/CodeGen/RISCV/select.ll +++ b/llvm/test/CodeGen/RISCV/select.ll @@ -647,7 +647,7 @@ define i32 @select_add_1(i1 zeroext %cond, i32 %a, i32 %b) { ; ; RV64IM-LABEL: select_add_1: ; RV64IM: # %bb.0: # %entry -; RV64IM-NEXT: negw a0, a0 +; RV64IM-NEXT: neg a0, a0 ; RV64IM-NEXT: and a0, a0, a1 ; RV64IM-NEXT: addw a0, a2, a0 ; RV64IM-NEXT: ret diff --git a/llvm/test/CodeGen/RISCV/sextw-removal.ll b/llvm/test/CodeGen/RISCV/sextw-removal.ll index b128abb..b155fea 100644 --- a/llvm/test/CodeGen/RISCV/sextw-removal.ll +++ b/llvm/test/CodeGen/RISCV/sextw-removal.ll @@ -1048,21 +1048,21 @@ define signext i32 @bug(i32 signext %x) { ; CHECK-NEXT: srliw a2, a0, 24 ; CHECK-NEXT: seqz a2, a2 ; CHECK-NEXT: slli a3, a2, 3 -; CHECK-NEXT: negw a2, a2 +; CHECK-NEXT: neg a2, a2 ; CHECK-NEXT: sllw a0, a0, a3 ; CHECK-NEXT: andi a2, a2, -8 ; CHECK-NEXT: add a1, a1, a2 ; CHECK-NEXT: srliw a2, a0, 28 ; CHECK-NEXT: seqz a2, a2 ; CHECK-NEXT: slli a3, a2, 2 -; CHECK-NEXT: negw a2, a2 +; CHECK-NEXT: neg a2, a2 ; CHECK-NEXT: sllw a0, a0, a3 ; CHECK-NEXT: andi a2, a2, -4 ; CHECK-NEXT: add a1, a1, a2 ; CHECK-NEXT: srliw a2, a0, 30 ; CHECK-NEXT: seqz a2, a2 ; CHECK-NEXT: slli a3, a2, 1 -; CHECK-NEXT: negw a2, a2 +; CHECK-NEXT: neg a2, a2 ; CHECK-NEXT: sllw a0, a0, a3 ; CHECK-NEXT: andi a2, a2, -2 ; CHECK-NEXT: add a1, a1, a2 @@ -1090,21 +1090,21 @@ define signext i32 @bug(i32 signext %x) { ; NOREMOVAL-NEXT: srliw a2, a0, 24 ; NOREMOVAL-NEXT: seqz a2, a2 ; NOREMOVAL-NEXT: slli a3, a2, 3 -; NOREMOVAL-NEXT: negw a2, a2 +; NOREMOVAL-NEXT: neg a2, a2 ; NOREMOVAL-NEXT: sllw a0, a0, a3 ; NOREMOVAL-NEXT: andi a2, a2, -8 ; NOREMOVAL-NEXT: add a1, a1, a2 ; NOREMOVAL-NEXT: srliw a2, a0, 28 ; NOREMOVAL-NEXT: seqz a2, a2 ; NOREMOVAL-NEXT: slli a3, a2, 2 -; NOREMOVAL-NEXT: negw a2, a2 +; NOREMOVAL-NEXT: neg a2, a2 ; NOREMOVAL-NEXT: sllw a0, a0, a3 ; NOREMOVAL-NEXT: andi a2, a2, -4 ; NOREMOVAL-NEXT: add a1, a1, a2 ; NOREMOVAL-NEXT: srliw a2, a0, 30 ; NOREMOVAL-NEXT: seqz a2, a2 ; NOREMOVAL-NEXT: slli a3, a2, 1 -; NOREMOVAL-NEXT: negw a2, a2 +; NOREMOVAL-NEXT: neg a2, a2 ; NOREMOVAL-NEXT: sllw a0, a0, a3 ; NOREMOVAL-NEXT: andi a2, a2, -2 ; NOREMOVAL-NEXT: add a1, a1, a2 diff --git a/llvm/test/CodeGen/RISCV/shifts.ll b/llvm/test/CodeGen/RISCV/shifts.ll index 7ca1ee1..1ca23d7 100644 --- a/llvm/test/CodeGen/RISCV/shifts.ll +++ b/llvm/test/CodeGen/RISCV/shifts.ll @@ -383,7 +383,7 @@ define i64 @fshr64_minsize(i64 %a, i64 %b) minsize nounwind { ; RV64I-LABEL: fshr64_minsize: ; RV64I: # %bb.0: ; RV64I-NEXT: srl a2, a0, a1 -; RV64I-NEXT: negw a1, a1 +; RV64I-NEXT: neg a1, a1 ; RV64I-NEXT: sll a0, a0, a1 ; RV64I-NEXT: or a0, a2, a0 ; RV64I-NEXT: ret diff --git a/llvm/test/CodeGen/RISCV/shl-cttz.ll b/llvm/test/CodeGen/RISCV/shl-cttz.ll index 99dc4f8..e44d247 100644 --- a/llvm/test/CodeGen/RISCV/shl-cttz.ll +++ b/llvm/test/CodeGen/RISCV/shl-cttz.ll @@ -40,7 +40,7 @@ define i8 @shl_cttz_i8(i8 %x, i8 %y) { ; RV64I-NEXT: and a1, a1, a2 ; RV64I-NEXT: srli a2, a1, 1 ; RV64I-NEXT: andi a2, a2, 85 -; RV64I-NEXT: subw a1, a1, a2 +; RV64I-NEXT: sub a1, a1, a2 ; RV64I-NEXT: andi a2, a1, 51 ; RV64I-NEXT: srli a1, a1, 2 ; RV64I-NEXT: andi a1, a1, 51 @@ -96,7 +96,7 @@ define i8 @shl_cttz_constant_i8(i8 %y) { ; RV64I-NEXT: and a0, a0, a1 ; RV64I-NEXT: srli a1, a0, 1 ; RV64I-NEXT: andi a1, a1, 85 -; RV64I-NEXT: subw a0, a0, a1 +; RV64I-NEXT: sub a0, a0, a1 ; RV64I-NEXT: andi a1, a0, 51 ; RV64I-NEXT: srli a0, a0, 2 ; RV64I-NEXT: andi a0, a0, 51 @@ -276,7 +276,7 @@ define i32 @shl_cttz_i32(i32 %x, i32 %y) { ; ; RV64I-LABEL: shl_cttz_i32: ; RV64I: # %bb.0: # %entry -; RV64I-NEXT: negw a2, a1 +; RV64I-NEXT: neg a2, a1 ; RV64I-NEXT: and a1, a1, a2 ; RV64I-NEXT: lui a2, 30667 ; RV64I-NEXT: addi a2, a2, 1329 @@ -333,7 +333,7 @@ define i32 @shl_cttz_i32_zero_is_defined(i32 %x, i32 %y) { ; RV64I-NEXT: sext.w a2, a1 ; RV64I-NEXT: beqz a2, .LBB5_2 ; RV64I-NEXT: # %bb.1: # %cond.false -; RV64I-NEXT: negw a2, a1 +; RV64I-NEXT: neg a2, a1 ; RV64I-NEXT: and a1, a1, a2 ; RV64I-NEXT: lui a2, 30667 ; RV64I-NEXT: addi a2, a2, 1329 @@ -378,7 +378,7 @@ define i32 @shl_cttz_constant_i32(i32 %y) { ; ; RV64I-LABEL: shl_cttz_constant_i32: ; RV64I: # %bb.0: # %entry -; RV64I-NEXT: negw a1, a0 +; RV64I-NEXT: neg a1, a0 ; RV64I-NEXT: and a0, a0, a1 ; RV64I-NEXT: lui a1, 30667 ; RV64I-NEXT: addi a1, a1, 1329 @@ -474,7 +474,7 @@ define i32 @shl_cttz_multiuse_i32(i32 %x, i32 %y) { ; RV64I-NEXT: .cfi_offset ra, -8 ; RV64I-NEXT: .cfi_offset s0, -16 ; RV64I-NEXT: .cfi_offset s1, -24 -; RV64I-NEXT: negw a2, a1 +; RV64I-NEXT: neg a2, a1 ; RV64I-NEXT: and a1, a1, a2 ; RV64I-NEXT: lui a2, 30667 ; RV64I-NEXT: addi a2, a2, 1329 diff --git a/llvm/test/CodeGen/RISCV/srem-seteq-illegal-types.ll b/llvm/test/CodeGen/RISCV/srem-seteq-illegal-types.ll index 93fb230..bc23388 100644 --- a/llvm/test/CodeGen/RISCV/srem-seteq-illegal-types.ll +++ b/llvm/test/CodeGen/RISCV/srem-seteq-illegal-types.ll @@ -50,7 +50,7 @@ define i1 @test_srem_odd(i29 %X) nounwind { ; RV64-NEXT: add a2, a2, a4 ; RV64-NEXT: slli a4, a0, 2 ; RV64-NEXT: add a4, a0, a4 -; RV64-NEXT: subw a1, a1, a4 +; RV64-NEXT: sub a1, a1, a4 ; RV64-NEXT: slli a4, a0, 17 ; RV64-NEXT: add a3, a3, a4 ; RV64-NEXT: slli a0, a0, 23 @@ -59,8 +59,8 @@ define i1 @test_srem_odd(i29 %X) nounwind { ; RV64-NEXT: add a1, a1, a3 ; RV64-NEXT: lui a3, 1324 ; RV64-NEXT: addi a2, a2, -83 -; RV64-NEXT: subw a0, a0, a2 -; RV64-NEXT: subw a1, a1, a0 +; RV64-NEXT: sub a0, a0, a2 +; RV64-NEXT: sub a1, a1, a0 ; RV64-NEXT: slli a1, a1, 35 ; RV64-NEXT: srli a1, a1, 35 ; RV64-NEXT: addi a0, a3, -165 @@ -189,7 +189,7 @@ define i1 @test_srem_even(i4 %X) nounwind { ; RV64M-NEXT: add a1, a1, a2 ; RV64M-NEXT: slli a2, a1, 3 ; RV64M-NEXT: slli a1, a1, 1 -; RV64M-NEXT: subw a1, a1, a2 +; RV64M-NEXT: sub a1, a1, a2 ; RV64M-NEXT: add a0, a0, a1 ; RV64M-NEXT: andi a0, a0, 15 ; RV64M-NEXT: addi a0, a0, -1 @@ -225,7 +225,7 @@ define i1 @test_srem_even(i4 %X) nounwind { ; RV64MV-NEXT: add a1, a1, a2 ; RV64MV-NEXT: slli a2, a1, 3 ; RV64MV-NEXT: slli a1, a1, 1 -; RV64MV-NEXT: subw a1, a1, a2 +; RV64MV-NEXT: sub a1, a1, a2 ; RV64MV-NEXT: add a0, a0, a1 ; RV64MV-NEXT: andi a0, a0, 15 ; RV64MV-NEXT: addi a0, a0, -1 @@ -256,7 +256,7 @@ define i1 @test_srem_pow2_setne(i6 %X) nounwind { ; RV64-NEXT: srli a1, a1, 62 ; RV64-NEXT: add a1, a0, a1 ; RV64-NEXT: andi a1, a1, 60 -; RV64-NEXT: subw a0, a0, a1 +; RV64-NEXT: sub a0, a0, a1 ; RV64-NEXT: andi a0, a0, 63 ; RV64-NEXT: snez a0, a0 ; RV64-NEXT: ret @@ -280,7 +280,7 @@ define i1 @test_srem_pow2_setne(i6 %X) nounwind { ; RV64M-NEXT: srli a1, a1, 62 ; RV64M-NEXT: add a1, a0, a1 ; RV64M-NEXT: andi a1, a1, 60 -; RV64M-NEXT: subw a0, a0, a1 +; RV64M-NEXT: sub a0, a0, a1 ; RV64M-NEXT: andi a0, a0, 63 ; RV64M-NEXT: snez a0, a0 ; RV64M-NEXT: ret @@ -304,7 +304,7 @@ define i1 @test_srem_pow2_setne(i6 %X) nounwind { ; RV64MV-NEXT: srli a1, a1, 62 ; RV64MV-NEXT: add a1, a0, a1 ; RV64MV-NEXT: andi a1, a1, 60 -; RV64MV-NEXT: subw a0, a0, a1 +; RV64MV-NEXT: sub a0, a0, a1 ; RV64MV-NEXT: andi a0, a0, 63 ; RV64MV-NEXT: snez a0, a0 ; RV64MV-NEXT: ret diff --git a/llvm/test/CodeGen/RISCV/srem-vector-lkk.ll b/llvm/test/CodeGen/RISCV/srem-vector-lkk.ll index 30ffaf6..5129ccc 100644 --- a/llvm/test/CodeGen/RISCV/srem-vector-lkk.ll +++ b/llvm/test/CodeGen/RISCV/srem-vector-lkk.ll @@ -183,10 +183,10 @@ define <4 x i16> @fold_srem_vec_1(<4 x i16> %x) nounwind { ; RV64IM-NEXT: mul a5, a5, t1 ; RV64IM-NEXT: li t1, -124 ; RV64IM-NEXT: mul a6, a6, t1 -; RV64IM-NEXT: subw a4, a4, a7 -; RV64IM-NEXT: subw a1, a1, t0 -; RV64IM-NEXT: subw a3, a3, a5 -; RV64IM-NEXT: subw a2, a2, a6 +; RV64IM-NEXT: sub a4, a4, a7 +; RV64IM-NEXT: sub a1, a1, t0 +; RV64IM-NEXT: sub a3, a3, a5 +; RV64IM-NEXT: sub a2, a2, a6 ; RV64IM-NEXT: sh a3, 0(a0) ; RV64IM-NEXT: sh a2, 2(a0) ; RV64IM-NEXT: sh a4, 4(a0) @@ -357,10 +357,10 @@ define <4 x i16> @fold_srem_vec_2(<4 x i16> %x) nounwind { ; RV64IM-NEXT: mul a7, a7, t1 ; RV64IM-NEXT: mul t0, t0, t1 ; RV64IM-NEXT: mul a2, a2, t1 -; RV64IM-NEXT: subw a3, a3, a6 -; RV64IM-NEXT: subw a4, a4, a7 -; RV64IM-NEXT: subw a5, a5, t0 -; RV64IM-NEXT: subw a1, a1, a2 +; RV64IM-NEXT: sub a3, a3, a6 +; RV64IM-NEXT: sub a4, a4, a7 +; RV64IM-NEXT: sub a5, a5, t0 +; RV64IM-NEXT: sub a1, a1, a2 ; RV64IM-NEXT: sh a3, 0(a0) ; RV64IM-NEXT: sh a4, 2(a0) ; RV64IM-NEXT: sh a5, 4(a0) @@ -597,10 +597,10 @@ define <4 x i16> @combine_srem_sdiv(<4 x i16> %x) nounwind { ; RV64IM-NEXT: add a1, a1, t1 ; RV64IM-NEXT: add a3, a3, t0 ; RV64IM-NEXT: add a4, a4, a7 -; RV64IM-NEXT: subw a2, a2, a6 -; RV64IM-NEXT: subw a1, a1, t4 -; RV64IM-NEXT: subw a3, a3, t3 -; RV64IM-NEXT: subw a4, a4, t2 +; RV64IM-NEXT: sub a2, a2, a6 +; RV64IM-NEXT: sub a1, a1, t4 +; RV64IM-NEXT: sub a3, a3, t3 +; RV64IM-NEXT: sub a4, a4, t2 ; RV64IM-NEXT: sh a2, 0(a0) ; RV64IM-NEXT: sh a1, 2(a0) ; RV64IM-NEXT: sh a3, 4(a0) @@ -703,15 +703,15 @@ define <4 x i16> @dont_fold_srem_power_of_two(<4 x i16> %x) nounwind { ; RV64I-NEXT: srli a1, a2, 58 ; RV64I-NEXT: add a1, a2, a1 ; RV64I-NEXT: andi a1, a1, -64 -; RV64I-NEXT: subw s1, a2, a1 +; RV64I-NEXT: sub s1, a2, a1 ; RV64I-NEXT: srli a1, a3, 59 ; RV64I-NEXT: add a1, a3, a1 ; RV64I-NEXT: andi a1, a1, -32 -; RV64I-NEXT: subw s2, a3, a1 +; RV64I-NEXT: sub s2, a3, a1 ; RV64I-NEXT: srli a1, a4, 61 ; RV64I-NEXT: add a1, a4, a1 ; RV64I-NEXT: andi a1, a1, -8 -; RV64I-NEXT: subw s3, a4, a1 +; RV64I-NEXT: sub s3, a4, a1 ; RV64I-NEXT: li a1, 95 ; RV64I-NEXT: call __moddi3 ; RV64I-NEXT: sh s1, 0(s0) @@ -737,23 +737,23 @@ define <4 x i16> @dont_fold_srem_power_of_two(<4 x i16> %x) nounwind { ; RV64IM-NEXT: srli a6, a2, 58 ; RV64IM-NEXT: add a6, a2, a6 ; RV64IM-NEXT: andi a6, a6, -64 -; RV64IM-NEXT: subw a2, a2, a6 +; RV64IM-NEXT: sub a2, a2, a6 ; RV64IM-NEXT: srli a6, a3, 59 ; RV64IM-NEXT: add a6, a3, a6 ; RV64IM-NEXT: andi a6, a6, -32 -; RV64IM-NEXT: subw a3, a3, a6 +; RV64IM-NEXT: sub a3, a3, a6 ; RV64IM-NEXT: srli a6, a4, 61 ; RV64IM-NEXT: mulh a5, a1, a5 ; RV64IM-NEXT: add a6, a4, a6 ; RV64IM-NEXT: add a5, a5, a1 ; RV64IM-NEXT: andi a6, a6, -8 -; RV64IM-NEXT: subw a4, a4, a6 +; RV64IM-NEXT: sub a4, a4, a6 ; RV64IM-NEXT: srli a6, a5, 63 ; RV64IM-NEXT: srli a5, a5, 6 ; RV64IM-NEXT: add a5, a5, a6 ; RV64IM-NEXT: li a6, 95 ; RV64IM-NEXT: mul a5, a5, a6 -; RV64IM-NEXT: subw a1, a1, a5 +; RV64IM-NEXT: sub a1, a1, a5 ; RV64IM-NEXT: sh a2, 0(a0) ; RV64IM-NEXT: sh a3, 2(a0) ; RV64IM-NEXT: sh a4, 4(a0) @@ -909,9 +909,9 @@ define <4 x i16> @dont_fold_srem_one(<4 x i16> %x) nounwind { ; RV64IM-NEXT: mul a6, a6, a7 ; RV64IM-NEXT: li a7, 23 ; RV64IM-NEXT: mul a4, a4, a7 -; RV64IM-NEXT: subw a2, a2, a5 -; RV64IM-NEXT: subw a1, a1, a6 -; RV64IM-NEXT: subw a3, a3, a4 +; RV64IM-NEXT: sub a2, a2, a5 +; RV64IM-NEXT: sub a1, a1, a6 +; RV64IM-NEXT: sub a3, a3, a4 ; RV64IM-NEXT: sh zero, 0(a0) ; RV64IM-NEXT: sh a2, 2(a0) ; RV64IM-NEXT: sh a3, 4(a0) @@ -1011,7 +1011,7 @@ define <4 x i16> @dont_fold_urem_i16_smax(<4 x i16> %x) nounwind { ; RV64I-NEXT: add a1, a2, a1 ; RV64I-NEXT: lui a3, 8 ; RV64I-NEXT: and a1, a1, a3 -; RV64I-NEXT: subw s3, a2, a1 +; RV64I-NEXT: sub s3, a2, a1 ; RV64I-NEXT: li a1, 23 ; RV64I-NEXT: call __moddi3 ; RV64I-NEXT: mv s2, a0 @@ -1050,7 +1050,7 @@ define <4 x i16> @dont_fold_urem_i16_smax(<4 x i16> %x) nounwind { ; RV64IM-NEXT: add a5, a5, a7 ; RV64IM-NEXT: mulh a4, a3, a4 ; RV64IM-NEXT: add a4, a4, a3 -; RV64IM-NEXT: subw a2, a2, a6 +; RV64IM-NEXT: sub a2, a2, a6 ; RV64IM-NEXT: srli a6, a4, 63 ; RV64IM-NEXT: srli a4, a4, 4 ; RV64IM-NEXT: add a4, a4, a6 @@ -1059,8 +1059,8 @@ define <4 x i16> @dont_fold_urem_i16_smax(<4 x i16> %x) nounwind { ; RV64IM-NEXT: mul a5, a5, a6 ; RV64IM-NEXT: li a6, 23 ; RV64IM-NEXT: mul a4, a4, a6 -; RV64IM-NEXT: subw a1, a1, a5 -; RV64IM-NEXT: subw a3, a3, a4 +; RV64IM-NEXT: sub a1, a1, a5 +; RV64IM-NEXT: sub a3, a3, a4 ; RV64IM-NEXT: sh zero, 0(a0) ; RV64IM-NEXT: sh a2, 2(a0) ; RV64IM-NEXT: sh a3, 4(a0) diff --git a/llvm/test/CodeGen/RISCV/typepromotion-overflow.ll b/llvm/test/CodeGen/RISCV/typepromotion-overflow.ll index 3007c35..0c13a1d 100644 --- a/llvm/test/CodeGen/RISCV/typepromotion-overflow.ll +++ b/llvm/test/CodeGen/RISCV/typepromotion-overflow.ll @@ -26,7 +26,7 @@ define zeroext i16 @overflow_add(i16 zeroext %a, i16 zeroext %b) { define zeroext i16 @overflow_sub(i16 zeroext %a, i16 zeroext %b) { ; CHECK-LABEL: overflow_sub: ; CHECK: # %bb.0: -; CHECK-NEXT: subw a0, a0, a1 +; CHECK-NEXT: sub a0, a0, a1 ; CHECK-NEXT: ori a0, a0, 1 ; CHECK-NEXT: slli a0, a0, 48 ; CHECK-NEXT: srli a0, a0, 48 diff --git a/llvm/test/CodeGen/RISCV/urem-lkk.ll b/llvm/test/CodeGen/RISCV/urem-lkk.ll index af5121d..ee49612 100644 --- a/llvm/test/CodeGen/RISCV/urem-lkk.ll +++ b/llvm/test/CodeGen/RISCV/urem-lkk.ll @@ -48,7 +48,7 @@ define i32 @fold_urem_positive_odd(i32 %x) nounwind { ; RV64IM-NEXT: slli a2, a2, 32 ; RV64IM-NEXT: mulhu a1, a1, a2 ; RV64IM-NEXT: srli a1, a1, 32 -; RV64IM-NEXT: subw a2, a0, a1 +; RV64IM-NEXT: sub a2, a0, a1 ; RV64IM-NEXT: srliw a2, a2, 1 ; RV64IM-NEXT: add a1, a2, a1 ; RV64IM-NEXT: srli a1, a1, 6 @@ -174,7 +174,7 @@ define i32 @combine_urem_udiv(i32 %x) nounwind { ; RV64IM-NEXT: slli a2, a2, 32 ; RV64IM-NEXT: mulhu a1, a1, a2 ; RV64IM-NEXT: srli a1, a1, 32 -; RV64IM-NEXT: subw a2, a0, a1 +; RV64IM-NEXT: sub a2, a0, a1 ; RV64IM-NEXT: srliw a2, a2, 1 ; RV64IM-NEXT: add a1, a2, a1 ; RV64IM-NEXT: li a2, 95 diff --git a/llvm/test/CodeGen/RISCV/urem-seteq-illegal-types.ll b/llvm/test/CodeGen/RISCV/urem-seteq-illegal-types.ll index d33c666..636fdfa 100644 --- a/llvm/test/CodeGen/RISCV/urem-seteq-illegal-types.ll +++ b/llvm/test/CodeGen/RISCV/urem-seteq-illegal-types.ll @@ -31,11 +31,11 @@ define i1 @test_urem_odd(i13 %X) nounwind { ; RV64-NEXT: slli a1, a0, 4 ; RV64-NEXT: slli a2, a0, 6 ; RV64-NEXT: slli a3, a0, 8 -; RV64-NEXT: subw a1, a1, a2 +; RV64-NEXT: sub a1, a1, a2 ; RV64-NEXT: slli a2, a0, 10 -; RV64-NEXT: subw a3, a3, a2 +; RV64-NEXT: sub a3, a3, a2 ; RV64-NEXT: slli a2, a0, 2 -; RV64-NEXT: subw a2, a0, a2 +; RV64-NEXT: sub a2, a0, a2 ; RV64-NEXT: slli a0, a0, 12 ; RV64-NEXT: add a1, a2, a1 ; RV64-NEXT: add a0, a3, a0 @@ -138,10 +138,10 @@ define i1 @test_urem_even(i27 %X) nounwind { ; RV64-NEXT: slli a4, a0, 18 ; RV64-NEXT: add a3, a3, a4 ; RV64-NEXT: slli a0, a0, 27 -; RV64-NEXT: subw a0, a0, a2 +; RV64-NEXT: sub a0, a0, a2 ; RV64-NEXT: lui a2, 2341 ; RV64-NEXT: add a1, a1, a3 -; RV64-NEXT: subw a0, a0, a1 +; RV64-NEXT: sub a0, a0, a1 ; RV64-NEXT: slli a1, a0, 26 ; RV64-NEXT: slli a0, a0, 37 ; RV64-NEXT: srli a0, a0, 38 @@ -234,8 +234,8 @@ define i1 @test_urem_odd_setne(i4 %X) nounwind { ; RV64-LABEL: test_urem_odd_setne: ; RV64: # %bb.0: ; RV64-NEXT: slli a1, a0, 1 -; RV64-NEXT: negw a0, a0 -; RV64-NEXT: subw a0, a0, a1 +; RV64-NEXT: neg a0, a0 +; RV64-NEXT: sub a0, a0, a1 ; RV64-NEXT: andi a0, a0, 15 ; RV64-NEXT: sltiu a0, a0, 4 ; RV64-NEXT: xori a0, a0, 1 @@ -254,8 +254,8 @@ define i1 @test_urem_odd_setne(i4 %X) nounwind { ; RV64M-LABEL: test_urem_odd_setne: ; RV64M: # %bb.0: ; RV64M-NEXT: slli a1, a0, 1 -; RV64M-NEXT: negw a0, a0 -; RV64M-NEXT: subw a0, a0, a1 +; RV64M-NEXT: neg a0, a0 +; RV64M-NEXT: sub a0, a0, a1 ; RV64M-NEXT: andi a0, a0, 15 ; RV64M-NEXT: sltiu a0, a0, 4 ; RV64M-NEXT: xori a0, a0, 1 @@ -274,8 +274,8 @@ define i1 @test_urem_odd_setne(i4 %X) nounwind { ; RV64MV-LABEL: test_urem_odd_setne: ; RV64MV: # %bb.0: ; RV64MV-NEXT: slli a1, a0, 1 -; RV64MV-NEXT: negw a0, a0 -; RV64MV-NEXT: subw a0, a0, a1 +; RV64MV-NEXT: neg a0, a0 +; RV64MV-NEXT: sub a0, a0, a1 ; RV64MV-NEXT: andi a0, a0, 15 ; RV64MV-NEXT: sltiu a0, a0, 4 ; RV64MV-NEXT: xori a0, a0, 1 @@ -306,9 +306,9 @@ define i1 @test_urem_negative_odd(i9 %X) nounwind { ; RV64-NEXT: slli a1, a0, 2 ; RV64-NEXT: slli a2, a0, 4 ; RV64-NEXT: slli a3, a0, 6 -; RV64-NEXT: subw a1, a1, a0 -; RV64-NEXT: subw a2, a2, a3 -; RV64-NEXT: subw a1, a1, a2 +; RV64-NEXT: sub a1, a1, a0 +; RV64-NEXT: sub a2, a2, a3 +; RV64-NEXT: sub a1, a1, a2 ; RV64-NEXT: slli a0, a0, 8 ; RV64-NEXT: add a0, a1, a0 ; RV64-NEXT: andi a0, a0, 511 @@ -437,7 +437,7 @@ define void @test_urem_vec(ptr %X) nounwind { ; RV64-NEXT: addi a2, a2, -2 ; RV64-NEXT: add a1, a1, a4 ; RV64-NEXT: add a5, a5, a6 -; RV64-NEXT: subw a4, t0, a7 +; RV64-NEXT: sub a4, t0, a7 ; RV64-NEXT: slli a6, a3, 3 ; RV64-NEXT: slli a7, a3, 6 ; RV64-NEXT: slli t0, a3, 9 @@ -447,18 +447,18 @@ define void @test_urem_vec(ptr %X) nounwind { ; RV64-NEXT: slli a6, a2, 4 ; RV64-NEXT: add a7, a7, t0 ; RV64-NEXT: slli t0, a2, 6 -; RV64-NEXT: subw a6, a6, t0 +; RV64-NEXT: sub a6, a6, t0 ; RV64-NEXT: slli t0, a2, 8 -; RV64-NEXT: subw a5, a5, a2 +; RV64-NEXT: sub a5, a5, a2 ; RV64-NEXT: slli a2, a2, 10 -; RV64-NEXT: subw a2, t0, a2 -; RV64-NEXT: subw a4, a4, a1 +; RV64-NEXT: sub a2, t0, a2 +; RV64-NEXT: sub a4, a4, a1 ; RV64-NEXT: add a3, a3, a7 -; RV64-NEXT: subw a1, a5, a6 +; RV64-NEXT: sub a1, a5, a6 ; RV64-NEXT: slli a5, a4, 10 ; RV64-NEXT: slli a4, a4, 53 -; RV64-NEXT: negw a3, a3 -; RV64-NEXT: subw a1, a1, a2 +; RV64-NEXT: neg a3, a3 +; RV64-NEXT: sub a1, a1, a2 ; RV64-NEXT: srli a4, a4, 54 ; RV64-NEXT: andi a2, a3, 2047 ; RV64-NEXT: andi a1, a1, 2047 diff --git a/llvm/test/CodeGen/RISCV/urem-vector-lkk.ll b/llvm/test/CodeGen/RISCV/urem-vector-lkk.ll index 3ef9f3f..5a3dfd1 100644 --- a/llvm/test/CodeGen/RISCV/urem-vector-lkk.ll +++ b/llvm/test/CodeGen/RISCV/urem-vector-lkk.ll @@ -157,10 +157,10 @@ define <4 x i16> @fold_urem_vec_1(<4 x i16> %x) nounwind { ; RV64IM-NEXT: mul a7, a7, t1 ; RV64IM-NEXT: slli t1, a5, 7 ; RV64IM-NEXT: slli a5, a5, 2 -; RV64IM-NEXT: subw a5, a5, t1 -; RV64IM-NEXT: subw a2, a2, a6 -; RV64IM-NEXT: subw a4, a4, t0 -; RV64IM-NEXT: subw a1, a1, a7 +; RV64IM-NEXT: sub a5, a5, t1 +; RV64IM-NEXT: sub a2, a2, a6 +; RV64IM-NEXT: sub a4, a4, t0 +; RV64IM-NEXT: sub a1, a1, a7 ; RV64IM-NEXT: add a3, a3, a5 ; RV64IM-NEXT: sh a2, 0(a0) ; RV64IM-NEXT: sh a3, 2(a0) @@ -300,10 +300,10 @@ define <4 x i16> @fold_urem_vec_2(<4 x i16> %x) nounwind { ; RV64IM-NEXT: mul t0, t0, a6 ; RV64IM-NEXT: mul t1, t1, a6 ; RV64IM-NEXT: mul a2, a2, a6 -; RV64IM-NEXT: subw a3, a3, a7 -; RV64IM-NEXT: subw a4, a4, t0 -; RV64IM-NEXT: subw a5, a5, t1 -; RV64IM-NEXT: subw a1, a1, a2 +; RV64IM-NEXT: sub a3, a3, a7 +; RV64IM-NEXT: sub a4, a4, t0 +; RV64IM-NEXT: sub a5, a5, t1 +; RV64IM-NEXT: sub a1, a1, a2 ; RV64IM-NEXT: sh a3, 0(a0) ; RV64IM-NEXT: sh a4, 2(a0) ; RV64IM-NEXT: sh a5, 4(a0) @@ -508,10 +508,10 @@ define <4 x i16> @combine_urem_udiv(<4 x i16> %x) nounwind { ; RV64IM-NEXT: add a1, a1, t1 ; RV64IM-NEXT: add a3, a3, t0 ; RV64IM-NEXT: add a4, a4, a7 -; RV64IM-NEXT: subw a2, a2, a6 -; RV64IM-NEXT: subw a1, a1, t4 -; RV64IM-NEXT: subw a3, a3, t3 -; RV64IM-NEXT: subw a4, a4, t2 +; RV64IM-NEXT: sub a2, a2, a6 +; RV64IM-NEXT: sub a1, a1, t4 +; RV64IM-NEXT: sub a3, a3, t3 +; RV64IM-NEXT: sub a4, a4, t2 ; RV64IM-NEXT: sh a2, 0(a0) ; RV64IM-NEXT: sh a1, 2(a0) ; RV64IM-NEXT: sh a3, 4(a0) @@ -622,7 +622,7 @@ define <4 x i16> @dont_fold_urem_power_of_two(<4 x i16> %x) nounwind { ; RV64IM-NEXT: andi a4, a4, 7 ; RV64IM-NEXT: mulhu a5, a1, a5 ; RV64IM-NEXT: mul a5, a5, a6 -; RV64IM-NEXT: subw a1, a1, a5 +; RV64IM-NEXT: sub a1, a1, a5 ; RV64IM-NEXT: sh a2, 0(a0) ; RV64IM-NEXT: sh a3, 2(a0) ; RV64IM-NEXT: sh a4, 4(a0) @@ -757,9 +757,9 @@ define <4 x i16> @dont_fold_urem_one(<4 x i16> %x) nounwind { ; RV64IM-NEXT: addi a7, a7, 1327 ; RV64IM-NEXT: mulhu a5, a1, a5 ; RV64IM-NEXT: mul a5, a5, a7 -; RV64IM-NEXT: subw a2, a2, a4 -; RV64IM-NEXT: subw a3, a3, a6 -; RV64IM-NEXT: subw a1, a1, a5 +; RV64IM-NEXT: sub a2, a2, a4 +; RV64IM-NEXT: sub a3, a3, a6 +; RV64IM-NEXT: sub a1, a1, a5 ; RV64IM-NEXT: sh zero, 0(a0) ; RV64IM-NEXT: sh a2, 2(a0) ; RV64IM-NEXT: sh a3, 4(a0) diff --git a/llvm/test/CodeGen/RISCV/xqciac.ll b/llvm/test/CodeGen/RISCV/xqciac.ll index a3b4e78..4c77b39 100644 --- a/llvm/test/CodeGen/RISCV/xqciac.ll +++ b/llvm/test/CodeGen/RISCV/xqciac.ll @@ -231,12 +231,12 @@ define dso_local i32 @pow2(i32 %a, i32 %b) local_unnamed_addr #0 { ; ; RV32IMXQCIAC-LABEL: pow2: ; RV32IMXQCIAC: # %bb.0: # %entry -; RV32IMXQCIAC-NEXT: qc.shladd a0, a0, a1, 5 +; RV32IMXQCIAC-NEXT: qc.shladd a0, a1, a0, 5 ; RV32IMXQCIAC-NEXT: ret ; ; RV32IZBAMXQCIAC-LABEL: pow2: ; RV32IZBAMXQCIAC: # %bb.0: # %entry -; RV32IZBAMXQCIAC-NEXT: qc.shladd a0, a0, a1, 5 +; RV32IZBAMXQCIAC-NEXT: qc.shladd a0, a1, a0, 5 ; RV32IZBAMXQCIAC-NEXT: ret entry: %mul = mul nsw i32 %b, 32 @@ -276,12 +276,12 @@ define dso_local i32 @shladd(i32 %a, i32 %b) local_unnamed_addr #0 { ; ; RV32IMXQCIAC-LABEL: shladd: ; RV32IMXQCIAC: # %bb.0: # %entry -; RV32IMXQCIAC-NEXT: qc.shladd a0, a0, a1, 31 +; RV32IMXQCIAC-NEXT: qc.shladd a0, a1, a0, 31 ; RV32IMXQCIAC-NEXT: ret ; ; RV32IZBAMXQCIAC-LABEL: shladd: ; RV32IZBAMXQCIAC: # %bb.0: # %entry -; RV32IZBAMXQCIAC-NEXT: qc.shladd a0, a0, a1, 31 +; RV32IZBAMXQCIAC-NEXT: qc.shladd a0, a1, a0, 31 ; RV32IZBAMXQCIAC-NEXT: ret entry: %shl = shl nsw i32 %b, 31 @@ -305,9 +305,9 @@ define dso_local i64 @shladd64(i64 %a, i64 %b) local_unnamed_addr #0 { ; RV32IMXQCIAC-LABEL: shladd64: ; RV32IMXQCIAC: # %bb.0: # %entry ; RV32IMXQCIAC-NEXT: srli a4, a2, 1 -; RV32IMXQCIAC-NEXT: qc.shladd a0, a0, a2, 31 +; RV32IMXQCIAC-NEXT: qc.shladd a0, a2, a0, 31 ; RV32IMXQCIAC-NEXT: slli a2, a2, 31 -; RV32IMXQCIAC-NEXT: qc.shladd a3, a4, a3, 31 +; RV32IMXQCIAC-NEXT: qc.shladd a3, a3, a4, 31 ; RV32IMXQCIAC-NEXT: sltu a2, a0, a2 ; RV32IMXQCIAC-NEXT: add a1, a1, a3 ; RV32IMXQCIAC-NEXT: add a1, a1, a2 @@ -316,9 +316,9 @@ define dso_local i64 @shladd64(i64 %a, i64 %b) local_unnamed_addr #0 { ; RV32IZBAMXQCIAC-LABEL: shladd64: ; RV32IZBAMXQCIAC: # %bb.0: # %entry ; RV32IZBAMXQCIAC-NEXT: srli a4, a2, 1 -; RV32IZBAMXQCIAC-NEXT: qc.shladd a0, a0, a2, 31 +; RV32IZBAMXQCIAC-NEXT: qc.shladd a0, a2, a0, 31 ; RV32IZBAMXQCIAC-NEXT: slli a2, a2, 31 -; RV32IZBAMXQCIAC-NEXT: qc.shladd a3, a4, a3, 31 +; RV32IZBAMXQCIAC-NEXT: qc.shladd a3, a3, a4, 31 ; RV32IZBAMXQCIAC-NEXT: sltu a2, a0, a2 ; RV32IZBAMXQCIAC-NEXT: add a1, a1, a3 ; RV32IZBAMXQCIAC-NEXT: add a1, a1, a2 @@ -338,12 +338,12 @@ define dso_local i32 @shladd_ordisjoint(i32 %a, i32 %b) local_unnamed_addr #0 { ; ; RV32IMXQCIAC-LABEL: shladd_ordisjoint: ; RV32IMXQCIAC: # %bb.0: # %entry -; RV32IMXQCIAC-NEXT: qc.shladd a0, a0, a1, 22 +; RV32IMXQCIAC-NEXT: qc.shladd a0, a1, a0, 22 ; RV32IMXQCIAC-NEXT: ret ; ; RV32IZBAMXQCIAC-LABEL: shladd_ordisjoint: ; RV32IZBAMXQCIAC: # %bb.0: # %entry -; RV32IZBAMXQCIAC-NEXT: qc.shladd a0, a0, a1, 22 +; RV32IZBAMXQCIAC-NEXT: qc.shladd a0, a1, a0, 22 ; RV32IZBAMXQCIAC-NEXT: ret entry: %shl = shl nsw i32 %b, 22 @@ -361,13 +361,13 @@ define dso_local i32 @shladdc1c2(i32 %a, i32 %b) local_unnamed_addr #0 { ; ; RV32IMXQCIAC-LABEL: shladdc1c2: ; RV32IMXQCIAC: # %bb.0: # %entry -; RV32IMXQCIAC-NEXT: qc.shladd a0, a1, a0, 5 +; RV32IMXQCIAC-NEXT: qc.shladd a0, a0, a1, 5 ; RV32IMXQCIAC-NEXT: slli a0, a0, 26 ; RV32IMXQCIAC-NEXT: ret ; ; RV32IZBAMXQCIAC-LABEL: shladdc1c2: ; RV32IZBAMXQCIAC: # %bb.0: # %entry -; RV32IZBAMXQCIAC-NEXT: qc.shladd a0, a1, a0, 5 +; RV32IZBAMXQCIAC-NEXT: qc.shladd a0, a0, a1, 5 ; RV32IZBAMXQCIAC-NEXT: slli a0, a0, 26 ; RV32IZBAMXQCIAC-NEXT: ret entry: @@ -388,7 +388,7 @@ define dso_local i32 @shxaddc1c2(i32 %a, i32 %b) local_unnamed_addr #0 { ; RV32IMXQCIAC-LABEL: shxaddc1c2: ; RV32IMXQCIAC: # %bb.0: # %entry ; RV32IMXQCIAC-NEXT: slli a1, a1, 28 -; RV32IMXQCIAC-NEXT: qc.shladd a0, a1, a0, 31 +; RV32IMXQCIAC-NEXT: qc.shladd a0, a0, a1, 31 ; RV32IMXQCIAC-NEXT: ret ; ; RV32IZBAMXQCIAC-LABEL: shxaddc1c2: @@ -417,18 +417,18 @@ define dso_local i64 @shladdc1c264(i64 %a, i64 %b) local_unnamed_addr #0 { ; RV32IMXQCIAC-LABEL: shladdc1c264: ; RV32IMXQCIAC: # %bb.0: # %entry ; RV32IMXQCIAC-NEXT: srli a1, a2, 12 -; RV32IMXQCIAC-NEXT: qc.shladd a1, a1, a3, 20 +; RV32IMXQCIAC-NEXT: qc.shladd a1, a3, a1, 20 ; RV32IMXQCIAC-NEXT: slli a2, a2, 20 -; RV32IMXQCIAC-NEXT: qc.shladd a1, a1, a0, 23 +; RV32IMXQCIAC-NEXT: qc.shladd a1, a0, a1, 23 ; RV32IMXQCIAC-NEXT: mv a0, a2 ; RV32IMXQCIAC-NEXT: ret ; ; RV32IZBAMXQCIAC-LABEL: shladdc1c264: ; RV32IZBAMXQCIAC: # %bb.0: # %entry ; RV32IZBAMXQCIAC-NEXT: srli a1, a2, 12 -; RV32IZBAMXQCIAC-NEXT: qc.shladd a1, a1, a3, 20 +; RV32IZBAMXQCIAC-NEXT: qc.shladd a1, a3, a1, 20 ; RV32IZBAMXQCIAC-NEXT: slli a2, a2, 20 -; RV32IZBAMXQCIAC-NEXT: qc.shladd a1, a1, a0, 23 +; RV32IZBAMXQCIAC-NEXT: qc.shladd a1, a0, a1, 23 ; RV32IZBAMXQCIAC-NEXT: mv a0, a2 ; RV32IZBAMXQCIAC-NEXT: ret entry: @@ -449,13 +449,13 @@ define dso_local i32 @shladdc1equalc2(i32 %a, i32 %b) local_unnamed_addr #0 { ; RV32IMXQCIAC-LABEL: shladdc1equalc2: ; RV32IMXQCIAC: # %bb.0: # %entry ; RV32IMXQCIAC-NEXT: slli a1, a1, 12 -; RV32IMXQCIAC-NEXT: qc.shladd a0, a1, a0, 12 +; RV32IMXQCIAC-NEXT: qc.shladd a0, a0, a1, 12 ; RV32IMXQCIAC-NEXT: ret ; ; RV32IZBAMXQCIAC-LABEL: shladdc1equalc2: ; RV32IZBAMXQCIAC: # %bb.0: # %entry ; RV32IZBAMXQCIAC-NEXT: slli a1, a1, 12 -; RV32IZBAMXQCIAC-NEXT: qc.shladd a0, a1, a0, 12 +; RV32IZBAMXQCIAC-NEXT: qc.shladd a0, a0, a1, 12 ; RV32IZBAMXQCIAC-NEXT: ret entry: %shlc1 = shl nsw i32 %a, 12 diff --git a/llvm/test/CodeGen/RISCV/zdinx-boundary-check.ll b/llvm/test/CodeGen/RISCV/zdinx-boundary-check.ll index f9db686..1ef37f7 100644 --- a/llvm/test/CodeGen/RISCV/zdinx-boundary-check.ll +++ b/llvm/test/CodeGen/RISCV/zdinx-boundary-check.ll @@ -242,7 +242,7 @@ define void @foo7(ptr nocapture %p) nounwind { ; RV64ZDINX: # %bb.0: # %entry ; RV64ZDINX-NEXT: lui a1, %hi(d) ; RV64ZDINX-NEXT: addi a2, a1, %lo(d) -; RV64ZDINX-NEXT: lwu a2, 8(a2) +; RV64ZDINX-NEXT: lw a2, 8(a2) ; RV64ZDINX-NEXT: lwu a1, %lo(d+4)(a1) ; RV64ZDINX-NEXT: slli a2, a2, 32 ; RV64ZDINX-NEXT: or a1, a2, a1 @@ -337,7 +337,7 @@ define void @foo9(ptr nocapture %p) nounwind { ; RV64ZDINX: # %bb.0: # %entry ; RV64ZDINX-NEXT: lui a1, %hi(e) ; RV64ZDINX-NEXT: addi a2, a1, %lo(e) -; RV64ZDINX-NEXT: lwu a2, 4(a2) +; RV64ZDINX-NEXT: lw a2, 4(a2) ; RV64ZDINX-NEXT: lwu a1, %lo(e)(a1) ; RV64ZDINX-NEXT: slli a2, a2, 32 ; RV64ZDINX-NEXT: or a1, a2, a1 @@ -480,7 +480,7 @@ define double @foo13(ptr nocapture %p) nounwind { ; RV64ZDINX-LABEL: foo13: ; RV64ZDINX: # %bb.0: # %entry ; RV64ZDINX-NEXT: lui a0, %hi(f) -; RV64ZDINX-NEXT: lwu a1, %lo(f+8)(a0) +; RV64ZDINX-NEXT: lw a1, %lo(f+8)(a0) ; RV64ZDINX-NEXT: lwu a0, %lo(f+4)(a0) ; RV64ZDINX-NEXT: slli a1, a1, 32 ; RV64ZDINX-NEXT: or a0, a1, a0 diff --git a/llvm/test/CodeGen/SPIRV/llvm-intrinsics/fp-intrinsics.ll b/llvm/test/CodeGen/SPIRV/llvm-intrinsics/fp-intrinsics.ll index 3d46b52..70030ca 100644 --- a/llvm/test/CodeGen/SPIRV/llvm-intrinsics/fp-intrinsics.ll +++ b/llvm/test/CodeGen/SPIRV/llvm-intrinsics/fp-intrinsics.ll @@ -1,4 +1,5 @@ ; RUN: llc -verify-machineinstrs -O0 -mtriple=spirv64-unknown-unknown %s -o - | FileCheck %s +; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv64-unknown-unknown %s -o - -filetype=obj | spirv-val %} ; CHECK: %[[#extinst_id:]] = OpExtInstImport "OpenCL.std" @@ -337,3 +338,68 @@ entry: } declare float @llvm.fma.f32(float, float, float) + +; CHECK: OpFunction +; CHECK: %[[#d:]] = OpFunctionParameter %[[#]] +; CHECK: %[[#fracPtr:]] = OpFunctionParameter %[[#]] +; CHECK: %[[#integralPtr:]] = OpFunctionParameter %[[#]] +; CHECK: %[[#varPtr:]] = OpVariable %[[#]] Function +; CHECK: %[[#frac:]] = OpExtInst %[[#var2]] %[[#extinst_id]] modf %[[#d]] %[[#varPtr]] +; CHECK: %[[#integral:]] = OpLoad %[[#var2]] %[[#varPtr]] +; CHECK: OpStore %[[#fracPtr]] %[[#frac]] +; CHECK: OpStore %[[#integralPtr]] %[[#integral]] +; CHECK: OpFunctionEnd +define void @TestModf(double %d, ptr addrspace(1) %frac, ptr addrspace(1) %integral) { +entry: + %4 = tail call { double, double } @llvm.modf.f64(double %d) + %5 = extractvalue { double, double } %4, 0 + %6 = extractvalue { double, double } %4, 1 + store double %5, ptr addrspace(1) %frac, align 8 + store double %6, ptr addrspace(1) %integral, align 8 + ret void +} + +; CHECK: OpFunction +; CHECK: %[[#d:]] = OpFunctionParameter %[[#]] +; CHECK: %[[#fracPtr:]] = OpFunctionParameter %[[#]] +; CHECK: %[[#integralPtr:]] = OpFunctionParameter %[[#]] +; CHECK: %[[#entryBlock:]] = OpLabel +; CHECK: %[[#varPtr:]] = OpVariable %[[#]] Function +; CHECK: OpBranchConditional %[[#]] %[[#lor_lhs_falseBlock:]] %[[#if_thenBlock:]] +; CHECK: %[[#lor_lhs_falseBlock]] = OpLabel +; CHECK: OpBranchConditional %[[#]] %[[#if_endBlock:]] %[[#if_thenBlock]] +; CHECK: %[[#if_thenBlock]] = OpLabel +; CHECK: OpBranch %[[#returnBlock:]] +; CHECK: %[[#if_endBlock]] = OpLabel +; CHECK: %[[#frac:]] = OpExtInst %[[#var2]] %[[#extinst_id]] modf %[[#d]] %[[#varPtr]] +; CHECK: %[[#integral:]] = OpLoad %[[#var2]] %[[#varPtr]] +; CHECK: OpStore %[[#fracPtr]] %[[#frac]] +; CHECK: OpStore %[[#integralPtr]] %[[#integral]] +; CHECK: OpFunctionEnd +define dso_local void @TestModf2(double noundef %d, ptr noundef %frac, ptr noundef %integral) { +entry: + %0 = load ptr, ptr %frac, align 8 + %tobool = icmp ne ptr %0, null + br i1 %tobool, label %lor.lhs.false, label %if.then + +lor.lhs.false: + %1 = load ptr, ptr %integral, align 8 + %tobool1 = icmp ne ptr %1, null + br i1 %tobool1, label %if.end, label %if.then + +if.then: + br label %return + +if.end: + %6 = tail call { double, double } @llvm.modf.f64(double %d) + %7 = extractvalue { double, double } %6, 0 + %8 = extractvalue { double, double } %6, 1 + store double %7, ptr %frac, align 4 + store double %8, ptr %integral, align 4 + br label %return + +return: + ret void +} + +declare { double, double } @llvm.modf.f64(double) diff --git a/llvm/test/CodeGen/Thumb2/ifcvt-rescan-bug-2016-08-22.ll b/llvm/test/CodeGen/Thumb2/ifcvt-rescan-bug-2016-08-22.ll index 9acdd7e..b70505c 100644 --- a/llvm/test/CodeGen/Thumb2/ifcvt-rescan-bug-2016-08-22.ll +++ b/llvm/test/CodeGen/Thumb2/ifcvt-rescan-bug-2016-08-22.ll @@ -17,6 +17,7 @@ declare void @_ZNSsC1EPKcRKSaIcE() unnamed_addr #0 ; CHECK: .LBB0_2 ; Function Attrs: nounwind define hidden void @_ZN4llvm14DOTGraphTraitsIPNS_13ScheduleDAGMIEE17getEdgeAttributesEPKNS_5SUnitENS_13SUnitIteratorEPKNS_11ScheduleDAGE() #0 align 2 { + %a = alloca i8 br i1 undef, label %1, label %2 ; <label>:1: ; preds = %0 @@ -25,7 +26,7 @@ define hidden void @_ZN4llvm14DOTGraphTraitsIPNS_13ScheduleDAGMIEE17getEdgeAttri br label %3 ; <label>:2: ; preds = %0 - call void @llvm.lifetime.start.p0(i64 1, ptr undef) #0 + call void @llvm.lifetime.start.p0(i64 1, ptr %a) #0 call void @_ZNSaIcEC2Ev() #0 br label %3 diff --git a/llvm/test/CodeGen/WebAssembly/memcmp-expand.ll b/llvm/test/CodeGen/WebAssembly/memcmp-expand.ll new file mode 100644 index 0000000..8030438 --- /dev/null +++ b/llvm/test/CodeGen/WebAssembly/memcmp-expand.ll @@ -0,0 +1,151 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc < %s -disable-wasm-fallthrough-return-opt -wasm-disable-explicit-locals -wasm-keep-registers | FileCheck %s + +target triple = "wasm32-unknown-unknown" + +declare i32 @memcmp(ptr, ptr, i32) + +define i1 @memcmp_expand_3(ptr %a, ptr %b) { +; CHECK-LABEL: memcmp_expand_3: +; CHECK: .functype memcmp_expand_3 (i32, i32) -> (i32) +; CHECK-NEXT: # %bb.0: +; CHECK-NEXT: i32.load16_u $push7=, 0($0):p2align=0 +; CHECK-NEXT: i32.load16_u $push6=, 0($1):p2align=0 +; CHECK-NEXT: i32.xor $push8=, $pop7, $pop6 +; CHECK-NEXT: i32.const $push0=, 2 +; CHECK-NEXT: i32.add $push3=, $0, $pop0 +; CHECK-NEXT: i32.load8_u $push4=, 0($pop3) +; CHECK-NEXT: i32.const $push13=, 2 +; CHECK-NEXT: i32.add $push1=, $1, $pop13 +; CHECK-NEXT: i32.load8_u $push2=, 0($pop1) +; CHECK-NEXT: i32.xor $push5=, $pop4, $pop2 +; CHECK-NEXT: i32.or $push9=, $pop8, $pop5 +; CHECK-NEXT: i32.const $push10=, 65535 +; CHECK-NEXT: i32.and $push11=, $pop9, $pop10 +; CHECK-NEXT: i32.eqz $push12=, $pop11 +; CHECK-NEXT: return $pop12 + %cmp_3 = call i32 @memcmp(ptr %a, ptr %b, i32 3) + %res = icmp eq i32 %cmp_3, 0 + ret i1 %res +} + +define i1 @memcmp_expand_5(ptr %a, ptr %b) { +; CHECK-LABEL: memcmp_expand_5: +; CHECK: .functype memcmp_expand_5 (i32, i32) -> (i32) +; CHECK-NEXT: # %bb.0: +; CHECK-NEXT: i32.load $push7=, 0($0):p2align=0 +; CHECK-NEXT: i32.load $push6=, 0($1):p2align=0 +; CHECK-NEXT: i32.xor $push8=, $pop7, $pop6 +; CHECK-NEXT: i32.const $push0=, 4 +; CHECK-NEXT: i32.add $push3=, $0, $pop0 +; CHECK-NEXT: i32.load8_u $push4=, 0($pop3) +; CHECK-NEXT: i32.const $push11=, 4 +; CHECK-NEXT: i32.add $push1=, $1, $pop11 +; CHECK-NEXT: i32.load8_u $push2=, 0($pop1) +; CHECK-NEXT: i32.xor $push5=, $pop4, $pop2 +; CHECK-NEXT: i32.or $push9=, $pop8, $pop5 +; CHECK-NEXT: i32.eqz $push10=, $pop9 +; CHECK-NEXT: return $pop10 + %cmp_5 = call i32 @memcmp(ptr %a, ptr %b, i32 5) + %res = icmp eq i32 %cmp_5, 0 + ret i1 %res +} + +define i1 @memcmp_expand_7(ptr %a, ptr %b) { +; CHECK-LABEL: memcmp_expand_7: +; CHECK: .functype memcmp_expand_7 (i32, i32) -> (i32) +; CHECK-NEXT: # %bb.0: +; CHECK-NEXT: i32.load $push7=, 0($0):p2align=0 +; CHECK-NEXT: i32.load $push6=, 0($1):p2align=0 +; CHECK-NEXT: i32.xor $push8=, $pop7, $pop6 +; CHECK-NEXT: i32.const $push0=, 3 +; CHECK-NEXT: i32.add $push3=, $0, $pop0 +; CHECK-NEXT: i32.load $push4=, 0($pop3):p2align=0 +; CHECK-NEXT: i32.const $push11=, 3 +; CHECK-NEXT: i32.add $push1=, $1, $pop11 +; CHECK-NEXT: i32.load $push2=, 0($pop1):p2align=0 +; CHECK-NEXT: i32.xor $push5=, $pop4, $pop2 +; CHECK-NEXT: i32.or $push9=, $pop8, $pop5 +; CHECK-NEXT: i32.eqz $push10=, $pop9 +; CHECK-NEXT: return $pop10 + %cmp_7 = call i32 @memcmp(ptr %a, ptr %b, i32 7) + %res = icmp eq i32 %cmp_7, 0 + ret i1 %res +} + +; INFO: Negative test +; Should not expand even with simd128 +define i1 @memcmp_expand_129(ptr %a, ptr %b) { +; CHECK-LABEL: memcmp_expand_129: +; CHECK: .functype memcmp_expand_129 (i32, i32) -> (i32) +; CHECK-NEXT: # %bb.0: +; CHECK-NEXT: i32.const $push0=, 129 +; CHECK-NEXT: call $push1=, memcmp, $0, $1, $pop0 +; CHECK-NEXT: i32.eqz $push2=, $pop1 +; CHECK-NEXT: return $pop2 + %cmp_129 = call i32 @memcmp(ptr %a, ptr %b, i32 129) + %res = icmp eq i32 %cmp_129, 0 + ret i1 %res +} + +define i1 @memcmp_expand_2(ptr %a, ptr %b) { +; CHECK-LABEL: memcmp_expand_2: +; CHECK: .functype memcmp_expand_2 (i32, i32) -> (i32) +; CHECK-NEXT: # %bb.0: +; CHECK-NEXT: i32.load16_u $push1=, 0($0):p2align=0 +; CHECK-NEXT: i32.load16_u $push0=, 0($1):p2align=0 +; CHECK-NEXT: i32.eq $push2=, $pop1, $pop0 +; CHECK-NEXT: return $pop2 + %cmp_2 = call i32 @memcmp(ptr %a, ptr %b, i32 2) + %res = icmp eq i32 %cmp_2, 0 + ret i1 %res +} + +define i1 @memcmp_expand_2_align(ptr align(2) %a, ptr align(2) %b) { +; CHECK-LABEL: memcmp_expand_2_align: +; CHECK: .functype memcmp_expand_2_align (i32, i32) -> (i32) +; CHECK-NEXT: # %bb.0: +; CHECK-NEXT: i32.load16_u $push1=, 0($0) +; CHECK-NEXT: i32.load16_u $push0=, 0($1) +; CHECK-NEXT: i32.eq $push2=, $pop1, $pop0 +; CHECK-NEXT: return $pop2 + %cmp_2 = call i32 @memcmp(ptr %a, ptr %b, i32 2) + %res = icmp eq i32 %cmp_2, 0 + ret i1 %res +} + +define i1 @memcmp_expand_8(ptr %a, ptr %b) { +; CHECK-LABEL: memcmp_expand_8: +; CHECK: .functype memcmp_expand_8 (i32, i32) -> (i32) +; CHECK-NEXT: # %bb.0: +; CHECK-NEXT: i64.load $push1=, 0($0):p2align=0 +; CHECK-NEXT: i64.load $push0=, 0($1):p2align=0 +; CHECK-NEXT: i64.eq $push2=, $pop1, $pop0 +; CHECK-NEXT: return $pop2 + %cmp_8 = call i32 @memcmp(ptr %a, ptr %b, i32 8) + %res = icmp eq i32 %cmp_8, 0 + ret i1 %res +} + +; TODO: Should be using a single load i64x2 or equivalent in bitsizes +define i1 @memcmp_expand_16(ptr %a, ptr %b) { +; CHECK-LABEL: memcmp_expand_16: +; CHECK: .functype memcmp_expand_16 (i32, i32) -> (i32) +; CHECK-NEXT: # %bb.0: +; CHECK-NEXT: i64.load $push7=, 0($0):p2align=0 +; CHECK-NEXT: i64.load $push6=, 0($1):p2align=0 +; CHECK-NEXT: i64.xor $push8=, $pop7, $pop6 +; CHECK-NEXT: i32.const $push0=, 8 +; CHECK-NEXT: i32.add $push3=, $0, $pop0 +; CHECK-NEXT: i64.load $push4=, 0($pop3):p2align=0 +; CHECK-NEXT: i32.const $push11=, 8 +; CHECK-NEXT: i32.add $push1=, $1, $pop11 +; CHECK-NEXT: i64.load $push2=, 0($pop1):p2align=0 +; CHECK-NEXT: i64.xor $push5=, $pop4, $pop2 +; CHECK-NEXT: i64.or $push9=, $pop8, $pop5 +; CHECK-NEXT: i64.eqz $push10=, $pop9 +; CHECK-NEXT: return $pop10 + %cmp_16 = call i32 @memcmp(ptr %a, ptr %b, i32 16) + %res = icmp eq i32 %cmp_16, 0 + ret i1 %res +} diff --git a/llvm/test/CodeGen/WebAssembly/simd-conversions.ll b/llvm/test/CodeGen/WebAssembly/simd-conversions.ll index 8459ec8..b355a0d 100644 --- a/llvm/test/CodeGen/WebAssembly/simd-conversions.ll +++ b/llvm/test/CodeGen/WebAssembly/simd-conversions.ll @@ -441,3 +441,31 @@ define <2 x double> @promote_mixed_v2f64(<4 x float> %x, <4 x float> %y) { %a = fpext <2 x float> %v to <2 x double> ret <2 x double> %a } + +define <4 x float> @convert_u_v4f32_maybeneg(<4 x i32> %x) { +; CHECK-LABEL: convert_u_v4f32_maybeneg: +; CHECK: .functype convert_u_v4f32_maybeneg (v128) -> (v128) +; CHECK-NEXT: # %bb.0: +; CHECK-NEXT: local.get 0 +; CHECK-NEXT: i32.const 1 +; CHECK-NEXT: i32x4.shr_s +; CHECK-NEXT: f32x4.convert_i32x4_u +; CHECK-NEXT: # fallthrough-return + %a = ashr <4 x i32> %x, <i32 1, i32 1, i32 1, i32 1> + %b = uitofp <4 x i32> %a to <4 x float> + ret <4 x float> %b +} + +define <4 x float> @convert_u_v4f32_nonneg(<4 x i32> %x) { +; CHECK-LABEL: convert_u_v4f32_nonneg: +; CHECK: .functype convert_u_v4f32_nonneg (v128) -> (v128) +; CHECK-NEXT: # %bb.0: +; CHECK-NEXT: local.get 0 +; CHECK-NEXT: i32.const 1 +; CHECK-NEXT: i32x4.shr_u +; CHECK-NEXT: f32x4.convert_i32x4_s +; CHECK-NEXT: # fallthrough-return + %a = lshr <4 x i32> %x, <i32 1, i32 1, i32 1, i32 1> + %b = uitofp <4 x i32> %a to <4 x float> + ret <4 x float> %b +} diff --git a/llvm/test/CodeGen/WebAssembly/simd-extending-convert.ll b/llvm/test/CodeGen/WebAssembly/simd-extending-convert.ll index c93b8aa..eb39f90 100644 --- a/llvm/test/CodeGen/WebAssembly/simd-extending-convert.ll +++ b/llvm/test/CodeGen/WebAssembly/simd-extending-convert.ll @@ -12,7 +12,7 @@ define <4 x float> @extend_to_float_low_i16x8_u(<8 x i16> %x) { ; CHECK-NEXT: # %bb.0: ; CHECK-NEXT: local.get 0 ; CHECK-NEXT: i32x4.extend_low_i16x8_u -; CHECK-NEXT: f32x4.convert_i32x4_u +; CHECK-NEXT: f32x4.convert_i32x4_s ; CHECK-NEXT: # fallthrough-return %low = shufflevector <8 x i16> %x, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> %extended = uitofp <4 x i16> %low to <4 x float> @@ -25,7 +25,7 @@ define <4 x float> @extend_to_float_high_i16x8_u(<8 x i16> %x) { ; CHECK-NEXT: # %bb.0: ; CHECK-NEXT: local.get 0 ; CHECK-NEXT: i32x4.extend_high_i16x8_u -; CHECK-NEXT: f32x4.convert_i32x4_u +; CHECK-NEXT: f32x4.convert_i32x4_s ; CHECK-NEXT: # fallthrough-return %high = shufflevector <8 x i16> %x, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> %extended = uitofp <4 x i16> %high to <4 x float> @@ -39,7 +39,7 @@ define <4 x float> @extend_to_float_low_i8x16_u(<8 x i8> %x) { ; CHECK-NEXT: local.get 0 ; CHECK-NEXT: i16x8.extend_low_i8x16_u ; CHECK-NEXT: i32x4.extend_low_i16x8_u -; CHECK-NEXT: f32x4.convert_i32x4_u +; CHECK-NEXT: f32x4.convert_i32x4_s ; CHECK-NEXT: # fallthrough-return %low = shufflevector <8 x i8> %x, <8 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> %extended = uitofp <4 x i8> %low to <4 x float> @@ -55,7 +55,7 @@ define <4 x float> @extend_to_float_high_i8x16_u(<8 x i8> %x) { ; CHECK-NEXT: i8x16.shuffle 4, 5, 6, 7, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ; CHECK-NEXT: i16x8.extend_low_i8x16_u ; CHECK-NEXT: i32x4.extend_low_i16x8_u -; CHECK-NEXT: f32x4.convert_i32x4_u +; CHECK-NEXT: f32x4.convert_i32x4_s ; CHECK-NEXT: # fallthrough-return %high = shufflevector <8 x i8> %x, <8 x i8> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> %extended = uitofp <4 x i8> %high to <4 x float> @@ -136,7 +136,7 @@ define <2 x double> @extend_to_double_low_i16x4_u(<4 x i16> %x) { ; CHECK-NEXT: # %bb.0: ; CHECK-NEXT: local.get 0 ; CHECK-NEXT: i32x4.extend_low_i16x8_u -; CHECK-NEXT: f64x2.convert_low_i32x4_u +; CHECK-NEXT: f64x2.convert_low_i32x4_s ; CHECK-NEXT: # fallthrough-return %low = shufflevector <4 x i16> %x, <4 x i16> undef, <2 x i32> <i32 0, i32 1> %extended = uitofp <2 x i16> %low to <2 x double> diff --git a/llvm/test/CodeGen/WinEH/wineh-reuse-catch-alloca.ll b/llvm/test/CodeGen/WinEH/wineh-reuse-catch-alloca.ll index 2bd004e..9de79ee 100644 --- a/llvm/test/CodeGen/WinEH/wineh-reuse-catch-alloca.ll +++ b/llvm/test/CodeGen/WinEH/wineh-reuse-catch-alloca.ll @@ -1,4 +1,5 @@ -; RUN: llc %s --mtriple=x86_64-pc-windows-msvc -o - | FileCheck %s +; RUN: llc %s --mtriple=x86_64-pc-windows-msvc -o - | FileCheck %s --check-prefixes=CHECK,X64 +; RUN: %if aarch64-registered-target %{ llc %s --mtriple=aarch64-pc-windows-msvc -o - | FileCheck %s --check-prefixes=CHECK,ARM64 %} ; Tests the fixed object layouts when two catchpads re-use the same stack ; allocation for this catch objects. @@ -18,27 +19,36 @@ ; } ; ``` -; Minimum stack alloc is 64 bytes, so no change there. ; CHECK-LABEL: calls_boom: -; CHECK: subq $64, %rsp -; CHECK: .seh_stackalloc 64 +; Minimum stack alloc is 64 bytes, so no change there. +; X64: subq $64, %rsp +; X64: .seh_stackalloc 64 +; Only need 48 bytes on the stack, not 64. +; ARM64: sub sp, sp, #48 +; ARM64: .seh_stackalloc 48 ; Both the catch blocks load from the same address. ; CHECK-LABEL: "?catch$3@?0?calls_boom@4HA": -; CHECK: movq -8(%rbp), %rax +; X64: movq -8(%rbp), %rax +; ARM64: ldr x8, [x29, #24] ; CHECK-LABEL: "?catch$4@?0?calls_boom@4HA": -; CHECK: movq -8(%rbp), %rax +; X64: movq -8(%rbp), %rax +; ARM64: ldr x8, [x29, #24] -; There's enough space for the UnwindHelp to be at 48 instead of 40 ; CHECK-LABEL: $cppxdata$calls_boom: -; CHECK: .long 48 # UnwindHelp +; There's enough space for the UnwindHelp to be at 48 instead of 40 +; X64: .long 48 # UnwindHelp +; There's enough space for the UnwindHelp to be at -16 instead of -32 +; ARM64: .word -16 // UnwindHelp ; Both catches have the same object offset. ; CHECK-LABEL: $handlerMap$0$calls_boom: -; CHECK: .long 56 # CatchObjOffset -; CHECK-NEXT: .long "?catch$3@?0?calls_boom@4HA"@IMGREL # Handler -; CHECK: .long 56 # CatchObjOffset -; CHECK-NEXT: .long "?catch$4@?0?calls_boom@4HA"@IMGREL # Handler +; X64: .long 56 # CatchObjOffset +; ARM64: .word -8 // CatchObjOffset +; CHECK-NEXT: "?catch$3@?0?calls_boom@4HA"@IMGREL +; X64: .long 56 # CatchObjOffset +; ARM64: .word -8 // CatchObjOffset +; CHECK-NEXT: "?catch$4@?0?calls_boom@4HA"@IMGREL %rtti.TypeDescriptor2 = type { ptr, ptr, [3 x i8] } diff --git a/llvm/test/CodeGen/X86/peephole-copy.mir b/llvm/test/CodeGen/X86/peephole-copy.mir index e24abf84..f399398 100644 --- a/llvm/test/CodeGen/X86/peephole-copy.mir +++ b/llvm/test/CodeGen/X86/peephole-copy.mir @@ -22,14 +22,14 @@ body: | bb.0: ; CHECK-LABEL: name: c ; CHECK: [[MOV32ri:%[0-9]+]]:gr32_abcd = MOV32ri 512 - ; CHECK-NEXT: INLINEASM &"", 1 /* sideeffect attdialect */, 2359305 /* reguse:GR32 */, [[MOV32ri]], 1 /* reguse */, implicit-def early-clobber $df + ; CHECK-NEXT: INLINEASM &"", 1 /* sideeffect attdialect */, 3211273 /* reguse:GR32_ABCD */, [[MOV32ri]], 1 /* reguse */, implicit-def early-clobber $df ; CHECK-NEXT: [[MOV32ri1:%[0-9]+]]:gr32_abcd = MOV32ri 512 - ; CHECK-NEXT: INLINEASM &"", 1 /* sideeffect attdialect */, 2359305 /* reguse:GR32 */, [[MOV32ri1]], 1 /* reguse */, implicit-def early-clobber $df + ; CHECK-NEXT: INLINEASM &"", 1 /* sideeffect attdialect */, 3211273 /* reguse:GR32_ABCD */, [[MOV32ri1]], 1 /* reguse */, implicit-def early-clobber $df ; CHECK-NEXT: RET 0 %2 = MOV32ri 512 %0 = COPY %2 - INLINEASM &"", 1 /* sideeffect attdialect */, 2359305 /* reguse:GR32_ABCD */, %0:gr32_abcd, 1 /* clobber */, implicit-def early-clobber $df + INLINEASM &"", 1 /* sideeffect attdialect */, 3211273 /* reguse:GR32_ABCD */, %0:gr32_abcd, 1 /* clobber */, implicit-def early-clobber $df %1 = COPY %2 - INLINEASM &"", 1 /* sideeffect attdialect */, 2359305 /* reguse:GR32_ABCD */, %1:gr32_abcd, 1 /* clobber */, implicit-def early-clobber $df + INLINEASM &"", 1 /* sideeffect attdialect */, 3211273 /* reguse:GR32_ABCD */, %1:gr32_abcd, 1 /* clobber */, implicit-def early-clobber $df RET 0 ... diff --git a/llvm/test/CodeGen/X86/pr62286.ll b/llvm/test/CodeGen/X86/pr62286.ll index 2d1b7fc..9728e13 100644 --- a/llvm/test/CodeGen/X86/pr62286.ll +++ b/llvm/test/CodeGen/X86/pr62286.ll @@ -42,10 +42,10 @@ define i64 @PR62286(i32 %a) { ; AVX2-LABEL: PR62286: ; AVX2: # %bb.0: ; AVX2-NEXT: vmovd %edi, %xmm0 -; AVX2-NEXT: vpaddd %xmm0, %xmm0, %xmm1 -; AVX2-NEXT: vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3] -; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3] +; AVX2-NEXT: vpslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3] +; AVX2-NEXT: vpaddd %xmm0, %xmm0, %xmm0 +; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; AVX2-NEXT: vpor %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: vpmovsxdq %xmm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-NEXT: vpaddq %xmm1, %xmm0, %xmm0 diff --git a/llvm/test/CodeGen/X86/pseudo-probe-desc-check.ll b/llvm/test/CodeGen/X86/pseudo-probe-desc-check.ll new file mode 100644 index 0000000..841061c --- /dev/null +++ b/llvm/test/CodeGen/X86/pseudo-probe-desc-check.ll @@ -0,0 +1,47 @@ +; REQUIRES: asserts +; RUN: llc -mtriple=x86_64-unknown-linux-gnu < %s -o /dev/null 2>&1 | FileCheck %s +; RUN: llc -mtriple=x86_64-unknown-windows-msvc < %s -o /dev/null 2>&1 | FileCheck %s + +; CHECK: warning: Guid:8314849053352128226 Name:inlinee does not exist in pseudo probe desc +; CHECK: warning: Guid:6492337042787843907 Name:extract2 does not exist in pseudo probe desc + +target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128" + +define void @extract1() !dbg !8 { +entry: + call void @llvm.pseudoprobe(i64 6028998432455395745, i64 1, i32 0, i64 -1), !dbg !11 + call void @llvm.pseudoprobe(i64 8314849053352128226, i64 1, i32 0, i64 -1), !dbg !12 + ret void, !dbg !16 +} + +define void @extract2() !dbg !17 { +entry: + call void @llvm.pseudoprobe(i64 6492337042787843907, i64 1, i32 0, i64 -1), !dbg !18 + ret void, !dbg !18 +} + +declare void @llvm.pseudoprobe(i64, i64, i32, i64) + +!llvm.dbg.cu = !{!0} +!llvm.module.flags = !{!2, !3, !4, !5, !6} +!llvm.pseudo_probe_desc = !{!7} + +!0 = distinct !DICompileUnit(language: DW_LANG_C11, file: !1, producer: "clang", isOptimized: false, emissionKind: LineTablesOnly, splitDebugInlining: false, nameTableKind: None) +!1 = !DIFile(filename: "test.c", directory: "/home/foo") +!2 = !{i32 7, !"Dwarf Version", i32 4} +!3 = !{i32 2, !"Debug Info Version", i32 3} +!4 = !{i32 1, !"wchar_size", i32 4} +!5 = !{i32 7, !"uwtable", i32 2} +!6 = !{i32 7, !"frame-pointer", i32 2} +!7 = !{i64 6028998432455395745, i64 281479271677951, !"extract1"} +!8 = distinct !DISubprogram(name: "extract1", scope: !1, file: !1, line: 4, type: !9, scopeLine: 4, spFlags: DISPFlagDefinition, unit: !0) +!9 = !DISubroutineType(types: !10) +!10 = !{} +!11 = !DILocation(line: 5, column: 3, scope: !8) +!12 = !DILocation(line: 2, column: 1, scope: !13, inlinedAt: !14) +!13 = distinct !DISubprogram(name: "inlinee", scope: !1, file: !1, line: 1, type: !9, scopeLine: 1, spFlags: DISPFlagDefinition, unit: !0) +!14 = distinct !DILocation(line: 5, column: 3, scope: !15) +!15 = !DILexicalBlockFile(scope: !8, file: !1, discriminator: 455082007) +!16 = !DILocation(line: 6, column: 1, scope: !8) +!17 = distinct !DISubprogram(name: "extract2", scope: !1, file: !1, line: 8, type: !9, scopeLine: 8, spFlags: DISPFlagDefinition, unit: !0) +!18 = !DILocation(line: 9, column: 1, scope: !17) diff --git a/llvm/test/CodeGen/X86/select-optimize.ll b/llvm/test/CodeGen/X86/select-optimize.ll index d273d09..c7cf9cb 100644 --- a/llvm/test/CodeGen/X86/select-optimize.ll +++ b/llvm/test/CodeGen/X86/select-optimize.ll @@ -229,9 +229,10 @@ define i32 @expensive_val_operand4(ptr nocapture %a, i32 %b, i32 %y, i1 %cmp) { } ; Expensive cold value operand with unsafe-to-sink (due to lifetime-end marker) load (partial slice sinking). -define i32 @expensive_val_operand5(ptr nocapture %a, i32 %b, i32 %y, i1 %cmp) { +define i32 @expensive_val_operand5(i32 %b, i32 %y, i1 %cmp) { ; CHECK-LABEL: @expensive_val_operand5( -; CHECK-NEXT: [[LOAD:%.*]] = load i32, ptr [[A:%.*]], align 8 +; CHECK-NEXT: [[A:%.*]] = alloca i32, align 4 +; CHECK-NEXT: [[LOAD:%.*]] = load i32, ptr [[A]], align 8 ; CHECK-NEXT: call void @llvm.lifetime.end.p0(i64 2, ptr nonnull [[A]]) ; CHECK-NEXT: [[CMP_FROZEN:%.*]] = freeze i1 [[CMP:%.*]] ; CHECK-NEXT: br i1 [[CMP_FROZEN]], label [[SELECT_TRUE_SINK:%.*]], label [[SELECT_END:%.*]], !prof [[PROF18]] @@ -242,6 +243,7 @@ define i32 @expensive_val_operand5(ptr nocapture %a, i32 %b, i32 %y, i1 %cmp) { ; CHECK-NEXT: [[SEL:%.*]] = phi i32 [ [[X]], [[SELECT_TRUE_SINK]] ], [ [[Y:%.*]], [[TMP0:%.*]] ] ; CHECK-NEXT: ret i32 [[SEL]] ; + %a = alloca i32 %load = load i32, ptr %a, align 8 call void @llvm.lifetime.end.p0(i64 2, ptr nonnull %a) %x = add i32 %load, %b diff --git a/llvm/test/CodeGen/X86/swap.ll b/llvm/test/CodeGen/X86/swap.ll index e556900..1dc454dd 100644 --- a/llvm/test/CodeGen/X86/swap.ll +++ b/llvm/test/CodeGen/X86/swap.ll @@ -47,12 +47,10 @@ define dso_local void @onealloc_noreadback(ptr nocapture %a, ptr nocapture %b) l entry: %alloc = alloca [16 x i8], i8 2, align 1 %part2 = getelementptr inbounds [16 x i8], ptr %alloc, i64 1, i64 0 - call void @llvm.lifetime.start.p0(i64 16, ptr nonnull %alloc) - call void @llvm.lifetime.start.p0(i64 16, ptr nonnull %part2) + call void @llvm.lifetime.start.p0(i64 32, ptr nonnull %alloc) call void @llvm.memcpy.p0.p0.i64(ptr nonnull align 1 %alloc, ptr align 1 %a, i64 16, i1 false) tail call void @llvm.memcpy.p0.p0.i64(ptr nonnull align 1 %part2, ptr align 1 %b, i64 16, i1 false) - call void @llvm.lifetime.end.p0(i64 16, ptr nonnull %alloc) - call void @llvm.lifetime.end.p0(i64 16, ptr nonnull %part2) + call void @llvm.lifetime.end.p0(i64 32, ptr nonnull %alloc) ret void } @@ -115,8 +113,9 @@ define dso_local void @onealloc_readback_1(ptr nocapture %a, ptr nocapture %b) l ; ; AA-LABEL: onealloc_readback_1: ; AA: # %bb.0: # %entry -; AA-NEXT: vmovups (%rsi), %xmm0 +; AA-NEXT: vmovups (%rdi), %xmm0 ; AA-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) +; AA-NEXT: vmovups (%rsi), %xmm0 ; AA-NEXT: vmovups %xmm0, (%rdi) ; AA-NEXT: retq entry: diff --git a/llvm/test/ExecutionEngine/RuntimeDyld/LoongArch/ELF_LoongArch_relocations.s b/llvm/test/ExecutionEngine/RuntimeDyld/LoongArch/ELF_LoongArch_relocations.s index 0fca88b..ddbf02c 100644 --- a/llvm/test/ExecutionEngine/RuntimeDyld/LoongArch/ELF_LoongArch_relocations.s +++ b/llvm/test/ExecutionEngine/RuntimeDyld/LoongArch/ELF_LoongArch_relocations.s @@ -2,6 +2,9 @@ # RUN: llvm-mc --triple=loongarch64 --filetype=obj -o %t/reloc.o %s # RUN: llvm-rtdyld --triple=loongarch64 --verify --check=%s %t/reloc.o \ # RUN: --map-section reloc.o,.got=0x21f00 \ +# RUN: --map-section reloc.o,.sec.large.pc=0x0000000012345000 \ +# RUN: --map-section reloc.o,.sec.large.got=0x44433333abcde000 \ +# RUN: --map-section reloc.o,.sec.dummy=0x4443333334567111 \ # RUN: --dummy-extern abs=0x0123456789abcdef \ # RUN: --dummy-extern external_data=0x1234 @@ -100,3 +103,42 @@ named_data: .quad 0x2222222222222222 .quad 0x3333333333333333 .size named_data, .-named_data + + .section .sec.large.pc,"ax" + .globl test_large_pc +test_large_pc: +## Code after link should be: +## 1a44444d pcalau12i $t1, 139810 +## 02c4440c addi.d $t0, $zero, 273 +## 1666666c lu32i.d $t0, 209715 +## 0311118c lu52i.d $t0, $t0, 1092 + +# rtdyld-check: *{4}(test_large_pc) = 0x1a44444d + pcalau12i $t1, %pc_hi20(.sec.dummy) +# rtdyld-check: *{4}(test_large_pc + 4) = 0x02c4440c + addi.d $t0, $zero, %pc_lo12(.sec.dummy) +# rtdyld-check: *{4}(test_large_pc + 8) = 0x1666666c + lu32i.d $t0, %pc64_lo20(.sec.dummy) +# rtdyld-check: *{4}(test_large_pc + 12) = 0x0311118c + lu52i.d $t0, $t0, %pc64_hi12(.sec.dummy) + + .section .sec.large.got,"ax" + .globl test_large_got +test_large_got: +## Code after link should be: +## 1aa8688d pcalau12i $t1, 344900 +## 02fc000c addi.d $t0, $zero, -256 +## 1799996c lu32i.d $t0, -209717 +## 032eed8c lu52i.d $t0, $t0, -1093 + +# rtdyld-check: *{4}(test_large_got) = 0x1aa8688d + pcalau12i $t1, %got_pc_hi20(external_data) +# rtdyld-check: *{4}(test_large_got + 4) = 0x02fc000c + addi.d $t0, $zero, %got_pc_lo12(external_data) +# rtdyld-check: *{4}(test_large_got + 8) = 0x1799996c + lu32i.d $t0, %got64_pc_lo20(external_data) +# rtdyld-check: *{4}(test_large_got + 12) = 0x032eed8c + lu52i.d $t0, $t0, %got64_pc_hi12(external_data) + + .section .sec.dummy,"a" + .word 0 diff --git a/llvm/test/Instrumentation/AddressSanitizer/asan-funclet.ll b/llvm/test/Instrumentation/AddressSanitizer/asan-funclet.ll index e9c1075..ae8b2b3 100644 --- a/llvm/test/Instrumentation/AddressSanitizer/asan-funclet.ll +++ b/llvm/test/Instrumentation/AddressSanitizer/asan-funclet.ll @@ -23,7 +23,7 @@ declare i32 @dummyPersonality(...) define void @FuncletPersonality(ptr %ptrParam) sanitize_address personality ptr @__CxxFrameHandler3 { ; CHECK-INLINE-LABEL: define void @FuncletPersonality( -; CHECK-INLINE-SAME: ptr [[PTRPARAM:%.*]]) #[[ATTR4:[0-9]+]] personality ptr @__CxxFrameHandler3 { +; CHECK-INLINE-SAME: ptr [[PTRPARAM:%.*]]) #[[ATTR3:[0-9]+]] personality ptr @__CxxFrameHandler3 { ; CHECK-INLINE-NEXT: entry: ; CHECK-INLINE-NEXT: [[TMP0:%.*]] = alloca i64, align 32 ; CHECK-INLINE-NEXT: store i64 0, ptr [[TMP0]], align 8 @@ -87,7 +87,6 @@ define void @FuncletPersonality(ptr %ptrParam) sanitize_address personality ptr ; CHECK-INLINE-NEXT: call void @__asan_set_shadow_f3(i64 [[TMP38]], i64 1) ; CHECK-INLINE-NEXT: [[TMP39:%.*]] = add i64 [[TMP29]], 1066 ; CHECK-INLINE-NEXT: call void @__asan_set_shadow_04(i64 [[TMP39]], i64 1) -; CHECK-INLINE-NEXT: call void @llvm.lifetime.start.p0(i64 4, ptr [[TMP22]]) ; CHECK-INLINE-NEXT: [[TMP40:%.*]] = lshr i64 [[TMP21]], 3 ; CHECK-INLINE-NEXT: [[TMP41:%.*]] = add i64 [[TMP40]], [[TMP1]] ; CHECK-INLINE-NEXT: [[TMP42:%.*]] = inttoptr i64 [[TMP41]] to ptr @@ -100,13 +99,12 @@ define void @FuncletPersonality(ptr %ptrParam) sanitize_address personality ptr ; CHECK-INLINE-NEXT: [[TMP48:%.*]] = icmp sge i8 [[TMP47]], [[TMP43]] ; CHECK-INLINE-NEXT: br i1 [[TMP48]], label [[TMP49:%.*]], label [[TMP50]] ; CHECK-INLINE: 49: -; CHECK-INLINE-NEXT: call void @__asan_report_store1(i64 [[TMP21]]) #[[ATTR8:[0-9]+]] +; CHECK-INLINE-NEXT: call void @__asan_report_store1(i64 [[TMP21]]) #[[ATTR7:[0-9]+]] ; CHECK-INLINE-NEXT: unreachable ; CHECK-INLINE: 50: ; CHECK-INLINE-NEXT: store volatile i8 0, ptr [[TMP22]], align 1 ; CHECK-INLINE-NEXT: [[TMP51:%.*]] = add i64 [[TMP29]], 1066 ; CHECK-INLINE-NEXT: call void @__asan_set_shadow_f8(i64 [[TMP51]], i64 1) -; CHECK-INLINE-NEXT: call void @llvm.lifetime.end.p0(i64 4, ptr [[TMP22]]) ; CHECK-INLINE-NEXT: [[TMP52:%.*]] = alloca i8, i64 96, align 32 ; CHECK-INLINE-NEXT: [[TMP53:%.*]] = ptrtoint ptr [[TMP52]] to i64 ; CHECK-INLINE-NEXT: [[TMP54:%.*]] = add i64 [[TMP53]], 32 @@ -128,7 +126,7 @@ define void @FuncletPersonality(ptr %ptrParam) sanitize_address personality ptr ; CHECK-INLINE-NEXT: [[TMP66:%.*]] = icmp ne i8 [[TMP65]], 0 ; CHECK-INLINE-NEXT: br i1 [[TMP66]], label [[TMP67:%.*]], label [[TMP68:%.*]] ; CHECK-INLINE: 67: -; CHECK-INLINE-NEXT: call void @__asan_report_store8(i64 [[TMP59]]) #[[ATTR8]] +; CHECK-INLINE-NEXT: call void @__asan_report_store8(i64 [[TMP59]]) #[[ATTR7]] ; CHECK-INLINE-NEXT: unreachable ; CHECK-INLINE: 68: ; CHECK-INLINE-NEXT: store volatile i64 0, ptr [[TMP61]], align 8 @@ -158,7 +156,7 @@ define void @FuncletPersonality(ptr %ptrParam) sanitize_address personality ptr ; CHECK-INLINE-NEXT: [[TMP88:%.*]] = icmp sge i8 [[TMP87]], [[TMP83]] ; CHECK-INLINE-NEXT: br i1 [[TMP88]], label [[TMP89:%.*]], label [[TMP90]] ; CHECK-INLINE: 89: -; CHECK-INLINE-NEXT: call void @__asan_report_store1(i64 [[TMP77]]) #[[ATTR8]] +; CHECK-INLINE-NEXT: call void @__asan_report_store1(i64 [[TMP77]]) #[[ATTR7]] ; CHECK-INLINE-NEXT: unreachable ; CHECK-INLINE: 90: ; CHECK-INLINE-NEXT: store volatile i8 0, ptr [[TMP79]], align 1 @@ -185,7 +183,6 @@ define void @FuncletPersonality(ptr %ptrParam) sanitize_address personality ptr ; CHECK-INLINE: ehcleanup: ; CHECK-INLINE-NEXT: [[TMP98:%.*]] = cleanuppad within none [] ; CHECK-INLINE-NEXT: call void @__asan_unpoison_stack_memory(i64 [[TMP54]], i64 4) [ "funclet"(token [[TMP98]]) ] -; CHECK-INLINE-NEXT: call void @llvm.lifetime.start.p0(i64 4, ptr [[TMP56]]) ; CHECK-INLINE-NEXT: [[TMP99:%.*]] = lshr i64 [[TMP54]], 3 ; CHECK-INLINE-NEXT: [[TMP100:%.*]] = add i64 [[TMP99]], [[TMP1]] ; CHECK-INLINE-NEXT: [[TMP101:%.*]] = inttoptr i64 [[TMP100]] to ptr @@ -198,12 +195,11 @@ define void @FuncletPersonality(ptr %ptrParam) sanitize_address personality ptr ; CHECK-INLINE-NEXT: [[TMP107:%.*]] = icmp sge i8 [[TMP106]], [[TMP102]] ; CHECK-INLINE-NEXT: br i1 [[TMP107]], label [[TMP108:%.*]], label [[TMP109]] ; CHECK-INLINE: 108: -; CHECK-INLINE-NEXT: call void @__asan_report_store1(i64 [[TMP54]]) #[[ATTR8]] [ "funclet"(token [[TMP98]]) ] +; CHECK-INLINE-NEXT: call void @__asan_report_store1(i64 [[TMP54]]) #[[ATTR7]] [ "funclet"(token [[TMP98]]) ] ; CHECK-INLINE-NEXT: unreachable ; CHECK-INLINE: 109: ; CHECK-INLINE-NEXT: store volatile i8 0, ptr [[TMP56]], align 1 ; CHECK-INLINE-NEXT: call void @__asan_poison_stack_memory(i64 [[TMP54]], i64 4) [ "funclet"(token [[TMP98]]) ] -; CHECK-INLINE-NEXT: call void @llvm.lifetime.end.p0(i64 4, ptr [[TMP56]]) ; CHECK-INLINE-NEXT: call void @DeInit(ptr [[TMP14]]) [ "funclet"(token [[TMP98]]) ] ; CHECK-INLINE-NEXT: [[TMP110:%.*]] = call ptr @__asan_memset(ptr [[TMP16]], i32 0, i64 4) [ "funclet"(token [[TMP98]]) ] ; CHECK-INLINE-NEXT: [[TMP111:%.*]] = call ptr @__asan_memcpy(ptr [[TMP18]], ptr [[TMP16]], i64 4) [ "funclet"(token [[TMP98]]) ] @@ -226,7 +222,7 @@ define void @FuncletPersonality(ptr %ptrParam) sanitize_address personality ptr ; CHECK-INLINE-NEXT: [[TMP125:%.*]] = icmp sge i8 [[TMP124]], [[TMP120]] ; CHECK-INLINE-NEXT: br i1 [[TMP125]], label [[TMP126:%.*]], label [[TMP127]] ; CHECK-INLINE: 126: -; CHECK-INLINE-NEXT: call void @__asan_report_store_n(i64 [[TMP116]], i64 8) #[[ATTR8]] [ "funclet"(token [[TMP98]]) ] +; CHECK-INLINE-NEXT: call void @__asan_report_store_n(i64 [[TMP116]], i64 8) #[[ATTR7]] [ "funclet"(token [[TMP98]]) ] ; CHECK-INLINE-NEXT: unreachable ; CHECK-INLINE: 127: ; CHECK-INLINE-NEXT: [[TMP128:%.*]] = lshr i64 [[TMP114]], 3 @@ -241,7 +237,7 @@ define void @FuncletPersonality(ptr %ptrParam) sanitize_address personality ptr ; CHECK-INLINE-NEXT: [[TMP136:%.*]] = icmp sge i8 [[TMP135]], [[TMP131]] ; CHECK-INLINE-NEXT: br i1 [[TMP136]], label [[TMP137:%.*]], label [[EHEXIT]] ; CHECK-INLINE: 137: -; CHECK-INLINE-NEXT: call void @__asan_report_store_n(i64 [[TMP114]], i64 8) #[[ATTR8]] [ "funclet"(token [[TMP98]]) ] +; CHECK-INLINE-NEXT: call void @__asan_report_store_n(i64 [[TMP114]], i64 8) #[[ATTR7]] [ "funclet"(token [[TMP98]]) ] ; CHECK-INLINE-NEXT: unreachable ; CHECK-INLINE: ehexit: ; CHECK-INLINE-NEXT: store i64 0, ptr [[PTRPARAM]], align 1 @@ -265,7 +261,7 @@ define void @FuncletPersonality(ptr %ptrParam) sanitize_address personality ptr ; CHECK-INLINE-NEXT: cleanupret from [[TMP98]] unwind to caller ; ; CHECK-OUTLINE-LABEL: define void @FuncletPersonality( -; CHECK-OUTLINE-SAME: ptr [[PTRPARAM:%.*]]) #[[ATTR4:[0-9]+]] personality ptr @__CxxFrameHandler3 { +; CHECK-OUTLINE-SAME: ptr [[PTRPARAM:%.*]]) #[[ATTR3:[0-9]+]] personality ptr @__CxxFrameHandler3 { ; CHECK-OUTLINE-NEXT: entry: ; CHECK-OUTLINE-NEXT: [[TMP0:%.*]] = alloca i64, align 32 ; CHECK-OUTLINE-NEXT: store i64 0, ptr [[TMP0]], align 8 @@ -339,12 +335,10 @@ define void @FuncletPersonality(ptr %ptrParam) sanitize_address personality ptr ; CHECK-OUTLINE-NEXT: call void @__asan_set_shadow_f3(i64 [[TMP45]], i64 5) ; CHECK-OUTLINE-NEXT: [[TMP46:%.*]] = add i64 [[TMP33]], 1066 ; CHECK-OUTLINE-NEXT: call void @__asan_set_shadow_04(i64 [[TMP46]], i64 1) -; CHECK-OUTLINE-NEXT: call void @llvm.lifetime.start.p0(i64 4, ptr [[TMP22]]) ; CHECK-OUTLINE-NEXT: call void @__asan_store1(i64 [[TMP21]]) ; CHECK-OUTLINE-NEXT: store volatile i8 0, ptr [[TMP22]], align 1 ; CHECK-OUTLINE-NEXT: [[TMP47:%.*]] = add i64 [[TMP33]], 1066 ; CHECK-OUTLINE-NEXT: call void @__asan_set_shadow_f8(i64 [[TMP47]], i64 1) -; CHECK-OUTLINE-NEXT: call void @llvm.lifetime.end.p0(i64 4, ptr [[TMP22]]) ; CHECK-OUTLINE-NEXT: call void @__asan_store8(i64 [[TMP25]]) ; CHECK-OUTLINE-NEXT: store volatile i64 0, ptr [[TMP26]], align 8 ; CHECK-OUTLINE-NEXT: [[TMPCOPYI64:%.*]] = load i64, ptr [[TMP26]], align 8 @@ -389,12 +383,10 @@ define void @FuncletPersonality(ptr %ptrParam) sanitize_address personality ptr ; CHECK-OUTLINE-NEXT: [[TMP67:%.*]] = cleanuppad within none [] ; CHECK-OUTLINE-NEXT: [[TMP68:%.*]] = add i64 [[TMP33]], 1068 ; CHECK-OUTLINE-NEXT: call void @__asan_set_shadow_04(i64 [[TMP68]], i64 1) [ "funclet"(token [[TMP67]]) ] -; CHECK-OUTLINE-NEXT: call void @llvm.lifetime.start.p0(i64 4, ptr [[TMP24]]) ; CHECK-OUTLINE-NEXT: call void @__asan_store1(i64 [[TMP23]]) [ "funclet"(token [[TMP67]]) ] ; CHECK-OUTLINE-NEXT: store volatile i8 0, ptr [[TMP24]], align 1 ; CHECK-OUTLINE-NEXT: [[TMP69:%.*]] = add i64 [[TMP33]], 1068 ; CHECK-OUTLINE-NEXT: call void @__asan_set_shadow_f8(i64 [[TMP69]], i64 1) [ "funclet"(token [[TMP67]]) ] -; CHECK-OUTLINE-NEXT: call void @llvm.lifetime.end.p0(i64 4, ptr [[TMP24]]) ; CHECK-OUTLINE-NEXT: call void @DeInit(ptr [[TMP14]]) [ "funclet"(token [[TMP67]]) ] ; CHECK-OUTLINE-NEXT: [[TMP70:%.*]] = call ptr @__asan_memset(ptr [[TMP16]], i32 0, i64 4) [ "funclet"(token [[TMP67]]) ] ; CHECK-OUTLINE-NEXT: [[TMP71:%.*]] = call ptr @__asan_memcpy(ptr [[TMP18]], ptr [[TMP16]], i64 4) [ "funclet"(token [[TMP67]]) ] @@ -495,7 +487,7 @@ nopredecessor: ; Non-Windows personality, ensure no funclet gets attached to asan runtime call. define void @OtherPersonality(ptr %ptrParam) sanitize_address personality ptr @dummyPersonality { ; CHECK-LABEL: define void @OtherPersonality( -; CHECK-SAME: ptr [[PTRPARAM:%.*]]) #[[ATTR4:[0-9]+]] personality ptr @dummyPersonality { +; CHECK-SAME: ptr [[PTRPARAM:%.*]]) #[[ATTR3:[0-9]+]] personality ptr @dummyPersonality { ; CHECK-NEXT: entry: ; CHECK-NEXT: [[TMP0:%.*]] = load i64, ptr @__asan_shadow_memory_dynamic_address, align 8 ; CHECK-NEXT: [[ASAN_LOCAL_STACK_BASE:%.*]] = alloca i64, align 8 diff --git a/llvm/test/Instrumentation/AddressSanitizer/lifetime-throw.ll b/llvm/test/Instrumentation/AddressSanitizer/lifetime-throw.ll index eac414a9..ddfa5e1 100644 --- a/llvm/test/Instrumentation/AddressSanitizer/lifetime-throw.ll +++ b/llvm/test/Instrumentation/AddressSanitizer/lifetime-throw.ll @@ -24,7 +24,7 @@ entry: call void @llvm.lifetime.start.p0(i64 4, ptr %x) ; CHECK: store i8 4, ptr %{{[0-9]+}} - ; CHECK-NEXT: @llvm.lifetime.start + ; CHECK-NOT: @llvm.lifetime.start %exception = call ptr @__cxa_allocate_exception(i64 4) invoke void @__cxa_throw(ptr %exception, ptr @_ZTI3ABC, ptr @_ZN3ABCD2Ev) noreturn @@ -38,7 +38,7 @@ lpad: call void @_ZN3ABCD2Ev(ptr nonnull %x) call void @llvm.lifetime.end.p0(i64 4, ptr %x) ; CHECK: store i8 -8, ptr %{{[0-9]+}} - ; CHECK-NEXT: @llvm.lifetime.end + ; CHECK-NOT: @llvm.lifetime.end resume { ptr, i32 } %0 ; CHECK: store i64 0, ptr %{{[0-9]+}} @@ -77,7 +77,7 @@ entry: call void @llvm.lifetime.start.p0(i64 4, ptr %x) ; CHECK: store i8 4, ptr %{{[0-9]+}} - ; CHECK-NEXT: @llvm.lifetime.start + ; CHECK-NOT: @llvm.lifetime.start invoke void @_CxxThrowException(ptr %tmp, ptr nonnull @"_TI1?AUABC@@") noreturn to label %unreachable unwind label %ehcleanup @@ -89,7 +89,7 @@ ehcleanup: call void @"\01??1ABC@@QEAA@XZ"(ptr nonnull %x) [ "funclet"(token %0) ] call void @llvm.lifetime.end.p0(i64 4, ptr %x) ; CHECK: store i8 -8, ptr %{{[0-9]+}} - ; CHECK-NEXT: @llvm.lifetime.end + ; CHECK-NOT: @llvm.lifetime.end cleanupret from %0 unwind to caller ; CHECK: store i64 0, ptr %{{[0-9]+}} diff --git a/llvm/test/Instrumentation/AddressSanitizer/lifetime.ll b/llvm/test/Instrumentation/AddressSanitizer/lifetime.ll index a878dbe..bbfe00b 100644 --- a/llvm/test/Instrumentation/AddressSanitizer/lifetime.ll +++ b/llvm/test/Instrumentation/AddressSanitizer/lifetime.ll @@ -30,7 +30,6 @@ define void @lifetime_no_size(i64 %i) sanitize_address { ; CHECK-NEXT: [[TMP10:%.*]] = add i64 [[TMP9]], 0 ; CHECK-NEXT: [[TMP11:%.*]] = inttoptr i64 [[TMP10]] to ptr ; CHECK-NEXT: store i64 -868083117767659023, ptr [[TMP11]], align 1 -; CHECK-NEXT: call void @llvm.lifetime.start.p0(i64 -1, ptr [[TMP2]]) ; CHECK-NEXT: [[AI:%.*]] = getelementptr inbounds [2 x i32], ptr [[TMP2]], i64 0, i64 [[I]] ; CHECK-NEXT: [[TMP12:%.*]] = ptrtoint ptr [[AI]] to i64 ; CHECK-NEXT: [[TMP13:%.*]] = lshr i64 [[TMP12]], 3 @@ -49,7 +48,6 @@ define void @lifetime_no_size(i64 %i) sanitize_address { ; CHECK-NEXT: unreachable ; CHECK: [[BB23]]: ; CHECK-NEXT: store volatile i8 0, ptr [[AI]], align 4 -; CHECK-NEXT: call void @llvm.lifetime.end.p0(i64 -1, ptr [[TMP2]]) ; CHECK-NEXT: store i64 1172321806, ptr [[TMP3]], align 8 ; CHECK-NEXT: [[TMP24:%.*]] = add i64 [[TMP9]], 0 ; CHECK-NEXT: [[TMP25:%.*]] = inttoptr i64 [[TMP24]] to ptr @@ -100,7 +98,6 @@ define void @lifetime() sanitize_address { ; CHECK-DEFAULT-NEXT: [[TMP14:%.*]] = add i64 [[TMP11]], 4 ; CHECK-DEFAULT-NEXT: [[TMP15:%.*]] = inttoptr i64 [[TMP14]] to ptr ; CHECK-DEFAULT-NEXT: store i8 4, ptr [[TMP15]], align 1 -; CHECK-DEFAULT-NEXT: call void @llvm.lifetime.start.p0(i64 3, ptr [[TMP4]]) ; CHECK-DEFAULT-NEXT: [[TMP16:%.*]] = ptrtoint ptr [[TMP4]] to i64 ; CHECK-DEFAULT-NEXT: [[TMP17:%.*]] = lshr i64 [[TMP16]], 3 ; CHECK-DEFAULT-NEXT: [[TMP18:%.*]] = add i64 [[TMP17]], 2147450880 @@ -121,11 +118,9 @@ define void @lifetime() sanitize_address { ; CHECK-DEFAULT-NEXT: [[TMP28:%.*]] = add i64 [[TMP11]], 4 ; CHECK-DEFAULT-NEXT: [[TMP29:%.*]] = inttoptr i64 [[TMP28]] to ptr ; CHECK-DEFAULT-NEXT: store i8 -8, ptr [[TMP29]], align 1 -; CHECK-DEFAULT-NEXT: call void @llvm.lifetime.end.p0(i64 4, ptr [[TMP4]]) ; CHECK-DEFAULT-NEXT: [[TMP30:%.*]] = add i64 [[TMP11]], 4 ; CHECK-DEFAULT-NEXT: [[TMP31:%.*]] = inttoptr i64 [[TMP30]] to ptr ; CHECK-DEFAULT-NEXT: store i8 -8, ptr [[TMP31]], align 1 -; CHECK-DEFAULT-NEXT: call void @llvm.lifetime.end.p0(i64 2, ptr [[TMP4]]) ; CHECK-DEFAULT-NEXT: [[TMP32:%.*]] = alloca i8, i64 128, align 32 ; CHECK-DEFAULT-NEXT: [[TMP33:%.*]] = ptrtoint ptr [[TMP32]] to i64 ; CHECK-DEFAULT-NEXT: [[TMP34:%.*]] = add i64 [[TMP33]], 32 @@ -135,7 +130,6 @@ define void @lifetime() sanitize_address { ; CHECK-DEFAULT-NEXT: [[TMP36:%.*]] = inttoptr i64 [[TMP34]] to ptr ; CHECK-DEFAULT-NEXT: [[TMP37:%.*]] = ptrtoint ptr [[TMP36]] to i64 ; CHECK-DEFAULT-NEXT: call void @__asan_unpoison_stack_memory(i64 [[TMP37]], i64 40) -; CHECK-DEFAULT-NEXT: call void @llvm.lifetime.start.p0(i64 40, ptr [[TMP36]]) ; CHECK-DEFAULT-NEXT: [[TMP38:%.*]] = ptrtoint ptr [[TMP36]] to i64 ; CHECK-DEFAULT-NEXT: [[TMP39:%.*]] = lshr i64 [[TMP38]], 3 ; CHECK-DEFAULT-NEXT: [[TMP40:%.*]] = add i64 [[TMP39]], 2147450880 @@ -155,11 +149,9 @@ define void @lifetime() sanitize_address { ; CHECK-DEFAULT-NEXT: store volatile i8 0, ptr [[TMP36]], align 1 ; CHECK-DEFAULT-NEXT: [[TMP50:%.*]] = ptrtoint ptr [[TMP36]] to i64 ; CHECK-DEFAULT-NEXT: call void @__asan_poison_stack_memory(i64 [[TMP50]], i64 40) -; CHECK-DEFAULT-NEXT: call void @llvm.lifetime.end.p0(i64 40, ptr [[TMP36]]) ; CHECK-DEFAULT-NEXT: [[TMP51:%.*]] = add i64 [[TMP11]], 4 ; CHECK-DEFAULT-NEXT: [[TMP52:%.*]] = inttoptr i64 [[TMP51]] to ptr ; CHECK-DEFAULT-NEXT: store i8 4, ptr [[TMP52]], align 1 -; CHECK-DEFAULT-NEXT: call void @llvm.lifetime.start.p0(i64 2, ptr [[TMP4]]) ; CHECK-DEFAULT-NEXT: [[TMP53:%.*]] = ptrtoint ptr [[TMP4]] to i64 ; CHECK-DEFAULT-NEXT: [[TMP54:%.*]] = lshr i64 [[TMP53]], 3 ; CHECK-DEFAULT-NEXT: [[TMP55:%.*]] = add i64 [[TMP54]], 2147450880 @@ -180,7 +172,6 @@ define void @lifetime() sanitize_address { ; CHECK-DEFAULT-NEXT: [[TMP65:%.*]] = add i64 [[TMP11]], 4 ; CHECK-DEFAULT-NEXT: [[TMP66:%.*]] = inttoptr i64 [[TMP65]] to ptr ; CHECK-DEFAULT-NEXT: store i8 -8, ptr [[TMP66]], align 1 -; CHECK-DEFAULT-NEXT: call void @llvm.lifetime.end.p0(i64 4, ptr [[TMP4]]) ; CHECK-DEFAULT-NEXT: [[TMP67:%.*]] = ptrtoint ptr [[TMP1]] to i64 ; CHECK-DEFAULT-NEXT: [[TMP68:%.*]] = load i64, ptr [[TMP1]], align 8 ; CHECK-DEFAULT-NEXT: call void @__asan_allocas_unpoison(i64 [[TMP68]], i64 [[TMP67]]) @@ -212,7 +203,6 @@ define void @lifetime() sanitize_address { ; CHECK-NO-DYNAMIC-NEXT: [[TMP13:%.*]] = add i64 [[TMP10]], 4 ; CHECK-NO-DYNAMIC-NEXT: [[TMP14:%.*]] = inttoptr i64 [[TMP13]] to ptr ; CHECK-NO-DYNAMIC-NEXT: store i8 4, ptr [[TMP14]], align 1 -; CHECK-NO-DYNAMIC-NEXT: call void @llvm.lifetime.start.p0(i64 3, ptr [[TMP3]]) ; CHECK-NO-DYNAMIC-NEXT: [[TMP15:%.*]] = ptrtoint ptr [[TMP3]] to i64 ; CHECK-NO-DYNAMIC-NEXT: [[TMP16:%.*]] = lshr i64 [[TMP15]], 3 ; CHECK-NO-DYNAMIC-NEXT: [[TMP17:%.*]] = add i64 [[TMP16]], 2147450880 @@ -233,11 +223,9 @@ define void @lifetime() sanitize_address { ; CHECK-NO-DYNAMIC-NEXT: [[TMP27:%.*]] = add i64 [[TMP10]], 4 ; CHECK-NO-DYNAMIC-NEXT: [[TMP28:%.*]] = inttoptr i64 [[TMP27]] to ptr ; CHECK-NO-DYNAMIC-NEXT: store i8 -8, ptr [[TMP28]], align 1 -; CHECK-NO-DYNAMIC-NEXT: call void @llvm.lifetime.end.p0(i64 4, ptr [[TMP3]]) ; CHECK-NO-DYNAMIC-NEXT: [[TMP29:%.*]] = add i64 [[TMP10]], 4 ; CHECK-NO-DYNAMIC-NEXT: [[TMP30:%.*]] = inttoptr i64 [[TMP29]] to ptr ; CHECK-NO-DYNAMIC-NEXT: store i8 -8, ptr [[TMP30]], align 1 -; CHECK-NO-DYNAMIC-NEXT: call void @llvm.lifetime.end.p0(i64 2, ptr [[TMP3]]) ; CHECK-NO-DYNAMIC-NEXT: [[ARR:%.*]] = alloca [10 x i32], align 16 ; CHECK-NO-DYNAMIC-NEXT: call void @llvm.lifetime.start.p0(i64 40, ptr [[ARR]]) ; CHECK-NO-DYNAMIC-NEXT: [[TMP31:%.*]] = ptrtoint ptr [[ARR]] to i64 @@ -261,7 +249,6 @@ define void @lifetime() sanitize_address { ; CHECK-NO-DYNAMIC-NEXT: [[TMP43:%.*]] = add i64 [[TMP10]], 4 ; CHECK-NO-DYNAMIC-NEXT: [[TMP44:%.*]] = inttoptr i64 [[TMP43]] to ptr ; CHECK-NO-DYNAMIC-NEXT: store i8 4, ptr [[TMP44]], align 1 -; CHECK-NO-DYNAMIC-NEXT: call void @llvm.lifetime.start.p0(i64 2, ptr [[TMP3]]) ; CHECK-NO-DYNAMIC-NEXT: [[TMP45:%.*]] = ptrtoint ptr [[TMP3]] to i64 ; CHECK-NO-DYNAMIC-NEXT: [[TMP46:%.*]] = lshr i64 [[TMP45]], 3 ; CHECK-NO-DYNAMIC-NEXT: [[TMP47:%.*]] = add i64 [[TMP46]], 2147450880 @@ -282,7 +269,6 @@ define void @lifetime() sanitize_address { ; CHECK-NO-DYNAMIC-NEXT: [[TMP57:%.*]] = add i64 [[TMP10]], 4 ; CHECK-NO-DYNAMIC-NEXT: [[TMP58:%.*]] = inttoptr i64 [[TMP57]] to ptr ; CHECK-NO-DYNAMIC-NEXT: store i8 -8, ptr [[TMP58]], align 1 -; CHECK-NO-DYNAMIC-NEXT: call void @llvm.lifetime.end.p0(i64 4, ptr [[TMP3]]) ; CHECK-NO-DYNAMIC-NEXT: store i64 1172321806, ptr [[TMP4]], align 8 ; CHECK-NO-DYNAMIC-NEXT: [[TMP59:%.*]] = add i64 [[TMP10]], 0 ; CHECK-NO-DYNAMIC-NEXT: [[TMP60:%.*]] = inttoptr i64 [[TMP59]] to ptr @@ -325,166 +311,6 @@ define void @lifetime() sanitize_address { ret void } -; Check that arguments of lifetime may come from phi nodes. -define void @phi_args(i1 %x) sanitize_address { -; CHECK-LABEL: define void @phi_args( -; CHECK-SAME: i1 [[X:%.*]]) #[[ATTR1]] { -; CHECK-NEXT: [[ENTRY:.*:]] -; CHECK-NEXT: [[MYALLOCA:%.*]] = alloca i8, i64 64, align 32 -; CHECK-NEXT: [[TMP0:%.*]] = ptrtoint ptr [[MYALLOCA]] to i64 -; CHECK-NEXT: [[TMP1:%.*]] = add i64 [[TMP0]], 32 -; CHECK-NEXT: [[TMP2:%.*]] = inttoptr i64 [[TMP1]] to ptr -; CHECK-NEXT: [[TMP3:%.*]] = inttoptr i64 [[TMP0]] to ptr -; CHECK-NEXT: store i64 1102416563, ptr [[TMP3]], align 8 -; CHECK-NEXT: [[TMP4:%.*]] = add i64 [[TMP0]], 8 -; CHECK-NEXT: [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr -; CHECK-NEXT: store i64 ptrtoint (ptr @___asan_gen_stack.2 to i64), ptr [[TMP5]], align 8 -; CHECK-NEXT: [[TMP6:%.*]] = add i64 [[TMP0]], 16 -; CHECK-NEXT: [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr -; CHECK-NEXT: store i64 ptrtoint (ptr @phi_args to i64), ptr [[TMP7]], align 8 -; CHECK-NEXT: [[TMP8:%.*]] = lshr i64 [[TMP0]], 3 -; CHECK-NEXT: [[TMP9:%.*]] = add i64 [[TMP8]], 2147450880 -; CHECK-NEXT: [[TMP10:%.*]] = add i64 [[TMP9]], 0 -; CHECK-NEXT: [[TMP11:%.*]] = inttoptr i64 [[TMP10]] to ptr -; CHECK-NEXT: store i64 -868082052615769615, ptr [[TMP11]], align 1 -; CHECK-NEXT: [[TMP12:%.*]] = add i64 [[TMP9]], 4 -; CHECK-NEXT: [[TMP13:%.*]] = inttoptr i64 [[TMP12]] to ptr -; CHECK-NEXT: store i8 0, ptr [[TMP13]], align 1 -; CHECK-NEXT: call void @llvm.lifetime.start.p0(i64 8, ptr [[TMP2]]) -; CHECK-NEXT: [[TMP14:%.*]] = ptrtoint ptr [[TMP2]] to i64 -; CHECK-NEXT: [[TMP15:%.*]] = lshr i64 [[TMP14]], 3 -; CHECK-NEXT: [[TMP16:%.*]] = add i64 [[TMP15]], 2147450880 -; CHECK-NEXT: [[TMP17:%.*]] = inttoptr i64 [[TMP16]] to ptr -; CHECK-NEXT: [[TMP18:%.*]] = load i8, ptr [[TMP17]], align 1 -; CHECK-NEXT: [[TMP19:%.*]] = icmp ne i8 [[TMP18]], 0 -; CHECK-NEXT: br i1 [[TMP19]], label %[[BB20:.*]], label %[[BB25:.*]], !prof [[PROF1]] -; CHECK: [[BB20]]: -; CHECK-NEXT: [[TMP21:%.*]] = and i64 [[TMP14]], 7 -; CHECK-NEXT: [[TMP22:%.*]] = trunc i64 [[TMP21]] to i8 -; CHECK-NEXT: [[TMP23:%.*]] = icmp sge i8 [[TMP22]], [[TMP18]] -; CHECK-NEXT: br i1 [[TMP23]], label %[[BB24:.*]], label %[[BB25]] -; CHECK: [[BB24]]: -; CHECK-NEXT: call void @__asan_report_store1(i64 [[TMP14]]) #[[ATTR4]] -; CHECK-NEXT: unreachable -; CHECK: [[BB25]]: -; CHECK-NEXT: store volatile i8 0, ptr [[TMP2]], align 1 -; CHECK-NEXT: br i1 [[X]], label %[[BB0:.*]], label %[[BB1:.*]] -; CHECK: [[BB0]]: -; CHECK-NEXT: br label %[[BB1]] -; CHECK: [[BB1]]: -; CHECK-NEXT: [[I_PHI:%.*]] = phi ptr [ [[TMP2]], %[[BB25]] ], [ [[TMP2]], %[[BB0]] ] -; CHECK-NEXT: [[TMP26:%.*]] = add i64 [[TMP9]], 4 -; CHECK-NEXT: [[TMP27:%.*]] = inttoptr i64 [[TMP26]] to ptr -; CHECK-NEXT: store i8 -8, ptr [[TMP27]], align 1 -; CHECK-NEXT: call void @llvm.lifetime.end.p0(i64 8, ptr [[I_PHI]]) -; CHECK-NEXT: store i64 1172321806, ptr [[TMP3]], align 8 -; CHECK-NEXT: [[TMP28:%.*]] = add i64 [[TMP9]], 0 -; CHECK-NEXT: [[TMP29:%.*]] = inttoptr i64 [[TMP28]] to ptr -; CHECK-NEXT: store i64 0, ptr [[TMP29]], align 1 -; CHECK-NEXT: ret void -; - -entry: - %i = alloca i64, align 4 - - ; Poison memory in prologue: F1F1F1F1F8F3F3F3 - - call void @llvm.lifetime.start.p0(i64 8, ptr %i) - - store volatile i8 0, ptr %i - - br i1 %x, label %bb0, label %bb1 - -bb0: - br label %bb1 - -bb1: - %i.phi = phi ptr [ %i, %entry ], [ %i, %bb0 ] - call void @llvm.lifetime.end.p0(i64 8, ptr %i.phi) - - ret void -} - -; Check that arguments of lifetime may come from getelementptr nodes. -define void @getelementptr_args(i64 %i) sanitize_address{ -; CHECK-LABEL: define void @getelementptr_args( -; CHECK-SAME: i64 [[I:%.*]]) #[[ATTR1]] { -; CHECK-NEXT: [[ENTRY:.*:]] -; CHECK-NEXT: [[MYALLOCA:%.*]] = alloca i8, i64 1216, align 32 -; CHECK-NEXT: [[TMP0:%.*]] = ptrtoint ptr [[MYALLOCA]] to i64 -; CHECK-NEXT: [[TMP1:%.*]] = add i64 [[TMP0]], 32 -; CHECK-NEXT: [[TMP2:%.*]] = inttoptr i64 [[TMP1]] to ptr -; CHECK-NEXT: [[TMP3:%.*]] = add i64 [[TMP0]], 1184 -; CHECK-NEXT: [[TMP4:%.*]] = inttoptr i64 [[TMP3]] to ptr -; CHECK-NEXT: [[TMP5:%.*]] = inttoptr i64 [[TMP0]] to ptr -; CHECK-NEXT: store i64 1102416563, ptr [[TMP5]], align 8 -; CHECK-NEXT: [[TMP6:%.*]] = add i64 [[TMP0]], 8 -; CHECK-NEXT: [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr -; CHECK-NEXT: store i64 ptrtoint (ptr @___asan_gen_stack.3 to i64), ptr [[TMP7]], align 8 -; CHECK-NEXT: [[TMP8:%.*]] = add i64 [[TMP0]], 16 -; CHECK-NEXT: [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr -; CHECK-NEXT: store i64 ptrtoint (ptr @getelementptr_args to i64), ptr [[TMP9]], align 8 -; CHECK-NEXT: [[TMP10:%.*]] = lshr i64 [[TMP0]], 3 -; CHECK-NEXT: [[TMP11:%.*]] = add i64 [[TMP10]], 2147450880 -; CHECK-NEXT: [[TMP12:%.*]] = add i64 [[TMP11]], 0 -; CHECK-NEXT: [[TMP13:%.*]] = inttoptr i64 [[TMP12]] to ptr -; CHECK-NEXT: store i32 -235802127, ptr [[TMP13]], align 1 -; CHECK-NEXT: [[TMP14:%.*]] = add i64 [[TMP11]], 4 -; CHECK-NEXT: call void @__asan_set_shadow_f8(i64 [[TMP14]], i64 128) -; CHECK-NEXT: [[TMP15:%.*]] = add i64 [[TMP11]], 132 -; CHECK-NEXT: [[TMP16:%.*]] = inttoptr i64 [[TMP15]] to ptr -; CHECK-NEXT: store i64 -940422246894996750, ptr [[TMP16]], align 1 -; CHECK-NEXT: [[TMP17:%.*]] = add i64 [[TMP11]], 140 -; CHECK-NEXT: [[TMP18:%.*]] = inttoptr i64 [[TMP17]] to ptr -; CHECK-NEXT: store i64 -940422246894996750, ptr [[TMP18]], align 1 -; CHECK-NEXT: [[TMP19:%.*]] = add i64 [[TMP11]], 150 -; CHECK-NEXT: [[TMP20:%.*]] = inttoptr i64 [[TMP19]] to ptr -; CHECK-NEXT: store i16 -3085, ptr [[TMP20]], align 1 -; CHECK-NEXT: [[TMP21:%.*]] = add i64 [[TMP11]], 4 -; CHECK-NEXT: call void @__asan_set_shadow_00(i64 [[TMP21]], i64 128) -; CHECK-NEXT: call void @llvm.lifetime.start.p0(i64 1024, ptr [[TMP2]]) -; CHECK-NEXT: [[AI:%.*]] = getelementptr inbounds [2 x ptr], ptr [[TMP4]], i64 0, i64 [[I]] -; CHECK-NEXT: [[TMP22:%.*]] = ptrtoint ptr [[AI]] to i64 -; CHECK-NEXT: [[TMP23:%.*]] = lshr i64 [[TMP22]], 3 -; CHECK-NEXT: [[TMP24:%.*]] = add i64 [[TMP23]], 2147450880 -; CHECK-NEXT: [[TMP25:%.*]] = inttoptr i64 [[TMP24]] to ptr -; CHECK-NEXT: [[TMP26:%.*]] = load i8, ptr [[TMP25]], align 1 -; CHECK-NEXT: [[TMP27:%.*]] = icmp ne i8 [[TMP26]], 0 -; CHECK-NEXT: br i1 [[TMP27]], label %[[BB28:.*]], label %[[BB29:.*]] -; CHECK: [[BB28]]: -; CHECK-NEXT: call void @__asan_report_store8(i64 [[TMP22]]) #[[ATTR4]] -; CHECK-NEXT: unreachable -; CHECK: [[BB29]]: -; CHECK-NEXT: store ptr [[TMP2]], ptr [[AI]], align 8 -; CHECK-NEXT: [[TMP30:%.*]] = add i64 [[TMP11]], 4 -; CHECK-NEXT: call void @__asan_set_shadow_f8(i64 [[TMP30]], i64 128) -; CHECK-NEXT: call void @llvm.lifetime.end.p0(i64 1024, ptr [[TMP2]]) -; CHECK-NEXT: store i64 1172321806, ptr [[TMP5]], align 8 -; CHECK-NEXT: [[TMP31:%.*]] = add i64 [[TMP11]], 0 -; CHECK-NEXT: call void @__asan_set_shadow_00(i64 [[TMP31]], i64 148) -; CHECK-NEXT: [[TMP32:%.*]] = add i64 [[TMP11]], 150 -; CHECK-NEXT: [[TMP33:%.*]] = inttoptr i64 [[TMP32]] to ptr -; CHECK-NEXT: store i16 0, ptr [[TMP33]], align 1 -; CHECK-NEXT: ret void -; -entry: - %x = alloca [1024 x i8], align 16 - %a = alloca [2 x ptr], align 8 - - ; F1F1F1F1 - ; 0xf2f2f2f2f2f2f2f2 - ; 0xf2f2f2f2f2f2f2f2 - - call void @llvm.lifetime.start.p0(i64 1024, ptr %x) - - %ai = getelementptr inbounds [2 x ptr], ptr %a, i64 0, i64 %i - store ptr %x, ptr %ai, align 8 - - call void @llvm.lifetime.end.p0(i64 1024, ptr %x) - - ret void -} - define void @zero_sized(i64 %a) #0 { ; CHECK-LABEL: define void @zero_sized( ; CHECK-SAME: i64 [[A:%.*]]) { diff --git a/llvm/test/Instrumentation/AddressSanitizer/stack-poisoning-and-lifetime-be.ll b/llvm/test/Instrumentation/AddressSanitizer/stack-poisoning-and-lifetime-be.ll index 9e21664..b4fe74a 100644 --- a/llvm/test/Instrumentation/AddressSanitizer/stack-poisoning-and-lifetime-be.ll +++ b/llvm/test/Instrumentation/AddressSanitizer/stack-poisoning-and-lifetime-be.ll @@ -100,8 +100,6 @@ entry: ; ENTRY-UAS-NEXT: [[PTR:%[0-9]+]] = inttoptr i64 [[OFFSET]] to ptr ; ENTRY-UAS-NEXT: store i8 2, ptr [[PTR]], align 1 - ; CHECK-NEXT: call void @llvm.lifetime.start.p0(i64 650, ptr %xx) - call void @Foo(ptr %xx) ; CHECK-NEXT: call void @Foo(ptr %xx) @@ -109,8 +107,6 @@ entry: ; ENTRY-UAS-NEXT: [[OFFSET:%[0-9]+]] = add i64 [[SHADOW_BASE]], 4 ; ENTRY-UAS-NEXT: call void @__asan_set_shadow_f8(i64 [[OFFSET]], i64 82) - ; CHECK-NEXT: call void @llvm.lifetime.end.p0(i64 650, ptr %xx) - call void @llvm.lifetime.start.p0(i64 13, ptr %yy) ; 0005 @@ -118,8 +114,6 @@ entry: ; ENTRY-UAS-NEXT: [[PTR:%[0-9]+]] = inttoptr i64 [[OFFSET]] to ptr ; ENTRY-UAS-NEXT: store i16 5, ptr [[PTR]], align 1 - ; CHECK-NEXT: call void @llvm.lifetime.start.p0(i64 13, ptr %yy) - call void @Foo(ptr %yy) ; CHECK-NEXT: call void @Foo(ptr %yy) @@ -129,8 +123,6 @@ entry: ; ENTRY-UAS-NEXT: [[PTR:%[0-9]+]] = inttoptr i64 [[OFFSET]] to ptr ; ENTRY-UAS-NEXT: store i16 -1800, ptr [[PTR]], align 1 - ; CHECK-NEXT: call void @llvm.lifetime.end.p0(i64 13, ptr %yy) - call void @llvm.lifetime.start.p0(i64 40, ptr %zz) ; 00000000 @@ -142,8 +134,6 @@ entry: ; ENTRY-UAS-NEXT: [[PTR:%[0-9]+]] = inttoptr i64 [[OFFSET]] to ptr ; ENTRY-UAS-NEXT: store i8 0, ptr [[PTR]], align 1 - ; CHECK-NEXT: call void @llvm.lifetime.start.p0(i64 40, ptr %zz) - call void @Foo(ptr %zz) ; CHECK-NEXT: call void @Foo(ptr %zz) @@ -157,8 +147,6 @@ entry: ; ENTRY-UAS-NEXT: [[PTR:%[0-9]+]] = inttoptr i64 [[OFFSET]] to ptr ; ENTRY-UAS-NEXT: store i8 -8, ptr [[PTR]], align 1 - ; CHECK-NEXT: call void @llvm.lifetime.end.p0(i64 40, ptr %zz) - ; CHECK: {{^[0-9]+}}: ; CHECK-NEXT: [[OFFSET:%[0-9]+]] = add i64 [[SHADOW_BASE]], 0 diff --git a/llvm/test/Instrumentation/AddressSanitizer/stack-poisoning-and-lifetime.ll b/llvm/test/Instrumentation/AddressSanitizer/stack-poisoning-and-lifetime.ll index 35833ed..fca92cb 100644 --- a/llvm/test/Instrumentation/AddressSanitizer/stack-poisoning-and-lifetime.ll +++ b/llvm/test/Instrumentation/AddressSanitizer/stack-poisoning-and-lifetime.ll @@ -100,8 +100,6 @@ entry: ; ENTRY-UAS-NEXT: [[PTR:%[0-9]+]] = inttoptr i64 [[OFFSET]] to ptr ; ENTRY-UAS-NEXT: store i8 2, ptr [[PTR]], align 1 - ; CHECK-NEXT: call void @llvm.lifetime.start.p0(i64 650, ptr %xx) - call void @Foo(ptr %xx) ; CHECK-NEXT: call void @Foo(ptr %xx) @@ -109,8 +107,6 @@ entry: ; ENTRY-UAS-NEXT: [[OFFSET:%[0-9]+]] = add i64 [[SHADOW_BASE]], 4 ; ENTRY-UAS-NEXT: call void @__asan_set_shadow_f8(i64 [[OFFSET]], i64 82) - ; CHECK-NEXT: call void @llvm.lifetime.end.p0(i64 650, ptr %xx) - call void @llvm.lifetime.start.p0(i64 13, ptr %yy) ; 0005 @@ -118,8 +114,6 @@ entry: ; ENTRY-UAS-NEXT: [[PTR:%[0-9]+]] = inttoptr i64 [[OFFSET]] to ptr ; ENTRY-UAS-NEXT: store i16 1280, ptr [[PTR]], align 1 - ; CHECK-NEXT: call void @llvm.lifetime.start.p0(i64 13, ptr %yy) - call void @Foo(ptr %yy) ; CHECK-NEXT: call void @Foo(ptr %yy) @@ -129,8 +123,6 @@ entry: ; ENTRY-UAS-NEXT: [[PTR:%[0-9]+]] = inttoptr i64 [[OFFSET]] to ptr ; ENTRY-UAS-NEXT: store i16 -1800, ptr [[PTR]], align 1 - ; CHECK-NEXT: call void @llvm.lifetime.end.p0(i64 13, ptr %yy) - call void @llvm.lifetime.start.p0(i64 40, ptr %zz) ; 00000000 @@ -142,8 +134,6 @@ entry: ; ENTRY-UAS-NEXT: [[PTR:%[0-9]+]] = inttoptr i64 [[OFFSET]] to ptr ; ENTRY-UAS-NEXT: store i8 0, ptr [[PTR]], align 1 - ; CHECK-NEXT: call void @llvm.lifetime.start.p0(i64 40, ptr %zz) - call void @Foo(ptr %zz) ; CHECK-NEXT: call void @Foo(ptr %zz) @@ -157,8 +147,6 @@ entry: ; ENTRY-UAS-NEXT: [[PTR:%[0-9]+]] = inttoptr i64 [[OFFSET]] to ptr ; ENTRY-UAS-NEXT: store i8 -8, ptr [[PTR]], align 1 - ; CHECK-NEXT: call void @llvm.lifetime.end.p0(i64 40, ptr %zz) - ; CHECK: {{^[0-9]+}}: ; CHECK-NEXT: [[OFFSET:%[0-9]+]] = add i64 [[SHADOW_BASE]], 0 @@ -209,40 +197,6 @@ entry: ; CHECK: ret void } -declare void @foo(ptr) -define void @PR41481(i1 %b) sanitize_address { -; CHECK-LABEL: @PR41481 -entry: - %p1 = alloca i32 - %p2 = alloca i32 - br label %bb1 - - ; Since we cannot account for all lifetime intrinsics in this function, we - ; might have missed a lifetime.start one and therefore shouldn't poison the - ; allocas at function entry. - ; ENTRY: store i64 -935356719533264399 - ; ENTRY-UAS: store i64 -935356719533264399 - -bb1: - %p = select i1 %b, ptr %p1, ptr %p2 - %q = select i1 %b, ptr %p1, ptr %p2 - call void @llvm.lifetime.start.p0(i64 4, ptr %q) - call void @foo(ptr %p) - br i1 %b, label %bb2, label %bb3 - -bb2: - call void @llvm.lifetime.end.p0(i64 4, ptr %p1) - br label %end - -bb3: - call void @llvm.lifetime.end.p0(i64 4, ptr %p2) - br label %end - -end: - ret void -} - - declare void @llvm.lifetime.start.p0(i64, ptr nocapture) declare void @llvm.lifetime.end.p0(i64, ptr nocapture) diff --git a/llvm/test/Instrumentation/MemorySanitizer/alloca.ll b/llvm/test/Instrumentation/MemorySanitizer/alloca.ll index 25a44ec..40ade5f 100644 --- a/llvm/test/Instrumentation/MemorySanitizer/alloca.ll +++ b/llvm/test/Instrumentation/MemorySanitizer/alloca.ll @@ -176,78 +176,5 @@ entry: ; CHECK: call void @llvm.lifetime.end ; CHECK: ret void - -; If we can't trace one of the lifetime markers to a single alloca, fall back -; to poisoning allocas at the beginning of the function. -; Each alloca must be poisoned only once. -define void @lifetime_no_alloca(i8 %v) sanitize_memory { -entry: - %x = alloca i32, align 4 - %y = alloca i32, align 4 - %z = alloca i32, align 4 - %tobool = icmp eq i8 %v, 0 - %xy = select i1 %tobool, ptr %x, ptr %y - %cxcy = select i1 %tobool, ptr %x, ptr %y - br label %another_bb - -another_bb: - call void @llvm.lifetime.start.p0(i64 4, ptr nonnull %z) - store i32 7, ptr %z - call void @llvm.lifetime.end.p0(i64 4, ptr nonnull %z) - call void @llvm.lifetime.start.p0(i64 4, ptr nonnull %z) - store i32 7, ptr %z - call void @llvm.lifetime.end.p0(i64 4, ptr nonnull %z) - call void @llvm.lifetime.start.p0(i64 4, ptr nonnull %cxcy) - store i32 8, ptr %xy - call void @llvm.lifetime.end.p0(i64 4, ptr nonnull %cxcy) - ret void -} - -; CHECK-LABEL: define void @lifetime_no_alloca( -; CHECK-LABEL: entry: -; CHECK: %x = alloca i32 -; INLINE: call void @llvm.memset.p0.i64(ptr align 4 {{.*}}, i8 -1, i64 4, i1 false) -; CALL: call void @__msan_poison_stack(ptr {{.*}}, i64 4) -; ORIGIN: call void @__msan_set_alloca_origin_with_descr(ptr {{.*}}, i64 4, -; ORIGIN-LEAN: call void @__msan_set_alloca_origin_no_descr(ptr {{.*}}, i64 4, -; KMSAN: call void @__msan_poison_alloca(ptr {{.*}}, i64 4, -; CHECK: %y = alloca i32 -; INLINE: call void @llvm.memset.p0.i64(ptr align 4 {{.*}}, i8 -1, i64 4, i1 false) -; CALL: call void @__msan_poison_stack(ptr {{.*}}, i64 4) -; ORIGIN: call void @__msan_set_alloca_origin_with_descr(ptr {{.*}}, i64 4, -; ORIGIN-LEAN: call void @__msan_set_alloca_origin_no_descr(ptr {{.*}}, i64 4, -; KMSAN: call void @__msan_poison_alloca(ptr {{.*}}, i64 4, -; CHECK: %z = alloca i32 -; INLINE: call void @llvm.memset.p0.i64(ptr align 4 {{.*}}, i8 -1, i64 4, i1 false) -; CALL: call void @__msan_poison_stack(ptr {{.*}}, i64 4) -; ORIGIN: call void @__msan_set_alloca_origin_with_descr(ptr {{.*}}, i64 4, -; ORIGIN-LEAN: call void @__msan_set_alloca_origin_no_descr(ptr {{.*}}, i64 4, -; KMSAN: call void @__msan_poison_alloca(ptr {{.*}}, i64 4, - -; There're two lifetime intrinsics for %z, but we must instrument it only once. -; INLINE-NOT: call void @llvm.memset.p0.i64(ptr align 4 {{.*}}, i8 -1, i64 4, i1 false) -; CALL-NOT: call void @__msan_poison_stack(ptr {{.*}}, i64 4) -; ORIGIN-NOT: call void @__msan_set_alloca_origin_with_descr(ptr {{.*}}, i64 4, -; ORIGIN-LEAN-NOT: call void @__msan_set_alloca_origin_no_descr(ptr {{.*}}, i64 4, -; KMSAN-NOT: call void @__msan_poison_alloca(ptr {{.*}}, i64 4, -; CHECK-LABEL: another_bb: - -; CHECK: call void @llvm.lifetime.start -; INLINE-NOT: call void @llvm.memset.p0.i64(ptr align 4 {{.*}}, i8 -1, i64 4, i1 false) -; CALL-NOT: call void @__msan_poison_stack(ptr {{.*}}, i64 4) -; ORIGIN-NOT: call void @__msan_set_alloca_origin_with_descr(ptr {{.*}}, i64 4, -; ORIGIN-LEAN-NOT: call void @__msan_set_alloca_origin_no_descr(ptr {{.*}}, i64 4, -; KMSAN-NOT: call void @__msan_poison_alloca(ptr {{.*}}, i64 4, -; CHECK: call void @llvm.lifetime.end -; CHECK: call void @llvm.lifetime.start -; INLINE-NOT: call void @llvm.memset.p0.i64(ptr align 4 {{.*}}, i8 -1, i64 4, i1 false) -; CALL-NOT: call void @__msan_poison_stack(ptr {{.*}}, i64 4) -; ORIGIN-NOT: call void @__msan_set_alloca_origin_with_descr(ptr {{.*}}, i64 4, -; ORIGIN-LEAN-NOT: call void @__msan_set_alloca_origin_no_descr(ptr {{.*}}, i64 4, -; KMSAN-NOT: call void @__msan_poison_alloca(ptr {{.*}}, i64 4, -; CHECK: call void @llvm.lifetime.end - - - declare void @llvm.lifetime.start.p0(i64 immarg, ptr nocapture) declare void @llvm.lifetime.end.p0(i64 immarg, ptr nocapture) diff --git a/llvm/test/MC/AMDGPU/gfx10_asm_vop3.s b/llvm/test/MC/AMDGPU/gfx10_asm_vop3.s index 6bb0f4b..3d6af6b 100644 --- a/llvm/test/MC/AMDGPU/gfx10_asm_vop3.s +++ b/llvm/test/MC/AMDGPU/gfx10_asm_vop3.s @@ -3628,6 +3628,18 @@ v_alignbit_b32 v5, v1, v2, exec_lo v_alignbit_b32 v5, v1, v2, exec_hi // GFX10: encoding: [0x05,0x00,0x4e,0xd5,0x01,0x05,0xfe,0x01] +v_alignbit_b32 v5, v1, v2, v3 op_sel:[1] +// GFX10: v_alignbit_b32 v5, v1, v2, v3 op_sel:[1,0,0,0] ; encoding: [0x05,0x08,0x4e,0xd5,0x01,0x05,0x0e,0x04] + +v_alignbit_b32 v5, v1, v2, v3 op_sel:[1,1] +// GFX10: v_alignbit_b32 v5, v1, v2, v3 op_sel:[1,1,0,0] ; encoding: [0x05,0x18,0x4e,0xd5,0x01,0x05,0x0e,0x04] + +v_alignbit_b32 v5, v1, v2, v3 op_sel:[1,1,1] +// GFX10: v_alignbit_b32 v5, v1, v2, v3 op_sel:[1,1,1,0] ; encoding: [0x05,0x38,0x4e,0xd5,0x01,0x05,0x0e,0x04] + +v_alignbit_b32 v5, v1, v2, v3 op_sel:[1,1,1,1] +// GFX10: v_alignbit_b32 v5, v1, v2, v3 op_sel:[1,1,1,1] ; encoding: [0x05,0x78,0x4e,0xd5,0x01,0x05,0x0e,0x04] + v_alignbyte_b32 v5, v1, v2, v3 // GFX10: encoding: [0x05,0x00,0x4f,0xd5,0x01,0x05,0x0e,0x04] @@ -3715,6 +3727,18 @@ v_alignbyte_b32 v5, v1, v2, exec_lo v_alignbyte_b32 v5, v1, v2, exec_hi // GFX10: encoding: [0x05,0x00,0x4f,0xd5,0x01,0x05,0xfe,0x01] +v_alignbyte_b32 v5, v1, v2, v3 op_sel:[1] +// GFX10: v_alignbyte_b32 v5, v1, v2, v3 op_sel:[1,0,0,0] ; encoding: [0x05,0x08,0x4f,0xd5,0x01,0x05,0x0e,0x04] + +v_alignbyte_b32 v5, v1, v2, v3 op_sel:[1,1] +// GFX10: v_alignbyte_b32 v5, v1, v2, v3 op_sel:[1,1,0,0] ; encoding: [0x05,0x18,0x4f,0xd5,0x01,0x05,0x0e,0x04] + +v_alignbyte_b32 v5, v1, v2, v3 op_sel:[1,1,1] +// GFX10: v_alignbyte_b32 v5, v1, v2, v3 op_sel:[1,1,1,0] ; encoding: [0x05,0x38,0x4f,0xd5,0x01,0x05,0x0e,0x04] + +v_alignbyte_b32 v5, v1, v2, v3 op_sel:[1,1,1,1] +// GFX10: v_alignbyte_b32 v5, v1, v2, v3 op_sel:[1,1,1,1] ; encoding: [0x05,0x78,0x4f,0xd5,0x01,0x05,0x0e,0x04] + v_mullit_f32 v5, v1, v2, v3 // GFX10: encoding: [0x05,0x00,0x50,0xd5,0x01,0x05,0x0e,0x04] diff --git a/llvm/test/MC/AMDGPU/gfx1250_asm_wmma_w32.s b/llvm/test/MC/AMDGPU/gfx1250_asm_wmma_w32.s index e81b6a1..d8dfd1e 100644 --- a/llvm/test/MC/AMDGPU/gfx1250_asm_wmma_w32.s +++ b/llvm/test/MC/AMDGPU/gfx1250_asm_wmma_w32.s @@ -923,6 +923,71 @@ v_swmmac_f16_16x16x64_f16 v[24:27], v[0:7], v[8:23], v28 matrix_b_reuse // WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 // GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU +v_wmma_f32_16x16x128_f8f6f4 v[0:7], v[8:23], v[24:39], v[40:47] +// GFX1250: v_wmma_f32_16x16x128_f8f6f4 v[0:7], v[8:23], v[24:39], v[40:47] ; encoding: [0x00,0x00,0x33,0xcc,0x08,0x31,0xa2,0x04] +// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 +// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU + +v_wmma_f32_16x16x128_f8f6f4 v[0:7], v[8:23], v[24:39], v[40:47] matrix_a_fmt:MATRIX_FMT_BF8 +// GFX1250: v_wmma_f32_16x16x128_f8f6f4 v[0:7], v[8:23], v[24:39], v[40:47] matrix_a_fmt:MATRIX_FMT_BF8 ; encoding: [0x00,0x08,0x33,0xcc,0x08,0x31,0xa2,0x04] +// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 +// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU + +v_wmma_f32_16x16x128_f8f6f4 v[0:7], v[8:19], v[24:39], v[40:47] matrix_a_fmt:MATRIX_FMT_FP6 +// GFX1250: v_wmma_f32_16x16x128_f8f6f4 v[0:7], v[8:19], v[24:39], v[40:47] matrix_a_fmt:MATRIX_FMT_FP6 ; encoding: [0x00,0x10,0x33,0xcc,0x08,0x31,0xa2,0x04] +// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 +// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU + +v_wmma_f32_16x16x128_f8f6f4 v[0:7], v[8:19], v[24:39], v[40:47] matrix_a_fmt:MATRIX_FMT_BF6 +// GFX1250: v_wmma_f32_16x16x128_f8f6f4 v[0:7], v[8:19], v[24:39], v[40:47] matrix_a_fmt:MATRIX_FMT_BF6 ; encoding: [0x00,0x18,0x33,0xcc,0x08,0x31,0xa2,0x04] +// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 +// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU + +v_wmma_f32_16x16x128_f8f6f4 v[0:7], v[8:15], v[24:39], v[40:47] matrix_a_fmt:MATRIX_FMT_FP4 +// GFX1250: v_wmma_f32_16x16x128_f8f6f4 v[0:7], v[8:15], v[24:39], v[40:47] matrix_a_fmt:MATRIX_FMT_FP4 ; encoding: [0x00,0x20,0x33,0xcc,0x08,0x31,0xa2,0x04] +// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 +// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU + +v_wmma_f32_16x16x128_f8f6f4 v[0:7], v[8:23], v[24:39], v[40:47] matrix_b_fmt:MATRIX_FMT_BF8 +// GFX1250: v_wmma_f32_16x16x128_f8f6f4 v[0:7], v[8:23], v[24:39], v[40:47] matrix_b_fmt:MATRIX_FMT_BF8 ; encoding: [0x00,0x00,0x33,0xcc,0x08,0x31,0xa2,0x0c] +// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 +// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU + +v_wmma_f32_16x16x128_f8f6f4 v[0:7], v[8:23], v[24:35], v[40:47] matrix_b_fmt:MATRIX_FMT_FP6 +// GFX1250: v_wmma_f32_16x16x128_f8f6f4 v[0:7], v[8:23], v[24:35], v[40:47] matrix_b_fmt:MATRIX_FMT_FP6 ; encoding: [0x00,0x00,0x33,0xcc,0x08,0x31,0xa2,0x14] +// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 +// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU + +v_wmma_f32_16x16x128_f8f6f4 v[0:7], v[8:23], v[24:35], v[40:47] matrix_b_fmt:MATRIX_FMT_BF6 +// GFX1250: v_wmma_f32_16x16x128_f8f6f4 v[0:7], v[8:23], v[24:35], v[40:47] matrix_b_fmt:MATRIX_FMT_BF6 ; encoding: [0x00,0x00,0x33,0xcc,0x08,0x31,0xa2,0x1c] +// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 +// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU + +v_wmma_f32_16x16x128_f8f6f4 v[0:7], v[8:23], v[24:31], v[40:47] matrix_b_fmt:MATRIX_FMT_FP4 +// GFX1250: v_wmma_f32_16x16x128_f8f6f4 v[0:7], v[8:23], v[24:31], v[40:47] matrix_b_fmt:MATRIX_FMT_FP4 ; encoding: [0x00,0x40,0x33,0xcc,0x08,0x31,0xa2,0x04] +// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 +// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU + +v_wmma_f32_16x16x128_f8f6f4 v[0:7], v[8:23], v[24:35], v[40:47] matrix_a_fmt:MATRIX_FMT_BF8 matrix_b_fmt:MATRIX_FMT_FP6 +// GFX1250: v_wmma_f32_16x16x128_f8f6f4 v[0:7], v[8:23], v[24:35], v[40:47] matrix_a_fmt:MATRIX_FMT_BF8 matrix_b_fmt:MATRIX_FMT_FP6 ; encoding: [0x00,0x08,0x33,0xcc,0x08,0x31,0xa2,0x14] +// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 +// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU + +v_wmma_f32_16x16x128_f8f6f4 v[0:7], v[8:23], v[24:39], 1.0 +// GFX1250: v_wmma_f32_16x16x128_f8f6f4 v[0:7], v[8:23], v[24:39], 1.0 ; encoding: [0x00,0x00,0x33,0xcc,0x08,0x31,0xca,0x03] +// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 +// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU + +v_wmma_f32_16x16x128_f8f6f4 v[0:7], v[8:23], v[24:39], v[40:47] neg_lo:[0,0,1] +// GFX1250: v_wmma_f32_16x16x128_f8f6f4 v[0:7], v[8:23], v[24:39], v[40:47] neg_lo:[0,0,1] ; encoding: [0x00,0x00,0x33,0xcc,0x08,0x31,0xa2,0x84] +// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 +// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU + +v_wmma_f32_16x16x128_f8f6f4 v[0:7], v[8:23], v[24:39], v[40:47] neg_hi:[0,0,1] +// GFX1250: v_wmma_f32_16x16x128_f8f6f4 v[0:7], v[8:23], v[24:39], v[40:47] neg_hi:[0,0,1] ; encoding: [0x00,0x04,0x33,0xcc,0x08,0x31,0xa2,0x04] +// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 +// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU + v_wmma_f16_16x16x128_fp8_fp8 v[16:19], v[0:15], v[8:23], v[16:19] // GFX1250: v_wmma_f16_16x16x128_fp8_fp8 v[16:19], v[0:15], v[8:23], v[16:19] ; encoding: [0x10,0x00,0x84,0xcc,0x00,0x11,0x42,0x1c] // WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 diff --git a/llvm/test/MC/AMDGPU/gfx1250_asm_wmma_w32_err.s b/llvm/test/MC/AMDGPU/gfx1250_asm_wmma_w32_err.s index 47445d3..421d96b 100644 --- a/llvm/test/MC/AMDGPU/gfx1250_asm_wmma_w32_err.s +++ b/llvm/test/MC/AMDGPU/gfx1250_asm_wmma_w32_err.s @@ -363,6 +363,82 @@ v_swmmac_f16_16x16x64_f16 v[24:27], v[0:7], v[8:23], v28 index_key:2 v_swmmac_f16_16x16x64_f16 v[24:27], v[0:7], v[8:23], v28 neg_lo:[0,0,1] // GFX1250-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_lo operand +v_wmma_f32_16x16x128_f8f6f4 v[0:7], v[8:23], v[24:39], v[40:47] neg_lo:[1,0,0] +// GFX1250-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_lo operand + +v_wmma_f32_16x16x128_f8f6f4 v[0:7], v[8:23], v[24:39], v[40:47] neg_lo:[0,1,0] +// GFX1250-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_lo operand + +v_wmma_f32_16x16x128_f8f6f4 v[0:7], v[8:23], v[24:39], v[40:47] neg_hi:[1,0,0] +// GFX1250-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_hi operand + +v_wmma_f32_16x16x128_f8f6f4 v[0:7], v[8:23], v[24:39], v[40:47] neg_hi:[0,1,0] +// GFX1250-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_hi operand + +v_wmma_f32_16x16x128_f8f6f4 v[0:7], v[8:23], v[24:39], v[40:47] clamp +// GFX1250-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_wmma_f32_16x16x128_f8f6f4 v[0:7], v[8:23], v[24:39], v[40:47] matrix_b_fmt:-1 +// GFX1250-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid matrix_b_fmt value + +v_wmma_f32_16x16x128_f8f6f4 v[0:7], v[8:23], v[24:39], v[40:47] matrix_b_fmt:xxx +// GFX1250-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid matrix_b_fmt value + +v_wmma_f32_16x16x128_f8f6f4 v[0:7], v[0:7], v[20:35], v[40:47] +// GFX1250-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: wrong register tuple size for MATRIX_FMT_FP8 +// GFX1250-ERR-NEXT: {{^}}v_wmma_f32_16x16x128_f8f6f4 v[0:7], v[0:7], v[20:35], v[40:47] +// GFX1250-ERR-NEXT: {{^}} ^ + +v_wmma_f32_16x16x128_f8f6f4 v[0:7], v[0:7], v[20:35], v[40:47] matrix_a_fmt:MATRIX_FMT_FP8 +// GFX1250-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: wrong register tuple size for MATRIX_FMT_FP8 +// GFX1250-ERR-NEXT: {{^}}v_wmma_f32_16x16x128_f8f6f4 v[0:7], v[0:7], v[20:35], v[40:47] matrix_a_fmt:MATRIX_FMT_FP8 +// GFX1250-ERR-NEXT: {{^}} ^ + +v_wmma_f32_16x16x128_f8f6f4 v[0:7], v[0:7], v[20:35], v[40:47] matrix_a_fmt:MATRIX_FMT_BF8 +// GFX1250-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: wrong register tuple size for MATRIX_FMT_BF8 +// GFX1250-ERR-NEXT: {{^}}v_wmma_f32_16x16x128_f8f6f4 v[0:7], v[0:7], v[20:35], v[40:47] matrix_a_fmt:MATRIX_FMT_BF8 +// GFX1250-ERR-NEXT: {{^}} ^ + +v_wmma_f32_16x16x128_f8f6f4 v[0:7], v[0:7], v[20:35], v[40:47] matrix_a_fmt:MATRIX_FMT_FP6 +// GFX1250-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: wrong register tuple size for MATRIX_FMT_FP6 +// GFX1250-ERR-NEXT: {{^}}v_wmma_f32_16x16x128_f8f6f4 v[0:7], v[0:7], v[20:35], v[40:47] matrix_a_fmt:MATRIX_FMT_FP6 +// GFX1250-ERR-NEXT: {{^}} ^ + +v_wmma_f32_16x16x128_f8f6f4 v[0:7], v[0:7], v[20:35], v[40:47] matrix_a_fmt:MATRIX_FMT_BF6 +// GFX1250-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: wrong register tuple size for MATRIX_FMT_BF6 +// GFX1250-ERR-NEXT: {{^}}v_wmma_f32_16x16x128_f8f6f4 v[0:7], v[0:7], v[20:35], v[40:47] matrix_a_fmt:MATRIX_FMT_BF6 +// GFX1250-ERR-NEXT: {{^}} ^ + +v_wmma_f32_16x16x128_f8f6f4 v[0:7], v[0:15], v[20:35], v[40:47] matrix_a_fmt:MATRIX_FMT_FP4 +// GFX1250-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: wrong register tuple size for MATRIX_FMT_FP4 +// GFX1250-ERR-NEXT: {{^}}v_wmma_f32_16x16x128_f8f6f4 v[0:7], v[0:15], v[20:35], v[40:47] matrix_a_fmt:MATRIX_FMT_FP4 +// GFX1250-ERR-NEXT: {{^}} ^ + +v_wmma_f32_16x16x128_f8f6f4 v[0:7], v[0:15], v[20:27], v[40:47] matrix_b_fmt:MATRIX_FMT_FP8 +// GFX1250-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: wrong register tuple size for MATRIX_FMT_FP8 +// GFX1250-ERR-NEXT: {{^}}v_wmma_f32_16x16x128_f8f6f4 v[0:7], v[0:15], v[20:27], v[40:47] matrix_b_fmt:MATRIX_FMT_FP8 +// GFX1250-ERR-NEXT: {{^}} ^ + +v_wmma_f32_16x16x128_f8f6f4 v[0:7], v[0:15], v[20:27], v[40:47] matrix_b_fmt:MATRIX_FMT_BF8 +// GFX1250-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: wrong register tuple size for MATRIX_FMT_BF8 +// GFX1250-ERR-NEXT: {{^}}v_wmma_f32_16x16x128_f8f6f4 v[0:7], v[0:15], v[20:27], v[40:47] matrix_b_fmt:MATRIX_FMT_BF8 +// GFX1250-ERR-NEXT: {{^}} ^ + +v_wmma_f32_16x16x128_f8f6f4 v[0:7], v[0:15], v[20:27], v[40:47] matrix_b_fmt:MATRIX_FMT_FP6 +// GFX1250-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: wrong register tuple size for MATRIX_FMT_FP6 +// GFX1250-ERR-NEXT: {{^}}v_wmma_f32_16x16x128_f8f6f4 v[0:7], v[0:15], v[20:27], v[40:47] matrix_b_fmt:MATRIX_FMT_FP6 +// GFX1250-ERR-NEXT: {{^}} ^ + +v_wmma_f32_16x16x128_f8f6f4 v[0:7], v[0:15], v[20:27], v[40:47] matrix_b_fmt:MATRIX_FMT_BF6 +// GFX1250-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: wrong register tuple size for MATRIX_FMT_BF6 +// GFX1250-ERR-NEXT: {{^}}v_wmma_f32_16x16x128_f8f6f4 v[0:7], v[0:15], v[20:27], v[40:47] matrix_b_fmt:MATRIX_FMT_BF6 +// GFX1250-ERR-NEXT: {{^}} ^ + +v_wmma_f32_16x16x128_f8f6f4 v[0:7], v[0:15], v[20:35], v[40:47] matrix_b_fmt:MATRIX_FMT_FP4 +// GFX1250-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: wrong register tuple size for MATRIX_FMT_FP4 +// GFX1250-ERR-NEXT: {{^}}v_wmma_f32_16x16x128_f8f6f4 v[0:7], v[0:15], v[20:35], v[40:47] matrix_b_fmt:MATRIX_FMT_FP4 +// GFX1250-ERR-NEXT: {{^}} ^ + v_wmma_f32_32x16x128_f4 v[4:19], v[0:15], v[2:9], v[4:19] neg_lo:[1,0,0] // GFX1250-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_lo operand // GFX1250-ERR-NEXT: {{^}}v_wmma_f32_32x16x128_f4 v[4:19], v[0:15], v[2:9], v[4:19] neg_lo:[1,0,0] diff --git a/llvm/test/MC/AMDGPU/gfx7_err_pos.s b/llvm/test/MC/AMDGPU/gfx7_err_pos.s index 9dcbd4a..7b6b241 100644 --- a/llvm/test/MC/AMDGPU/gfx7_err_pos.s +++ b/llvm/test/MC/AMDGPU/gfx7_err_pos.s @@ -44,3 +44,16 @@ s_load_dword s5, s[2:3], glc // CHECK: :[[@LINE-1]]:{{[0-9]+}}: error: cache policy is not supported for SMRD instructions // CHECK-NEXT:{{^}}s_load_dword s5, s[2:3], glc // CHECK-NEXT:{{^}} ^ + +//============================================================================== +// not a valid operand + +v_alignbit_b32 v5, v1, v2, v3 op_sel:[1,1,1,1] +// CHECK: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. +// CHECK-NEXT:{{^}}v_alignbit_b32 v5, v1, v2, v3 op_sel:[1,1,1,1] +// CHECK-NEXT:{{^}} ^ + +v_alignbyte_b32 v5, v1, v2, v3 op_sel:[1,1,1,1] +// CHECK: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. +// CHECK-NEXT:{{^}}v_alignbyte_b32 v5, v1, v2, v3 op_sel:[1,1,1,1] +// CHECK-NEXT:{{^}} ^ diff --git a/llvm/test/MC/AMDGPU/gfx8_err_pos.s b/llvm/test/MC/AMDGPU/gfx8_err_pos.s index 1e8457d..a475c73 100644 --- a/llvm/test/MC/AMDGPU/gfx8_err_pos.s +++ b/llvm/test/MC/AMDGPU/gfx8_err_pos.s @@ -49,3 +49,13 @@ v_cndmask_b32_sdwa v5, v1, sext(v2), vcc dst_sel:DWORD dst_unused:UNUSED_PRESERV // CHECK: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. // CHECK-NEXT:{{^}}v_cndmask_b32_sdwa v5, v1, sext(v2), vcc dst_sel:DWORD dst_unused:UNUSED_PRESERVE src0_sel:BYTE_0 src1_sel:WORD_0 // CHECK-NEXT:{{^}} ^ + +v_alignbit_b32 v5, v1, v2, v3 op_sel:[1,1,1,1] +// CHECK: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. +// CHECK-NEXT:{{^}}v_alignbit_b32 v5, v1, v2, v3 op_sel:[1,1,1,1] +// CHECK-NEXT:{{^}} ^ + +v_alignbyte_b32 v5, v1, v2, v3 op_sel:[1,1,1,1] +// CHECK: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. +// CHECK-NEXT:{{^}}v_alignbyte_b32 v5, v1, v2, v3 op_sel:[1,1,1,1] +// CHECK-NEXT:{{^}} ^ diff --git a/llvm/test/MC/AMDGPU/gfx9_asm_vop3_e64.s b/llvm/test/MC/AMDGPU/gfx9_asm_vop3_e64.s index f3f4cae..a1cd9ce 100644 --- a/llvm/test/MC/AMDGPU/gfx9_asm_vop3_e64.s +++ b/llvm/test/MC/AMDGPU/gfx9_asm_vop3_e64.s @@ -2829,6 +2829,18 @@ v_alignbit_b32 v5, v1, v2, src_execz v_alignbit_b32 v5, v1, v2, src_scc // CHECK: [0x05,0x00,0xce,0xd1,0x01,0x05,0xf6,0x03] +v_alignbit_b32 v5, v1, v2, v3 op_sel:[1,0,0,0] ; encoding: [0x05,0x08,0xce,0xd1,0x01,0x05,0x0e,0x04] +// CHECK: [0x05,0x08,0xce,0xd1,0x01,0x05,0x0e,0x04] + +v_alignbit_b32 v5, v1, v2, v3 op_sel:[1,1,0,0] ; encoding: [0x05,0x18,0xce,0xd1,0x01,0x05,0x0e,0x04] +// CHECK: [0x05,0x18,0xce,0xd1,0x01,0x05,0x0e,0x04] + +v_alignbit_b32 v5, v1, v2, v3 op_sel:[1,1,1,0] ; encoding: [0x05,0x38,0xce,0xd1,0x01,0x05,0x0e,0x04] +// CHECK: [0x05,0x38,0xce,0xd1,0x01,0x05,0x0e,0x04] + +v_alignbit_b32 v5, v1, v2, v3 op_sel:[1,1,1,1] ; encoding: [0x05,0x78,0xce,0xd1,0x01,0x05,0x0e,0x04] +// CHECK: [0x05,0x78,0xce,0xd1,0x01,0x05,0x0e,0x04] + v_alignbyte_b32 v5, v1, v2, v3 // CHECK: [0x05,0x00,0xcf,0xd1,0x01,0x05,0x0e,0x04] @@ -3000,6 +3012,18 @@ v_alignbyte_b32 v5, v1, v2, src_execz v_alignbyte_b32 v5, v1, v2, src_scc // CHECK: [0x05,0x00,0xcf,0xd1,0x01,0x05,0xf6,0x03] +v_alignbyte_b32 v5, v1, v2, v3 op_sel:[1] +// CHECK: v_alignbyte_b32 v5, v1, v2, v3 op_sel:[1,0,0,0] ; encoding: [0x05,0x08,0xcf,0xd1,0x01,0x05,0x0e,0x04] + +v_alignbyte_b32 v5, v1, v2, v3 op_sel:[1,1] +// CHECK: v_alignbyte_b32 v5, v1, v2, v3 op_sel:[1,1,0,0] ; encoding: [0x05,0x18,0xcf,0xd1,0x01,0x05,0x0e,0x04] + +v_alignbyte_b32 v5, v1, v2, v3 op_sel:[1,1,1] +// CHECK: v_alignbyte_b32 v5, v1, v2, v3 op_sel:[1,1,1,0] ; encoding: [0x05,0x38,0xcf,0xd1,0x01,0x05,0x0e,0x04] + +v_alignbyte_b32 v5, v1, v2, v3 op_sel:[1,1,1,1] +// CHECK: v_alignbyte_b32 v5, v1, v2, v3 op_sel:[1,1,1,1] ; encoding: [0x05,0x78,0xcf,0xd1,0x01,0x05,0x0e,0x04] + v_min3_f32 v5, v1, v2, v3 // CHECK: [0x05,0x00,0xd0,0xd1,0x01,0x05,0x0e,0x04] diff --git a/llvm/test/MC/AVR/inst-brbc.s b/llvm/test/MC/AVR/inst-brbc.s index 6d96393..bf73188 100644 --- a/llvm/test/MC/AVR/inst-brbc.s +++ b/llvm/test/MC/AVR/inst-brbc.s @@ -15,8 +15,10 @@ foo: ; CHECK: brcc .Ltmp1-16+2 ; encoding: [0bAAAAA000,0b111101AA] ; INST-LABEL: <foo>: -; INST-NEXT: 23 f4 brvc .+8 -; INST-NEXT: c0 f7 brsh .-16 +; INST-NEXT: fb f7 brvc .-2 +; INST-NEXT: R_AVR_7_PCREL .text+0xa +; INST-NEXT: f8 f7 brsh .-2 +; INST-NEXT: R_AVR_7_PCREL .text-0xc ; INST-NEXT: 59 f7 brne .-42 ; INST-NEXT: 52 f7 brpl .-44 ; INST-NEXT: 4c f7 brge .-46 diff --git a/llvm/test/MC/AVR/inst-brbs.s b/llvm/test/MC/AVR/inst-brbs.s index 9dde5e1..3e64ebc 100644 --- a/llvm/test/MC/AVR/inst-brbs.s +++ b/llvm/test/MC/AVR/inst-brbs.s @@ -14,8 +14,10 @@ foo: ; CHECK: brcs .Ltmp1-12+2 ; encoding: [0bAAAAA000,0b111100AA] ; INST-LABEL: <foo>: -; INST-NEXT: 23 f0 brvs .+8 -; INST-NEXT: d0 f3 brlo .-12 +; INST-NEXT: fb f3 brvs .-2 +; INST-NEXT: R_AVR_7_PCREL .text+0xa +; INST-NEXT: f8 f3 brlo .-2 +; INST-NEXT: R_AVR_7_PCREL .text-0x8 ; INST-NEXT: 59 f3 breq .-42 ; INST-NEXT: 52 f3 brmi .-44 ; INST-NEXT: 4c f3 brlt .-46 diff --git a/llvm/test/MC/AVR/inst-brcc.s b/llvm/test/MC/AVR/inst-brcc.s index 0edefa1..eba05e0 100644 --- a/llvm/test/MC/AVR/inst-brcc.s +++ b/llvm/test/MC/AVR/inst-brcc.s @@ -18,7 +18,11 @@ bar: ; CHECK: brcc bar ; encoding: [0bAAAAA000,0b111101AA] ; INST-LABEL: <foo>: -; INST-NEXT: 08 f5 brsh .+66 -; INST-NEXT: a8 f7 brsh .-22 -; INST-NEXT: 08 f5 brsh .+66 -; INST-NEXT: 00 f4 brsh .+0 +; INST-NEXT: f8 f7 brsh .-2 +; INST-NEXT: R_AVR_7_PCREL .text+0x44 +; INST-NEXT: f8 f7 brsh .-2 +; INST-NEXT: R_AVR_7_PCREL .text-0x12 +; INST-NEXT: f8 f7 brsh .-2 +; INST-NEXT: R_AVR_7_PCREL .text+0x48 +; INST-NEXT: f8 f7 brsh .-2 +; INST-NEXT: R_AVR_7_PCREL .text+0x8 diff --git a/llvm/test/MC/AVR/inst-brcs.s b/llvm/test/MC/AVR/inst-brcs.s index ea8a3f5..fb4e0dd 100644 --- a/llvm/test/MC/AVR/inst-brcs.s +++ b/llvm/test/MC/AVR/inst-brcs.s @@ -18,7 +18,11 @@ bar: ; CHECK: brcs bar ; encoding: [0bAAAAA000,0b111100AA] ; INST-LABEL: <foo>: -; INST-NEXT: 20 f0 brlo .+8 -; INST-NEXT: 10 f0 brlo .+4 -; INST-NEXT: 20 f0 brlo .+8 -; INST-NEXT: 00 f0 brlo .+0 +; INST-NEXT: f8 f3 brlo .-2 +; INST-NEXT: R_AVR_7_PCREL .text+0xa +; INST-NEXT: f8 f3 brlo .-2 +; INST-NEXT: R_AVR_7_PCREL .text+0x8 +; INST-NEXT: f8 f3 brlo .-2 +; INST-NEXT: R_AVR_7_PCREL .text+0xe +; INST-NEXT: f8 f3 brlo .-2 +; INST-NEXT: R_AVR_7_PCREL .text+0x8 diff --git a/llvm/test/MC/AVR/inst-breq.s b/llvm/test/MC/AVR/inst-breq.s index d916f6d..8b8e85a 100644 --- a/llvm/test/MC/AVR/inst-breq.s +++ b/llvm/test/MC/AVR/inst-breq.s @@ -18,7 +18,10 @@ bar: ; CHECK: brbs 1, bar ; encoding: [0bAAAAA001,0b111100AA] ; INST-LABEL: <foo>: -; INST-NEXT: b9 f3 breq .-18 -; INST-NEXT: d1 f3 breq .-12 -; INST-NEXT: b9 f3 breq .-18 -; INST-NEXT: 01 f0 breq .+0 +; INST-NEXT: f9 f3 breq .-2 +; INST-NEXT: R_AVR_7_PCREL .text-0x10 +; INST-NEXT: f9 f3 breq .-2 +; INST-NEXT: R_AVR_7_PCREL .text-0x8 +; INST-NEXT: f9 f3 breq .-2 +; INST-NEXT: R_AVR_7_PCREL .text-0xc +; INST-NEXT: f9 f3 breq .-2 diff --git a/llvm/test/MC/AVR/inst-brge.s b/llvm/test/MC/AVR/inst-brge.s index 3a8fd72..ed96d89 100644 --- a/llvm/test/MC/AVR/inst-brge.s +++ b/llvm/test/MC/AVR/inst-brge.s @@ -16,6 +16,9 @@ bar: ; CHECK: brge bar ; encoding: [0bAAAAA100,0b111101AA] ; INST-LABEL: <foo>: -; INST-NEXT: cc f4 brge .+50 -; INST-NEXT: ac f4 brge .+42 -; INST-NEXT: 04 f4 brge .+0 +; INST-NEXT: fc f7 brge .-2 +; INST-NEXT: R_AVR_7_PCREL .text+0x34 +; INST-NEXT: fc f7 brge .-2 +; INST-NEXT: R_AVR_7_PCREL .text+0x2e +; INST-NEXT: fc f7 brge .-2 +; INST-NEXT: R_AVR_7_PCREL .text+0x6 diff --git a/llvm/test/MC/AVR/inst-brhc.s b/llvm/test/MC/AVR/inst-brhc.s index 4fc55b6..8421c91 100644 --- a/llvm/test/MC/AVR/inst-brhc.s +++ b/llvm/test/MC/AVR/inst-brhc.s @@ -16,6 +16,9 @@ bar: ; CHECK: brhc bar ; encoding: [0bAAAAA101,0b111101AA] ; INST-LABEL: <foo>: -; INST-NEXT: 35 f4 brhc .+12 -; INST-NEXT: 3d f4 brhc .+14 -; INST-NEXT: 05 f4 brhc .+0 +; INST-NEXT: fd f7 brhc .-2 +; INST-NEXT: R_AVR_7_PCREL .text+0xe +; INST-NEXT: fd f7 brhc .-2 +; INST-NEXT: R_AVR_7_PCREL .text+0x12 +; INST-NEXT: fd f7 brhc .-2 +; INST-NEXT: R_AVR_7_PCREL .text+0x6 diff --git a/llvm/test/MC/AVR/inst-brhs.s b/llvm/test/MC/AVR/inst-brhs.s index d0968753..a3777b4 100644 --- a/llvm/test/MC/AVR/inst-brhs.s +++ b/llvm/test/MC/AVR/inst-brhs.s @@ -16,6 +16,9 @@ bar: ; CHECK: brhs bar ; encoding: [0bAAAAA101,0b111100AA] ; INST-LABEL: <foo>: -; INST-NEXT: fd f2 brhs .-66 -; INST-NEXT: 3d f0 brhs .+14 -; INST-NEXT: 05 f0 brhs .+0 +; INST-NEXT: fd f3 brhs .-2 +; INST-NEXT: R_AVR_7_PCREL .text-0x40 +; INST-NEXT: fd f3 brhs .-2 +; INST-NEXT: R_AVR_7_PCREL .text+0x12 +; INST-NEXT: fd f3 brhs .-2 +; INST-NEXT: R_AVR_7_PCREL .text+0x6 diff --git a/llvm/test/MC/AVR/inst-brid.s b/llvm/test/MC/AVR/inst-brid.s index 2a3a30f..888ae02 100644 --- a/llvm/test/MC/AVR/inst-brid.s +++ b/llvm/test/MC/AVR/inst-brid.s @@ -16,6 +16,9 @@ bar: ; CHECK: brid bar ; encoding: [0bAAAAA111,0b111101AA] ; INST-LABEL: <foo>: -; INST-NEXT: af f4 brid .+42 -; INST-NEXT: ff f4 brid .+62 -; INST-NEXT: 07 f4 brid .+0 +; INST-NEXT: ff f7 brid .-2 +; INST-NEXT: R_AVR_7_PCREL .text+0x2c +; INST-NEXT: ff f7 brid .-2 +; INST-NEXT: R_AVR_7_PCREL .text+0x42 +; INST-NEXT: ff f7 brid .-2 +; INST-NEXT: R_AVR_7_PCREL .text+0x6 diff --git a/llvm/test/MC/AVR/inst-brie.s b/llvm/test/MC/AVR/inst-brie.s index 4f867ae..1d175f1 100644 --- a/llvm/test/MC/AVR/inst-brie.s +++ b/llvm/test/MC/AVR/inst-brie.s @@ -16,6 +16,9 @@ bar: ; CHECK: brie bar ; encoding: [0bAAAAA111,0b111100AA] ; INST-LABEL: <foo>: -; INST-NEXT: 57 f0 brie .+20 -; INST-NEXT: a7 f0 brie .+40 -; INST-NEXT: 07 f0 brie .+0 +; INST-NEXT: ff f3 brie .-2 +; INST-NEXT: R_AVR_7_PCREL .text+0x16 +; INST-NEXT: ff f3 brie .-2 +; INST-NEXT: R_AVR_7_PCREL .text+0x2c +; INST-NEXT: ff f3 brie .-2 +; INST-NEXT: R_AVR_7_PCREL .text+0x6 diff --git a/llvm/test/MC/AVR/inst-brlo.s b/llvm/test/MC/AVR/inst-brlo.s index 48499aa..4b57e77 100644 --- a/llvm/test/MC/AVR/inst-brlo.s +++ b/llvm/test/MC/AVR/inst-brlo.s @@ -16,6 +16,9 @@ bar: ; CHECK: brlo bar ; encoding: [0bAAAAA000,0b111100AA] ; INST-LABEL: <foo>: -; INST-NEXT: 30 f0 brlo .+12 -; INST-NEXT: 70 f0 brlo .+28 -; INST-NEXT: 00 f0 brlo .+0 +; INST-NEXT: f8 f3 brlo .-2 +; INST-NEXT: R_AVR_7_PCREL .text+0xe +; INST-NEXT: f8 f3 brlo .-2 +; INST-NEXT: R_AVR_7_PCREL .text+0x20 +; INST-NEXT: f8 f3 brlo .-2 +; INST-NEXT: R_AVR_7_PCREL .text+0x6 diff --git a/llvm/test/MC/AVR/inst-brlt.s b/llvm/test/MC/AVR/inst-brlt.s index e16fd05..58e57c4d 100644 --- a/llvm/test/MC/AVR/inst-brlt.s +++ b/llvm/test/MC/AVR/inst-brlt.s @@ -16,6 +16,9 @@ bar: ; CHECK: brlt bar ; encoding: [0bAAAAA100,0b111100AA] ; INST-LABEL: <foo>: -; INST-NEXT: 44 f0 brlt .+16 -; INST-NEXT: 0c f0 brlt .+2 -; INST-NEXT: 04 f0 brlt .+0 +; INST-NEXT: fc f3 brlt .-2 +; INST-NEXT: R_AVR_7_PCREL .text+0x12 +; INST-NEXT: fc f3 brlt .-2 +; INST-NEXT: R_AVR_7_PCREL .text+0x6 +; INST-NEXT: fc f3 brlt .-2 +; INST-NEXT: R_AVR_7_PCREL .text+0x6 diff --git a/llvm/test/MC/AVR/inst-brmi.s b/llvm/test/MC/AVR/inst-brmi.s index 0d46af8..c406448 100644 --- a/llvm/test/MC/AVR/inst-brmi.s +++ b/llvm/test/MC/AVR/inst-brmi.s @@ -16,6 +16,9 @@ bar: ; CHECK: brmi bar ; encoding: [0bAAAAA010,0b111100AA] ; INST-LABEL: <foo>: -; INST-NEXT: 0a f1 brmi .+66 -; INST-NEXT: ea f0 brmi .+58 -; INST-NEXT: 02 f0 brmi .+0 +; INST-NEXT: fa f3 brmi .-2 +; INST-NEXT: VR_7_PCREL .text+0x44 +; INST-NEXT: fa f3 brmi .-2 +; INST-NEXT: VR_7_PCREL .text+0x3e +; INST-NEXT: fa f3 brmi .-2 +; INST-NEXT: VR_7_PCREL .text+0x6 diff --git a/llvm/test/MC/AVR/inst-brne.s b/llvm/test/MC/AVR/inst-brne.s index e87813a..4b00c63 100644 --- a/llvm/test/MC/AVR/inst-brne.s +++ b/llvm/test/MC/AVR/inst-brne.s @@ -18,7 +18,10 @@ bar: ; CHECK: brbc 1, bar ; encoding: [0bAAAAA001,0b111101AA] ; INST-LABEL: <foo>: -; INST-NEXT: 29 f4 brne .+10 -; INST-NEXT: 09 f4 brne .+2 -; INST-NEXT: 29 f4 brne .+10 -; INST-NEXT: 01 f4 brne .+0 +; INST-NEXT: f9 f7 brne .-2 +; INST-NEXT: R_AVR_7_PCREL .text+0xc +; INST-NEXT: f9 f7 brne .-2 +; INST-NEXT: R_AVR_7_PCREL .text+0x6 +; INST-NEXT: f9 f7 brne .-2 +; INST-NEXT: R_AVR_7_PCREL .text+0x10 +; INST-NEXT: f9 f7 brne .-2 diff --git a/llvm/test/MC/AVR/inst-brpl.s b/llvm/test/MC/AVR/inst-brpl.s index 3487796..9049e24 100644 --- a/llvm/test/MC/AVR/inst-brpl.s +++ b/llvm/test/MC/AVR/inst-brpl.s @@ -16,6 +16,9 @@ bar: ; CHECK: brpl bar ; encoding: [0bAAAAA010,0b111101AA] ; INST-LABEL: <foo>: -; INST-NEXT: d2 f7 brpl .-12 -; INST-NEXT: 4a f4 brpl .+18 -; INST-NEXT: 02 f4 brpl .+0 +; INST-NEXT: fa f7 brpl .-2 +; INST-NEXT: R_AVR_7_PCREL .text-0xa +; INST-NEXT: fa f7 brpl .-2 +; INST-NEXT: R_AVR_7_PCREL .text+0x16 +; INST-NEXT: fa f7 brpl .-2 +; INST-NEXT: R_AVR_7_PCREL .text+0x6 diff --git a/llvm/test/MC/AVR/inst-brsh.s b/llvm/test/MC/AVR/inst-brsh.s index be0a06c..0f32fba 100644 --- a/llvm/test/MC/AVR/inst-brsh.s +++ b/llvm/test/MC/AVR/inst-brsh.s @@ -16,6 +16,9 @@ bar: ; CHECK: brsh bar ; encoding: [0bAAAAA000,0b111101AA] ; INST-LABEL: <foo>: -; INST-NEXT: 80 f4 brsh .+32 -; INST-NEXT: 18 f5 brsh .+70 -; INST-NEXT: 00 f4 brsh .+0 +; INST-NEXT: f8 f7 brsh .-2 +; INST-NEXT: R_AVR_7_PCREL .text+0x22 +; INST-NEXT: f8 f7 brsh .-2 +; INST-NEXT: R_AVR_7_PCREL .text+0x4a +; INST-NEXT: f8 f7 brsh .-2 +; INST-NEXT: R_AVR_7_PCREL .text+0x6 diff --git a/llvm/test/MC/AVR/inst-brtc.s b/llvm/test/MC/AVR/inst-brtc.s index 312c55c..731b495 100644 --- a/llvm/test/MC/AVR/inst-brtc.s +++ b/llvm/test/MC/AVR/inst-brtc.s @@ -16,6 +16,9 @@ bar: ; CHECK: brtc bar ; encoding: [0bAAAAA110,0b111101AA] ; INST-LABEL: <foo>: -; INST-NEXT: d6 f4 brtc .+52 -; INST-NEXT: ce f4 brtc .+50 -; INST-NEXT: 06 f4 brtc .+0 +; INST-NEXT: fe f7 brtc .-2 +; INST-NEXT: R_AVR_7_PCREL .text+0x36 +; INST-NEXT: fe f7 brtc .-2 +; INST-NEXT: R_AVR_7_PCREL .text+0x36 +; INST-NEXT: fe f7 brtc .-2 +; INST-NEXT: R_AVR_7_PCREL .text+0x6 diff --git a/llvm/test/MC/AVR/inst-brts.s b/llvm/test/MC/AVR/inst-brts.s index 40ef6af..bb00acb 100644 --- a/llvm/test/MC/AVR/inst-brts.s +++ b/llvm/test/MC/AVR/inst-brts.s @@ -16,6 +16,9 @@ bar: ; CHECK: brts bar ; encoding: [0bAAAAA110,0b111100AA] ; INST-LABEL: <foo>: -; INST-NEXT: 4e f0 brts .+18 -; INST-NEXT: 5e f0 brts .+22 -; INST-NEXT: 06 f0 brts .+0 +; INST-NEXT: fe f3 brts .-2 +; INST-NEXT: R_AVR_7_PCREL .text+0x14 +; INST-NEXT: fe f3 brts .-2 +; INST-NEXT: R_AVR_7_PCREL .text+0x1a +; INST-NEXT: fe f3 brts .-2 +; INST-NEXT: R_AVR_7_PCREL .text+0x6 diff --git a/llvm/test/MC/AVR/inst-brvc.s b/llvm/test/MC/AVR/inst-brvc.s index d493ff1..f65e735 100644 --- a/llvm/test/MC/AVR/inst-brvc.s +++ b/llvm/test/MC/AVR/inst-brvc.s @@ -16,6 +16,9 @@ bar: ; CHECK: brvc bar ; encoding: [0bAAAAA011,0b111101AA] ; INST-LABEL: <foo>: -; INST-NEXT: 93 f7 brvc .-28 -; INST-NEXT: 0b f7 brvc .-62 -; INST-NEXT: 03 f4 brvc .+0 +; INST-NEXT: fb f7 brvc .-2 +; INST-NEXT: R_AVR_7_PCREL .text-0x1a +; INST-NEXT: fb f7 brvc .-2 +; INST-NEXT: R_AVR_7_PCREL .text-0x3a +; INST-NEXT: fb f7 brvc .-2 +; INST-NEXT: R_AVR_7_PCREL .text+0x6 diff --git a/llvm/test/MC/AVR/inst-brvs.s b/llvm/test/MC/AVR/inst-brvs.s index 07755d8..a5b7e4b 100644 --- a/llvm/test/MC/AVR/inst-brvs.s +++ b/llvm/test/MC/AVR/inst-brvs.s @@ -16,6 +16,9 @@ bar: ; CHECK: brvs bar ; encoding: [0bAAAAA011,0b111100AA] ; INST-LABEL: <foo>: -; INST-NEXT: 4b f0 brvs .+18 -; INST-NEXT: 83 f0 brvs .+32 -; INST-NEXT: 03 f0 brvs .+0 +; INST-NEXT: fb f3 brvs .-2 +; INST-NEXT: R_AVR_7_PCREL .text+0x14 +; INST-NEXT: fb f3 brvs .-2 +; INST-NEXT: R_AVR_7_PCREL .text+0x24 +; INST-NEXT: fb f3 brvs .-2 +; INST-NEXT: R_AVR_7_PCREL .text+0x6 diff --git a/llvm/test/MC/AVR/inst-rcall.s b/llvm/test/MC/AVR/inst-rcall.s index 1da6e7f..f7818aa 100644 --- a/llvm/test/MC/AVR/inst-rcall.s +++ b/llvm/test/MC/AVR/inst-rcall.s @@ -17,8 +17,11 @@ foo: ; CHECK: rcall .Ltmp3+46+2 ; encoding: [A,0b1101AAAA] ; INST-LABEL: <foo>: -; INST-NEXT: 00 d0 rcall .+0 -; INST-NEXT: fc df rcall .-8 -; INST-NEXT: 06 d0 rcall .+12 -; INST-NEXT: 17 d0 rcall .+46 -; INST-NEXT: ea df rcall .-44 +; INST-NEXT: ff df rcall .-2 +; INST-NEXT: R_AVR_13_PCREL .text+0x2 +; INST-NEXT: ff df rcall .-2 +; INST-NEXT: R_AVR_13_PCREL .text-0x4 +; INST-NEXT: ff df rcall .-2 +; INST-NEXT: R_AVR_13_PCREL .text+0x12 +; INST-NEXT: ff df rcall .-2 +; INST-NEXT: R_AVR_13_PCREL .text+0x36 diff --git a/llvm/test/MC/AVR/inst-rjmp.s b/llvm/test/MC/AVR/inst-rjmp.s index 6712319..6ac6343 100644 --- a/llvm/test/MC/AVR/inst-rjmp.s +++ b/llvm/test/MC/AVR/inst-rjmp.s @@ -33,18 +33,28 @@ x: ; CHECK: rjmp .Ltmp6+4094+2 ; encoding: [A,0b1100AAAA] ; INST-LABEL: <foo>: -; INST-NEXT: 01 c0 rjmp .+2 ; INST-NEXT: ff cf rjmp .-2 -; INST-NEXT: fd cf rjmp .-6 -; INST-NEXT: 04 c0 rjmp .+8 -; INST-NEXT: 01 c0 rjmp .+2 -; INST-NEXT: 00 c0 rjmp .+0 +; INST-NEXT: R_AVR_13_PCREL .text+0x4 +; INST-NEXT: ff cf rjmp .-2 +; INST-NEXT: R_AVR_13_PCREL .text+0x2 +; INST-NEXT: ff cf rjmp .-2 +; INST-NEXT: R_AVR_13_PCREL .text +; INST-NEXT: ff cf rjmp .-2 +; INST-NEXT: R_AVR_13_PCREL .text+0x10 +; INST-NEXT: ff cf rjmp .-2 +; INST-NEXT: R_AVR_13_PCREL .text+0xc +; INST-NEXT: ff cf rjmp .-2 +; INST-NEXT: R_AVR_13_PCREL .text+0xc ; INST-EMPTY: ; INST-LABEL: <end>: -; INST-NEXT: fe cf rjmp .-4 -; INST-NEXT: fd cf rjmp .-6 +; INST-NEXT: ff cf rjmp .-2 +; INST-NEXT: R_AVR_13_PCREL .text+0xa +; INST-NEXT: ff cf rjmp .-2 +; INST-NEXT: R_AVR_13_PCREL .text+0xa ; INST-EMPTY: ; INST-LABEL: <x>: ; INST-NEXT: ff cf rjmp .-2 +; INST-NEXT: R_AVR_13_PCREL .text+0x10 ; INST-NEXT: 0f c0 rjmp .+30 -; INST-NEXT: ff c7 rjmp .+4094 +; INST-NEXT: ff cf rjmp .-2 +; INST-NEXT: R_AVR_13_PCREL .text+0x1014 diff --git a/llvm/test/MC/COFF/bss-text.s b/llvm/test/MC/COFF/bss-text.s index ed68905..cedbb2f 100644 --- a/llvm/test/MC/COFF/bss-text.s +++ b/llvm/test/MC/COFF/bss-text.s @@ -1,13 +1,15 @@ -# RUN: not llvm-mc -filetype=obj -triple=x86_64-pc-win32 %s -o /dev/null 2>&1 | FileCheck %s +# RUN: not llvm-mc -filetype=obj -triple=x86_64-pc-win32 %s -o /dev/null 2>&1 | FileCheck %s --implicit-check-not=error: ## -filetype=asm does not check the error. # RUN: llvm-mc -triple=x86_64-pc-win32 %s +.bss +# CHECK: <unknown>:0: error: BSS section '.bss' cannot have non-zero bytes + addb %bl,(%rax) + .section uninitialized,"b" -# MCRelaxableFragment -# CHECK: {{.*}}.s:[[#@LINE+1]]:3: error: IMAGE_SCN_CNT_UNINITIALIZED_DATA section 'uninitialized' cannot have instructions +# CHECK: <unknown>:0: error: BSS section 'uninitialized' cannot have non-zero bytes jmp foo -.bss -# CHECK: {{.*}}.s:[[#@LINE+1]]:3: error: IMAGE_SCN_CNT_UNINITIALIZED_DATA section '.bss' cannot have instructions +.section bss0,"b" addb %al,(%rax) diff --git a/llvm/test/MC/COFF/section.s b/llvm/test/MC/COFF/section.s index 9c1a11e..fdd6570 100644 --- a/llvm/test/MC/COFF/section.s +++ b/llvm/test/MC/COFF/section.s @@ -29,7 +29,7 @@ .section s ; .long 1 .section s_, "" ; .long 1 .section s_a,"a"; .long 1 -.section s_b,"b"; .long 1 +.section s_b,"b"; .long 0 .section s_d,"d"; .long 1 .section s_D,"D"; .long 1 .section s_n,"n"; .long 1 diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx10_vop3.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx10_vop3.txt index 721babd..08ed50d 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx10_vop3.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx10_vop3.txt @@ -1146,6 +1146,18 @@ # GFX10: v_alignbit_b32 v5, vcc_lo, v2, v3 ; encoding: [0x05,0x00,0x4e,0xd5,0x6a,0x04,0x0e,0x04] 0x05,0x00,0x4e,0xd5,0x6a,0x04,0x0e,0x04 +# GFX10: v_alignbit_b32 v5, v1, v2, v3 op_sel:[1,0,0,0] ; encoding: [0x05,0x08,0x4e,0xd5,0x01,0x05,0x0e,0x04] +0x05,0x08,0x4e,0xd5,0x01,0x05,0x0e,0x04 + +# GFX10: v_alignbit_b32 v5, v1, v2, v3 op_sel:[1,1,0,0] ; encoding: [0x05,0x18,0x4e,0xd5,0x01,0x05,0x0e,0x04] +0x05,0x18,0x4e,0xd5,0x01,0x05,0x0e,0x04 + +# GFX10: v_alignbit_b32 v5, v1, v2, v3 op_sel:[1,1,1,0] ; encoding: [0x05,0x38,0x4e,0xd5,0x01,0x05,0x0e,0x04] +0x05,0x38,0x4e,0xd5,0x01,0x05,0x0e,0x04 + +# GFX10: v_alignbit_b32 v5, v1, v2, v3 op_sel:[1,1,1,1] ; encoding: [0x05,0x78,0x4e,0xd5,0x01,0x05,0x0e,0x04] +0x05,0x78,0x4e,0xd5,0x01,0x05,0x0e,0x04 + # GFX10: v_alignbyte_b32 v255, v1, v2, v3 ; encoding: [0xff,0x00,0x4f,0xd5,0x01,0x05,0x0e,0x04] 0xff,0x00,0x4f,0xd5,0x01,0x05,0x0e,0x04 @@ -1233,6 +1245,18 @@ # GFX10: v_alignbyte_b32 v5, vcc_lo, v2, v3 ; encoding: [0x05,0x00,0x4f,0xd5,0x6a,0x04,0x0e,0x04] 0x05,0x00,0x4f,0xd5,0x6a,0x04,0x0e,0x04 +# GFX10: v_alignbyte_b32 v5, v1, v2, v3 op_sel:[1,0,0,0] ; encoding: [0x05,0x08,0x4f,0xd5,0x01,0x05,0x0e,0x04] +0x05,0x08,0x4f,0xd5,0x01,0x05,0x0e,0x04 + +# GFX10: v_alignbyte_b32 v5, v1, v2, v3 op_sel:[1,1,0,0] ; encoding: [0x05,0x18,0x4f,0xd5,0x01,0x05,0x0e,0x04] +0x05,0x18,0x4f,0xd5,0x01,0x05,0x0e,0x04 + +# GFX10: v_alignbyte_b32 v5, v1, v2, v3 op_sel:[1,1,1,0] ; encoding: [0x05,0x38,0x4f,0xd5,0x01,0x05,0x0e,0x04] +0x05,0x38,0x4f,0xd5,0x01,0x05,0x0e,0x04 + +# GFX10: v_alignbyte_b32 v5, v1, v2, v3 op_sel:[1,1,1,1] ; encoding: [0x05,0x78,0x4f,0xd5,0x01,0x05,0x0e,0x04] +0x05,0x78,0x4f,0xd5,0x01,0x05,0x0e,0x04 + # GFX10: v_and_b32_e64 v255, v1, v2 ; encoding: [0xff,0x00,0x1b,0xd5,0x01,0x05,0x02,0x00] 0xff,0x00,0x1b,0xd5,0x01,0x05,0x02,0x00 diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_wmma_w32.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_wmma_w32.txt index d76ec4c..e20f020 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_wmma_w32.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_wmma_w32.txt @@ -364,6 +364,45 @@ 0x10,0x00,0x6e,0xcc,0x00,0x11,0x42,0x9c # GFX1250: v_wmma_f16_16x16x64_fp8_fp8 v[16:19], v[0:7], v[8:15], v[16:19] neg_lo:[0,0,1] ; encoding: [0x10,0x00,0x6e,0xcc,0x00,0x11,0x42,0x9c] +0x00,0x20,0x33,0xcc,0x08,0x31,0xa2,0x04 +# GFX1250: v_wmma_f32_16x16x128_f8f6f4 v[0:7], v[8:15], v[24:39], v[40:47] matrix_a_fmt:MATRIX_FMT_FP4 ; encoding: [0x00,0x20,0x33,0xcc,0x08,0x31,0xa2,0x04] + +0x00,0x18,0x33,0xcc,0x08,0x31,0xa2,0x04 +# GFX1250: v_wmma_f32_16x16x128_f8f6f4 v[0:7], v[8:19], v[24:39], v[40:47] matrix_a_fmt:MATRIX_FMT_BF6 ; encoding: [0x00,0x18,0x33,0xcc,0x08,0x31,0xa2,0x04] + +0x00,0x10,0x33,0xcc,0x08,0x31,0xa2,0x04 +# GFX1250: v_wmma_f32_16x16x128_f8f6f4 v[0:7], v[8:19], v[24:39], v[40:47] matrix_a_fmt:MATRIX_FMT_FP6 ; encoding: [0x00,0x10,0x33,0xcc,0x08,0x31,0xa2,0x04] + +0x00,0x40,0x33,0xcc,0x08,0x31,0xa2,0x04 +# GFX1250: v_wmma_f32_16x16x128_f8f6f4 v[0:7], v[8:23], v[24:31], v[40:47] matrix_b_fmt:MATRIX_FMT_FP4 ; encoding: [0x00,0x40,0x33,0xcc,0x08,0x31,0xa2,0x04] + +0x00,0x08,0x33,0xcc,0x08,0x31,0xa2,0x14 +# GFX1250: v_wmma_f32_16x16x128_f8f6f4 v[0:7], v[8:23], v[24:35], v[40:47] matrix_a_fmt:MATRIX_FMT_BF8 matrix_b_fmt:MATRIX_FMT_FP6 ; encoding: [0x00,0x08,0x33,0xcc,0x08,0x31,0xa2,0x14] + +0x00,0x00,0x33,0xcc,0x08,0x31,0xa2,0x1c +# GFX1250: v_wmma_f32_16x16x128_f8f6f4 v[0:7], v[8:23], v[24:35], v[40:47] matrix_b_fmt:MATRIX_FMT_BF6 ; encoding: [0x00,0x00,0x33,0xcc,0x08,0x31,0xa2,0x1c] + +0x00,0x00,0x33,0xcc,0x08,0x31,0xa2,0x14 +# GFX1250: v_wmma_f32_16x16x128_f8f6f4 v[0:7], v[8:23], v[24:35], v[40:47] matrix_b_fmt:MATRIX_FMT_FP6 ; encoding: [0x00,0x00,0x33,0xcc,0x08,0x31,0xa2,0x14] + +0x00,0x00,0x33,0xcc,0x08,0x31,0xca,0x03 +# GFX1250: v_wmma_f32_16x16x128_f8f6f4 v[0:7], v[8:23], v[24:39], 1.0 ; encoding: [0x00,0x00,0x33,0xcc,0x08,0x31,0xca,0x03] + +0x00,0x00,0x33,0xcc,0x08,0x31,0xa2,0x04 +# GFX1250: v_wmma_f32_16x16x128_f8f6f4 v[0:7], v[8:23], v[24:39], v[40:47] ; encoding: [0x00,0x00,0x33,0xcc,0x08,0x31,0xa2,0x04] + +0x00,0x08,0x33,0xcc,0x08,0x31,0xa2,0x04 +# GFX1250: v_wmma_f32_16x16x128_f8f6f4 v[0:7], v[8:23], v[24:39], v[40:47] matrix_a_fmt:MATRIX_FMT_BF8 ; encoding: [0x00,0x08,0x33,0xcc,0x08,0x31,0xa2,0x04] + +0x00,0x00,0x33,0xcc,0x08,0x31,0xa2,0x0c +# GFX1250: v_wmma_f32_16x16x128_f8f6f4 v[0:7], v[8:23], v[24:39], v[40:47] matrix_b_fmt:MATRIX_FMT_BF8 ; encoding: [0x00,0x00,0x33,0xcc,0x08,0x31,0xa2,0x0c] + +0x00,0x04,0x33,0xcc,0x08,0x31,0xa2,0x04 +# GFX1250: v_wmma_f32_16x16x128_f8f6f4 v[0:7], v[8:23], v[24:39], v[40:47] neg_hi:[0,0,1] ; encoding: [0x00,0x04,0x33,0xcc,0x08,0x31,0xa2,0x04] + +0x00,0x00,0x33,0xcc,0x08,0x31,0xa2,0x84 +# GFX1250: v_wmma_f32_16x16x128_f8f6f4 v[0:7], v[8:23], v[24:39], v[40:47] neg_lo:[0,0,1] ; encoding: [0x00,0x00,0x33,0xcc,0x08,0x31,0xa2,0x84] + 0x10,0x00,0x62,0xcc,0x00,0x11,0xca,0x1b # GFX1250: v_wmma_f32_16x16x32_bf16 v[16:23], v[0:7], v[8:15], 1.0 ; encoding: [0x10,0x00,0x62,0xcc,0x00,0x11,0xca,0x1b] diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx9_vop3.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx9_vop3.txt index 618e081..802d6368 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx9_vop3.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx9_vop3.txt @@ -11310,6 +11310,18 @@ # CHECK: v_alignbit_b32 v5, v1, v2, exec_hi ; encoding: [0x05,0x00,0xce,0xd1,0x01,0x05,0xfe,0x01] 0x05,0x00,0xce,0xd1,0x01,0x05,0xfe,0x01 +# CHECK: v_alignbit_b32 v5, v1, v2, v3 op_sel:[1,0,0,0] ; encoding: [0x05,0x08,0xce,0xd1,0x01,0x05,0x0e,0x04] +0x05,0x08,0xce,0xd1,0x01,0x05,0x0e,0x04 + +# CHECK: v_alignbit_b32 v5, v1, v2, v3 op_sel:[1,1,0,0] ; encoding: [0x05,0x18,0xce,0xd1,0x01,0x05,0x0e,0x04] +0x05,0x18,0xce,0xd1,0x01,0x05,0x0e,0x04 + +# CHECK: v_alignbit_b32 v5, v1, v2, v3 op_sel:[1,1,1,0] ; encoding: [0x05,0x38,0xce,0xd1,0x01,0x05,0x0e,0x04] +0x05,0x38,0xce,0xd1,0x01,0x05,0x0e,0x04 + +# CHECK: v_alignbit_b32 v5, v1, v2, v3 op_sel:[1,1,1,1] ; encoding: [0x05,0x78,0xce,0xd1,0x01,0x05,0x0e,0x04] +0x05,0x78,0xce,0xd1,0x01,0x05,0x0e,0x04 + # CHECK: v_alignbyte_b32 v5, v1, v2, v3 ; encoding: [0x05,0x00,0xcf,0xd1,0x01,0x05,0x0e,0x04] 0x05,0x00,0xcf,0xd1,0x01,0x05,0x0e,0x04 @@ -11406,6 +11418,18 @@ # CHECK: v_alignbyte_b32 v5, v1, v2, exec_hi ; encoding: [0x05,0x00,0xcf,0xd1,0x01,0x05,0xfe,0x01] 0x05,0x00,0xcf,0xd1,0x01,0x05,0xfe,0x01 +# CHECK: v_alignbyte_b32 v5, v1, v2, v3 op_sel:[1,0,0,0] ; encoding: [0x05,0x08,0xcf,0xd1,0x01,0x05,0x0e,0x04] +0x05,0x08,0xcf,0xd1,0x01,0x05,0x0e,0x04 + +# CHECK: v_alignbyte_b32 v5, v1, v2, v3 op_sel:[1,1,0,0] ; encoding: [0x05,0x18,0xcf,0xd1,0x01,0x05,0x0e,0x04] +0x05,0x18,0xcf,0xd1,0x01,0x05,0x0e,0x04 + +# CHECK: v_alignbyte_b32 v5, v1, v2, v3 op_sel:[1,1,1,0] ; encoding: [0x05,0x38,0xcf,0xd1,0x01,0x05,0x0e,0x04] +0x05,0x38,0xcf,0xd1,0x01,0x05,0x0e,0x04 + +# CHECK: v_alignbyte_b32 v5, v1, v2, v3 op_sel:[1,1,1,1] ; encoding: [0x05,0x78,0xcf,0xd1,0x01,0x05,0x0e,0x04] +0x05,0x78,0xcf,0xd1,0x01,0x05,0x0e,0x04 + # CHECK: v_min3_f32 v5, v1, v2, v3 ; encoding: [0x05,0x00,0xd0,0xd1,0x01,0x05,0x0e,0x04] 0x05,0x00,0xd0,0xd1,0x01,0x05,0x0e,0x04 diff --git a/llvm/test/MC/ELF/mc-dump.s b/llvm/test/MC/ELF/mc-dump.s index 5cc2e9f..fd6cf95 100644 --- a/llvm/test/MC/ELF/mc-dump.s +++ b/llvm/test/MC/ELF/mc-dump.s @@ -6,9 +6,9 @@ #CHECK-LABEL:assembler backend - final-layout # CHECK:Sections:[ # CHECK-NEXT:MCSection Name:.text -# CHECK-NEXT:0 Data Size:0 [] +# CHECK-NEXT:0 Align Size:0+0 [] +# CHECK-NEXT: Align:4 Fill:0 FillLen:1 MaxBytesToEmit:4 Nops # CHECK-NEXT: Symbol @0 .text -# CHECK-NEXT:0 Align Align:4 Fill:0 FillLen:1 MaxBytesToEmit:4 Nops # CHECK-NEXT:0 Data Size:0 [] # CHECK-NEXT: Symbol @0 _start # CHECK-NEXT: Symbol @0 Temporary @@ -22,9 +22,9 @@ # CHECK-NEXT: Symbol @0 Temporary # CHECK-NEXT: Symbol @16 Temporary # CHECK-NEXT:MCSection Name:.data -# CHECK-NEXT:0 Data Size:0 [] +# CHECK-NEXT:0 Align Size:0+0 [] +# CHECK-NEXT: Align:4 Fill:0 FillLen:1 MaxBytesToEmit:4 # CHECK-NEXT: Symbol @0 .data -# CHECK-NEXT:0 Align Align:4 Fill:0 FillLen:1 MaxBytesToEmit:4 # CHECK-NEXT:0 Data Size:4 [01,00,00,00] # CHECK-NEXT:4 Fill Value:0 ValueSize:1 NumValues:1 # CHECK-NEXT:5 LEB Size:0+1 [15] Value:.Ltmp0-_start Signed:0 diff --git a/llvm/test/MC/ELF/nobits-non-zero-value.s b/llvm/test/MC/ELF/nobits-non-zero-value.s index ff43e69..ea95ec97 100644 --- a/llvm/test/MC/ELF/nobits-non-zero-value.s +++ b/llvm/test/MC/ELF/nobits-non-zero-value.s @@ -1,26 +1,45 @@ -# RUN: not llvm-mc -filetype=obj -triple=x86_64 %s -o /dev/null 2>&1 | FileCheck %s --implicit-check-not=error: +# RUN: not llvm-mc -filetype=obj -triple=x86_64 %s -o /dev/null 2>&1 | FileCheck %s --implicit-check-not=error: --implicit-check-not=warning: ## -filetype=asm does not check the error. # RUN: llvm-mc -triple=x86_64 %s .section .tbss,"aw",@nobits -# MCRelaxableFragment -# CHECK: {{.*}}.s:[[#@LINE+1]]:3: error: SHT_NOBITS section '.tbss' cannot have instructions jmp foo .bss -# CHECK: {{.*}}.s:[[#@LINE+1]]:3: error: SHT_NOBITS section '.bss' cannot have instructions addb %al,(%rax) -# CHECK: {{.*}}.s:[[#@LINE+1]]:11: warning: ignoring non-zero fill value in SHT_NOBITS section '.bss' +# CHECK: {{.*}}.s:[[#@LINE+1]]:11: warning: ignoring non-zero fill value in BSS section '.bss' .align 4, 42 -# CHECK-NOT: {{.*}}.s:[[#@LINE+1]]:11: warning: ignoring non-zero fill value in SHT_NOBITS section '.bss' -.align 4, 0 - -# CHECK: <unknown>:0: error: SHT_NOBITS section '.bss' cannot have non-zero initializers .long 1 -.section .bss1,"aw",%nobits -# CHECK: <unknown>:0: error: SHT_NOBITS section '.bss1' cannot have fixups +.section .bss0,"aw",%nobits +addb %al,(%rax) + +.section data_fixup,"aw",%nobits .quad foo + +.section fill,"aw",%nobits +.fill b-a,1,1 + +.section org,"aw",%nobits +.org 1,1 + +.section ok,"aw",%nobits +.org 1 +.fill 1 +.fill b-a,1,0 +.align 4, 0 +.long 0 + +.text +a: nop +b: + +## Location is not tracked for efficiency. +# CHECK: <unknown>:0: error: BSS section '.tbss' cannot have non-zero bytes +# CHECK: <unknown>:0: error: BSS section '.bss' cannot have non-zero bytes +# CHECK: <unknown>:0: error: BSS section 'data_fixup' cannot have fixups +# CHECK: <unknown>:0: error: BSS section 'fill' cannot have non-zero bytes +# CHECK: <unknown>:0: error: BSS section 'org' cannot have non-zero bytes diff --git a/llvm/test/MC/RISCV/Relocations/mc-dump.s b/llvm/test/MC/RISCV/Relocations/mc-dump.s index f722584..e8f4b14 100644 --- a/llvm/test/MC/RISCV/Relocations/mc-dump.s +++ b/llvm/test/MC/RISCV/Relocations/mc-dump.s @@ -3,16 +3,18 @@ # CHECK:Sections:[ # CHECK-NEXT:MCSection Name:.text -# CHECK-NEXT:0 Data Size:0 [] +# CHECK-NEXT:0 Align Size:0+0 [] +# CHECK-NEXT: Align:4 Fill:0 FillLen:1 MaxBytesToEmit:4 Nops # CHECK-NEXT: Symbol @0 .text -# CHECK-NEXT:0 Align Align:4 Fill:0 FillLen:1 MaxBytesToEmit:4 Nops # CHECK-NEXT:0 Data LinkerRelaxable Size:8 [97,00,00,00,e7,80,00,00] # CHECK-NEXT: Fixup @0 Value:specifier(19,ext) Kind:4023 # CHECK-NEXT: Symbol @0 $x -# CHECK-NEXT:8 Data Size:0 [] -# CHECK-NEXT:8 Align Align:8 Fill:0 FillLen:1 MaxBytesToEmit:8 Nops -# CHECK-NEXT:12 Data Size:4 [13,05,30,00] -# CHECK-NEXT:16 Align Align:8 Fill:0 FillLen:1 MaxBytesToEmit:8 Nops +# CHECK-NEXT:8 Align Size:0+4 [] +# CHECK-NEXT: Align:8 Fill:0 FillLen:1 MaxBytesToEmit:8 Nops +# CHECK-NEXT: Fixup @0 Value:4 Kind:[[#]] +# CHECK-NEXT:12 Align Size:4+4 [13,05,30,00] +# CHECK-NEXT: Align:8 Fill:0 FillLen:1 MaxBytesToEmit:8 Nops +# CHECK-NEXT: Fixup @4 Value:4 Kind:[[#]] # CHECK-NEXT:] call ext diff --git a/llvm/test/Other/new-pm-print-pipeline.ll b/llvm/test/Other/new-pm-print-pipeline.ll index db398d6..6fa57f1 100644 --- a/llvm/test/Other/new-pm-print-pipeline.ll +++ b/llvm/test/Other/new-pm-print-pipeline.ll @@ -32,7 +32,7 @@ ; CHECK-10: function(loop-unroll<O2>,loop-unroll<partial;peeling;runtime;upperbound;profile-peeling;full-unroll-max=5;O1>,loop-unroll<no-partial;no-peeling;no-runtime;no-upperbound;no-profile-peeling;full-unroll-max=7;O1>) ; RUN: opt -disable-output -disable-verify -print-pipeline-passes -passes='function(gvn<>,gvn<pre;load-pre;split-backedge-load-pre;memdep;memoryssa>,gvn<no-pre;no-load-pre;no-split-backedge-load-pre;no-memdep;no-memoryssa>)' < %s | FileCheck %s --match-full-lines --check-prefixes=CHECK-11 -; CHECK-11: function(gvn<>,gvn<pre;load-pre;split-backedge-load-pre;memdep;memoryssa>,gvn<no-pre;no-load-pre;no-split-backedge-load-pre;no-memdep;no-memoryssa>) +; CHECK-11: function(gvn<>,gvn<pre;load-pre;split-backedge-load-pre;no-memdep;memoryssa>,gvn<no-pre;no-load-pre;no-split-backedge-load-pre;memdep;no-memoryssa>) ; RUN: opt -disable-output -disable-verify -print-pipeline-passes -passes='function(early-cse<>,early-cse<memssa>)' < %s | FileCheck %s --match-full-lines --check-prefixes=CHECK-12 ; CHECK-12: function(early-cse<>,early-cse<memssa>) diff --git a/llvm/test/Transforms/AggressiveInstCombine/X86/store-merge-be.ll b/llvm/test/Transforms/AggressiveInstCombine/X86/store-merge-be.ll new file mode 100644 index 0000000..34f3924 --- /dev/null +++ b/llvm/test/Transforms/AggressiveInstCombine/X86/store-merge-be.ll @@ -0,0 +1,106 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 +; RUN: opt -S -passes=aggressive-instcombine -mtriple=x86_64-unknown-linux-gnu -data-layout="E-n64" < %s | FileCheck %s + +; Pretend X86 is big endian. + +; FIXME: Big endian not supported yet. + +define void @test_i32_be(i32 %x, ptr %p) { +; CHECK-LABEL: define void @test_i32_be( +; CHECK-SAME: i32 [[X:%.*]], ptr [[P:%.*]]) { +; CHECK-NEXT: [[X_0:%.*]] = trunc i32 [[X]] to i8 +; CHECK-NEXT: [[GEP_0:%.*]] = getelementptr i8, ptr [[P]], i64 3 +; CHECK-NEXT: store i8 [[X_0]], ptr [[GEP_0]], align 1 +; CHECK-NEXT: [[SHR_1:%.*]] = lshr i32 [[X]], 8 +; CHECK-NEXT: [[X_1:%.*]] = trunc i32 [[SHR_1]] to i8 +; CHECK-NEXT: [[GEP_1:%.*]] = getelementptr i8, ptr [[P]], i64 2 +; CHECK-NEXT: store i8 [[X_1]], ptr [[GEP_1]], align 1 +; CHECK-NEXT: [[SHR_2:%.*]] = lshr i32 [[X]], 16 +; CHECK-NEXT: [[X_2:%.*]] = trunc i32 [[SHR_2]] to i8 +; CHECK-NEXT: [[GEP_2:%.*]] = getelementptr i8, ptr [[P]], i64 1 +; CHECK-NEXT: store i8 [[X_2]], ptr [[GEP_2]], align 1 +; CHECK-NEXT: [[TMP1:%.*]] = lshr i32 [[X]], 24 +; CHECK-NEXT: [[X_3:%.*]] = trunc i32 [[TMP1]] to i8 +; CHECK-NEXT: store i8 [[X_3]], ptr [[P]], align 1 +; CHECK-NEXT: ret void +; + %x.0 = trunc i32 %x to i8 + %gep.0 = getelementptr i8, ptr %p, i64 3 + store i8 %x.0, ptr %gep.0 + %shr.1 = lshr i32 %x, 8 + %x.1 = trunc i32 %shr.1 to i8 + %gep.1 = getelementptr i8, ptr %p, i64 2 + store i8 %x.1, ptr %gep.1 + %shr.2 = lshr i32 %x, 16 + %x.2 = trunc i32 %shr.2 to i8 + %gep.2 = getelementptr i8, ptr %p, i64 1 + store i8 %x.2, ptr %gep.2 + %shr.3 = lshr i32 %x, 24 + %x.3 = trunc i32 %shr.3 to i8 + store i8 %x.3, ptr %p + ret void +} + +define void @test_i32_le(i32 %x, ptr %p) { +; CHECK-LABEL: define void @test_i32_le( +; CHECK-SAME: i32 [[X:%.*]], ptr [[P:%.*]]) { +; CHECK-NEXT: [[X_0:%.*]] = trunc i32 [[X]] to i8 +; CHECK-NEXT: store i8 [[X_0]], ptr [[P]], align 1 +; CHECK-NEXT: [[SHR_1:%.*]] = lshr i32 [[X]], 8 +; CHECK-NEXT: [[X_1:%.*]] = trunc i32 [[SHR_1]] to i8 +; CHECK-NEXT: [[GEP_1:%.*]] = getelementptr i8, ptr [[P]], i64 1 +; CHECK-NEXT: store i8 [[X_1]], ptr [[GEP_1]], align 1 +; CHECK-NEXT: [[SHR_2:%.*]] = lshr i32 [[X]], 16 +; CHECK-NEXT: [[X_2:%.*]] = trunc i32 [[SHR_2]] to i8 +; CHECK-NEXT: [[GEP_2:%.*]] = getelementptr i8, ptr [[P]], i64 2 +; CHECK-NEXT: store i8 [[X_2]], ptr [[GEP_2]], align 1 +; CHECK-NEXT: [[SHR_3:%.*]] = lshr i32 [[X]], 24 +; CHECK-NEXT: [[X_3:%.*]] = trunc i32 [[SHR_3]] to i8 +; CHECK-NEXT: [[GEP_3:%.*]] = getelementptr i8, ptr [[P]], i64 3 +; CHECK-NEXT: store i8 [[X_3]], ptr [[GEP_3]], align 1 +; CHECK-NEXT: ret void +; + %x.0 = trunc i32 %x to i8 + store i8 %x.0, ptr %p + %shr.1 = lshr i32 %x, 8 + %x.1 = trunc i32 %shr.1 to i8 + %gep.1 = getelementptr i8, ptr %p, i64 1 + store i8 %x.1, ptr %gep.1 + %shr.2 = lshr i32 %x, 16 + %x.2 = trunc i32 %shr.2 to i8 + %gep.2 = getelementptr i8, ptr %p, i64 2 + store i8 %x.2, ptr %gep.2 + %shr.3 = lshr i32 %x, 24 + %x.3 = trunc i32 %shr.3 to i8 + %gep.3 = getelementptr i8, ptr %p, i64 3 + store i8 %x.3, ptr %gep.3 + ret void +} + +define void @test_i32_mixed_parts(i32 %x, ptr %p) { +; CHECK-LABEL: define void @test_i32_mixed_parts( +; CHECK-SAME: i32 [[X:%.*]], ptr [[P:%.*]]) { +; CHECK-NEXT: [[X_0:%.*]] = trunc i32 [[X]] to i8 +; CHECK-NEXT: [[GEP_0:%.*]] = getelementptr i8, ptr [[P]], i64 3 +; CHECK-NEXT: store i8 [[X_0]], ptr [[GEP_0]], align 1 +; CHECK-NEXT: [[SHR_1:%.*]] = lshr i32 [[X]], 8 +; CHECK-NEXT: [[X_1:%.*]] = trunc i32 [[SHR_1]] to i16 +; CHECK-NEXT: [[GEP_1:%.*]] = getelementptr i8, ptr [[P]], i64 1 +; CHECK-NEXT: store i16 [[X_1]], ptr [[GEP_1]], align 2 +; CHECK-NEXT: [[SHR_3:%.*]] = lshr i32 [[X]], 24 +; CHECK-NEXT: [[X_3:%.*]] = trunc i32 [[SHR_3]] to i8 +; CHECK-NEXT: store i8 [[X_3]], ptr [[P]], align 1 +; CHECK-NEXT: ret void +; + %x.0 = trunc i32 %x to i8 + %gep.0 = getelementptr i8, ptr %p, i64 3 + store i8 %x.0, ptr %gep.0 + %shr.1 = lshr i32 %x, 8 + %x.1 = trunc i32 %shr.1 to i16 + %gep.1 = getelementptr i8, ptr %p, i64 1 + store i16 %x.1, ptr %gep.1 + %shr.3 = lshr i32 %x, 24 + %x.3 = trunc i32 %shr.3 to i8 + store i8 %x.3, ptr %p + ret void +} diff --git a/llvm/test/Transforms/AggressiveInstCombine/X86/store-merge.ll b/llvm/test/Transforms/AggressiveInstCombine/X86/store-merge.ll new file mode 100644 index 0000000..38a55e1 --- /dev/null +++ b/llvm/test/Transforms/AggressiveInstCombine/X86/store-merge.ll @@ -0,0 +1,812 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 +; RUN: opt -S -passes=aggressive-instcombine -mtriple=x86_64-unknown-linux-gnu < %s | FileCheck %s + +declare void @use.i16(i16) +declare void @use.i32(i32) + +define void @test_i16(i16 %x, ptr %p) { +; CHECK-LABEL: define void @test_i16( +; CHECK-SAME: i16 [[X:%.*]], ptr [[P:%.*]]) { +; CHECK-NEXT: store i16 [[X]], ptr [[P]], align 1 +; CHECK-NEXT: ret void +; + %x.0 = trunc i16 %x to i8 + store i8 %x.0, ptr %p + %shr.1 = lshr i16 %x, 8 + %x.1 = trunc i16 %shr.1 to i8 + %gep.1 = getelementptr i8, ptr %p, i64 1 + store i8 %x.1, ptr %gep.1 + ret void +} + +define void @test_i32_i8_parts(i32 %x, ptr %p) { +; CHECK-LABEL: define void @test_i32_i8_parts( +; CHECK-SAME: i32 [[X:%.*]], ptr [[P:%.*]]) { +; CHECK-NEXT: store i32 [[X]], ptr [[P]], align 1 +; CHECK-NEXT: ret void +; + %x.0 = trunc i32 %x to i8 + store i8 %x.0, ptr %p + %shr.1 = lshr i32 %x, 8 + %x.1 = trunc i32 %shr.1 to i8 + %gep.1 = getelementptr i8, ptr %p, i64 1 + store i8 %x.1, ptr %gep.1 + %shr.2 = lshr i32 %x, 16 + %x.2 = trunc i32 %shr.2 to i8 + %gep.2 = getelementptr i8, ptr %p, i64 2 + store i8 %x.2, ptr %gep.2 + %shr.3 = lshr i32 %x, 24 + %x.3 = trunc i32 %shr.3 to i8 + %gep.3 = getelementptr i8, ptr %p, i64 3 + store i8 %x.3, ptr %gep.3 + ret void +} + +define void @test_i32_i16_parts(i32 %x, ptr %p) { +; CHECK-LABEL: define void @test_i32_i16_parts( +; CHECK-SAME: i32 [[X:%.*]], ptr [[P:%.*]]) { +; CHECK-NEXT: store i32 [[X]], ptr [[P]], align 2 +; CHECK-NEXT: ret void +; + %x.0 = trunc i32 %x to i16 + store i16 %x.0, ptr %p + %shr.1 = lshr i32 %x, 16 + %x.1 = trunc i32 %shr.1 to i16 + %gep.1 = getelementptr i8, ptr %p, i64 2 + store i16 %x.1, ptr %gep.1 + ret void +} + +define void @test_i32_mixed_parts(i32 %x, ptr %p) { +; CHECK-LABEL: define void @test_i32_mixed_parts( +; CHECK-SAME: i32 [[X:%.*]], ptr [[P:%.*]]) { +; CHECK-NEXT: store i32 [[X]], ptr [[P]], align 1 +; CHECK-NEXT: ret void +; + %x.0 = trunc i32 %x to i8 + store i8 %x.0, ptr %p + %shr.1 = lshr i32 %x, 8 + %x.1 = trunc i32 %shr.1 to i16 + %gep.1 = getelementptr i8, ptr %p, i64 1 + store i16 %x.1, ptr %gep.1 + %shr.3 = lshr i32 %x, 24 + %x.3 = trunc i32 %shr.3 to i8 + %gep.3 = getelementptr i8, ptr %p, i64 3 + store i8 %x.3, ptr %gep.3 + ret void +} + +define void @test_i64(i64 %x, ptr %p) { +; CHECK-LABEL: define void @test_i64( +; CHECK-SAME: i64 [[X:%.*]], ptr [[P:%.*]]) { +; CHECK-NEXT: store i64 [[X]], ptr [[P]], align 1 +; CHECK-NEXT: ret void +; + %x.0 = trunc i64 %x to i8 + store i8 %x.0, ptr %p + %shr.1 = lshr i64 %x, 8 + %x.1 = trunc i64 %shr.1 to i8 + %gep.1 = getelementptr i8, ptr %p, i64 1 + store i8 %x.1, ptr %gep.1 + %shr.2 = lshr i64 %x, 16 + %x.2 = trunc i64 %shr.2 to i8 + %gep.2 = getelementptr i8, ptr %p, i64 2 + store i8 %x.2, ptr %gep.2 + %shr.3 = lshr i64 %x, 24 + %x.3 = trunc i64 %shr.3 to i8 + %gep.3 = getelementptr i8, ptr %p, i64 3 + store i8 %x.3, ptr %gep.3 + %shr.4 = lshr i64 %x, 32 + %x.4 = trunc i64 %shr.4 to i8 + %gep.4 = getelementptr i8, ptr %p, i64 4 + store i8 %x.4, ptr %gep.4 + %shr.5 = lshr i64 %x, 40 + %x.5 = trunc i64 %shr.5 to i8 + %gep.5 = getelementptr i8, ptr %p, i64 5 + store i8 %x.5, ptr %gep.5 + %shr.6 = lshr i64 %x, 48 + %x.6 = trunc i64 %shr.6 to i8 + %gep.6 = getelementptr i8, ptr %p, i64 6 + store i8 %x.6, ptr %gep.6 + %shr.7 = lshr i64 %x, 56 + %x.7 = trunc i64 %shr.7 to i8 + %gep.7 = getelementptr i8, ptr %p, i64 7 + store i8 %x.7, ptr %gep.7 + ret void +} + +define void @test_i128(i128 %x, ptr %p) { +; CHECK-LABEL: define void @test_i128( +; CHECK-SAME: i128 [[X:%.*]], ptr [[P:%.*]]) { +; CHECK-NEXT: [[X_0:%.*]] = trunc i128 [[X]] to i8 +; CHECK-NEXT: store i8 [[X_0]], ptr [[P]], align 1 +; CHECK-NEXT: [[SHR_1:%.*]] = lshr i128 [[X]], 8 +; CHECK-NEXT: [[X_1:%.*]] = trunc i128 [[SHR_1]] to i8 +; CHECK-NEXT: [[GEP_1:%.*]] = getelementptr i8, ptr [[P]], i64 1 +; CHECK-NEXT: store i8 [[X_1]], ptr [[GEP_1]], align 1 +; CHECK-NEXT: [[SHR_2:%.*]] = lshr i128 [[X]], 16 +; CHECK-NEXT: [[X_2:%.*]] = trunc i128 [[SHR_2]] to i8 +; CHECK-NEXT: [[GEP_2:%.*]] = getelementptr i8, ptr [[P]], i64 2 +; CHECK-NEXT: store i8 [[X_2]], ptr [[GEP_2]], align 1 +; CHECK-NEXT: [[SHR_3:%.*]] = lshr i128 [[X]], 24 +; CHECK-NEXT: [[X_3:%.*]] = trunc i128 [[SHR_3]] to i8 +; CHECK-NEXT: [[GEP_3:%.*]] = getelementptr i8, ptr [[P]], i64 3 +; CHECK-NEXT: store i8 [[X_3]], ptr [[GEP_3]], align 1 +; CHECK-NEXT: [[SHR_4:%.*]] = lshr i128 [[X]], 32 +; CHECK-NEXT: [[X_4:%.*]] = trunc i128 [[SHR_4]] to i8 +; CHECK-NEXT: [[GEP_4:%.*]] = getelementptr i8, ptr [[P]], i64 4 +; CHECK-NEXT: store i8 [[X_4]], ptr [[GEP_4]], align 1 +; CHECK-NEXT: [[SHR_5:%.*]] = lshr i128 [[X]], 40 +; CHECK-NEXT: [[X_5:%.*]] = trunc i128 [[SHR_5]] to i8 +; CHECK-NEXT: [[GEP_5:%.*]] = getelementptr i8, ptr [[P]], i64 5 +; CHECK-NEXT: store i8 [[X_5]], ptr [[GEP_5]], align 1 +; CHECK-NEXT: [[SHR_6:%.*]] = lshr i128 [[X]], 48 +; CHECK-NEXT: [[X_6:%.*]] = trunc i128 [[SHR_6]] to i8 +; CHECK-NEXT: [[GEP_6:%.*]] = getelementptr i8, ptr [[P]], i64 6 +; CHECK-NEXT: store i8 [[X_6]], ptr [[GEP_6]], align 1 +; CHECK-NEXT: [[SHR_7:%.*]] = lshr i128 [[X]], 56 +; CHECK-NEXT: [[X_7:%.*]] = trunc i128 [[SHR_7]] to i8 +; CHECK-NEXT: [[GEP_7:%.*]] = getelementptr i8, ptr [[P]], i64 7 +; CHECK-NEXT: store i8 [[X_7]], ptr [[GEP_7]], align 1 +; CHECK-NEXT: [[SHR_8:%.*]] = lshr i128 [[X]], 64 +; CHECK-NEXT: [[X_8:%.*]] = trunc i128 [[SHR_8]] to i8 +; CHECK-NEXT: [[GEP_8:%.*]] = getelementptr i8, ptr [[P]], i64 8 +; CHECK-NEXT: store i8 [[X_8]], ptr [[GEP_8]], align 1 +; CHECK-NEXT: [[SHR_9:%.*]] = lshr i128 [[X]], 72 +; CHECK-NEXT: [[X_9:%.*]] = trunc i128 [[SHR_9]] to i8 +; CHECK-NEXT: [[GEP_9:%.*]] = getelementptr i8, ptr [[P]], i64 9 +; CHECK-NEXT: store i8 [[X_9]], ptr [[GEP_9]], align 1 +; CHECK-NEXT: [[SHR_10:%.*]] = lshr i128 [[X]], 80 +; CHECK-NEXT: [[X_10:%.*]] = trunc i128 [[SHR_10]] to i8 +; CHECK-NEXT: [[GEP_10:%.*]] = getelementptr i8, ptr [[P]], i64 10 +; CHECK-NEXT: store i8 [[X_10]], ptr [[GEP_10]], align 1 +; CHECK-NEXT: [[SHR_11:%.*]] = lshr i128 [[X]], 88 +; CHECK-NEXT: [[X_11:%.*]] = trunc i128 [[SHR_11]] to i8 +; CHECK-NEXT: [[GEP_11:%.*]] = getelementptr i8, ptr [[P]], i64 11 +; CHECK-NEXT: store i8 [[X_11]], ptr [[GEP_11]], align 1 +; CHECK-NEXT: [[SHR_12:%.*]] = lshr i128 [[X]], 96 +; CHECK-NEXT: [[X_12:%.*]] = trunc i128 [[SHR_12]] to i8 +; CHECK-NEXT: [[GEP_12:%.*]] = getelementptr i8, ptr [[P]], i64 12 +; CHECK-NEXT: store i8 [[X_12]], ptr [[GEP_12]], align 1 +; CHECK-NEXT: [[SHR_13:%.*]] = lshr i128 [[X]], 104 +; CHECK-NEXT: [[X_13:%.*]] = trunc i128 [[SHR_13]] to i8 +; CHECK-NEXT: [[GEP_13:%.*]] = getelementptr i8, ptr [[P]], i64 13 +; CHECK-NEXT: store i8 [[X_13]], ptr [[GEP_13]], align 1 +; CHECK-NEXT: [[SHR_14:%.*]] = lshr i128 [[X]], 112 +; CHECK-NEXT: [[X_14:%.*]] = trunc i128 [[SHR_14]] to i8 +; CHECK-NEXT: [[GEP_14:%.*]] = getelementptr i8, ptr [[P]], i64 14 +; CHECK-NEXT: store i8 [[X_14]], ptr [[GEP_14]], align 1 +; CHECK-NEXT: [[SHR_15:%.*]] = lshr i128 [[X]], 120 +; CHECK-NEXT: [[X_15:%.*]] = trunc i128 [[SHR_15]] to i8 +; CHECK-NEXT: [[GEP_15:%.*]] = getelementptr i8, ptr [[P]], i64 15 +; CHECK-NEXT: store i8 [[X_15]], ptr [[GEP_15]], align 1 +; CHECK-NEXT: ret void +; + %x.0 = trunc i128 %x to i8 + store i8 %x.0, ptr %p + %shr.1 = lshr i128 %x, 8 + %x.1 = trunc i128 %shr.1 to i8 + %gep.1 = getelementptr i8, ptr %p, i64 1 + store i8 %x.1, ptr %gep.1 + %shr.2 = lshr i128 %x, 16 + %x.2 = trunc i128 %shr.2 to i8 + %gep.2 = getelementptr i8, ptr %p, i64 2 + store i8 %x.2, ptr %gep.2 + %shr.3 = lshr i128 %x, 24 + %x.3 = trunc i128 %shr.3 to i8 + %gep.3 = getelementptr i8, ptr %p, i64 3 + store i8 %x.3, ptr %gep.3 + %shr.4 = lshr i128 %x, 32 + %x.4 = trunc i128 %shr.4 to i8 + %gep.4 = getelementptr i8, ptr %p, i64 4 + store i8 %x.4, ptr %gep.4 + %shr.5 = lshr i128 %x, 40 + %x.5 = trunc i128 %shr.5 to i8 + %gep.5 = getelementptr i8, ptr %p, i64 5 + store i8 %x.5, ptr %gep.5 + %shr.6 = lshr i128 %x, 48 + %x.6 = trunc i128 %shr.6 to i8 + %gep.6 = getelementptr i8, ptr %p, i64 6 + store i8 %x.6, ptr %gep.6 + %shr.7 = lshr i128 %x, 56 + %x.7 = trunc i128 %shr.7 to i8 + %gep.7 = getelementptr i8, ptr %p, i64 7 + store i8 %x.7, ptr %gep.7 + %shr.8 = lshr i128 %x, 64 + %x.8 = trunc i128 %shr.8 to i8 + %gep.8 = getelementptr i8, ptr %p, i64 8 + store i8 %x.8, ptr %gep.8 + %shr.9 = lshr i128 %x, 72 + %x.9 = trunc i128 %shr.9 to i8 + %gep.9 = getelementptr i8, ptr %p, i64 9 + store i8 %x.9, ptr %gep.9 + %shr.10 = lshr i128 %x, 80 + %x.10 = trunc i128 %shr.10 to i8 + %gep.10 = getelementptr i8, ptr %p, i64 10 + store i8 %x.10, ptr %gep.10 + %shr.11 = lshr i128 %x, 88 + %x.11 = trunc i128 %shr.11 to i8 + %gep.11 = getelementptr i8, ptr %p, i64 11 + store i8 %x.11, ptr %gep.11 + %shr.12 = lshr i128 %x, 96 + %x.12 = trunc i128 %shr.12 to i8 + %gep.12 = getelementptr i8, ptr %p, i64 12 + store i8 %x.12, ptr %gep.12 + %shr.13 = lshr i128 %x, 104 + %x.13 = trunc i128 %shr.13 to i8 + %gep.13 = getelementptr i8, ptr %p, i64 13 + store i8 %x.13, ptr %gep.13 + %shr.14 = lshr i128 %x, 112 + %x.14 = trunc i128 %shr.14 to i8 + %gep.14 = getelementptr i8, ptr %p, i64 14 + store i8 %x.14, ptr %gep.14 + %shr.15 = lshr i128 %x, 120 + %x.15 = trunc i128 %shr.15 to i8 + %gep.15 = getelementptr i8, ptr %p, i64 15 + store i8 %x.15, ptr %gep.15 + ret void +} + +define void @test_i32_lo(i32 %x, ptr %p) { +; CHECK-LABEL: define void @test_i32_lo( +; CHECK-SAME: i32 [[X:%.*]], ptr [[P:%.*]]) { +; CHECK-NEXT: [[TMP1:%.*]] = trunc i32 [[X]] to i16 +; CHECK-NEXT: store i16 [[TMP1]], ptr [[P]], align 1 +; CHECK-NEXT: ret void +; + %x.0 = trunc i32 %x to i8 + store i8 %x.0, ptr %p + %shr.1 = lshr i32 %x, 8 + %x.1 = trunc i32 %shr.1 to i8 + %gep.1 = getelementptr i8, ptr %p, i64 1 + store i8 %x.1, ptr %gep.1 + ret void +} + +define void @test_i32_hi(i32 %x, ptr %p) { +; CHECK-LABEL: define void @test_i32_hi( +; CHECK-SAME: i32 [[X:%.*]], ptr [[P:%.*]]) { +; CHECK-NEXT: [[SHR_0:%.*]] = lshr i32 [[X]], 16 +; CHECK-NEXT: [[TMP2:%.*]] = trunc i32 [[SHR_0]] to i16 +; CHECK-NEXT: store i16 [[TMP2]], ptr [[P]], align 1 +; CHECK-NEXT: ret void +; + %shr.0 = lshr i32 %x, 16 + %x.0 = trunc i32 %shr.0 to i8 + store i8 %x.0, ptr %p + %shr.1 = lshr i32 %x, 24 + %x.1 = trunc i32 %shr.1 to i8 + %gep.1 = getelementptr i8, ptr %p, i64 1 + store i8 %x.1, ptr %gep.1 + ret void +} + +define void @test_i32_mid(i32 %x, ptr %p) { +; CHECK-LABEL: define void @test_i32_mid( +; CHECK-SAME: i32 [[X:%.*]], ptr [[P:%.*]]) { +; CHECK-NEXT: [[TMP1:%.*]] = lshr i32 [[X]], 10 +; CHECK-NEXT: [[TMP2:%.*]] = trunc i32 [[TMP1]] to i16 +; CHECK-NEXT: store i16 [[TMP2]], ptr [[P]], align 1 +; CHECK-NEXT: ret void +; + %shr.0 = lshr i32 %x, 10 + %x.0 = trunc i32 %shr.0 to i8 + store i8 %x.0, ptr %p + %shr.1 = lshr i32 %x, 18 + %x.1 = trunc i32 %shr.1 to i8 + %gep.1 = getelementptr i8, ptr %p, i64 1 + store i8 %x.1, ptr %gep.1 + ret void +} + +define void @test_i32_shift_in_zeros(i32 %x, ptr %p) { +; CHECK-LABEL: define void @test_i32_shift_in_zeros( +; CHECK-SAME: i32 [[X:%.*]], ptr [[P:%.*]]) { +; CHECK-NEXT: [[SHR_0:%.*]] = lshr i32 [[X]], 20 +; CHECK-NEXT: [[TMP2:%.*]] = trunc i32 [[SHR_0]] to i16 +; CHECK-NEXT: store i16 [[TMP2]], ptr [[P]], align 1 +; CHECK-NEXT: ret void +; + %shr.0 = lshr i32 %x, 20 + %x.0 = trunc i32 %shr.0 to i8 + store i8 %x.0, ptr %p + %shr.1 = lshr i32 %x, 28 + %x.1 = trunc i32 %shr.1 to i8 + %gep.1 = getelementptr i8, ptr %p, i64 1 + store i8 %x.1, ptr %gep.1 + ret void +} + +define void @test_base_ptr_with_offset(i32 %x, ptr %p) { +; CHECK-LABEL: define void @test_base_ptr_with_offset( +; CHECK-SAME: i32 [[X:%.*]], ptr [[P:%.*]]) { +; CHECK-NEXT: [[TMP1:%.*]] = getelementptr i8, ptr [[P]], i64 7 +; CHECK-NEXT: store i32 [[X]], ptr [[TMP1]], align 2 +; CHECK-NEXT: ret void +; + %x.0 = trunc i32 %x to i16 + %gep.0 = getelementptr i8, ptr %p, i64 7 + store i16 %x.0, ptr %gep.0 + %shr.1 = lshr i32 %x, 16 + %x.1 = trunc i32 %shr.1 to i16 + %gep.1 = getelementptr i8, ptr %p, i64 9 + store i16 %x.1, ptr %gep.1 + ret void +} + +define void @test_aliasing_store(i16 %x, ptr %p, ptr %p2) { +; CHECK-LABEL: define void @test_aliasing_store( +; CHECK-SAME: i16 [[X:%.*]], ptr [[P:%.*]], ptr [[P2:%.*]]) { +; CHECK-NEXT: [[X_0:%.*]] = trunc i16 [[X]] to i8 +; CHECK-NEXT: store i8 [[X_0]], ptr [[P]], align 1 +; CHECK-NEXT: store i8 0, ptr [[P2]], align 1 +; CHECK-NEXT: [[SHR_1:%.*]] = lshr i16 [[X]], 8 +; CHECK-NEXT: [[X_1:%.*]] = trunc i16 [[SHR_1]] to i8 +; CHECK-NEXT: [[GEP_1:%.*]] = getelementptr i8, ptr [[P]], i64 1 +; CHECK-NEXT: store i8 [[X_1]], ptr [[GEP_1]], align 1 +; CHECK-NEXT: ret void +; + %x.0 = trunc i16 %x to i8 + store i8 %x.0, ptr %p + store i8 0, ptr %p2 + %shr.1 = lshr i16 %x, 8 + %x.1 = trunc i16 %shr.1 to i8 + %gep.1 = getelementptr i8, ptr %p, i64 1 + store i8 %x.1, ptr %gep.1 + ret void +} + +define void @test_non_aliasing_store(i16 %x, ptr noalias %p, ptr noalias %p2) { +; CHECK-LABEL: define void @test_non_aliasing_store( +; CHECK-SAME: i16 [[X:%.*]], ptr noalias [[P:%.*]], ptr noalias [[P2:%.*]]) { +; CHECK-NEXT: [[X_0:%.*]] = trunc i16 [[X]] to i8 +; CHECK-NEXT: store i8 [[X_0]], ptr [[P]], align 1 +; CHECK-NEXT: store i8 0, ptr [[P2]], align 1 +; CHECK-NEXT: [[SHR_1:%.*]] = lshr i16 [[X]], 8 +; CHECK-NEXT: [[X_1:%.*]] = trunc i16 [[SHR_1]] to i8 +; CHECK-NEXT: [[GEP_1:%.*]] = getelementptr i8, ptr [[P]], i64 1 +; CHECK-NEXT: store i8 [[X_1]], ptr [[GEP_1]], align 1 +; CHECK-NEXT: ret void +; + %x.0 = trunc i16 %x to i8 + store i8 %x.0, ptr %p + store i8 0, ptr %p2 + %shr.1 = lshr i16 %x, 8 + %x.1 = trunc i16 %shr.1 to i8 + %gep.1 = getelementptr i8, ptr %p, i64 1 + store i8 %x.1, ptr %gep.1 + ret void +} + +define i8 @test_aliasing_load(i16 %x, ptr %p, ptr %p2) { +; CHECK-LABEL: define i8 @test_aliasing_load( +; CHECK-SAME: i16 [[X:%.*]], ptr [[P:%.*]], ptr [[P2:%.*]]) { +; CHECK-NEXT: [[X_0:%.*]] = trunc i16 [[X]] to i8 +; CHECK-NEXT: store i8 [[X_0]], ptr [[P]], align 1 +; CHECK-NEXT: [[V:%.*]] = load i8, ptr [[P2]], align 1 +; CHECK-NEXT: [[SHR_1:%.*]] = lshr i16 [[X]], 8 +; CHECK-NEXT: [[X_1:%.*]] = trunc i16 [[SHR_1]] to i8 +; CHECK-NEXT: [[GEP_1:%.*]] = getelementptr i8, ptr [[P]], i64 1 +; CHECK-NEXT: store i8 [[X_1]], ptr [[GEP_1]], align 1 +; CHECK-NEXT: ret i8 [[V]] +; + %x.0 = trunc i16 %x to i8 + store i8 %x.0, ptr %p + %v = load i8, ptr %p2 + %shr.1 = lshr i16 %x, 8 + %x.1 = trunc i16 %shr.1 to i8 + %gep.1 = getelementptr i8, ptr %p, i64 1 + store i8 %x.1, ptr %gep.1 + ret i8 %v +} + +define i8 @test_non_aliasing_load(i16 %x, ptr noalias %p, ptr noalias %p2) { +; CHECK-LABEL: define i8 @test_non_aliasing_load( +; CHECK-SAME: i16 [[X:%.*]], ptr noalias [[P:%.*]], ptr noalias [[P2:%.*]]) { +; CHECK-NEXT: [[X_0:%.*]] = trunc i16 [[X]] to i8 +; CHECK-NEXT: store i8 [[X_0]], ptr [[P]], align 1 +; CHECK-NEXT: [[V:%.*]] = load i8, ptr [[P2]], align 1 +; CHECK-NEXT: [[SHR_1:%.*]] = lshr i16 [[X]], 8 +; CHECK-NEXT: [[X_1:%.*]] = trunc i16 [[SHR_1]] to i8 +; CHECK-NEXT: [[GEP_1:%.*]] = getelementptr i8, ptr [[P]], i64 1 +; CHECK-NEXT: store i8 [[X_1]], ptr [[GEP_1]], align 1 +; CHECK-NEXT: ret i8 [[V]] +; + %x.0 = trunc i16 %x to i8 + store i8 %x.0, ptr %p + %v = load i8, ptr %p2 + %shr.1 = lshr i16 %x, 8 + %x.1 = trunc i16 %shr.1 to i8 + %gep.1 = getelementptr i8, ptr %p, i64 1 + store i8 %x.1, ptr %gep.1 + ret i8 %v +} + +define i8 @test_aliasing_load_partially_mergeable(i32 %x, ptr %p, ptr %p2) { +; CHECK-LABEL: define i8 @test_aliasing_load_partially_mergeable( +; CHECK-SAME: i32 [[X:%.*]], ptr [[P:%.*]], ptr [[P2:%.*]]) { +; CHECK-NEXT: [[TMP1:%.*]] = trunc i32 [[X]] to i16 +; CHECK-NEXT: store i16 [[TMP1]], ptr [[P]], align 1 +; CHECK-NEXT: [[V:%.*]] = load i8, ptr [[P2]], align 1 +; CHECK-NEXT: [[TMP4:%.*]] = getelementptr i8, ptr [[P]], i64 2 +; CHECK-NEXT: [[TMP2:%.*]] = lshr i32 [[X]], 16 +; CHECK-NEXT: [[TMP3:%.*]] = trunc i32 [[TMP2]] to i16 +; CHECK-NEXT: store i16 [[TMP3]], ptr [[TMP4]], align 1 +; CHECK-NEXT: ret i8 [[V]] +; + %x.0 = trunc i32 %x to i8 + store i8 %x.0, ptr %p + %shr.1 = lshr i32 %x, 8 + %x.1 = trunc i32 %shr.1 to i8 + %gep.1 = getelementptr i8, ptr %p, i64 1 + store i8 %x.1, ptr %gep.1 + %v = load i8, ptr %p2 + %shr.2 = lshr i32 %x, 16 + %x.2 = trunc i32 %shr.2 to i8 + %gep.2 = getelementptr i8, ptr %p, i64 2 + store i8 %x.2, ptr %gep.2 + %shr.3 = lshr i32 %x, 24 + %x.3 = trunc i32 %shr.3 to i8 + %gep.3 = getelementptr i8, ptr %p, i64 3 + store i8 %x.3, ptr %gep.3 + ret i8 %v +} + +declare void @may_unwind() memory(none) + +define void @test_unwind(i16 %x, ptr %p, ptr %p2) { +; CHECK-LABEL: define void @test_unwind( +; CHECK-SAME: i16 [[X:%.*]], ptr [[P:%.*]], ptr [[P2:%.*]]) { +; CHECK-NEXT: [[X_0:%.*]] = trunc i16 [[X]] to i8 +; CHECK-NEXT: store i8 [[X_0]], ptr [[P]], align 1 +; CHECK-NEXT: call void @may_unwind() +; CHECK-NEXT: [[SHR_1:%.*]] = lshr i16 [[X]], 8 +; CHECK-NEXT: [[X_1:%.*]] = trunc i16 [[SHR_1]] to i8 +; CHECK-NEXT: [[GEP_1:%.*]] = getelementptr i8, ptr [[P]], i64 1 +; CHECK-NEXT: store i8 [[X_1]], ptr [[GEP_1]], align 1 +; CHECK-NEXT: ret void +; + %x.0 = trunc i16 %x to i8 + store i8 %x.0, ptr %p + call void @may_unwind() + %shr.1 = lshr i16 %x, 8 + %x.1 = trunc i16 %shr.1 to i8 + %gep.1 = getelementptr i8, ptr %p, i64 1 + store i8 %x.1, ptr %gep.1 + ret void +} + +define void @test_multi_group(i16 %x, ptr %p1, i16 %y, ptr %p2) { +; CHECK-LABEL: define void @test_multi_group( +; CHECK-SAME: i16 [[X:%.*]], ptr [[P1:%.*]], i16 [[Y:%.*]], ptr [[P2:%.*]]) { +; CHECK-NEXT: store i16 [[X]], ptr [[P1]], align 1 +; CHECK-NEXT: call void @may_unwind() +; CHECK-NEXT: store i16 [[Y]], ptr [[P2]], align 1 +; CHECK-NEXT: ret void +; + %x.0 = trunc i16 %x to i8 + store i8 %x.0, ptr %p1 + %shr.1 = lshr i16 %x, 8 + %x.1 = trunc i16 %shr.1 to i8 + %gep.1 = getelementptr i8, ptr %p1, i64 1 + store i8 %x.1, ptr %gep.1 + call void @may_unwind() + %y.0 = trunc i16 %y to i8 + store i8 %y.0, ptr %p2 + %shr.2 = lshr i16 %y, 8 + %y.1 = trunc i16 %shr.2 to i8 + %gep.2 = getelementptr i8, ptr %p2, i64 1 + store i8 %y.1, ptr %gep.2 + ret void +} + +define void @test_stores_out_of_order(i32 %x, ptr %p) { +; CHECK-LABEL: define void @test_stores_out_of_order( +; CHECK-SAME: i32 [[X:%.*]], ptr [[P:%.*]]) { +; CHECK-NEXT: store i32 [[X]], ptr [[P]], align 1 +; CHECK-NEXT: ret void +; + %shr.2 = lshr i32 %x, 16 + %x.2 = trunc i32 %shr.2 to i8 + %gep.2 = getelementptr i8, ptr %p, i64 2 + store i8 %x.2, ptr %gep.2 + %x.0 = trunc i32 %x to i8 + store i8 %x.0, ptr %p + %shr.3 = lshr i32 %x, 24 + %x.3 = trunc i32 %shr.3 to i8 + %gep.3 = getelementptr i8, ptr %p, i64 3 + store i8 %x.3, ptr %gep.3 + %shr.1 = lshr i32 %x, 8 + %x.1 = trunc i32 %shr.1 to i8 + %gep.1 = getelementptr i8, ptr %p, i64 1 + store i8 %x.1, ptr %gep.1 + ret void +} + +define void @test_i32_gap(i32 %x, ptr %p) { +; CHECK-LABEL: define void @test_i32_gap( +; CHECK-SAME: i32 [[X:%.*]], ptr [[P:%.*]]) { +; CHECK-NEXT: [[SHR_0:%.*]] = lshr i32 [[X]], 7 +; CHECK-NEXT: [[X_0:%.*]] = trunc i32 [[SHR_0]] to i8 +; CHECK-NEXT: store i8 [[X_0]], ptr [[P]], align 1 +; CHECK-NEXT: [[SHR_1:%.*]] = lshr i32 [[X]], 16 +; CHECK-NEXT: [[X_1:%.*]] = trunc i32 [[SHR_1]] to i8 +; CHECK-NEXT: [[GEP_1:%.*]] = getelementptr i8, ptr [[P]], i64 1 +; CHECK-NEXT: store i8 [[X_1]], ptr [[GEP_1]], align 1 +; CHECK-NEXT: ret void +; + %shr.0 = lshr i32 %x, 7 + %x.0 = trunc i32 %shr.0 to i8 + store i8 %x.0, ptr %p + %shr.1 = lshr i32 %x, 16 + %x.1 = trunc i32 %shr.1 to i8 + %gep.1 = getelementptr i8, ptr %p, i64 1 + store i8 %x.1, ptr %gep.1 + ret void +} + +define void @test_i32_non_byte_sized(i32 %x, ptr %p) { +; CHECK-LABEL: define void @test_i32_non_byte_sized( +; CHECK-SAME: i32 [[X:%.*]], ptr [[P:%.*]]) { +; CHECK-NEXT: [[X_0:%.*]] = trunc i32 [[X]] to i15 +; CHECK-NEXT: store i15 [[X_0]], ptr [[P]], align 2 +; CHECK-NEXT: [[SHR_1:%.*]] = lshr i32 [[X]], 15 +; CHECK-NEXT: [[X_1:%.*]] = trunc i32 [[SHR_1]] to i17 +; CHECK-NEXT: [[GEP_1:%.*]] = getelementptr i8, ptr [[P]], i64 2 +; CHECK-NEXT: store i17 [[X_1]], ptr [[GEP_1]], align 4 +; CHECK-NEXT: ret void +; + %x.0 = trunc i32 %x to i15 + store i15 %x.0, ptr %p + %shr.1 = lshr i32 %x, 15 + %x.1 = trunc i32 %shr.1 to i17 + %gep.1 = getelementptr i8, ptr %p, i64 2 + store i17 %x.1, ptr %gep.1 + ret void +} + +define void @test_i32_wrong_ptr_offset(i32 %x, ptr %p) { +; CHECK-LABEL: define void @test_i32_wrong_ptr_offset( +; CHECK-SAME: i32 [[X:%.*]], ptr [[P:%.*]]) { +; CHECK-NEXT: [[SHR_0:%.*]] = lshr i32 [[X]], 8 +; CHECK-NEXT: [[X_0:%.*]] = trunc i32 [[SHR_0]] to i8 +; CHECK-NEXT: store i8 [[X_0]], ptr [[P]], align 1 +; CHECK-NEXT: [[SHR_1:%.*]] = lshr i32 [[X]], 16 +; CHECK-NEXT: [[X_1:%.*]] = trunc i32 [[SHR_1]] to i8 +; CHECK-NEXT: [[GEP_1:%.*]] = getelementptr i8, ptr [[P]], i64 2 +; CHECK-NEXT: store i8 [[X_1]], ptr [[GEP_1]], align 1 +; CHECK-NEXT: ret void +; + %shr.0 = lshr i32 %x, 8 + %x.0 = trunc i32 %shr.0 to i8 + store i8 %x.0, ptr %p + %shr.1 = lshr i32 %x, 16 + %x.1 = trunc i32 %shr.1 to i8 + %gep.1 = getelementptr i8, ptr %p, i64 2 + store i8 %x.1, ptr %gep.1 + ret void +} + +define void @test_i32_wrong_endian(i32 %x, ptr %p) { +; CHECK-LABEL: define void @test_i32_wrong_endian( +; CHECK-SAME: i32 [[X:%.*]], ptr [[P:%.*]]) { +; CHECK-NEXT: [[X_0:%.*]] = trunc i32 [[X]] to i8 +; CHECK-NEXT: [[GEP_0:%.*]] = getelementptr i8, ptr [[P]], i64 3 +; CHECK-NEXT: store i8 [[X_0]], ptr [[GEP_0]], align 1 +; CHECK-NEXT: [[SHR_1:%.*]] = lshr i32 [[X]], 8 +; CHECK-NEXT: [[X_1:%.*]] = trunc i32 [[SHR_1]] to i8 +; CHECK-NEXT: [[GEP_1:%.*]] = getelementptr i8, ptr [[P]], i64 2 +; CHECK-NEXT: store i8 [[X_1]], ptr [[GEP_1]], align 1 +; CHECK-NEXT: [[SHR_2:%.*]] = lshr i32 [[X]], 16 +; CHECK-NEXT: [[X_2:%.*]] = trunc i32 [[SHR_2]] to i8 +; CHECK-NEXT: [[GEP_2:%.*]] = getelementptr i8, ptr [[P]], i64 1 +; CHECK-NEXT: store i8 [[X_2]], ptr [[GEP_2]], align 1 +; CHECK-NEXT: [[SHR_3:%.*]] = lshr i32 [[X]], 24 +; CHECK-NEXT: [[X_3:%.*]] = trunc i32 [[SHR_3]] to i8 +; CHECK-NEXT: store i8 [[X_3]], ptr [[P]], align 1 +; CHECK-NEXT: ret void +; + %x.0 = trunc i32 %x to i8 + %gep.0 = getelementptr i8, ptr %p, i64 3 + store i8 %x.0, ptr %gep.0 + %shr.1 = lshr i32 %x, 8 + %x.1 = trunc i32 %shr.1 to i8 + %gep.1 = getelementptr i8, ptr %p, i64 2 + store i8 %x.1, ptr %gep.1 + %shr.2 = lshr i32 %x, 16 + %x.2 = trunc i32 %shr.2 to i8 + %gep.2 = getelementptr i8, ptr %p, i64 1 + store i8 %x.2, ptr %gep.2 + %shr.3 = lshr i32 %x, 24 + %x.3 = trunc i32 %shr.3 to i8 + store i8 %x.3, ptr %p + ret void +} + +define void @test_i32_volatile(i32 %x, ptr %p) { +; CHECK-LABEL: define void @test_i32_volatile( +; CHECK-SAME: i32 [[X:%.*]], ptr [[P:%.*]]) { +; CHECK-NEXT: [[TMP1:%.*]] = lshr i32 [[X]], 8 +; CHECK-NEXT: [[X_0:%.*]] = trunc i32 [[TMP1]] to i8 +; CHECK-NEXT: store volatile i8 [[X_0]], ptr [[P]], align 1 +; CHECK-NEXT: [[SHR_1:%.*]] = lshr i32 [[X]], 16 +; CHECK-NEXT: [[X_1:%.*]] = trunc i32 [[SHR_1]] to i8 +; CHECK-NEXT: [[GEP_1:%.*]] = getelementptr i8, ptr [[P]], i64 1 +; CHECK-NEXT: store i8 [[X_1]], ptr [[GEP_1]], align 1 +; CHECK-NEXT: ret void +; + %shr.0 = lshr i32 %x, 8 + %x.0 = trunc i32 %shr.0 to i8 + store volatile i8 %x.0, ptr %p + %shr.1 = lshr i32 %x, 16 + %x.1 = trunc i32 %shr.1 to i8 + %gep.1 = getelementptr i8, ptr %p, i64 1 + store i8 %x.1, ptr %gep.1 + ret void +} + +define void @test_i32_atomic(i32 %x, ptr %p) { +; CHECK-LABEL: define void @test_i32_atomic( +; CHECK-SAME: i32 [[X:%.*]], ptr [[P:%.*]]) { +; CHECK-NEXT: [[SHR_0:%.*]] = lshr i32 [[X]], 8 +; CHECK-NEXT: [[X_0:%.*]] = trunc i32 [[SHR_0]] to i8 +; CHECK-NEXT: store atomic i8 [[X_0]], ptr [[P]] monotonic, align 1 +; CHECK-NEXT: [[SHR_1:%.*]] = lshr i32 [[X]], 16 +; CHECK-NEXT: [[X_1:%.*]] = trunc i32 [[SHR_1]] to i8 +; CHECK-NEXT: [[GEP_1:%.*]] = getelementptr i8, ptr [[P]], i64 1 +; CHECK-NEXT: store i8 [[X_1]], ptr [[GEP_1]], align 1 +; CHECK-NEXT: ret void +; + %shr.0 = lshr i32 %x, 8 + %x.0 = trunc i32 %shr.0 to i8 + store atomic i8 %x.0, ptr %p monotonic, align 1 + %shr.1 = lshr i32 %x, 16 + %x.1 = trunc i32 %shr.1 to i8 + %gep.1 = getelementptr i8, ptr %p, i64 1 + store i8 %x.1, ptr %gep.1 + ret void +} + +define void @test_i32_multiple_pointers(i32 %x, i32 %y, ptr %p, ptr %p2) { +; CHECK-LABEL: define void @test_i32_multiple_pointers( +; CHECK-SAME: i32 [[X:%.*]], i32 [[Y:%.*]], ptr [[P:%.*]], ptr [[P2:%.*]]) { +; CHECK-NEXT: store i32 [[X]], ptr [[P]], align 2 +; CHECK-NEXT: store i32 [[Y]], ptr [[P2]], align 2 +; CHECK-NEXT: ret void +; + %x.0 = trunc i32 %x to i16 + store i16 %x.0, ptr %p + %shr.1 = lshr i32 %x, 16 + %x.1 = trunc i32 %shr.1 to i16 + %gep.1 = getelementptr i8, ptr %p, i64 2 + store i16 %x.1, ptr %gep.1 + + %y.0 = trunc i32 %y to i16 + store i16 %y.0, ptr %p2 + %y.shr.1 = lshr i32 %y, 16 + %y.1 = trunc i32 %y.shr.1 to i16 + %p2.gep.1 = getelementptr i8, ptr %p2, i64 2 + store i16 %y.1, ptr %p2.gep.1 + ret void +} + +define void @test_i32_multiple_pointers_interleaved(i32 %x, i32 %y, ptr noalias %p, ptr noalias %p2) { +; CHECK-LABEL: define void @test_i32_multiple_pointers_interleaved( +; CHECK-SAME: i32 [[X:%.*]], i32 [[Y:%.*]], ptr noalias [[P:%.*]], ptr noalias [[P2:%.*]]) { +; CHECK-NEXT: [[X_0:%.*]] = trunc i32 [[X]] to i16 +; CHECK-NEXT: store i16 [[X_0]], ptr [[P]], align 2 +; CHECK-NEXT: [[Y_0:%.*]] = trunc i32 [[Y]] to i16 +; CHECK-NEXT: store i16 [[Y_0]], ptr [[P2]], align 2 +; CHECK-NEXT: [[SHR_1:%.*]] = lshr i32 [[X]], 16 +; CHECK-NEXT: [[X_1:%.*]] = trunc i32 [[SHR_1]] to i16 +; CHECK-NEXT: [[GEP_1:%.*]] = getelementptr i8, ptr [[P]], i64 2 +; CHECK-NEXT: store i16 [[X_1]], ptr [[GEP_1]], align 2 +; CHECK-NEXT: [[Y_SHR_1:%.*]] = lshr i32 [[Y]], 16 +; CHECK-NEXT: [[Y_1:%.*]] = trunc i32 [[Y_SHR_1]] to i16 +; CHECK-NEXT: [[P2_GEP_1:%.*]] = getelementptr i8, ptr [[P2]], i64 2 +; CHECK-NEXT: store i16 [[Y_1]], ptr [[P2_GEP_1]], align 2 +; CHECK-NEXT: ret void +; + %x.0 = trunc i32 %x to i16 + store i16 %x.0, ptr %p + %y.0 = trunc i32 %y to i16 + store i16 %y.0, ptr %p2 + + %shr.1 = lshr i32 %x, 16 + %x.1 = trunc i32 %shr.1 to i16 + %gep.1 = getelementptr i8, ptr %p, i64 2 + store i16 %x.1, ptr %gep.1 + %y.shr.1 = lshr i32 %y, 16 + %y.1 = trunc i32 %y.shr.1 to i16 + %p2.gep.1 = getelementptr i8, ptr %p2, i64 2 + store i16 %y.1, ptr %p2.gep.1 + ret void +} + +define void @test_i32_multi_use(i32 %x, ptr %p) { +; CHECK-LABEL: define void @test_i32_multi_use( +; CHECK-SAME: i32 [[X:%.*]], ptr [[P:%.*]]) { +; CHECK-NEXT: [[X_0:%.*]] = trunc i32 [[X]] to i16 +; CHECK-NEXT: store i32 [[X]], ptr [[P]], align 2 +; CHECK-NEXT: [[SHR_1:%.*]] = lshr i32 [[X]], 16 +; CHECK-NEXT: [[X_1:%.*]] = trunc i32 [[SHR_1]] to i16 +; CHECK-NEXT: call void @use.i16(i16 [[X_0]]) +; CHECK-NEXT: call void @use.i16(i16 [[X_1]]) +; CHECK-NEXT: call void @use.i32(i32 [[SHR_1]]) +; CHECK-NEXT: ret void +; + %x.0 = trunc i32 %x to i16 + store i16 %x.0, ptr %p + %shr.1 = lshr i32 %x, 16 + %x.1 = trunc i32 %shr.1 to i16 + %gep.1 = getelementptr i8, ptr %p, i64 2 + store i16 %x.1, ptr %gep.1 + call void @use.i16(i16 %x.0) + call void @use.i16(i16 %x.1) + call void @use.i32(i32 %shr.1) + ret void +} + +define void @test_i32_scoped_aa_same(i32 %x, ptr %p) { +; CHECK-LABEL: define void @test_i32_scoped_aa_same( +; CHECK-SAME: i32 [[X:%.*]], ptr [[P:%.*]]) { +; CHECK-NEXT: store i32 [[X]], ptr [[P]], align 2, !noalias [[META0:![0-9]+]] +; CHECK-NEXT: ret void +; + %x.0 = trunc i32 %x to i16 + store i16 %x.0, ptr %p, !noalias !0 + %shr.1 = lshr i32 %x, 16 + %x.1 = trunc i32 %shr.1 to i16 + %gep.1 = getelementptr i8, ptr %p, i64 2 + store i16 %x.1, ptr %gep.1, !noalias !0 + ret void +} + +define void @test_i32_scoped_aa_different(i32 %x, ptr %p) { +; CHECK-LABEL: define void @test_i32_scoped_aa_different( +; CHECK-SAME: i32 [[X:%.*]], ptr [[P:%.*]]) { +; CHECK-NEXT: store i32 [[X]], ptr [[P]], align 2, !noalias [[META3:![0-9]+]] +; CHECK-NEXT: ret void +; + %x.0 = trunc i32 %x to i16 + store i16 %x.0, ptr %p, !noalias !0 + %shr.1 = lshr i32 %x, 16 + %x.1 = trunc i32 %shr.1 to i16 + %gep.1 = getelementptr i8, ptr %p, i64 2 + store i16 %x.1, ptr %gep.1, !noalias !3 + ret void +} + +define void @test_i32_tbaa(i32 %x, ptr %p) { +; CHECK-LABEL: define void @test_i32_tbaa( +; CHECK-SAME: i32 [[X:%.*]], ptr [[P:%.*]]) { +; CHECK-NEXT: store i32 [[X]], ptr [[P]], align 2 +; CHECK-NEXT: ret void +; + %x.0 = trunc i32 %x to i16 + store i16 %x.0, ptr %p, !tbaa !6 + %shr.1 = lshr i32 %x, 16 + %x.1 = trunc i32 %shr.1 to i16 + %gep.1 = getelementptr i8, ptr %p, i64 2 + store i16 %x.1, ptr %gep.1, !tbaa !6 + ret void +} + +!0 = !{!1} +!1 = !{!1, !2} +!2 = !{!2} + +!3 = !{!4} +!4 = !{!4, !5} +!5 = !{!5} + +!6 = !{!7, !7, i64 0} +!7 = !{!"short", !8, i64 0} +!8 = !{!"omnipotent char", !9, i64 0} +!9 = !{!"Simple C/C++ TBAA"} +;. +; CHECK: [[META0]] = !{[[META1:![0-9]+]]} +; CHECK: [[META1]] = distinct !{[[META1]], [[META2:![0-9]+]]} +; CHECK: [[META2]] = distinct !{[[META2]]} +; CHECK: [[META3]] = !{} +;. diff --git a/llvm/test/Transforms/Attributor/heap_to_stack.ll b/llvm/test/Transforms/Attributor/heap_to_stack.ll index fad4acb..6719290 100644 --- a/llvm/test/Transforms/Attributor/heap_to_stack.ll +++ b/llvm/test/Transforms/Attributor/heap_to_stack.ll @@ -393,26 +393,6 @@ bb: ret i32 %i2 } -define i32 @test_lifetime() { -; CHECK-LABEL: define {{[^@]+}}@test_lifetime() { -; CHECK-NEXT: bb: -; CHECK-NEXT: [[I_H2S:%.*]] = alloca i8, i64 4, align 1 -; CHECK-NEXT: tail call void @no_sync_func(ptr noalias nofree captures(none) [[I_H2S]]) -; CHECK-NEXT: call void @llvm.lifetime.start.p0(i64 noundef 4, ptr noalias nofree nonnull align 4 captures(none) dereferenceable(4) [[I_H2S]]) -; CHECK-NEXT: store i32 10, ptr [[I_H2S]], align 4 -; CHECK-NEXT: [[I2:%.*]] = load i32, ptr [[I_H2S]], align 4 -; CHECK-NEXT: ret i32 [[I2]] -; -bb: - %i = tail call noalias ptr @malloc(i64 4) - tail call void @no_sync_func(ptr %i) - call void @llvm.lifetime.start.p0(i64 4, ptr %i) - store i32 10, ptr %i, align 4 - %i2 = load i32, ptr %i, align 4 - tail call void @free(ptr %i) - ret i32 %i2 -} - ; TEST 11 define void @test11() { diff --git a/llvm/test/Transforms/Attributor/heap_to_stack_gpu.ll b/llvm/test/Transforms/Attributor/heap_to_stack_gpu.ll index c7a9ec8..0be9434 100644 --- a/llvm/test/Transforms/Attributor/heap_to_stack_gpu.ll +++ b/llvm/test/Transforms/Attributor/heap_to_stack_gpu.ll @@ -340,27 +340,6 @@ bb: ret i32 %i2 } -define i32 @test_lifetime() { -; CHECK-LABEL: define {{[^@]+}}@test_lifetime() { -; CHECK-NEXT: bb: -; CHECK-NEXT: [[I:%.*]] = tail call noalias ptr @malloc(i64 noundef 4) -; CHECK-NEXT: tail call void @no_sync_func(ptr noalias nofree captures(none) [[I]]) -; CHECK-NEXT: call void @llvm.lifetime.start.p0(i64 noundef 4, ptr noalias nofree nonnull align 4 captures(none) dereferenceable(4) [[I]]) -; CHECK-NEXT: store i32 10, ptr [[I]], align 4 -; CHECK-NEXT: [[I2:%.*]] = load i32, ptr [[I]], align 4 -; CHECK-NEXT: tail call void @free(ptr noalias nonnull align 4 captures(none) dereferenceable(4) [[I]]) -; CHECK-NEXT: ret i32 [[I2]] -; -bb: - %i = tail call noalias ptr @malloc(i64 4) - tail call void @no_sync_func(ptr %i) - call void @llvm.lifetime.start.p0(i64 4, ptr %i) - store i32 10, ptr %i, align 4 - %i2 = load i32, ptr %i, align 4 - tail call void @free(ptr %i) - ret i32 %i2 -} - ; TEST 11 define void @test11() { diff --git a/llvm/test/Transforms/CodeExtractor/PartialInlineAlloca5.ll b/llvm/test/Transforms/CodeExtractor/PartialInlineAlloca5.ll index 005c021..54782c5 100644 --- a/llvm/test/Transforms/CodeExtractor/PartialInlineAlloca5.ll +++ b/llvm/test/Transforms/CodeExtractor/PartialInlineAlloca5.ll @@ -18,11 +18,11 @@ bb: br i1 %tmp4, label %bb6, label %bb5 bb5: ; preds = %bb - call void @llvm.lifetime.start.p0(i64 4, ptr nonnull %tmp1) #2 + call void @llvm.lifetime.start.p0(i64 4, ptr nonnull %tmp) #2 store i32 %tmp3, ptr %tmp, align 4, !tbaa !2 store i32 %tmp3, ptr @g, align 4, !tbaa !2 call void @bar(ptr nonnull %tmp) #2 - call void @llvm.lifetime.end.p0(i64 4, ptr nonnull %tmp1) #2 + call void @llvm.lifetime.end.p0(i64 4, ptr nonnull %tmp) #2 br label %bb6 bb6: ; preds = %bb5, %bb diff --git a/llvm/test/Transforms/CodeExtractor/live_shrink_gep.ll b/llvm/test/Transforms/CodeExtractor/live_shrink_gep.ll index 03ff31b..e9d5fb6 100644 --- a/llvm/test/Transforms/CodeExtractor/live_shrink_gep.ll +++ b/llvm/test/Transforms/CodeExtractor/live_shrink_gep.ll @@ -9,8 +9,7 @@ define void @_Z3foov() local_unnamed_addr { bb: %tmp = alloca %class.A, align 1 - %tmp1 = getelementptr inbounds %class.A, ptr %tmp, i64 0, i32 0 - call void @llvm.lifetime.start.p0(i64 1, ptr nonnull %tmp1) + call void @llvm.lifetime.start.p0(i64 1, ptr nonnull %tmp) %tmp2 = load i32, ptr @cond, align 4, !tbaa !2 %tmp3 = icmp eq i32 %tmp2, 0 br i1 %tmp3, label %bb4, label %bb5 @@ -20,7 +19,7 @@ bb4: ; preds = %bb br label %bb5 bb5: ; preds = %bb4, %bb - call void @llvm.lifetime.end.p0(i64 1, ptr nonnull %tmp1) + call void @llvm.lifetime.end.p0(i64 1, ptr nonnull %tmp) ret void } @@ -38,7 +37,6 @@ define void @_Z3goov() local_unnamed_addr { bb: ; CHECK: bb: ; CHECK-NOT: alloca -; CHECK-NOT: getelementptr ; CHECK-NOT: llvm.lifetime ; CHECK: br i1 ; CHECK: codeRepl.i: @@ -50,7 +48,6 @@ bb: ; CHECK-LABEL: define internal void @_Z3foov.1. ; CHECK: newFuncRoot: ; CHECK-NEXT: %tmp = alloca %class.A -; CHECK-NEXT: %tmp1 = getelementptr ; CHECK-NEXT: call void @llvm.lifetime.start.p0 ; CHECK: call void @llvm.lifetime.end.p0 ; CHECK-NEXT: br label %bb5.exitStub diff --git a/llvm/test/Transforms/ConstantHoisting/AArch64/const-hoist-intrinsics.ll b/llvm/test/Transforms/ConstantHoisting/AArch64/const-hoist-intrinsics.ll index 9b5362d..6bf268b 100644 --- a/llvm/test/Transforms/ConstantHoisting/AArch64/const-hoist-intrinsics.ll +++ b/llvm/test/Transforms/ConstantHoisting/AArch64/const-hoist-intrinsics.ll @@ -61,10 +61,11 @@ entry: declare i64 @llvm.aarch64.udiv.i64.i64(i64, i64) -define void @test_free_intrinsics(i64 %x, ptr %ptr) { +define void @test_free_intrinsics(i64 %x) { ; CHECK-LABEL: @test_free_intrinsics( ; CHECK-NEXT: entry: -; CHECK-NEXT: call void @llvm.lifetime.start.p0(i64 100000000032, ptr [[PTR:%.*]]) +; CHECK-NEXT: [[PTR:%.*]] = alloca i8, align 1 +; CHECK-NEXT: call void @llvm.lifetime.start.p0(i64 100000000032, ptr [[PTR]]) ; CHECK-NEXT: call void @llvm.lifetime.start.p0(i64 100000000064, ptr [[PTR]]) ; CHECK-NEXT: call void @llvm.lifetime.end.p0(i64 100000000128, ptr [[PTR]]) ; CHECK-NEXT: [[I:%.*]] = call ptr @llvm.invariant.start.p0(i64 100000000256, ptr [[PTR]]) @@ -72,6 +73,7 @@ define void @test_free_intrinsics(i64 %x, ptr %ptr) { ; CHECK-NEXT: ret void ; entry: + %ptr = alloca i8 call void @llvm.lifetime.start.p0(i64 100000000032, ptr %ptr) call void @llvm.lifetime.start.p0(i64 100000000064, ptr %ptr) call void @llvm.lifetime.end.p0(i64 100000000128, ptr %ptr) diff --git a/llvm/test/Transforms/DCE/basic.ll b/llvm/test/Transforms/DCE/basic.ll index 134994a..1a3b12e 100644 --- a/llvm/test/Transforms/DCE/basic.ll +++ b/llvm/test/Transforms/DCE/basic.ll @@ -26,47 +26,5 @@ define i32 @test_lifetime_alloca() { ret i32 0 } -; CHECK-LABEL: @test_lifetime_arg -define i32 @test_lifetime_arg(ptr) { -; Check that lifetime intrinsics are removed along with the pointer. -; CHECK-NEXT: #dbg_value -; CHECK-NEXT: ret i32 0 -; CHECK-NOT: llvm.lifetime.start -; CHECK-NOT: llvm.lifetime.end - call void @llvm.lifetime.start.p0(i64 -1, ptr %0) - call void @llvm.lifetime.end.p0(i64 -1, ptr %0) - ret i32 0 -} - -@glob = global i8 1 - -; CHECK-LABEL: @test_lifetime_global -define i32 @test_lifetime_global() { -; Check that lifetime intrinsics are removed along with the pointer. -; CHECK-NEXT: #dbg_value -; CHECK-NEXT: ret i32 0 -; CHECK-NOT: llvm.lifetime.start -; CHECK-NOT: llvm.lifetime.end - call void @llvm.lifetime.start.p0(i64 -1, ptr @glob) - call void @llvm.lifetime.end.p0(i64 -1, ptr @glob) - ret i32 0 -} - -; CHECK-LABEL: @test_lifetime_bitcast -define i32 @test_lifetime_bitcast(ptr %arg) { -; Check that lifetime intrinsics are NOT removed when the pointer is a bitcast. -; It's not uncommon for two bitcasts to be made: one for lifetime, one for use. -; TODO: Support the above case. -; CHECK-NEXT: bitcast -; CHECK-NEXT: #dbg_value -; CHECK-NEXT: llvm.lifetime.start.p0(i64 -1, ptr %cast) -; CHECK-NEXT: llvm.lifetime.end.p0(i64 -1, ptr %cast) -; CHECK-NEXT: ret i32 0 - %cast = bitcast ptr %arg to ptr - call void @llvm.lifetime.start.p0(i64 -1, ptr %cast) - call void @llvm.lifetime.end.p0(i64 -1, ptr %cast) - ret i32 0 -} - ; CHECK: [[add]] = !DILocalVariable ; CHECK: [[sub]] = !DILocalVariable diff --git a/llvm/test/Transforms/DeadStoreElimination/libcalls.ll b/llvm/test/Transforms/DeadStoreElimination/libcalls.ll index 4d9a767..27ad639 100644 --- a/llvm/test/Transforms/DeadStoreElimination/libcalls.ll +++ b/llvm/test/Transforms/DeadStoreElimination/libcalls.ll @@ -67,19 +67,6 @@ define void @test_strcat_with_lifetime(ptr %src) { ret void } -define void @test_strcat_with_lifetime_nonlocal(ptr %dest, ptr %src) { -; CHECK-LABEL: @test_strcat_with_lifetime_nonlocal( -; CHECK-NEXT: call void @llvm.lifetime.start.p0(i64 16, ptr nonnull [[DEST:%.*]]) -; CHECK-NEXT: [[CALL:%.*]] = call ptr @strcat(ptr [[DEST]], ptr [[SRC:%.*]]) -; CHECK-NEXT: call void @llvm.lifetime.end.p0(i64 16, ptr nonnull [[DEST]]) -; CHECK-NEXT: ret void -; - call void @llvm.lifetime.start.p0(i64 16, ptr nonnull %dest) - %call = call ptr @strcat(ptr %dest, ptr %src) - call void @llvm.lifetime.end.p0(i64 16, ptr nonnull %dest) - ret void -} - declare ptr @strncat(ptr %dest, ptr %src, i64 %n) nounwind define void @test4(ptr %src) { ; CHECK-LABEL: @test4( diff --git a/llvm/test/Transforms/DeadStoreElimination/lifetime.ll b/llvm/test/Transforms/DeadStoreElimination/lifetime.ll index 73b9903..19e7b0d 100644 --- a/llvm/test/Transforms/DeadStoreElimination/lifetime.ll +++ b/llvm/test/Transforms/DeadStoreElimination/lifetime.ll @@ -25,12 +25,12 @@ define void @test1() { define void @test2(ptr %P) { ; CHECK-LABEL: @test2( -; CHECK-NEXT: [[Q:%.*]] = getelementptr i32, ptr [[P:%.*]], i32 1 +; CHECK-NEXT: [[Q:%.*]] = alloca i32, align 4 ; CHECK-NEXT: call void @llvm.lifetime.start.p0(i64 4, ptr [[Q]]) ; CHECK-NEXT: call void @llvm.lifetime.end.p0(i64 4, ptr [[Q]]) ; CHECK-NEXT: ret void ; - %Q = getelementptr i32, ptr %P, i32 1 + %Q = alloca i32 call void @llvm.lifetime.start.p0(i64 4, ptr %Q) store i32 0, ptr %Q ;; This store is dead. call void @llvm.lifetime.end.p0(i64 4, ptr %Q) @@ -114,19 +114,19 @@ exit: ; lifetime.end only marks the first two bytes of %A as dead. Make sure ; `store i8 20, ptr %A.2 is not removed. -define void @test5_lifetime_end_partial(ptr %A) { +define void @test5_lifetime_end_partial() { ; CHECK-LABEL: @test5_lifetime_end_partial( -; CHECK-NEXT: call void @llvm.lifetime.start.p0(i64 2, ptr [[A:%.*]]) +; CHECK-NEXT: [[A:%.*]] = alloca [4 x i8], align 1 +; CHECK-NEXT: call void @llvm.lifetime.start.p0(i64 2, ptr [[A]]) ; CHECK-NEXT: [[A_1:%.*]] = getelementptr i8, ptr [[A]], i64 1 ; CHECK-NEXT: [[A_2:%.*]] = getelementptr i8, ptr [[A]], i64 2 ; CHECK-NEXT: store i8 20, ptr [[A_2]], align 1 ; CHECK-NEXT: call void @llvm.lifetime.end.p0(i64 2, ptr [[A]]) ; CHECK-NEXT: call void @use(ptr [[A_1]]) -; CHECK-NEXT: store i8 30, ptr [[A_1]], align 1 -; CHECK-NEXT: store i8 40, ptr [[A_2]], align 1 ; CHECK-NEXT: ret void ; + %A = alloca [4 x i8] call void @llvm.lifetime.start.p0(i64 2, ptr %A) %A.1 = getelementptr i8, ptr %A, i64 1 %A.2 = getelementptr i8, ptr %A, i64 2 diff --git a/llvm/test/Transforms/DeadStoreElimination/multiblock-multipath.ll b/llvm/test/Transforms/DeadStoreElimination/multiblock-multipath.ll index 95bd859..588bdc0 100644 --- a/llvm/test/Transforms/DeadStoreElimination/multiblock-multipath.ll +++ b/llvm/test/Transforms/DeadStoreElimination/multiblock-multipath.ll @@ -398,7 +398,7 @@ bb5: @linenum = external local_unnamed_addr global i32, align 4 -define void @accessible_after_return11_loop() { +define void @accessible_after_return11_loop(ptr noalias %p) { ; CHECK-LABEL: @accessible_after_return11_loop( ; CHECK-NEXT: entry: ; CHECK-NEXT: br label [[FOR_BODY_I:%.*]] @@ -406,7 +406,7 @@ define void @accessible_after_return11_loop() { ; CHECK-NEXT: [[C_1:%.*]] = call i1 @cond() ; CHECK-NEXT: br i1 [[C_1]], label [[FOR_BODY_I]], label [[INIT_PARSE_EXIT:%.*]] ; CHECK: init_parse.exit: -; CHECK-NEXT: call void @llvm.lifetime.end.p0(i64 16, ptr nonnull undef) +; CHECK-NEXT: store i32 1, ptr [[P:%.*]], align 4 ; CHECK-NEXT: store i32 0, ptr @linenum, align 4 ; CHECK-NEXT: br label [[FOR_BODY_I20:%.*]] ; CHECK: for.body.i20: @@ -424,7 +424,7 @@ for.body.i: ; preds = %for.body.i, %entry init_parse.exit: ; preds = %for.body.i store i32 0, ptr @linenum, align 4 - call void @llvm.lifetime.end.p0(i64 16, ptr nonnull undef) #2 + store i32 1, ptr %p store i32 0, ptr @linenum, align 4 br label %for.body.i20 @@ -435,7 +435,6 @@ for.body.i20: ; preds = %for.body.i20, %init exit: ret void } -declare void @llvm.lifetime.end.p0(i64 immarg, ptr nocapture) declare i1 @cond() readnone nounwind ; Tests where the pointer/object is *NOT* accessible after the function returns. diff --git a/llvm/test/Transforms/EarlyCSE/memoryssa.ll b/llvm/test/Transforms/EarlyCSE/memoryssa.ll index 942b6f8..ba4cce4 100644 --- a/llvm/test/Transforms/EarlyCSE/memoryssa.ll +++ b/llvm/test/Transforms/EarlyCSE/memoryssa.ll @@ -142,10 +142,12 @@ end: ;; Check that we respect lifetime.start/lifetime.end intrinsics when deleting ;; stores that, without the lifetime calls, would be writebacks. -define void @test_writeback_lifetimes(ptr %p) { +define void @test_writeback_lifetimes() { ; CHECK-NOMEMSSA-LABEL: @test_writeback_lifetimes( ; CHECK-NOMEMSSA-NEXT: entry: -; CHECK-NOMEMSSA-NEXT: [[Q:%.*]] = getelementptr i32, ptr [[P:%.*]], i64 1 +; CHECK-NOMEMSSA-NEXT: [[P:%.*]] = alloca i64, align 8 +; CHECK-NOMEMSSA-NEXT: call void @llvm.lifetime.start.p0(i64 8, ptr [[P]]) +; CHECK-NOMEMSSA-NEXT: [[Q:%.*]] = getelementptr i32, ptr [[P]], i64 1 ; CHECK-NOMEMSSA-NEXT: [[PV:%.*]] = load i32, ptr [[P]], align 4 ; CHECK-NOMEMSSA-NEXT: [[QV:%.*]] = load i32, ptr [[Q]], align 4 ; CHECK-NOMEMSSA-NEXT: call void @llvm.lifetime.end.p0(i64 8, ptr [[P]]) @@ -156,7 +158,9 @@ define void @test_writeback_lifetimes(ptr %p) { ; ; CHECK-LABEL: @test_writeback_lifetimes( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[Q:%.*]] = getelementptr i32, ptr [[P:%.*]], i64 1 +; CHECK-NEXT: [[P:%.*]] = alloca i64, align 8 +; CHECK-NEXT: call void @llvm.lifetime.start.p0(i64 8, ptr [[P]]) +; CHECK-NEXT: [[Q:%.*]] = getelementptr i32, ptr [[P]], i64 1 ; CHECK-NEXT: [[PV:%.*]] = load i32, ptr [[P]], align 4 ; CHECK-NEXT: [[QV:%.*]] = load i32, ptr [[Q]], align 4 ; CHECK-NEXT: call void @llvm.lifetime.end.p0(i64 8, ptr [[P]]) @@ -166,6 +170,8 @@ define void @test_writeback_lifetimes(ptr %p) { ; CHECK-NEXT: ret void ; entry: + %p = alloca i64 + call void @llvm.lifetime.start.p0(i64 8, ptr %p) %q = getelementptr i32, ptr %p, i64 1 %pv = load i32, ptr %p %qv = load i32, ptr %q @@ -178,10 +184,12 @@ entry: ;; Check that we respect lifetime.start/lifetime.end intrinsics when deleting ;; stores that, without the lifetime calls, would be writebacks. -define void @test_writeback_lifetimes_multi_arg(ptr %p, ptr %q) { +define void @test_writeback_lifetimes_multi_arg(ptr %q) { ; CHECK-NOMEMSSA-LABEL: @test_writeback_lifetimes_multi_arg( ; CHECK-NOMEMSSA-NEXT: entry: -; CHECK-NOMEMSSA-NEXT: [[PV:%.*]] = load i32, ptr [[P:%.*]], align 4 +; CHECK-NOMEMSSA-NEXT: [[P:%.*]] = alloca i64, align 8 +; CHECK-NOMEMSSA-NEXT: call void @llvm.lifetime.start.p0(i64 8, ptr [[P]]) +; CHECK-NOMEMSSA-NEXT: [[PV:%.*]] = load i32, ptr [[P]], align 4 ; CHECK-NOMEMSSA-NEXT: [[QV:%.*]] = load i32, ptr [[Q:%.*]], align 4 ; CHECK-NOMEMSSA-NEXT: call void @llvm.lifetime.end.p0(i64 8, ptr [[P]]) ; CHECK-NOMEMSSA-NEXT: call void @llvm.lifetime.start.p0(i64 8, ptr [[P]]) @@ -191,15 +199,18 @@ define void @test_writeback_lifetimes_multi_arg(ptr %p, ptr %q) { ; ; CHECK-LABEL: @test_writeback_lifetimes_multi_arg( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[PV:%.*]] = load i32, ptr [[P:%.*]], align 4 +; CHECK-NEXT: [[P:%.*]] = alloca i64, align 8 +; CHECK-NEXT: call void @llvm.lifetime.start.p0(i64 8, ptr [[P]]) +; CHECK-NEXT: [[PV:%.*]] = load i32, ptr [[P]], align 4 ; CHECK-NEXT: [[QV:%.*]] = load i32, ptr [[Q:%.*]], align 4 ; CHECK-NEXT: call void @llvm.lifetime.end.p0(i64 8, ptr [[P]]) ; CHECK-NEXT: call void @llvm.lifetime.start.p0(i64 8, ptr [[P]]) ; CHECK-NEXT: store i32 [[PV]], ptr [[P]], align 4 -; CHECK-NEXT: store i32 [[QV]], ptr [[Q]], align 4 ; CHECK-NEXT: ret void ; entry: + %p = alloca i64 + call void @llvm.lifetime.start.p0(i64 8, ptr %p) %pv = load i32, ptr %p %qv = load i32, ptr %q call void @llvm.lifetime.end.p0(i64 8, ptr %p) diff --git a/llvm/test/Transforms/GVN/assume.ll b/llvm/test/Transforms/GVN/assume.ll index 1498aa4..5d3a23b 100644 --- a/llvm/test/Transforms/GVN/assume.ll +++ b/llvm/test/Transforms/GVN/assume.ll @@ -1,6 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py ; RUN: opt < %s -passes=gvn -verify-analysis-invalidation -S | FileCheck --check-prefixes=CHECK,MDEP %s -; RUN: opt < %s -passes='gvn<memoryssa;no-memdep>' -verify-analysis-invalidation -S | FileCheck --check-prefixes=CHECK,MSSA %s +; RUN: opt < %s -passes='gvn<memoryssa>' -verify-analysis-invalidation -S | FileCheck --check-prefixes=CHECK,MSSA %s declare void @llvm.assume(i1) declare void @use(i1) diff --git a/llvm/test/Transforms/GVN/basic.ll b/llvm/test/Transforms/GVN/basic.ll index c1a358a..2e360aa 100644 --- a/llvm/test/Transforms/GVN/basic.ll +++ b/llvm/test/Transforms/GVN/basic.ll @@ -1,6 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 ; RUN: opt < %s -passes=gvn -S | FileCheck %s --check-prefixes=CHECK,MDEP -; RUN: opt < %s -passes='gvn<memoryssa;no-memdep>' -S | FileCheck --check-prefixes=CHECK,MSSA %s +; RUN: opt < %s -passes='gvn<memoryssa>' -S | FileCheck --check-prefixes=CHECK,MSSA %s define i32 @main() { ; CHECK-LABEL: define i32 @main() { diff --git a/llvm/test/Transforms/GVN/nonescaping.ll b/llvm/test/Transforms/GVN/nonescaping.ll index 2913755..0866a27 100644 --- a/llvm/test/Transforms/GVN/nonescaping.ll +++ b/llvm/test/Transforms/GVN/nonescaping.ll @@ -1,6 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py ; RUN: opt < %s -S -passes=gvn 2>&1 | FileCheck %s --check-prefixes=CHECK,MDEP -; RUN: opt < %s -S -passes='gvn<memoryssa;no-memdep>' 2>&1 | FileCheck --check-prefixes=CHECK,MSSA %s +; RUN: opt < %s -S -passes='gvn<memoryssa>' 2>&1 | FileCheck --check-prefixes=CHECK,MSSA %s target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128" diff --git a/llvm/test/Transforms/GVN/opt-remarks.ll b/llvm/test/Transforms/GVN/opt-remarks.ll index 8fb2d57..87cd54d 100644 --- a/llvm/test/Transforms/GVN/opt-remarks.ll +++ b/llvm/test/Transforms/GVN/opt-remarks.ll @@ -107,7 +107,8 @@ entry: ret i32 %add } -define i8 @lifetime_end(ptr %p, i8 %val) { +define i8 @lifetime_end(i8 %val) { + %p = alloca [32 x i8] call void @llvm.lifetime.start.p0(i64 32, ptr %p) store i8 %val, ptr %p call void @llvm.lifetime.end.p0(i64 32, ptr %p) diff --git a/llvm/test/Transforms/GVN/phi.ll b/llvm/test/Transforms/GVN/phi.ll index 5b607f7..a0207cf 100644 --- a/llvm/test/Transforms/GVN/phi.ll +++ b/llvm/test/Transforms/GVN/phi.ll @@ -1,6 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py ; RUN: opt -S -passes=gvn < %s | FileCheck --check-prefixes=CHECK,MDEP %s -; RUN: opt -S -passes='gvn<memoryssa;no-memdep>' < %s | FileCheck %s +; RUN: opt -S -passes='gvn<memoryssa>' < %s | FileCheck %s define i64 @test1(i1 %c, i64 %a, i64 %b) { diff --git a/llvm/test/Transforms/GVN/pr14166.ll b/llvm/test/Transforms/GVN/pr14166.ll index bbc8c89..6e23bdc 100644 --- a/llvm/test/Transforms/GVN/pr14166.ll +++ b/llvm/test/Transforms/GVN/pr14166.ll @@ -1,6 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 ; RUN: opt -disable-basic-aa -passes=gvn -S < %s | FileCheck %s --check-prefixes=CHECK,MDEP -; RUN: opt -disable-basic-aa -passes='gvn<memoryssa;no-memdep>' -S < %s | FileCheck --check-prefixes=CHECK,MSSA %s +; RUN: opt -disable-basic-aa -passes='gvn<memoryssa>' -S < %s | FileCheck --check-prefixes=CHECK,MSSA %s target datalayout = "e-p:32:32:32" define <2 x i32> @test1() { ; MDEP-LABEL: define <2 x i32> @test1() { diff --git a/llvm/test/Transforms/GVN/pre-compare.ll b/llvm/test/Transforms/GVN/pre-compare.ll index 574d40d..c4f083b 100644 --- a/llvm/test/Transforms/GVN/pre-compare.ll +++ b/llvm/test/Transforms/GVN/pre-compare.ll @@ -1,6 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 ; RUN: opt -passes=gvn -S < %s | FileCheck --check-prefixes=CHECK,MDEP %s -; RUN: opt -passes='gvn<memoryssa;no-memdep>' -S < %s | FileCheck --check-prefixes=CHECK,MSSA %s +; RUN: opt -passes='gvn<memoryssa>' -S < %s | FileCheck --check-prefixes=CHECK,MSSA %s ; C source: ; diff --git a/llvm/test/Transforms/GVN/readattrs.ll b/llvm/test/Transforms/GVN/readattrs.ll index be018834..6e02dd3 100644 --- a/llvm/test/Transforms/GVN/readattrs.ll +++ b/llvm/test/Transforms/GVN/readattrs.ll @@ -1,6 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 ; RUN: opt -passes=gvn -S -o - < %s | FileCheck --check-prefixes=CHECK,MDEP %s -; RUN: opt -passes='gvn<memoryssa;no-memdep>' -S -o - < %s | FileCheck --check-prefixes=CHECK,MSSA %s +; RUN: opt -passes='gvn<memoryssa>' -S -o - < %s | FileCheck --check-prefixes=CHECK,MSSA %s target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128" target triple = "x86_64-unknown-linux-gnu" diff --git a/llvm/test/Transforms/GVN/setjmp.ll b/llvm/test/Transforms/GVN/setjmp.ll index 7777038..53518784 100644 --- a/llvm/test/Transforms/GVN/setjmp.ll +++ b/llvm/test/Transforms/GVN/setjmp.ll @@ -1,6 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 ; RUN: opt -S -passes=gvn < %s | FileCheck --check-prefixes=CHECK,MDEP %s -; RUN: opt -S -passes='gvn<memoryssa;no-memdep>' -verify-analysis-invalidation < %s | FileCheck --check-prefixes=CHECK,MSSA %s +; RUN: opt -S -passes='gvn<memoryssa>' -verify-analysis-invalidation < %s | FileCheck --check-prefixes=CHECK,MSSA %s declare i32 @setjmp() returns_twice declare void @longjmp() declare ptr @malloc(i64) diff --git a/llvm/test/Transforms/GVN/tbaa.ll b/llvm/test/Transforms/GVN/tbaa.ll index 366dfec..59ace14 100644 --- a/llvm/test/Transforms/GVN/tbaa.ll +++ b/llvm/test/Transforms/GVN/tbaa.ll @@ -1,6 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 ; RUN: opt -passes=gvn -S < %s | FileCheck --check-prefixes=CHECK,MDEP %s -; RUN: opt -passes='gvn<memoryssa;no-memdep>' -S < %s | FileCheck --check-prefixes=CHECK,MSSA %s +; RUN: opt -passes='gvn<memoryssa>' -S < %s | FileCheck --check-prefixes=CHECK,MSSA %s define i32 @test1(ptr %p, ptr %q) { ; MDEP-LABEL: define i32 @test1( diff --git a/llvm/test/Transforms/GVN/vscale.ll b/llvm/test/Transforms/GVN/vscale.ll index 646a67d..5d6c559 100644 --- a/llvm/test/Transforms/GVN/vscale.ll +++ b/llvm/test/Transforms/GVN/vscale.ll @@ -1,6 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py ; RUN: opt -S < %s -passes=gvn,dce | FileCheck --check-prefixes=CHECK,MDEP %s -; RUN: opt -S < %s -passes='gvn<memoryssa;no-memdep>',dce | FileCheck --check-prefixes=CHECK,MSSA %s +; RUN: opt -S < %s -passes='gvn<memoryssa>',dce | FileCheck --check-prefixes=CHECK,MSSA %s ; Analyze Load from clobbering Load. diff --git a/llvm/test/Transforms/HotColdSplit/lifetime-markers-on-inputs-1.ll b/llvm/test/Transforms/HotColdSplit/lifetime-markers-on-inputs-1.ll index e4e68ae..e5bab0c 100644 --- a/llvm/test/Transforms/HotColdSplit/lifetime-markers-on-inputs-1.ll +++ b/llvm/test/Transforms/HotColdSplit/lifetime-markers-on-inputs-1.ll @@ -36,11 +36,10 @@ outlinedPath: ; These two uses of stack slots are overlapping. This should prevent ; merging of stack slots. CodeExtractor must replicate the effects of ; these markers in the caller to inhibit stack coloring. - %gep1 = getelementptr inbounds i8, ptr %local1, i64 1 - call void @llvm.lifetime.start.p0(i64 1, ptr %gep1) + call void @llvm.lifetime.start.p0(i64 1, ptr %local1) call void @llvm.lifetime.start.p0(i64 1, ptr %local2) call void @cold_use2(ptr %local1, ptr %local2) - call void @llvm.lifetime.end.p0(i64 1, ptr %gep1) + call void @llvm.lifetime.end.p0(i64 1, ptr %local1) call void @llvm.lifetime.end.p0(i64 1, ptr %local2) br i1 undef, label %outlinedPath2, label %outlinedPathExit diff --git a/llvm/test/Transforms/InferAddressSpaces/NVPTX/lifetime.ll b/llvm/test/Transforms/InferAddressSpaces/NVPTX/lifetime.ll index 8bf6312..5926c32 100644 --- a/llvm/test/Transforms/InferAddressSpaces/NVPTX/lifetime.ll +++ b/llvm/test/Transforms/InferAddressSpaces/NVPTX/lifetime.ll @@ -1,3 +1,4 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 ; RUN: opt -S -passes=infer-address-spaces %s | FileCheck %s target triple = "nvptx64-nvidia-cuda" @@ -6,11 +7,13 @@ define i32 @lifetime_flat_pointer() { ; CHECK-LABEL: define i32 @lifetime_flat_pointer() { ; CHECK-NEXT: [[ALLOCA:%.*]] = alloca i32, align 4 ; CHECK-NEXT: [[TMP1:%.*]] = addrspacecast ptr [[ALLOCA]] to ptr addrspace(5) -; CHECK-NEXT: call void @llvm.lifetime.start.p5(i64 4, ptr addrspace(5) [[TMP1]]) +; CHECK-NEXT: [[TMP2:%.*]] = addrspacecast ptr addrspace(5) [[TMP1]] to ptr +; CHECK-NEXT: [[TMP3:%.*]] = addrspacecast ptr addrspace(5) [[TMP1]] to ptr +; CHECK-NEXT: call void @llvm.lifetime.start.p0(i64 4, ptr [[ALLOCA]]) ; CHECK-NEXT: store i32 1, ptr addrspace(5) [[TMP1]], align 4 -; CHECK-NEXT: %ret = load i32, ptr addrspace(5) [[TMP1]], align 4 -; CHECK-NEXT: call void @llvm.lifetime.end.p5(i64 4, ptr addrspace(5) [[TMP1]]) -; CHECK-NEXT: ret i32 %ret +; CHECK-NEXT: [[RET:%.*]] = load i32, ptr addrspace(5) [[TMP1]], align 4 +; CHECK-NEXT: call void @llvm.lifetime.end.p0(i64 4, ptr [[ALLOCA]]) +; CHECK-NEXT: ret i32 [[RET]] ; %alloca = alloca i32, align 4 %1 = addrspacecast ptr %alloca to ptr addrspace(5) diff --git a/llvm/test/Transforms/Inline/alloca-bonus.ll b/llvm/test/Transforms/Inline/alloca-bonus.ll index 1dec660..45ff527 100644 --- a/llvm/test/Transforms/Inline/alloca-bonus.ll +++ b/llvm/test/Transforms/Inline/alloca-bonus.ll @@ -3,8 +3,6 @@ target datalayout = "p:32:32" -declare void @llvm.lifetime.start.p0(i64 %size, ptr nocapture %ptr) - @glbl = external global i32 define void @outer1() { @@ -20,7 +18,6 @@ define void @inner1(ptr %ptr) { store i32 0, ptr %ptr %D = getelementptr inbounds i32, ptr %ptr, i32 1 %F = select i1 false, ptr %ptr, ptr @glbl - call void @llvm.lifetime.start.p0(i64 0, ptr %ptr) call void @extern() ret void } @@ -39,7 +36,6 @@ define void @inner2(ptr %ptr) { store i32 0, ptr %ptr %D = getelementptr inbounds i32, ptr %ptr, i32 %A %F = select i1 false, ptr %ptr, ptr @glbl - call void @llvm.lifetime.start.p0(i64 0, ptr %ptr) call void @extern() ret void } @@ -146,7 +142,6 @@ define void @inner5(i1 %flag, ptr %ptr) { if.then: %D = getelementptr inbounds i32, ptr %ptr, i32 %A %F = select i1 false, ptr %ptr, ptr @glbl - call void @llvm.lifetime.start.p0(i64 0, ptr %ptr) ret void exit: diff --git a/llvm/test/Transforms/Inline/redundant-loads.ll b/llvm/test/Transforms/Inline/redundant-loads.ll index 773be78..3b066ef 100644 --- a/llvm/test/Transforms/Inline/redundant-loads.ll +++ b/llvm/test/Transforms/Inline/redundant-loads.ll @@ -104,11 +104,8 @@ define void @outer6(ptr %a, ptr %ptr) { ret void } -declare void @llvm.lifetime.start.p0(i64, ptr nocapture) argmemonly nounwind - define void @inner6(ptr %a, ptr %ptr) { %1 = load i32, ptr %a - call void @llvm.lifetime.start.p0(i64 32, ptr %ptr) ; This intrinsic does not clobber the first load. %2 = load i32, ptr %a call void @pad() %3 = load i32, ptr %a diff --git a/llvm/test/Transforms/InstCombine/AMDGPU/wmma-f8f6f4.ll b/llvm/test/Transforms/InstCombine/AMDGPU/wmma-f8f6f4.ll new file mode 100644 index 0000000..d255eb0 --- /dev/null +++ b/llvm/test/Transforms/InstCombine/AMDGPU/wmma-f8f6f4.ll @@ -0,0 +1,158 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 +; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -passes=instcombine < %s | FileCheck %s + +; ------------------------------------------------------------------------------------ +; Incorrect signature for format cases (IR vector too large) wmma.f32.16x16x128.f8f6f4 +; ------------------------------------------------------------------------------------ + +define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4___v16i32_fp6___v16i32_fp8(<16 x i32> %A, <16 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) { +; CHECK-LABEL: define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4___v16i32_fp6___v16i32_fp8( +; CHECK-SAME: <16 x i32> [[A:%.*]], <16 x i32> [[B:%.*]], <8 x float> [[C:%.*]], ptr addrspace(1) [[OUT:%.*]]) { +; CHECK-NEXT: [[BB:.*:]] +; CHECK-NEXT: [[TMP0:%.*]] = shufflevector <16 x i32> [[A]], <16 x i32> poison, <12 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11> +; CHECK-NEXT: [[RES:%.*]] = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v12i32.v16i32(i32 2, <12 x i32> [[TMP0]], i32 0, <16 x i32> [[B]], i16 0, <8 x float> [[C]]) +; CHECK-NEXT: store <8 x float> [[RES]], ptr addrspace(1) [[OUT]], align 32 +; CHECK-NEXT: ret void +; +bb: + %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v16i32.v16i32(i32 2, <16 x i32> %A, i32 0, <16 x i32> %B, i16 0, <8 x float> %C) + store <8 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4___v16i32_bf8___v16i32_fp6(<16 x i32> %A, <16 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) { +; CHECK-LABEL: define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4___v16i32_bf8___v16i32_fp6( +; CHECK-SAME: <16 x i32> [[A:%.*]], <16 x i32> [[B:%.*]], <8 x float> [[C:%.*]], ptr addrspace(1) [[OUT:%.*]]) { +; CHECK-NEXT: [[BB:.*:]] +; CHECK-NEXT: [[TMP0:%.*]] = shufflevector <16 x i32> [[B]], <16 x i32> poison, <12 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11> +; CHECK-NEXT: [[RES:%.*]] = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v16i32.v12i32(i32 1, <16 x i32> [[A]], i32 2, <12 x i32> [[TMP0]], i16 0, <8 x float> [[C]]) +; CHECK-NEXT: store <8 x float> [[RES]], ptr addrspace(1) [[OUT]], align 32 +; CHECK-NEXT: ret void +; +bb: + %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v16i32.v16i32(i32 1, <16 x i32> %A, i32 2, <16 x i32> %B, i16 0, <8 x float> %C) + store <8 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4___v16i32_bf6___v16i32_bf8(<16 x i32> %A, <16 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) { +; CHECK-LABEL: define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4___v16i32_bf6___v16i32_bf8( +; CHECK-SAME: <16 x i32> [[A:%.*]], <16 x i32> [[B:%.*]], <8 x float> [[C:%.*]], ptr addrspace(1) [[OUT:%.*]]) { +; CHECK-NEXT: [[BB:.*:]] +; CHECK-NEXT: [[TMP0:%.*]] = shufflevector <16 x i32> [[A]], <16 x i32> poison, <12 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11> +; CHECK-NEXT: [[RES:%.*]] = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v12i32.v16i32(i32 3, <12 x i32> [[TMP0]], i32 1, <16 x i32> [[B]], i16 0, <8 x float> [[C]]) +; CHECK-NEXT: store <8 x float> [[RES]], ptr addrspace(1) [[OUT]], align 32 +; CHECK-NEXT: ret void +; +bb: + %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v16i32.v16i32(i32 3, <16 x i32> %A, i32 1, <16 x i32> %B, i16 0, <8 x float> %C) + store <8 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4___v16i32_bf8___v16i32_bf6(<16 x i32> %A, <16 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) { +; CHECK-LABEL: define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4___v16i32_bf8___v16i32_bf6( +; CHECK-SAME: <16 x i32> [[A:%.*]], <16 x i32> [[B:%.*]], <8 x float> [[C:%.*]], ptr addrspace(1) [[OUT:%.*]]) { +; CHECK-NEXT: [[BB:.*:]] +; CHECK-NEXT: [[TMP0:%.*]] = shufflevector <16 x i32> [[B]], <16 x i32> poison, <12 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11> +; CHECK-NEXT: [[RES:%.*]] = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v16i32.v12i32(i32 1, <16 x i32> [[A]], i32 3, <12 x i32> [[TMP0]], i16 0, <8 x float> [[C]]) +; CHECK-NEXT: store <8 x float> [[RES]], ptr addrspace(1) [[OUT]], align 32 +; CHECK-NEXT: ret void +; +bb: + %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v16i32.v16i32(i32 1, <16 x i32> %A, i32 3, <16 x i32> %B, i16 0, <8 x float> %C) + store <8 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4___v16i32_fp4___v16i32_fp8(<16 x i32> %A, <16 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) { +; CHECK-LABEL: define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4___v16i32_fp4___v16i32_fp8( +; CHECK-SAME: <16 x i32> [[A:%.*]], <16 x i32> [[B:%.*]], <8 x float> [[C:%.*]], ptr addrspace(1) [[OUT:%.*]]) { +; CHECK-NEXT: [[BB:.*:]] +; CHECK-NEXT: [[TMP0:%.*]] = shufflevector <16 x i32> [[A]], <16 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> +; CHECK-NEXT: [[RES:%.*]] = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v8i32.v16i32(i32 4, <8 x i32> [[TMP0]], i32 0, <16 x i32> [[B]], i16 0, <8 x float> [[C]]) +; CHECK-NEXT: store <8 x float> [[RES]], ptr addrspace(1) [[OUT]], align 32 +; CHECK-NEXT: ret void +; +bb: + %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v16i32.v16i32(i32 4, <16 x i32> %A, i32 0, <16 x i32> %B, i16 0, <8 x float> %C) + store <8 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4___v16i32_fp8___v16i32_fp4(<16 x i32> %A, <16 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) { +; CHECK-LABEL: define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4___v16i32_fp8___v16i32_fp4( +; CHECK-SAME: <16 x i32> [[A:%.*]], <16 x i32> [[B:%.*]], <8 x float> [[C:%.*]], ptr addrspace(1) [[OUT:%.*]]) { +; CHECK-NEXT: [[BB:.*:]] +; CHECK-NEXT: [[TMP0:%.*]] = shufflevector <16 x i32> [[B]], <16 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> +; CHECK-NEXT: [[RES:%.*]] = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v16i32.v8i32(i32 0, <16 x i32> [[A]], i32 4, <8 x i32> [[TMP0]], i16 0, <8 x float> [[C]]) +; CHECK-NEXT: store <8 x float> [[RES]], ptr addrspace(1) [[OUT]], align 32 +; CHECK-NEXT: ret void +; +bb: + %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v16i32.v16i32(i32 0, <16 x i32> %A, i32 4, <16 x i32> %B, i16 0, <8 x float> %C) + store <8 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4___v12i32_fp4___v16i32_fp8(<12 x i32> %A, <16 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) { +; CHECK-LABEL: define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4___v12i32_fp4___v16i32_fp8( +; CHECK-SAME: <12 x i32> [[A:%.*]], <16 x i32> [[B:%.*]], <8 x float> [[C:%.*]], ptr addrspace(1) [[OUT:%.*]]) { +; CHECK-NEXT: [[BB:.*:]] +; CHECK-NEXT: [[TMP0:%.*]] = shufflevector <12 x i32> [[A]], <12 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> +; CHECK-NEXT: [[RES:%.*]] = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v8i32.v16i32(i32 4, <8 x i32> [[TMP0]], i32 0, <16 x i32> [[B]], i16 0, <8 x float> [[C]]) +; CHECK-NEXT: store <8 x float> [[RES]], ptr addrspace(1) [[OUT]], align 32 +; CHECK-NEXT: ret void +; +bb: + %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v16i32.v16i32(i32 4, <12 x i32> %A, i32 0, <16 x i32> %B, i16 0, <8 x float> %C) + store <8 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4___v16i32_fp8___v12i32_fp4(<16 x i32> %A, <12 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) { +; CHECK-LABEL: define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4___v16i32_fp8___v12i32_fp4( +; CHECK-SAME: <16 x i32> [[A:%.*]], <12 x i32> [[B:%.*]], <8 x float> [[C:%.*]], ptr addrspace(1) [[OUT:%.*]]) { +; CHECK-NEXT: [[BB:.*:]] +; CHECK-NEXT: [[TMP0:%.*]] = shufflevector <12 x i32> [[B]], <12 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> +; CHECK-NEXT: [[RES:%.*]] = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v16i32.v8i32(i32 0, <16 x i32> [[A]], i32 4, <8 x i32> [[TMP0]], i16 0, <8 x float> [[C]]) +; CHECK-NEXT: store <8 x float> [[RES]], ptr addrspace(1) [[OUT]], align 32 +; CHECK-NEXT: ret void +; +bb: + %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v16i32.v16i32(i32 0, <16 x i32> %A, i32 4, <12 x i32> %B, i16 0, <8 x float> %C) + store <8 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4___v12i32_fp4___v16i32_fp6(<12 x i32> %A, <16 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) { +; CHECK-LABEL: define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4___v12i32_fp4___v16i32_fp6( +; CHECK-SAME: <12 x i32> [[A:%.*]], <16 x i32> [[B:%.*]], <8 x float> [[C:%.*]], ptr addrspace(1) [[OUT:%.*]]) { +; CHECK-NEXT: [[BB:.*:]] +; CHECK-NEXT: [[TMP0:%.*]] = shufflevector <12 x i32> [[A]], <12 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <16 x i32> [[B]], <16 x i32> poison, <12 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11> +; CHECK-NEXT: [[RES:%.*]] = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v8i32.v12i32(i32 4, <8 x i32> [[TMP0]], i32 2, <12 x i32> [[TMP1]], i16 0, <8 x float> [[C]]) +; CHECK-NEXT: store <8 x float> [[RES]], ptr addrspace(1) [[OUT]], align 32 +; CHECK-NEXT: ret void +; +bb: + %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v16i32.v16i32(i32 4, <12 x i32> %A, i32 2, <16 x i32> %B, i16 0, <8 x float> %C) + store <8 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4___v16i32_bf6___v12i32_fp4(<16 x i32> %A, <12 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) { +; CHECK-LABEL: define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4___v16i32_bf6___v12i32_fp4( +; CHECK-SAME: <16 x i32> [[A:%.*]], <12 x i32> [[B:%.*]], <8 x float> [[C:%.*]], ptr addrspace(1) [[OUT:%.*]]) { +; CHECK-NEXT: [[BB:.*:]] +; CHECK-NEXT: [[TMP0:%.*]] = shufflevector <16 x i32> [[A]], <16 x i32> poison, <12 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11> +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <12 x i32> [[B]], <12 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> +; CHECK-NEXT: [[RES:%.*]] = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v12i32.v8i32(i32 3, <12 x i32> [[TMP0]], i32 4, <8 x i32> [[TMP1]], i16 0, <8 x float> [[C]]) +; CHECK-NEXT: store <8 x float> [[RES]], ptr addrspace(1) [[OUT]], align 32 +; CHECK-NEXT: ret void +; +bb: + %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v16i32.v16i32(i32 3, <16 x i32> %A, i32 4, <12 x i32> %B, i16 0, <8 x float> %C) + store <8 x float> %res, ptr addrspace(1) %out + ret void +} diff --git a/llvm/test/Transforms/InstCombine/deadcode.ll b/llvm/test/Transforms/InstCombine/deadcode.ll index e65f0ab..f3e1ba6 100644 --- a/llvm/test/Transforms/InstCombine/deadcode.ll +++ b/llvm/test/Transforms/InstCombine/deadcode.ll @@ -26,8 +26,9 @@ declare void @llvm.lifetime.start.p0(i64, ptr) declare void @llvm.lifetime.end.p0(i64, ptr) define void @test3() { - call void @llvm.lifetime.start.p0(i64 -1, ptr undef) - call void @llvm.lifetime.end.p0(i64 -1, ptr undef) + %a = alloca i32 + call void @llvm.lifetime.start.p0(i64 -1, ptr %a) + call void @llvm.lifetime.end.p0(i64 -1, ptr %a) ret void } diff --git a/llvm/test/Transforms/InstCombine/getelementptr.ll b/llvm/test/Transforms/InstCombine/getelementptr.ll index 752ff0c..bb0a94c 100644 --- a/llvm/test/Transforms/InstCombine/getelementptr.ll +++ b/llvm/test/Transforms/InstCombine/getelementptr.ll @@ -682,15 +682,15 @@ define i32 @test28() nounwind { ; CHECK-NEXT: entry: ; CHECK-NEXT: [[ORIENTATIONS:%.*]] = alloca [1 x [1 x %struct.x]], align 8 ; CHECK-NEXT: [[T3:%.*]] = call i32 @puts(ptr noundef nonnull dereferenceable(1) @.str) #[[ATTR0]] -; CHECK-NEXT: [[T45:%.*]] = getelementptr inbounds nuw i8, ptr [[ORIENTATIONS]], i64 1 ; CHECK-NEXT: br label [[BB10:%.*]] ; CHECK: bb10: ; CHECK-NEXT: [[INDVAR:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INDVAR_NEXT:%.*]], [[BB10]] ] ; CHECK-NEXT: [[T12_REC:%.*]] = xor i32 [[INDVAR]], -1 ; CHECK-NEXT: [[TMP0:%.*]] = sext i32 [[T12_REC]] to i64 -; CHECK-NEXT: [[T12:%.*]] = getelementptr inbounds i8, ptr [[T45]], i64 [[TMP0]] +; CHECK-NEXT: [[TMP1:%.*]] = add nsw i64 [[TMP0]], 1 +; CHECK-NEXT: [[T12:%.*]] = getelementptr inbounds i8, ptr [[ORIENTATIONS]], i64 [[TMP1]] ; CHECK-NEXT: [[T16:%.*]] = call i32 (ptr, ...) @printf(ptr noundef nonnull dereferenceable(1) @.str1, ptr nonnull [[T12]]) #[[ATTR0]] -; CHECK-NEXT: [[T84:%.*]] = icmp eq i32 [[INDVAR]], 0 +; CHECK-NEXT: [[T84:%.*]] = icmp eq i64 [[TMP1]], 0 ; CHECK-NEXT: [[INDVAR_NEXT]] = add i32 [[INDVAR]], 1 ; CHECK-NEXT: br i1 [[T84]], label [[BB17:%.*]], label [[BB10]] ; CHECK: bb17: diff --git a/llvm/test/Transforms/InstCombine/malloc-free.ll b/llvm/test/Transforms/InstCombine/malloc-free.ll index 989074f..d8a1c07 100644 --- a/llvm/test/Transforms/InstCombine/malloc-free.ll +++ b/llvm/test/Transforms/InstCombine/malloc-free.ll @@ -109,8 +109,6 @@ define void @test3(ptr %src) { ; CHECK-NEXT: ret void ; %a = call noalias ptr @malloc(i32 10) - call void @llvm.lifetime.start.p0(i64 10, ptr %a) - call void @llvm.lifetime.end.p0(i64 10, ptr %a) %size = call i64 @llvm.objectsize.i64(ptr %a, i1 true) store i8 42, ptr %a call void @llvm.memcpy.p0.p0.i32(ptr %a, ptr %src, i32 32, i1 false) diff --git a/llvm/test/Transforms/InstCombine/scalable-vector-struct.ll b/llvm/test/Transforms/InstCombine/scalable-vector-struct.ll index 9a0a6ae..95753a2 100644 --- a/llvm/test/Transforms/InstCombine/scalable-vector-struct.ll +++ b/llvm/test/Transforms/InstCombine/scalable-vector-struct.ll @@ -174,16 +174,12 @@ define { <16 x i8>, <32 x i8> } @differenttypes({ <4 x i32>, <8 x i32> } %a, ptr ; CHECK-LABEL: define { <16 x i8>, <32 x i8> } @differenttypes ; CHECK-SAME: ({ <4 x i32>, <8 x i32> } [[A:%.*]], ptr [[P:%.*]]) { ; CHECK-NEXT: entry: -; CHECK-NEXT: call void @llvm.lifetime.start.p0(i64 -1, ptr nonnull [[P]]) ; CHECK-NEXT: store { <4 x i32>, <8 x i32> } [[A]], ptr [[P]], align 16 ; CHECK-NEXT: [[TMP0:%.*]] = load { <16 x i8>, <32 x i8> }, ptr [[P]], align 16 -; CHECK-NEXT: call void @llvm.lifetime.end.p0(i64 -1, ptr nonnull [[P]]) ; CHECK-NEXT: ret { <16 x i8>, <32 x i8> } [[TMP0]] ; entry: - call void @llvm.lifetime.start.p0(i64 -1, ptr nonnull %p) #5 store { <4 x i32>, <8 x i32> } %a, ptr %p, align 16 %2 = load { <16 x i8>, <32 x i8> }, ptr %p, align 16 - call void @llvm.lifetime.end.p0(i64 -1, ptr nonnull %p) #5 ret { <16 x i8>, <32 x i8> } %2 } diff --git a/llvm/test/Transforms/InstCombine/sub-gep.ll b/llvm/test/Transforms/InstCombine/sub-gep.ll index 11af6b4..84e5703 100644 --- a/llvm/test/Transforms/InstCombine/sub-gep.ll +++ b/llvm/test/Transforms/InstCombine/sub-gep.ll @@ -945,19 +945,15 @@ define i64 @multiple_geps_two_chains_gep_base(ptr %base, i64 %base.idx, i64 %idx define i64 @multiple_geps_two_chains_multi_use(ptr %base, i64 %idx1, i64 %idx2, i64 %idx3, i64 %idx4) { ; CHECK-LABEL: @multiple_geps_two_chains_multi_use( -; CHECK-NEXT: [[P2_IDX:%.*]] = shl nsw i64 [[IDX2:%.*]], 2 -; CHECK-NEXT: [[P2:%.*]] = getelementptr inbounds i8, ptr [[P1:%.*]], i64 [[P2_IDX]] -; CHECK-NEXT: [[P4_IDX:%.*]] = shl nsw i64 [[IDX4:%.*]], 2 -; CHECK-NEXT: [[P5:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[P4_IDX]] -; CHECK-NEXT: [[P3_IDX:%.*]] = shl nsw i64 [[IDX3:%.*]], 2 -; CHECK-NEXT: [[P3:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[P3_IDX]] -; CHECK-NEXT: [[P4_IDX1:%.*]] = shl nsw i64 [[IDX5:%.*]], 2 -; CHECK-NEXT: [[P4:%.*]] = getelementptr inbounds i8, ptr [[P3]], i64 [[P4_IDX1]] +; CHECK-NEXT: [[P1_IDX1:%.*]] = add i64 [[IDX1:%.*]], [[IDX2:%.*]] +; CHECK-NEXT: [[P4_IDX:%.*]] = shl i64 [[P1_IDX1]], 2 +; CHECK-NEXT: [[P5:%.*]] = getelementptr inbounds i8, ptr [[P2:%.*]], i64 [[P4_IDX]] +; CHECK-NEXT: [[P3_IDX2:%.*]] = add i64 [[IDX3:%.*]], [[IDX4:%.*]] +; CHECK-NEXT: [[P4_IDX1:%.*]] = shl i64 [[P3_IDX2]], 2 +; CHECK-NEXT: [[P4:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[P4_IDX1]] ; CHECK-NEXT: call void @use(ptr [[P5]]) ; CHECK-NEXT: call void @use(ptr [[P4]]) -; CHECK-NEXT: [[TMP1:%.*]] = add nsw i64 [[P2_IDX]], [[P4_IDX]] -; CHECK-NEXT: [[TMP2:%.*]] = add nsw i64 [[P3_IDX]], [[P4_IDX1]] -; CHECK-NEXT: [[GEPDIFF:%.*]] = sub nsw i64 [[TMP1]], [[TMP2]] +; CHECK-NEXT: [[GEPDIFF:%.*]] = sub nsw i64 [[P4_IDX]], [[P4_IDX1]] ; CHECK-NEXT: ret i64 [[GEPDIFF]] ; %p1 = getelementptr inbounds i32, ptr %base, i64 %idx1 @@ -974,23 +970,18 @@ define i64 @multiple_geps_two_chains_multi_use(ptr %base, i64 %idx1, i64 %idx2, define i64 @multiple_geps_two_chains_partial_multi_use(ptr %base, i64 %idx1, i64 %idx2, i64 %idx3, i64 %idx4, i64 %idx5, i64 %idx6) { ; CHECK-LABEL: @multiple_geps_two_chains_partial_multi_use( -; CHECK-NEXT: [[P2_IDX:%.*]] = shl nsw i64 [[IDX2:%.*]], 2 -; CHECK-NEXT: [[P2:%.*]] = getelementptr inbounds i8, ptr [[P1:%.*]], i64 [[P2_IDX]] -; CHECK-NEXT: [[P4_IDX:%.*]] = shl nsw i64 [[IDX4:%.*]], 2 -; CHECK-NEXT: [[P3:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[P4_IDX]] -; CHECK-NEXT: [[P3_IDX:%.*]] = shl nsw i64 [[IDX3:%.*]], 2 -; CHECK-NEXT: [[P4_IDX1:%.*]] = shl nsw i64 [[IDX7:%.*]], 2 -; CHECK-NEXT: [[P5:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[P4_IDX1]] -; CHECK-NEXT: [[P5_IDX:%.*]] = shl nsw i64 [[IDX5:%.*]], 2 -; CHECK-NEXT: [[P4:%.*]] = getelementptr inbounds i8, ptr [[P5]], i64 [[P5_IDX]] -; CHECK-NEXT: [[P6_IDX:%.*]] = shl nsw i64 [[IDX6:%.*]], 2 +; CHECK-NEXT: [[P1_IDX1:%.*]] = add i64 [[IDX1:%.*]], [[IDX2:%.*]] +; CHECK-NEXT: [[P4_IDX:%.*]] = shl i64 [[P1_IDX1]], 2 +; CHECK-NEXT: [[P3:%.*]] = getelementptr inbounds i8, ptr [[P2:%.*]], i64 [[P4_IDX]] +; CHECK-NEXT: [[P4_IDX2:%.*]] = add i64 [[IDX4:%.*]], [[IDX5:%.*]] +; CHECK-NEXT: [[P5_IDX:%.*]] = shl i64 [[P4_IDX2]], 2 +; CHECK-NEXT: [[P4:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[P5_IDX]] ; CHECK-NEXT: call void @use(ptr [[P3]]) ; CHECK-NEXT: call void @use(ptr [[P4]]) -; CHECK-NEXT: [[TMP1:%.*]] = add nsw i64 [[P2_IDX]], [[P4_IDX]] -; CHECK-NEXT: [[TMP2:%.*]] = add nsw i64 [[TMP1]], [[P3_IDX]] -; CHECK-NEXT: [[TMP3:%.*]] = add nsw i64 [[P4_IDX1]], [[P5_IDX]] -; CHECK-NEXT: [[TMP4:%.*]] = add nsw i64 [[TMP3]], [[P6_IDX]] -; CHECK-NEXT: [[GEPDIFF:%.*]] = sub nsw i64 [[TMP2]], [[TMP4]] +; CHECK-NEXT: [[TMP3:%.*]] = add i64 [[P1_IDX1]], [[IDX3:%.*]] +; CHECK-NEXT: [[TMP4:%.*]] = add i64 [[P4_IDX2]], [[IDX6:%.*]] +; CHECK-NEXT: [[TMP5:%.*]] = sub i64 [[TMP3]], [[TMP4]] +; CHECK-NEXT: [[GEPDIFF:%.*]] = shl i64 [[TMP5]], 2 ; CHECK-NEXT: ret i64 [[GEPDIFF]] ; %p1 = getelementptr inbounds i32, ptr %base, i64 %idx1 @@ -1007,6 +998,29 @@ define i64 @multiple_geps_two_chains_partial_multi_use(ptr %base, i64 %idx1, i64 ret i64 %d } +define i64 @multiple_geps_two_chains_partial_multi_use_insert_point(ptr %p, i64 %idx1, i64 %idx2, i64 %idx3) { +; CHECK-LABEL: @multiple_geps_two_chains_partial_multi_use_insert_point( +; CHECK-NEXT: [[GEP2:%.*]] = getelementptr i8, ptr [[P:%.*]], i64 8 +; CHECK-NEXT: call void @use(ptr [[GEP2]]) +; CHECK-NEXT: [[TMP1:%.*]] = add i64 [[IDX2:%.*]], [[IDX3:%.*]] +; CHECK-NEXT: [[GEP4:%.*]] = getelementptr i8, ptr [[GEP2]], i64 [[TMP1]] +; CHECK-NEXT: call void @use(ptr [[GEP4]]) +; CHECK-NEXT: [[TMP2:%.*]] = add i64 [[TMP1]], 8 +; CHECK-NEXT: [[GEPDIFF:%.*]] = sub i64 [[IDX1:%.*]], [[TMP2]] +; CHECK-NEXT: ret i64 [[GEPDIFF]] +; + %gep1 = getelementptr i8, ptr %p, i64 %idx1 + %gep2 = getelementptr i8, ptr %p, i64 8 + call void @use(ptr %gep2) + %gep3 = getelementptr i8, ptr %gep2, i64 %idx2 + %gep4 = getelementptr i8, ptr %gep3, i64 %idx3 + call void @use(ptr %gep4) + %gep1.int = ptrtoint ptr %gep1 to i64 + %gep4.int = ptrtoint ptr %gep4 to i64 + %sub = sub i64 %gep1.int, %gep4.int + ret i64 %sub +} + define i64 @multiple_geps_inbounds(ptr %base, i64 %idx, i64 %idx2) { ; CHECK-LABEL: @multiple_geps_inbounds( ; CHECK-NEXT: [[D:%.*]] = add nsw i64 [[IDX:%.*]], [[IDX2:%.*]] diff --git a/llvm/test/Transforms/InstSimplify/const-fold-nvvm-unary-arithmetic.ll b/llvm/test/Transforms/InstSimplify/const-fold-nvvm-unary-arithmetic.ll new file mode 100644 index 0000000..75b8509 --- /dev/null +++ b/llvm/test/Transforms/InstSimplify/const-fold-nvvm-unary-arithmetic.ll @@ -0,0 +1,646 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4 +; RUN: opt < %s -passes=instsimplify -march=nvptx64 -S | FileCheck %s + +; Test constant-folding for various NVVM unary arithmetic intrinsics. + +;############################################################### +;# Ceil # +;############################################################### + +define double @test_ceil_d_1_25() { +; CHECK-LABEL: define double @test_ceil_d_1_25() { +; CHECK-NEXT: ret double 2.000000e+00 +; + %res = call double @llvm.nvvm.ceil.d(double 1.25) + ret double %res +} + +define float @test_ceil_f_1_25() { +; CHECK-LABEL: define float @test_ceil_f_1_25() { +; CHECK-NEXT: ret float 2.000000e+00 +; + %res = call float @llvm.nvvm.ceil.f(float 1.25) + ret float %res +} + +define float @test_ceil_ftz_f_1_25() { +; CHECK-LABEL: define float @test_ceil_ftz_f_1_25() { +; CHECK-NEXT: ret float 2.000000e+00 +; + %res = call float @llvm.nvvm.ceil.ftz.f(float 1.25) + ret float %res +} + +define double @test_ceil_d_pos_subnorm() { +; CHECK-LABEL: define double @test_ceil_d_pos_subnorm() { +; CHECK-NEXT: ret double 1.000000e+00 +; + %res = call double @llvm.nvvm.ceil.d(double 0x380FFFFFC0000000) + ret double %res +} + +define float @test_ceil_f_pos_subnorm() { +; CHECK-LABEL: define float @test_ceil_f_pos_subnorm() { +; CHECK-NEXT: ret float 1.000000e+00 +; + %res = call float @llvm.nvvm.ceil.f(float 0x380FFFFFC0000000) + ret float %res +} + +define float @test_ceil_ftz_f_pos_subnorm() { +; CHECK-LABEL: define float @test_ceil_ftz_f_pos_subnorm() { +; CHECK-NEXT: ret float 0.000000e+00 +; + %res = call float @llvm.nvvm.ceil.ftz.f(float 0x380FFFFFC0000000) + ret float %res +} + +;############################################################### +;# FAbs # +;############################################################### + +define float @test_fabs_neg_1_5() { +; CHECK-LABEL: define float @test_fabs_neg_1_5() { +; CHECK-NEXT: ret float 1.500000e+00 +; + %res = call float @llvm.nvvm.fabs(float -1.5) + ret float %res +} + +define float @test_fabs_ftz_neg_1_5() { +; CHECK-LABEL: define float @test_fabs_ftz_neg_1_5() { +; CHECK-NEXT: ret float 1.500000e+00 +; + %res = call float @llvm.nvvm.fabs.ftz(float -1.5) + ret float %res +} + +define float @test_fabs_1_25() { +; CHECK-LABEL: define float @test_fabs_1_25() { +; CHECK-NEXT: ret float 1.250000e+00 +; + %res = call float @llvm.nvvm.fabs(float 1.25) + ret float %res +} + +define float @test_fabs_ftz_1_25() { +; CHECK-LABEL: define float @test_fabs_ftz_1_25() { +; CHECK-NEXT: ret float 1.250000e+00 +; + %res = call float @llvm.nvvm.fabs.ftz(float 1.25) + ret float %res +} + +define float @test_fabs_neg_subnorm() { +; CHECK-LABEL: define float @test_fabs_neg_subnorm() { +; CHECK-NEXT: ret float 0x380FFFFFC0000000 +; + %res = call float @llvm.nvvm.fabs(float 0xB80FFFFFC0000000) + ret float %res +} + +define float @test_fabs_ftz_neg_subnorm() { +; CHECK-LABEL: define float @test_fabs_ftz_neg_subnorm() { +; CHECK-NEXT: ret float 0.000000e+00 +; + %res = call float @llvm.nvvm.fabs.ftz(float 0xB80FFFFFC0000000) + ret float %res +} + +define float @test_fabs_pos_subnorm() { +; CHECK-LABEL: define float @test_fabs_pos_subnorm() { +; CHECK-NEXT: ret float 0x380FFFFFC0000000 +; + %res = call float @llvm.nvvm.fabs(float 0x380FFFFFC0000000) + ret float %res +} + +define float @test_fabs_ftz_pos_subnorm() { +; CHECK-LABEL: define float @test_fabs_ftz_pos_subnorm() { +; CHECK-NEXT: ret float 0.000000e+00 +; + %res = call float @llvm.nvvm.fabs.ftz(float 0x380FFFFFC0000000) + ret float %res +} + + +;############################################################### +;# Floor # +;############################################################### + +define double @test_floor_d_1_25() { +; CHECK-LABEL: define double @test_floor_d_1_25() { +; CHECK-NEXT: ret double 1.000000e+00 +; + %res = call double @llvm.nvvm.floor.d(double 1.25) + ret double %res +} + +define float @test_floor_f_1_25() { +; CHECK-LABEL: define float @test_floor_f_1_25() { +; CHECK-NEXT: ret float 1.000000e+00 +; + %res = call float @llvm.nvvm.floor.f(float 1.25) + ret float %res +} + +define float @test_floor_ftz_f_1_25() { +; CHECK-LABEL: define float @test_floor_ftz_f_1_25() { +; CHECK-NEXT: ret float 1.000000e+00 +; + %res = call float @llvm.nvvm.floor.ftz.f(float 1.25) + ret float %res +} + +define double @test_floor_d_neg_subnorm() { +; CHECK-LABEL: define double @test_floor_d_neg_subnorm() { +; CHECK-NEXT: ret double -1.000000e+00 +; + %res = call double @llvm.nvvm.floor.d(double 0xB80FFFFFC0000000) + ret double %res +} + +define float @test_floor_f_neg_subnorm() { +; CHECK-LABEL: define float @test_floor_f_neg_subnorm() { +; CHECK-NEXT: ret float -1.000000e+00 +; + %res = call float @llvm.nvvm.floor.f(float 0xB80FFFFFC0000000) + ret float %res +} + +define float @test_floor_ftz_f_neg_subnorm() { +; CHECK-LABEL: define float @test_floor_ftz_f_neg_subnorm() { +; CHECK-NEXT: ret float -0.000000e+00 +; + %res = call float @llvm.nvvm.floor.ftz.f(float 0xB80FFFFFC0000000) + ret float %res +} + +;############################################################### +;# Rcp # +;############################################################### + +;+-------------------------------------------------------------+ +;| rcp_rm | +;+-------------------------------------------------------------+ +define double @test_rcp_rm_d_0_5() { +; CHECK-LABEL: define double @test_rcp_rm_d_0_5() { +; CHECK-NEXT: ret double 2.000000e+00 +; + %res = call double @llvm.nvvm.rcp.rm.d(double 0.5) + ret double %res +} + +define float @test_rcp_rm_f_0_5() { +; CHECK-LABEL: define float @test_rcp_rm_f_0_5() { +; CHECK-NEXT: ret float 2.000000e+00 +; + %res = call float @llvm.nvvm.rcp.rm.f(float 0.5) + ret float %res +} + +define float @test_rcp_rm_ftz_f_0_5() { +; CHECK-LABEL: define float @test_rcp_rm_ftz_f_0_5() { +; CHECK-NEXT: ret float 2.000000e+00 +; + %res = call float @llvm.nvvm.rcp.rm.ftz.f(float 0.5) + ret float %res +} + +define double @test_rcp_rm_d_neg_subnorm() { +; CHECK-LABEL: define double @test_rcp_rm_d_neg_subnorm() { +; CHECK-NEXT: ret double 0xC7D0000020000041 +; + %res = call double @llvm.nvvm.rcp.rm.d(double 0xB80FFFFFC0000000) + ret double %res +} + +define float @test_rcp_rm_f_neg_subnorm() { +; CHECK-LABEL: define float @test_rcp_rm_f_neg_subnorm() { +; CHECK-NEXT: ret float 0xC7D0000040000000 +; + %res = call float @llvm.nvvm.rcp.rm.f(float 0xB80FFFFFC0000000) + ret float %res +} + +define float @test_rcp_rm_ftz_f_neg_subnorm() { +; CHECK-LABEL: define float @test_rcp_rm_ftz_f_neg_subnorm() { +; CHECK-NEXT: [[RES:%.*]] = call float @llvm.nvvm.rcp.rm.ftz.f(float 0xB80FFFFFC0000000) +; CHECK-NEXT: ret float [[RES]] +; + %res = call float @llvm.nvvm.rcp.rm.ftz.f(float 0xB80FFFFFC0000000) + ret float %res +} + +;+-------------------------------------------------------------+ +;| rcp_rn | +;+-------------------------------------------------------------+ +define double @test_rcp_rn_d_0_5() { +; CHECK-LABEL: define double @test_rcp_rn_d_0_5() { +; CHECK-NEXT: ret double 2.000000e+00 +; + %res = call double @llvm.nvvm.rcp.rn.d(double 0.5) + ret double %res +} + +define float @test_rcp_rn_f_0_5() { +; CHECK-LABEL: define float @test_rcp_rn_f_0_5() { +; CHECK-NEXT: ret float 2.000000e+00 +; + %res = call float @llvm.nvvm.rcp.rn.f(float 0.5) + ret float %res +} + +define float @test_rcp_rn_ftz_f_0_5() { +; CHECK-LABEL: define float @test_rcp_rn_ftz_f_0_5() { +; CHECK-NEXT: ret float 2.000000e+00 +; + %res = call float @llvm.nvvm.rcp.rn.ftz.f(float 0.5) + ret float %res +} + +define double @test_rcp_rn_d_neg_subnorm() { +; CHECK-LABEL: define double @test_rcp_rn_d_neg_subnorm() { +; CHECK-NEXT: ret double 0xC7D0000020000040 +; + %res = call double @llvm.nvvm.rcp.rn.d(double 0xB80FFFFFC0000000) + ret double %res +} + +define float @test_rcp_rn_f_neg_subnorm() { +; CHECK-LABEL: define float @test_rcp_rn_f_neg_subnorm() { +; CHECK-NEXT: ret float 0xC7D0000020000000 +; + %res = call float @llvm.nvvm.rcp.rn.f(float 0xB80FFFFFC0000000) + ret float %res +} + +define float @test_rcp_rn_ftz_f_neg_subnorm() { +; CHECK-LABEL: define float @test_rcp_rn_ftz_f_neg_subnorm() { +; CHECK-NEXT: [[RES:%.*]] = call float @llvm.nvvm.rcp.rn.ftz.f(float 0xB80FFFFFC0000000) +; CHECK-NEXT: ret float [[RES]] +; + %res = call float @llvm.nvvm.rcp.rn.ftz.f(float 0xB80FFFFFC0000000) + ret float %res +} + +;+-------------------------------------------------------------+ +;| rcp_rp | +;+-------------------------------------------------------------+ +define double @test_rcp_rp_d_0_5() { +; CHECK-LABEL: define double @test_rcp_rp_d_0_5() { +; CHECK-NEXT: ret double 2.000000e+00 +; + %res = call double @llvm.nvvm.rcp.rp.d(double 0.5) + ret double %res +} + +define float @test_rcp_rp_f_0_5() { +; CHECK-LABEL: define float @test_rcp_rp_f_0_5() { +; CHECK-NEXT: ret float 2.000000e+00 +; + %res = call float @llvm.nvvm.rcp.rp.f(float 0.5) + ret float %res +} + +define float @test_rcp_rp_ftz_f_0_5() { +; CHECK-LABEL: define float @test_rcp_rp_ftz_f_0_5() { +; CHECK-NEXT: ret float 2.000000e+00 +; + %res = call float @llvm.nvvm.rcp.rp.ftz.f(float 0.5) + ret float %res +} + +define double @test_rcp_rp_d_neg_subnorm() { +; CHECK-LABEL: define double @test_rcp_rp_d_neg_subnorm() { +; CHECK-NEXT: ret double 0xC7D0000020000040 +; + %res = call double @llvm.nvvm.rcp.rp.d(double 0xB80FFFFFC0000000) + ret double %res +} + +define float @test_rcp_rp_f_neg_subnorm() { +; CHECK-LABEL: define float @test_rcp_rp_f_neg_subnorm() { +; CHECK-NEXT: ret float 0xC7D0000020000000 +; + %res = call float @llvm.nvvm.rcp.rp.f(float 0xB80FFFFFC0000000) + ret float %res +} + +define float @test_rcp_rp_ftz_f_neg_subnorm() { +; CHECK-LABEL: define float @test_rcp_rp_ftz_f_neg_subnorm() { +; CHECK-NEXT: [[RES:%.*]] = call float @llvm.nvvm.rcp.rp.ftz.f(float 0xB80FFFFFC0000000) +; CHECK-NEXT: ret float [[RES]] +; + %res = call float @llvm.nvvm.rcp.rp.ftz.f(float 0xB80FFFFFC0000000) + ret float %res +} + +;+-------------------------------------------------------------+ +;| rcp_rz | +;+-------------------------------------------------------------+ +define double @test_rcp_rz_d_0_5() { +; CHECK-LABEL: define double @test_rcp_rz_d_0_5() { +; CHECK-NEXT: ret double 2.000000e+00 +; + %res = call double @llvm.nvvm.rcp.rz.d(double 0.5) + ret double %res +} + +define float @test_rcp_rz_f_0_5() { +; CHECK-LABEL: define float @test_rcp_rz_f_0_5() { +; CHECK-NEXT: ret float 2.000000e+00 +; + %res = call float @llvm.nvvm.rcp.rz.f(float 0.5) + ret float %res +} + +define float @test_rcp_rz_ftz_f_0_5() { +; CHECK-LABEL: define float @test_rcp_rz_ftz_f_0_5() { +; CHECK-NEXT: ret float 2.000000e+00 +; + %res = call float @llvm.nvvm.rcp.rz.ftz.f(float 0.5) + ret float %res +} + +define double @test_rcp_rz_d_neg_subnorm() { +; CHECK-LABEL: define double @test_rcp_rz_d_neg_subnorm() { +; CHECK-NEXT: ret double 0xC7D0000020000040 +; + %res = call double @llvm.nvvm.rcp.rz.d(double 0xB80FFFFFC0000000) + ret double %res +} + +define float @test_rcp_rz_f_neg_subnorm() { +; CHECK-LABEL: define float @test_rcp_rz_f_neg_subnorm() { +; CHECK-NEXT: ret float 0xC7D0000020000000 +; + %res = call float @llvm.nvvm.rcp.rz.f(float 0xB80FFFFFC0000000) + ret float %res +} + +define float @test_rcp_rz_ftz_f_neg_subnorm() { +; CHECK-LABEL: define float @test_rcp_rz_ftz_f_neg_subnorm() { +; CHECK-NEXT: [[RES:%.*]] = call float @llvm.nvvm.rcp.rz.ftz.f(float 0xB80FFFFFC0000000) +; CHECK-NEXT: ret float [[RES]] +; + %res = call float @llvm.nvvm.rcp.rz.ftz.f(float 0xB80FFFFFC0000000) + ret float %res +} + +;############################################################### +;# Round # +;############################################################### + +define double @test_round_d_neg_1_5() { +; CHECK-LABEL: define double @test_round_d_neg_1_5() { +; CHECK-NEXT: ret double -2.000000e+00 +; + %res = call double @llvm.nvvm.round.d(double -1.5) + ret double %res +} + +define float @test_round_f_neg_1_5() { +; CHECK-LABEL: define float @test_round_f_neg_1_5() { +; CHECK-NEXT: ret float -2.000000e+00 +; + %res = call float @llvm.nvvm.round.f(float -1.5) + ret float %res +} + +define float @test_round_ftz_f_neg_1_5() { +; CHECK-LABEL: define float @test_round_ftz_f_neg_1_5() { +; CHECK-NEXT: ret float -2.000000e+00 +; + %res = call float @llvm.nvvm.round.ftz.f(float -1.5) + ret float %res +} + +define double @test_round_d_neg_subnorm() { +; CHECK-LABEL: define double @test_round_d_neg_subnorm() { +; CHECK-NEXT: ret double -0.000000e+00 +; + %res = call double @llvm.nvvm.round.d(double 0xB80FFFFFC0000000) + ret double %res +} + +define float @test_round_f_neg_subnorm() { +; CHECK-LABEL: define float @test_round_f_neg_subnorm() { +; CHECK-NEXT: ret float -0.000000e+00 +; + %res = call float @llvm.nvvm.round.f(float 0xB80FFFFFC0000000) + ret float %res +} + +define float @test_round_ftz_f_neg_subnorm() { +; CHECK-LABEL: define float @test_round_ftz_f_neg_subnorm() { +; CHECK-NEXT: ret float -0.000000e+00 +; + %res = call float @llvm.nvvm.round.ftz.f(float 0xB80FFFFFC0000000) + ret float %res +} + +;############################################################### +;# Saturate # +;############################################################### + +define double @test_saturate_d_1_25() { +; CHECK-LABEL: define double @test_saturate_d_1_25() { +; CHECK-NEXT: ret double 1.000000e+00 +; + %res = call double @llvm.nvvm.saturate.d(double 1.25) + ret double %res +} + +define float @test_saturate_f_1_25() { +; CHECK-LABEL: define float @test_saturate_f_1_25() { +; CHECK-NEXT: ret float 1.000000e+00 +; + %res = call float @llvm.nvvm.saturate.f(float 1.25) + ret float %res +} + +define float @test_saturate_ftz_f_1_25() { +; CHECK-LABEL: define float @test_saturate_ftz_f_1_25() { +; CHECK-NEXT: ret float 1.000000e+00 +; + %res = call float @llvm.nvvm.saturate.ftz.f(float 1.25) + ret float %res +} + +define double @test_saturate_d_neg_1_25() { +; CHECK-LABEL: define double @test_saturate_d_neg_1_25() { +; CHECK-NEXT: ret double 0.000000e+00 +; + %res = call double @llvm.nvvm.saturate.d(double -1.25) + ret double %res +} + +define float @test_saturate_f_neg_1_25() { +; CHECK-LABEL: define float @test_saturate_f_neg_1_25() { +; CHECK-NEXT: ret float 0.000000e+00 +; + %res = call float @llvm.nvvm.saturate.f(float -1.25) + ret float %res +} + +define float @test_saturate_ftz_f_neg_1_25() { +; CHECK-LABEL: define float @test_saturate_ftz_f_neg_1_25() { +; CHECK-NEXT: ret float 0.000000e+00 +; + %res = call float @llvm.nvvm.saturate.ftz.f(float -1.25) + ret float %res +} + +define double @test_saturate_d_0_5() { +; CHECK-LABEL: define double @test_saturate_d_0_5() { +; CHECK-NEXT: ret double 5.000000e-01 +; + %res = call double @llvm.nvvm.saturate.d(double 0.5) + ret double %res +} + +define float @test_saturate_f_0_5() { +; CHECK-LABEL: define float @test_saturate_f_0_5() { +; CHECK-NEXT: ret float 5.000000e-01 +; + %res = call float @llvm.nvvm.saturate.f(float 0.5) + ret float %res +} + +define float @test_saturate_ftz_f_0_5() { +; CHECK-LABEL: define float @test_saturate_ftz_f_0_5() { +; CHECK-NEXT: ret float 5.000000e-01 +; + %res = call float @llvm.nvvm.saturate.ftz.f(float 0.5) + ret float %res +} + +define double @test_saturate_d_pos_subnorm() { +; CHECK-LABEL: define double @test_saturate_d_pos_subnorm() { +; CHECK-NEXT: ret double 0x380FFFFFC0000000 +; + %res = call double @llvm.nvvm.saturate.d(double 0x380FFFFFC0000000) + ret double %res +} + +define float @test_saturate_f_pos_subnorm() { +; CHECK-LABEL: define float @test_saturate_f_pos_subnorm() { +; CHECK-NEXT: ret float 0x380FFFFFC0000000 +; + %res = call float @llvm.nvvm.saturate.f(float 0x380FFFFFC0000000) + ret float %res +} + +define float @test_saturate_ftz_f_pos_subnorm() { +; CHECK-LABEL: define float @test_saturate_ftz_f_pos_subnorm() { +; CHECK-NEXT: ret float 0.000000e+00 +; + %res = call float @llvm.nvvm.saturate.ftz.f(float 0x380FFFFFC0000000) + ret float %res +} + +;############################################################### +;# Sqrt # +;############################################################### + +define float @test_sqrt_f_4() { +; CHECK-LABEL: define float @test_sqrt_f_4() { +; CHECK-NEXT: ret float 2.000000e+00 +; + %res = call float @llvm.nvvm.sqrt.f(float 4.0) + ret float %res +} + +define float @test_sqrt_rn_f_4() { +; CHECK-LABEL: define float @test_sqrt_rn_f_4() { +; CHECK-NEXT: ret float 2.000000e+00 +; + %res = call float @llvm.nvvm.sqrt.rn.f(float 4.0) + ret float %res +} + +define double @test_sqrt_rn_d_4() { +; CHECK-LABEL: define double @test_sqrt_rn_d_4() { +; CHECK-NEXT: ret double 2.000000e+00 +; + %res = call double @llvm.nvvm.sqrt.rn.d(double 4.0) + ret double %res +} + +define float @test_sqrt_rn_ftz_f_4() { +; CHECK-LABEL: define float @test_sqrt_rn_ftz_f_4() { +; CHECK-NEXT: ret float 2.000000e+00 +; + %res = call float @llvm.nvvm.sqrt.rn.ftz.f(float 4.0) + ret float %res +} + +define float @test_sqrt_f_pos_subnorm() { +; CHECK-LABEL: define float @test_sqrt_f_pos_subnorm() { +; CHECK-NEXT: ret float 0x3BFFFFFFE0000000 +; + %res = call float @llvm.nvvm.sqrt.f(float 0x380FFFFFC0000000) + ret float %res +} + +define float @test_sqrt_rn_f_pos_subnorm() { +; CHECK-LABEL: define float @test_sqrt_rn_f_pos_subnorm() { +; CHECK-NEXT: ret float 0x3BFFFFFFE0000000 +; + %res = call float @llvm.nvvm.sqrt.rn.f(float 0x380FFFFFC0000000) + ret float %res +} + +define double @test_sqrt_rn_d_pos_subnorm() { +; CHECK-LABEL: define double @test_sqrt_rn_d_pos_subnorm() { +; CHECK-NEXT: ret double 0x3BFFFFFFDFFFFFF0 +; + %res = call double @llvm.nvvm.sqrt.rn.d(double 0x380FFFFFC0000000) + ret double %res +} + +define float @test_sqrt_rn_ftz_f_pos_subnorm() { +; CHECK-LABEL: define float @test_sqrt_rn_ftz_f_pos_subnorm() { +; CHECK-NEXT: ret float 0.000000e+00 +; + %res = call float @llvm.nvvm.sqrt.rn.ftz.f(float 0x380FFFFFC0000000) + ret float %res +} + +declare double @llvm.nvvm.ceil.d(double) +declare float @llvm.nvvm.ceil.f(float) +declare float @llvm.nvvm.ceil.ftz.f(float) + +declare float @llvm.nvvm.fabs(float) +declare float @llvm.nvvm.fabs.ftz(float) + +declare double @llvm.nvvm.floor.d(double) +declare float @llvm.nvvm.floor.f(float) +declare float @llvm.nvvm.floor.ftz.f(float) + +declare double @llvm.nvvm.rcp.rm.d(double) +declare float @llvm.nvvm.rcp.rm.f(float) +declare float @llvm.nvvm.rcp.rm.ftz.f(float) +declare double @llvm.nvvm.rcp.rn.d(double) +declare float @llvm.nvvm.rcp.rn.f(float) +declare float @llvm.nvvm.rcp.rn.ftz.f(float) +declare double @llvm.nvvm.rcp.rp.d(double) +declare float @llvm.nvvm.rcp.rp.f(float) +declare float @llvm.nvvm.rcp.rp.ftz.f(float) +declare double @llvm.nvvm.rcp.rz.d(double) +declare float @llvm.nvvm.rcp.rz.f(float) +declare float @llvm.nvvm.rcp.rz.ftz.f(float) + +declare double @llvm.nvvm.round.d(double) +declare float @llvm.nvvm.round.f(float) +declare float @llvm.nvvm.round.ftz.f(float) + +declare double @llvm.nvvm.saturate.d(double) +declare float @llvm.nvvm.saturate.f(float) +declare float @llvm.nvvm.saturate.ftz.f(float) + +declare float @llvm.nvvm.sqrt.f(float) +declare double @llvm.nvvm.sqrt.rn.d(double) +declare float @llvm.nvvm.sqrt.rn.f(float) +declare float @llvm.nvvm.sqrt.rn.ftz.f(float) diff --git a/llvm/test/Transforms/InstSimplify/exp10.ll b/llvm/test/Transforms/InstSimplify/exp10.ll index c415c41..17c0811 100644 --- a/llvm/test/Transforms/InstSimplify/exp10.ll +++ b/llvm/test/Transforms/InstSimplify/exp10.ll @@ -57,8 +57,7 @@ define <vscale x 2 x float> @exp10_exp10_scalable_vector(<vscale x 2 x float> %x define float @exp10_poison() { ; CHECK-LABEL: define float @exp10_poison() { -; CHECK-NEXT: [[RET:%.*]] = call float @llvm.exp10.f32(float poison) -; CHECK-NEXT: ret float [[RET]] +; CHECK-NEXT: ret float poison ; %ret = call float @llvm.exp10.f32(float poison) ret float %ret @@ -66,8 +65,7 @@ define float @exp10_poison() { define <2 x float> @exp10_poison_vector() { ; CHECK-LABEL: define <2 x float> @exp10_poison_vector() { -; CHECK-NEXT: [[RET:%.*]] = call <2 x float> @llvm.exp10.v2f32(<2 x float> poison) -; CHECK-NEXT: ret <2 x float> [[RET]] +; CHECK-NEXT: ret <2 x float> poison ; %ret = call <2 x float> @llvm.exp10.v2f32(<2 x float> poison) ret <2 x float> %ret @@ -75,8 +73,7 @@ define <2 x float> @exp10_poison_vector() { define <vscale x 2 x float> @exp10_poison_scaleable_vector() { ; CHECK-LABEL: define <vscale x 2 x float> @exp10_poison_scaleable_vector() { -; CHECK-NEXT: [[RET:%.*]] = call <vscale x 2 x float> @llvm.exp10.nxv2f32(<vscale x 2 x float> poison) -; CHECK-NEXT: ret <vscale x 2 x float> [[RET]] +; CHECK-NEXT: ret <vscale x 2 x float> poison ; %ret = call <vscale x 2 x float> @llvm.exp10.nxv2f32(<vscale x 2 x float> poison) ret <vscale x 2 x float> %ret diff --git a/llvm/test/Transforms/InstSimplify/fold-intrinsics.ll b/llvm/test/Transforms/InstSimplify/fold-intrinsics.ll index e4cfa46..45f5e37 100644 --- a/llvm/test/Transforms/InstSimplify/fold-intrinsics.ll +++ b/llvm/test/Transforms/InstSimplify/fold-intrinsics.ll @@ -286,3 +286,327 @@ define void @tanh_poison(ptr %P) { ret void } + + +define void @exp_poison(ptr %P) { +; CHECK-LABEL: @exp_poison( +; CHECK-NEXT: store volatile float poison, ptr [[P:%.*]], align 4 +; CHECK-NEXT: store volatile <2 x float> poison, ptr [[P]], align 8 +; CHECK-NEXT: store volatile <4 x double> poison, ptr [[P]], align 32 +; CHECK-NEXT: store volatile float poison, ptr [[P]], align 4 +; CHECK-NEXT: store volatile <2 x float> poison, ptr [[P]], align 8 +; CHECK-NEXT: store volatile <4 x double> poison, ptr [[P]], align 32 +; CHECK-NEXT: store volatile float poison, ptr [[P]], align 4 +; CHECK-NEXT: store volatile <2 x float> poison, ptr [[P]], align 8 +; CHECK-NEXT: store volatile <4 x double> poison, ptr [[P]], align 32 +; CHECK-NEXT: ret void +; + %exp_f32 = call float @llvm.exp(float poison) + store volatile float %exp_f32, ptr %P + + %exp_2xf32 = call <2 x float> @llvm.exp(<2 x float> poison) + store volatile <2 x float> %exp_2xf32, ptr %P + + %exp_4xf64 = call <4 x double> @llvm.exp(<4 x double> poison) + store volatile <4 x double> %exp_4xf64, ptr %P + + %exp2_f32 = call float @llvm.exp2(float poison) + store volatile float %exp2_f32, ptr %P + + %exp2_2xf32 = call <2 x float> @llvm.exp2(<2 x float> poison) + store volatile <2 x float> %exp2_2xf32, ptr %P + + %exp2_4xf64 = call <4 x double> @llvm.exp2(<4 x double> poison) + store volatile <4 x double> %exp2_4xf64, ptr %P + + %exp10_f32 = call float @llvm.exp10(float poison) + store volatile float %exp10_f32, ptr %P + + %exp10_2xf32 = call <2 x float> @llvm.exp10(<2 x float> poison) + store volatile <2 x float> %exp10_2xf32, ptr %P + + %exp10_4xf64 = call <4 x double> @llvm.exp10(<4 x double> poison) + store volatile <4 x double> %exp10_4xf64, ptr %P + ret void +} + + +define void @log_poison(ptr %P) { +; CHECK-LABEL: @log_poison( +; CHECK-NEXT: store volatile float poison, ptr [[P:%.*]], align 4 +; CHECK-NEXT: store volatile <2 x float> poison, ptr [[P]], align 8 +; CHECK-NEXT: store volatile <4 x double> poison, ptr [[P]], align 32 +; CHECK-NEXT: store volatile float poison, ptr [[P]], align 4 +; CHECK-NEXT: store volatile <2 x float> poison, ptr [[P]], align 8 +; CHECK-NEXT: store volatile <4 x double> poison, ptr [[P]], align 32 +; CHECK-NEXT: store volatile float poison, ptr [[P]], align 4 +; CHECK-NEXT: store volatile <2 x float> poison, ptr [[P]], align 8 +; CHECK-NEXT: store volatile <4 x double> poison, ptr [[P]], align 32 +; CHECK-NEXT: ret void +; + %log_f32 = call float @llvm.log(float poison) + store volatile float %log_f32, ptr %P + + %log_2xf32 = call <2 x float> @llvm.log(<2 x float> poison) + store volatile <2 x float> %log_2xf32, ptr %P + + %log_4xf64 = call <4 x double> @llvm.log(<4 x double> poison) + store volatile <4 x double> %log_4xf64, ptr %P + + %log2_f32 = call float @llvm.log2(float poison) + store volatile float %log2_f32, ptr %P + + %log2_2xf32 = call <2 x float> @llvm.log2(<2 x float> poison) + store volatile <2 x float> %log2_2xf32, ptr %P + + %log2_4xf64 = call <4 x double> @llvm.log2(<4 x double> poison) + store volatile <4 x double> %log2_4xf64, ptr %P + + %log10_f32 = call float @llvm.log10(float poison) + store volatile float %log10_f32, ptr %P + + %log10_2xf32 = call <2 x float> @llvm.log10(<2 x float> poison) + store volatile <2 x float> %log10_2xf32, ptr %P + + %log10_4xf64 = call <4 x double> @llvm.log10(<4 x double> poison) + store volatile <4 x double> %log10_4xf64, ptr %P + ret void +} + + +define void @modf_poison(ptr %P) { +; CHECK-LABEL: @modf_poison( +; CHECK-NEXT: store volatile { float, float } poison, ptr [[P:%.*]], align 4 +; CHECK-NEXT: store volatile { <2 x float>, <2 x float> } poison, ptr [[P]], align 8 +; CHECK-NEXT: store volatile { <4 x double>, <4 x double> } poison, ptr [[P]], align 32 +; CHECK-NEXT: ret void +; + %modf_f32 = call { float, float } @llvm.modf(float poison) + store volatile { float, float } %modf_f32, ptr %P + + %modf_2xf32 = call { <2 x float>, <2 x float> } @llvm.modf(<2 x float> poison) + store volatile { <2 x float>, <2 x float> } %modf_2xf32, ptr %P + + %modf_4xf64 = call { <4 x double>, <4 x double> } @llvm.modf(<4 x double> poison) + store volatile { <4 x double>, <4 x double> } %modf_4xf64, ptr %P + + ret void +} + + +define void @floor_poison(ptr %P) { +; CHECK-LABEL: @floor_poison( +; CHECK-NEXT: store volatile float poison, ptr [[P:%.*]], align 4 +; CHECK-NEXT: store volatile <2 x float> poison, ptr [[P]], align 8 +; CHECK-NEXT: store volatile <4 x double> poison, ptr [[P]], align 32 +; CHECK-NEXT: ret void +; + %floor_f32 = call float @llvm.floor(float poison) + store volatile float %floor_f32, ptr %P + + %floor_2xf32 = call <2 x float> @llvm.floor(<2 x float> poison) + store volatile <2 x float> %floor_2xf32, ptr %P + + %floor_4xf64 = call <4 x double> @llvm.floor(<4 x double> poison) + store volatile <4 x double> %floor_4xf64, ptr %P + + ret void +} + + +define void @ceil_poison(ptr %P) { +; CHECK-LABEL: @ceil_poison( +; CHECK-NEXT: store volatile float poison, ptr [[P:%.*]], align 4 +; CHECK-NEXT: store volatile <2 x float> poison, ptr [[P]], align 8 +; CHECK-NEXT: store volatile <4 x double> poison, ptr [[P]], align 32 +; CHECK-NEXT: ret void +; + %ceil_f32 = call float @llvm.ceil(float poison) + store volatile float %ceil_f32, ptr %P + + %ceil_2xf32 = call <2 x float> @llvm.ceil(<2 x float> poison) + store volatile <2 x float> %ceil_2xf32, ptr %P + + %ceil_4xf64 = call <4 x double> @llvm.ceil(<4 x double> poison) + store volatile <4 x double> %ceil_4xf64, ptr %P + + ret void +} + + +define void @trunc_poison(ptr %P) { +; CHECK-LABEL: @trunc_poison( +; CHECK-NEXT: store volatile float poison, ptr [[P:%.*]], align 4 +; CHECK-NEXT: store volatile <2 x float> poison, ptr [[P]], align 8 +; CHECK-NEXT: store volatile <4 x double> poison, ptr [[P]], align 32 +; CHECK-NEXT: ret void +; + %trunc_f32 = call float @llvm.trunc(float poison) + store volatile float %trunc_f32, ptr %P + + %trunc_2xf32 = call <2 x float> @llvm.trunc(<2 x float> poison) + store volatile <2 x float> %trunc_2xf32, ptr %P + + %trunc_4xf64 = call <4 x double> @llvm.trunc(<4 x double> poison) + store volatile <4 x double> %trunc_4xf64, ptr %P + + ret void +} + +define void @rint_poison(ptr %P) { +; CHECK-LABEL: @rint_poison( +; CHECK-NEXT: store volatile float poison, ptr [[P:%.*]], align 4 +; CHECK-NEXT: store volatile <2 x float> poison, ptr [[P]], align 8 +; CHECK-NEXT: store volatile <4 x double> poison, ptr [[P]], align 32 +; CHECK-NEXT: ret void +; + %rint_f32 = call float @llvm.rint(float poison) + store volatile float %rint_f32, ptr %P + + %rint_2xf32 = call <2 x float> @llvm.rint(<2 x float> poison) + store volatile <2 x float> %rint_2xf32, ptr %P + + %rint_4xf64 = call <4 x double> @llvm.rint(<4 x double> poison) + store volatile <4 x double> %rint_4xf64, ptr %P + + ret void +} + +define void @nearbyint_poison(ptr %P) { +; CHECK-LABEL: @nearbyint_poison( +; CHECK-NEXT: store volatile float poison, ptr [[P:%.*]], align 4 +; CHECK-NEXT: store volatile <2 x float> poison, ptr [[P]], align 8 +; CHECK-NEXT: store volatile <4 x double> poison, ptr [[P]], align 32 +; CHECK-NEXT: ret void +; + %nearbyint_f32 = call float @llvm.nearbyint(float poison) + store volatile float %nearbyint_f32, ptr %P + + %nearbyint_2xf32 = call <2 x float> @llvm.nearbyint(<2 x float> poison) + store volatile <2 x float> %nearbyint_2xf32, ptr %P + + %nearbyint_4xf64 = call <4 x double> @llvm.nearbyint(<4 x double> poison) + store volatile <4 x double> %nearbyint_4xf64, ptr %P + + ret void +} + + +define void @round_poison(ptr %P) { +; CHECK-LABEL: @round_poison( +; CHECK-NEXT: store volatile float poison, ptr [[P:%.*]], align 4 +; CHECK-NEXT: store volatile <2 x float> poison, ptr [[P]], align 8 +; CHECK-NEXT: store volatile <4 x double> poison, ptr [[P]], align 32 +; CHECK-NEXT: ret void +; + %round_f32 = call float @llvm.round(float poison) + store volatile float %round_f32, ptr %P + + %round_2xf32 = call <2 x float> @llvm.round(<2 x float> poison) + store volatile <2 x float> %round_2xf32, ptr %P + + %round_4xf64 = call <4 x double> @llvm.round(<4 x double> poison) + store volatile <4 x double> %round_4xf64, ptr %P + + ret void +} + + +define void @roundeven_poison(ptr %P) { +; CHECK-LABEL: @roundeven_poison( +; CHECK-NEXT: store volatile float poison, ptr [[P:%.*]], align 4 +; CHECK-NEXT: store volatile <2 x float> poison, ptr [[P]], align 8 +; CHECK-NEXT: store volatile <4 x double> poison, ptr [[P]], align 32 +; CHECK-NEXT: ret void +; + %roundeven_f32 = call float @llvm.roundeven(float poison) + store volatile float %roundeven_f32, ptr %P + + %roundeven_2xf32 = call <2 x float> @llvm.roundeven(<2 x float> poison) + store volatile <2 x float> %roundeven_2xf32, ptr %P + + %roundeven_4xf64 = call <4 x double> @llvm.roundeven(<4 x double> poison) + store volatile <4 x double> %roundeven_4xf64, ptr %P + + ret void +} + + +define void @lrint_poison(ptr %P) { +; CHECK-LABEL: @lrint_poison( +; CHECK-NEXT: store volatile i32 poison, ptr [[P:%.*]], align 4 +; CHECK-NEXT: store volatile <2 x i32> poison, ptr [[P]], align 8 +; CHECK-NEXT: store volatile <4 x i64> poison, ptr [[P]], align 32 +; CHECK-NEXT: ret void +; + %lrint_f32 = call i32 @llvm.lrint(float poison) + store volatile i32 %lrint_f32, ptr %P + + %lrint_2xf32 = call <2 x i32> @llvm.lrint(<2 x float> poison) + store volatile <2 x i32> %lrint_2xf32, ptr %P + + %lrint_4xf64 = call <4 x i64> @llvm.lrint(<4 x double> poison) + store volatile <4 x i64> %lrint_4xf64, ptr %P + + ret void +} + + +define void @llrint_poison(ptr %P) { +; CHECK-LABEL: @llrint_poison( +; CHECK-NEXT: store volatile i32 poison, ptr [[P:%.*]], align 4 +; CHECK-NEXT: store volatile <2 x i32> poison, ptr [[P]], align 8 +; CHECK-NEXT: store volatile <4 x i64> poison, ptr [[P]], align 32 +; CHECK-NEXT: ret void +; + %llrint_f32 = call i32 @llvm.llrint(float poison) + store volatile i32 %llrint_f32, ptr %P + + %llrint_2xf32 = call <2 x i32> @llvm.llrint(<2 x float> poison) + store volatile <2 x i32> %llrint_2xf32, ptr %P + + %llrint_4xf64 = call <4 x i64> @llvm.llrint(<4 x double> poison) + store volatile <4 x i64> %llrint_4xf64, ptr %P + + ret void +} + + +define void @umul_fix_poison(ptr %P) { +; CHECK-LABEL: @umul_fix_poison( +; CHECK-NEXT: store volatile i16 poison, ptr [[P:%.*]], align 2 +; CHECK-NEXT: store volatile i32 poison, ptr [[P]], align 4 +; CHECK-NEXT: store volatile <4 x i32> poison, ptr [[P]], align 16 +; CHECK-NEXT: ret void +; + %umul_fix_i16 = call i16 @llvm.umul.fix(i16 poison, i16 poison, i32 2) + store volatile i16 %umul_fix_i16, ptr %P + + %umul_fix_i32 = call i32 @llvm.umul.fix(i32 poison, i32 poison, i32 2) + store volatile i32 %umul_fix_i32, ptr %P + + %umul_fix_4xi32 = call <4 x i32> @llvm.umul.fix(<4 x i32> poison, <4 x i32> poison, i32 2) + store volatile <4 x i32> %umul_fix_4xi32, ptr %P + + ret void +} + + +define void @umul_fix_sat_poison(ptr %P) { +; CHECK-LABEL: @umul_fix_sat_poison( +; CHECK-NEXT: store volatile i16 poison, ptr [[P:%.*]], align 2 +; CHECK-NEXT: store volatile i32 poison, ptr [[P]], align 4 +; CHECK-NEXT: store volatile <4 x i32> poison, ptr [[P]], align 16 +; CHECK-NEXT: ret void +; + %umul_fix_sati16 = call i16 @llvm.umul.fix.sat(i16 poison, i16 poison, i32 2) + store volatile i16 %umul_fix_sati16, ptr %P + + %umul_fix_sati32 = call i32 @llvm.umul.fix.sat(i32 poison, i32 poison, i32 2) + store volatile i32 %umul_fix_sati32, ptr %P + + %umul_fix_sat4xi32 = call <4 x i32> @llvm.umul.fix.sat(<4 x i32> poison, <4 x i32> poison, i32 2) + store volatile <4 x i32> %umul_fix_sat4xi32, ptr %P + + ret void +} diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory-cost.ll b/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory-cost.ll index 173766cc..ccfa725 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory-cost.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory-cost.ll @@ -386,8 +386,7 @@ define void @single_fmul_used_by_each_member(ptr noalias %A, ptr noalias %B, ptr ; CHECK: [[VEC_EPILOG_VECTOR_BODY]]: ; CHECK-NEXT: [[INDEX24:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], %[[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT25:%.*]], %[[VEC_EPILOG_VECTOR_BODY]] ] ; CHECK-NEXT: [[TMP45:%.*]] = getelementptr double, ptr [[A]], i64 [[INDEX24]] -; CHECK-NEXT: [[TMP46:%.*]] = getelementptr double, ptr [[TMP45]], i32 0 -; CHECK-NEXT: [[TMP47:%.*]] = load double, ptr [[TMP46]], align 8 +; CHECK-NEXT: [[TMP47:%.*]] = load double, ptr [[TMP45]], align 8 ; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x double> poison, double [[TMP47]], i64 0 ; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x double> [[BROADCAST_SPLATINSERT]], <2 x double> poison, <2 x i32> zeroinitializer ; CHECK-NEXT: [[TMP48:%.*]] = fmul <2 x double> [[BROADCAST_SPLAT]], splat (double 5.000000e+00) diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory-with-wide-ops.ll b/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory-with-wide-ops.ll index 813d61b..38e224f 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory-with-wide-ops.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory-with-wide-ops.ll @@ -166,8 +166,7 @@ define void @test_2xi64(ptr noalias %data, ptr noalias %factor) { ; VF2: [[VECTOR_BODY]]: ; VF2-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] ; VF2-NEXT: [[TMP1:%.*]] = getelementptr inbounds i64, ptr [[FACTOR]], i64 [[INDEX]] -; VF2-NEXT: [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[TMP1]], i32 0 -; VF2-NEXT: [[TMP3:%.*]] = load i64, ptr [[TMP2]], align 8 +; VF2-NEXT: [[TMP3:%.*]] = load i64, ptr [[TMP1]], align 8 ; VF2-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x i64> poison, i64 [[TMP3]], i64 0 ; VF2-NEXT: [[WIDE_LOAD:%.*]] = shufflevector <2 x i64> [[BROADCAST_SPLATINSERT]], <2 x i64> poison, <2 x i32> zeroinitializer ; VF2-NEXT: [[TMP6:%.*]] = shl nsw i64 [[INDEX]], 1 @@ -959,13 +958,11 @@ define void @test_2xi64_sub_of_wide_loads(ptr noalias %data, ptr noalias %A, ptr ; VF2: [[VECTOR_BODY]]: ; VF2-NEXT: [[TMP0:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[IV_NEXT:%.*]], %[[VECTOR_BODY]] ] ; VF2-NEXT: [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP0]] -; VF2-NEXT: [[TMP1:%.*]] = getelementptr inbounds i64, ptr [[TMP6]], i32 0 -; VF2-NEXT: [[TMP2:%.*]] = load i64, ptr [[TMP1]], align 8 +; VF2-NEXT: [[TMP2:%.*]] = load i64, ptr [[TMP6]], align 8 ; VF2-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x i64> poison, i64 [[TMP2]], i64 0 ; VF2-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x i64> [[BROADCAST_SPLATINSERT]], <2 x i64> poison, <2 x i32> zeroinitializer ; VF2-NEXT: [[TMP3:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP0]] -; VF2-NEXT: [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[TMP3]], i32 0 -; VF2-NEXT: [[TMP5:%.*]] = load i64, ptr [[TMP4]], align 8 +; VF2-NEXT: [[TMP5:%.*]] = load i64, ptr [[TMP3]], align 8 ; VF2-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <2 x i64> poison, i64 [[TMP5]], i64 0 ; VF2-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <2 x i64> [[BROADCAST_SPLATINSERT1]], <2 x i64> poison, <2 x i32> zeroinitializer ; VF2-NEXT: [[TMP13:%.*]] = sub <2 x i64> [[BROADCAST_SPLAT]], [[BROADCAST_SPLAT2]] diff --git a/llvm/test/Transforms/LoopVectorize/X86/transform-narrow-interleave-to-widen-memory.ll b/llvm/test/Transforms/LoopVectorize/X86/transform-narrow-interleave-to-widen-memory.ll index f226ae9..cb7f0bf 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/transform-narrow-interleave-to-widen-memory.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/transform-narrow-interleave-to-widen-memory.ll @@ -18,8 +18,7 @@ define void @test_4xi64(ptr noalias %data, ptr noalias %factor, i64 noundef %n) ; CHECK: [[VECTOR_BODY]]: ; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] ; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[FACTOR]], i64 [[IV]] -; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[ARRAYIDX]], i32 0 -; CHECK-NEXT: [[TMP5:%.*]] = load i64, ptr [[TMP2]], align 8 +; CHECK-NEXT: [[TMP5:%.*]] = load i64, ptr [[ARRAYIDX]], align 8 ; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[TMP5]], i64 0 ; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer ; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds { i64, i64, i64, i64 }, ptr [[DATA]], i64 [[IV]], i32 0 diff --git a/llvm/test/Transforms/LowerMatrixIntrinsics/multiply-fused-lifetime-ends.ll b/llvm/test/Transforms/LowerMatrixIntrinsics/multiply-fused-lifetime-ends.ll index bdd0c6f..7cc8458 100644 --- a/llvm/test/Transforms/LowerMatrixIntrinsics/multiply-fused-lifetime-ends.ll +++ b/llvm/test/Transforms/LowerMatrixIntrinsics/multiply-fused-lifetime-ends.ll @@ -431,195 +431,26 @@ exit: ret void } -define void @lifetime_for_ptr_first_arg_before_multiply(ptr noalias %A, ptr noalias %B, ptr noalias %C, i1 %c.0) { -; CHECK-LABEL: @lifetime_for_ptr_first_arg_before_multiply( -; CHECK-NEXT: entry: -; CHECK-NEXT: br i1 [[C:%.*]], label [[THEN:%.*]], label [[EXIT:%.*]] -; CHECK: then: -; CHECK-NEXT: br label [[EXIT]] -; CHECK: exit: -; CHECK-NEXT: [[TMP0:%.*]] = getelementptr double, ptr [[A:%.*]], i64 0 -; CHECK-NEXT: [[COL_LOAD:%.*]] = load <2 x double>, ptr [[TMP0]], align 8 -; CHECK-NEXT: [[VEC_GEP:%.*]] = getelementptr double, ptr [[TMP0]], i64 2 -; CHECK-NEXT: [[COL_LOAD1:%.*]] = load <2 x double>, ptr [[VEC_GEP]], align 8 -; CHECK-NEXT: [[TMP1:%.*]] = getelementptr double, ptr [[B:%.*]], i64 0 -; CHECK-NEXT: [[COL_LOAD2:%.*]] = load <2 x double>, ptr [[TMP1]], align 8 -; CHECK-NEXT: [[VEC_GEP3:%.*]] = getelementptr double, ptr [[TMP1]], i64 2 -; CHECK-NEXT: [[COL_LOAD4:%.*]] = load <2 x double>, ptr [[VEC_GEP3]], align 8 -; CHECK-NEXT: [[BLOCK:%.*]] = shufflevector <2 x double> [[COL_LOAD]], <2 x double> poison, <1 x i32> zeroinitializer -; CHECK-NEXT: [[TMP2:%.*]] = extractelement <2 x double> [[COL_LOAD2]], i64 0 -; CHECK-NEXT: [[SPLAT_SPLATINSERT:%.*]] = insertelement <1 x double> poison, double [[TMP2]], i64 0 -; CHECK-NEXT: [[SPLAT_SPLAT:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT]], <1 x double> poison, <1 x i32> zeroinitializer -; CHECK-NEXT: [[TMP3:%.*]] = fmul contract <1 x double> [[BLOCK]], [[SPLAT_SPLAT]] -; CHECK-NEXT: [[BLOCK5:%.*]] = shufflevector <2 x double> [[COL_LOAD1]], <2 x double> poison, <1 x i32> zeroinitializer -; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x double> [[COL_LOAD2]], i64 1 -; CHECK-NEXT: [[SPLAT_SPLATINSERT6:%.*]] = insertelement <1 x double> poison, double [[TMP4]], i64 0 -; CHECK-NEXT: [[SPLAT_SPLAT7:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT6]], <1 x double> poison, <1 x i32> zeroinitializer -; CHECK-NEXT: [[TMP5:%.*]] = call contract <1 x double> @llvm.fmuladd.v1f64(<1 x double> [[BLOCK5]], <1 x double> [[SPLAT_SPLAT7]], <1 x double> [[TMP3]]) -; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <1 x double> [[TMP5]], <1 x double> poison, <2 x i32> <i32 0, i32 poison> -; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <2 x double> zeroinitializer, <2 x double> [[TMP6]], <2 x i32> <i32 2, i32 1> -; CHECK-NEXT: [[BLOCK8:%.*]] = shufflevector <2 x double> [[TMP7]], <2 x double> poison, <1 x i32> <i32 1> -; CHECK-NEXT: [[BLOCK9:%.*]] = shufflevector <2 x double> [[COL_LOAD]], <2 x double> poison, <1 x i32> <i32 1> -; CHECK-NEXT: [[TMP8:%.*]] = extractelement <2 x double> [[COL_LOAD2]], i64 0 -; CHECK-NEXT: [[SPLAT_SPLATINSERT10:%.*]] = insertelement <1 x double> poison, double [[TMP8]], i64 0 -; CHECK-NEXT: [[SPLAT_SPLAT11:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT10]], <1 x double> poison, <1 x i32> zeroinitializer -; CHECK-NEXT: [[TMP9:%.*]] = fmul contract <1 x double> [[BLOCK9]], [[SPLAT_SPLAT11]] -; CHECK-NEXT: [[BLOCK12:%.*]] = shufflevector <2 x double> [[COL_LOAD1]], <2 x double> poison, <1 x i32> <i32 1> -; CHECK-NEXT: [[TMP10:%.*]] = extractelement <2 x double> [[COL_LOAD2]], i64 1 -; CHECK-NEXT: [[SPLAT_SPLATINSERT13:%.*]] = insertelement <1 x double> poison, double [[TMP10]], i64 0 -; CHECK-NEXT: [[SPLAT_SPLAT14:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT13]], <1 x double> poison, <1 x i32> zeroinitializer -; CHECK-NEXT: [[TMP11:%.*]] = call contract <1 x double> @llvm.fmuladd.v1f64(<1 x double> [[BLOCK12]], <1 x double> [[SPLAT_SPLAT14]], <1 x double> [[TMP9]]) -; CHECK-NEXT: [[TMP12:%.*]] = shufflevector <1 x double> [[TMP11]], <1 x double> poison, <2 x i32> <i32 0, i32 poison> -; CHECK-NEXT: [[TMP13:%.*]] = shufflevector <2 x double> [[TMP7]], <2 x double> [[TMP12]], <2 x i32> <i32 0, i32 2> -; CHECK-NEXT: [[BLOCK15:%.*]] = shufflevector <2 x double> [[COL_LOAD]], <2 x double> poison, <1 x i32> zeroinitializer -; CHECK-NEXT: [[TMP14:%.*]] = extractelement <2 x double> [[COL_LOAD4]], i64 0 -; CHECK-NEXT: [[SPLAT_SPLATINSERT16:%.*]] = insertelement <1 x double> poison, double [[TMP14]], i64 0 -; CHECK-NEXT: [[SPLAT_SPLAT17:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT16]], <1 x double> poison, <1 x i32> zeroinitializer -; CHECK-NEXT: [[TMP15:%.*]] = fmul contract <1 x double> [[BLOCK15]], [[SPLAT_SPLAT17]] -; CHECK-NEXT: [[BLOCK18:%.*]] = shufflevector <2 x double> [[COL_LOAD1]], <2 x double> poison, <1 x i32> zeroinitializer -; CHECK-NEXT: [[TMP16:%.*]] = extractelement <2 x double> [[COL_LOAD4]], i64 1 -; CHECK-NEXT: [[SPLAT_SPLATINSERT19:%.*]] = insertelement <1 x double> poison, double [[TMP16]], i64 0 -; CHECK-NEXT: [[SPLAT_SPLAT20:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT19]], <1 x double> poison, <1 x i32> zeroinitializer -; CHECK-NEXT: [[TMP17:%.*]] = call contract <1 x double> @llvm.fmuladd.v1f64(<1 x double> [[BLOCK18]], <1 x double> [[SPLAT_SPLAT20]], <1 x double> [[TMP15]]) -; CHECK-NEXT: [[TMP18:%.*]] = shufflevector <1 x double> [[TMP17]], <1 x double> poison, <2 x i32> <i32 0, i32 poison> -; CHECK-NEXT: [[TMP19:%.*]] = shufflevector <2 x double> zeroinitializer, <2 x double> [[TMP18]], <2 x i32> <i32 2, i32 1> -; CHECK-NEXT: [[BLOCK21:%.*]] = shufflevector <2 x double> [[TMP19]], <2 x double> poison, <1 x i32> <i32 1> -; CHECK-NEXT: [[BLOCK22:%.*]] = shufflevector <2 x double> [[COL_LOAD]], <2 x double> poison, <1 x i32> <i32 1> -; CHECK-NEXT: [[TMP20:%.*]] = extractelement <2 x double> [[COL_LOAD4]], i64 0 -; CHECK-NEXT: [[SPLAT_SPLATINSERT23:%.*]] = insertelement <1 x double> poison, double [[TMP20]], i64 0 -; CHECK-NEXT: [[SPLAT_SPLAT24:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT23]], <1 x double> poison, <1 x i32> zeroinitializer -; CHECK-NEXT: [[TMP21:%.*]] = fmul contract <1 x double> [[BLOCK22]], [[SPLAT_SPLAT24]] -; CHECK-NEXT: [[BLOCK25:%.*]] = shufflevector <2 x double> [[COL_LOAD1]], <2 x double> poison, <1 x i32> <i32 1> -; CHECK-NEXT: [[TMP22:%.*]] = extractelement <2 x double> [[COL_LOAD4]], i64 1 -; CHECK-NEXT: [[SPLAT_SPLATINSERT26:%.*]] = insertelement <1 x double> poison, double [[TMP22]], i64 0 -; CHECK-NEXT: [[SPLAT_SPLAT27:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT26]], <1 x double> poison, <1 x i32> zeroinitializer -; CHECK-NEXT: [[TMP23:%.*]] = call contract <1 x double> @llvm.fmuladd.v1f64(<1 x double> [[BLOCK25]], <1 x double> [[SPLAT_SPLAT27]], <1 x double> [[TMP21]]) -; CHECK-NEXT: [[TMP24:%.*]] = shufflevector <1 x double> [[TMP23]], <1 x double> poison, <2 x i32> <i32 0, i32 poison> -; CHECK-NEXT: [[TMP25:%.*]] = shufflevector <2 x double> [[TMP19]], <2 x double> [[TMP24]], <2 x i32> <i32 0, i32 2> -; CHECK-NEXT: [[TMP26:%.*]] = getelementptr double, ptr [[C1:%.*]], i64 0 -; CHECK-NEXT: store <2 x double> [[TMP13]], ptr [[TMP26]], align 8 -; CHECK-NEXT: [[VEC_GEP28:%.*]] = getelementptr double, ptr [[TMP26]], i64 2 -; CHECK-NEXT: store <2 x double> [[TMP25]], ptr [[VEC_GEP28]], align 8 -; CHECK-NEXT: ret void -; -entry: - %a = load <4 x double>, ptr %A, align 8 - %b = load <4 x double>, ptr %B, align 8 - br i1 %c.0, label %then, label %exit - -then: - call void @llvm.lifetime.end(i64 -1, ptr %A) - br label %exit - -exit: - %m = call <4 x double> @llvm.matrix.multiply(<4 x double> %a, <4 x double> %b, i32 2, i32 2, i32 2) - store <4 x double> %m, ptr %C, align 8 - ret void -} - -define void @lifetime_for_both_ptr_args_before_multiply(ptr noalias %A, ptr noalias %B, ptr noalias %C, i1 %c.0) { -; CHECK-LABEL: @lifetime_for_both_ptr_args_before_multiply( -; CHECK-NEXT: entry: -; CHECK-NEXT: br i1 [[C:%.*]], label [[THEN:%.*]], label [[EXIT:%.*]] -; CHECK: then: -; CHECK-NEXT: br label [[EXIT]] -; CHECK: exit: -; CHECK-NEXT: [[TMP0:%.*]] = getelementptr double, ptr [[A:%.*]], i64 0 -; CHECK-NEXT: [[COL_LOAD:%.*]] = load <2 x double>, ptr [[TMP0]], align 8 -; CHECK-NEXT: [[VEC_GEP:%.*]] = getelementptr double, ptr [[TMP0]], i64 2 -; CHECK-NEXT: [[COL_LOAD1:%.*]] = load <2 x double>, ptr [[VEC_GEP]], align 8 -; CHECK-NEXT: [[TMP1:%.*]] = getelementptr double, ptr [[B:%.*]], i64 0 -; CHECK-NEXT: [[COL_LOAD2:%.*]] = load <2 x double>, ptr [[TMP1]], align 8 -; CHECK-NEXT: [[VEC_GEP3:%.*]] = getelementptr double, ptr [[TMP1]], i64 2 -; CHECK-NEXT: [[COL_LOAD4:%.*]] = load <2 x double>, ptr [[VEC_GEP3]], align 8 -; CHECK-NEXT: [[BLOCK:%.*]] = shufflevector <2 x double> [[COL_LOAD]], <2 x double> poison, <1 x i32> zeroinitializer -; CHECK-NEXT: [[TMP2:%.*]] = extractelement <2 x double> [[COL_LOAD2]], i64 0 -; CHECK-NEXT: [[SPLAT_SPLATINSERT:%.*]] = insertelement <1 x double> poison, double [[TMP2]], i64 0 -; CHECK-NEXT: [[SPLAT_SPLAT:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT]], <1 x double> poison, <1 x i32> zeroinitializer -; CHECK-NEXT: [[TMP3:%.*]] = fmul contract <1 x double> [[BLOCK]], [[SPLAT_SPLAT]] -; CHECK-NEXT: [[BLOCK5:%.*]] = shufflevector <2 x double> [[COL_LOAD1]], <2 x double> poison, <1 x i32> zeroinitializer -; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x double> [[COL_LOAD2]], i64 1 -; CHECK-NEXT: [[SPLAT_SPLATINSERT6:%.*]] = insertelement <1 x double> poison, double [[TMP4]], i64 0 -; CHECK-NEXT: [[SPLAT_SPLAT7:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT6]], <1 x double> poison, <1 x i32> zeroinitializer -; CHECK-NEXT: [[TMP5:%.*]] = call contract <1 x double> @llvm.fmuladd.v1f64(<1 x double> [[BLOCK5]], <1 x double> [[SPLAT_SPLAT7]], <1 x double> [[TMP3]]) -; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <1 x double> [[TMP5]], <1 x double> poison, <2 x i32> <i32 0, i32 poison> -; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <2 x double> zeroinitializer, <2 x double> [[TMP6]], <2 x i32> <i32 2, i32 1> -; CHECK-NEXT: [[BLOCK8:%.*]] = shufflevector <2 x double> [[TMP7]], <2 x double> poison, <1 x i32> <i32 1> -; CHECK-NEXT: [[BLOCK9:%.*]] = shufflevector <2 x double> [[COL_LOAD]], <2 x double> poison, <1 x i32> <i32 1> -; CHECK-NEXT: [[TMP8:%.*]] = extractelement <2 x double> [[COL_LOAD2]], i64 0 -; CHECK-NEXT: [[SPLAT_SPLATINSERT10:%.*]] = insertelement <1 x double> poison, double [[TMP8]], i64 0 -; CHECK-NEXT: [[SPLAT_SPLAT11:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT10]], <1 x double> poison, <1 x i32> zeroinitializer -; CHECK-NEXT: [[TMP9:%.*]] = fmul contract <1 x double> [[BLOCK9]], [[SPLAT_SPLAT11]] -; CHECK-NEXT: [[BLOCK12:%.*]] = shufflevector <2 x double> [[COL_LOAD1]], <2 x double> poison, <1 x i32> <i32 1> -; CHECK-NEXT: [[TMP10:%.*]] = extractelement <2 x double> [[COL_LOAD2]], i64 1 -; CHECK-NEXT: [[SPLAT_SPLATINSERT13:%.*]] = insertelement <1 x double> poison, double [[TMP10]], i64 0 -; CHECK-NEXT: [[SPLAT_SPLAT14:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT13]], <1 x double> poison, <1 x i32> zeroinitializer -; CHECK-NEXT: [[TMP11:%.*]] = call contract <1 x double> @llvm.fmuladd.v1f64(<1 x double> [[BLOCK12]], <1 x double> [[SPLAT_SPLAT14]], <1 x double> [[TMP9]]) -; CHECK-NEXT: [[TMP12:%.*]] = shufflevector <1 x double> [[TMP11]], <1 x double> poison, <2 x i32> <i32 0, i32 poison> -; CHECK-NEXT: [[TMP13:%.*]] = shufflevector <2 x double> [[TMP7]], <2 x double> [[TMP12]], <2 x i32> <i32 0, i32 2> -; CHECK-NEXT: [[BLOCK15:%.*]] = shufflevector <2 x double> [[COL_LOAD]], <2 x double> poison, <1 x i32> zeroinitializer -; CHECK-NEXT: [[TMP14:%.*]] = extractelement <2 x double> [[COL_LOAD4]], i64 0 -; CHECK-NEXT: [[SPLAT_SPLATINSERT16:%.*]] = insertelement <1 x double> poison, double [[TMP14]], i64 0 -; CHECK-NEXT: [[SPLAT_SPLAT17:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT16]], <1 x double> poison, <1 x i32> zeroinitializer -; CHECK-NEXT: [[TMP15:%.*]] = fmul contract <1 x double> [[BLOCK15]], [[SPLAT_SPLAT17]] -; CHECK-NEXT: [[BLOCK18:%.*]] = shufflevector <2 x double> [[COL_LOAD1]], <2 x double> poison, <1 x i32> zeroinitializer -; CHECK-NEXT: [[TMP16:%.*]] = extractelement <2 x double> [[COL_LOAD4]], i64 1 -; CHECK-NEXT: [[SPLAT_SPLATINSERT19:%.*]] = insertelement <1 x double> poison, double [[TMP16]], i64 0 -; CHECK-NEXT: [[SPLAT_SPLAT20:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT19]], <1 x double> poison, <1 x i32> zeroinitializer -; CHECK-NEXT: [[TMP17:%.*]] = call contract <1 x double> @llvm.fmuladd.v1f64(<1 x double> [[BLOCK18]], <1 x double> [[SPLAT_SPLAT20]], <1 x double> [[TMP15]]) -; CHECK-NEXT: [[TMP18:%.*]] = shufflevector <1 x double> [[TMP17]], <1 x double> poison, <2 x i32> <i32 0, i32 poison> -; CHECK-NEXT: [[TMP19:%.*]] = shufflevector <2 x double> zeroinitializer, <2 x double> [[TMP18]], <2 x i32> <i32 2, i32 1> -; CHECK-NEXT: [[BLOCK21:%.*]] = shufflevector <2 x double> [[TMP19]], <2 x double> poison, <1 x i32> <i32 1> -; CHECK-NEXT: [[BLOCK22:%.*]] = shufflevector <2 x double> [[COL_LOAD]], <2 x double> poison, <1 x i32> <i32 1> -; CHECK-NEXT: [[TMP20:%.*]] = extractelement <2 x double> [[COL_LOAD4]], i64 0 -; CHECK-NEXT: [[SPLAT_SPLATINSERT23:%.*]] = insertelement <1 x double> poison, double [[TMP20]], i64 0 -; CHECK-NEXT: [[SPLAT_SPLAT24:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT23]], <1 x double> poison, <1 x i32> zeroinitializer -; CHECK-NEXT: [[TMP21:%.*]] = fmul contract <1 x double> [[BLOCK22]], [[SPLAT_SPLAT24]] -; CHECK-NEXT: [[BLOCK25:%.*]] = shufflevector <2 x double> [[COL_LOAD1]], <2 x double> poison, <1 x i32> <i32 1> -; CHECK-NEXT: [[TMP22:%.*]] = extractelement <2 x double> [[COL_LOAD4]], i64 1 -; CHECK-NEXT: [[SPLAT_SPLATINSERT26:%.*]] = insertelement <1 x double> poison, double [[TMP22]], i64 0 -; CHECK-NEXT: [[SPLAT_SPLAT27:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT26]], <1 x double> poison, <1 x i32> zeroinitializer -; CHECK-NEXT: [[TMP23:%.*]] = call contract <1 x double> @llvm.fmuladd.v1f64(<1 x double> [[BLOCK25]], <1 x double> [[SPLAT_SPLAT27]], <1 x double> [[TMP21]]) -; CHECK-NEXT: [[TMP24:%.*]] = shufflevector <1 x double> [[TMP23]], <1 x double> poison, <2 x i32> <i32 0, i32 poison> -; CHECK-NEXT: [[TMP25:%.*]] = shufflevector <2 x double> [[TMP19]], <2 x double> [[TMP24]], <2 x i32> <i32 0, i32 2> -; CHECK-NEXT: [[TMP26:%.*]] = getelementptr double, ptr [[C1:%.*]], i64 0 -; CHECK-NEXT: store <2 x double> [[TMP13]], ptr [[TMP26]], align 8 -; CHECK-NEXT: [[VEC_GEP28:%.*]] = getelementptr double, ptr [[TMP26]], i64 2 -; CHECK-NEXT: store <2 x double> [[TMP25]], ptr [[VEC_GEP28]], align 8 -; CHECK-NEXT: ret void -; -entry: - %a = load <4 x double>, ptr %A, align 8 - %b = load <4 x double>, ptr %B, align 8 - br i1 %c.0, label %then, label %exit - -then: - call void @llvm.lifetime.end(i64 -1, ptr %B) - call void @llvm.lifetime.end(i64 -1, ptr %A) - br label %exit - -exit: - %m = call <4 x double> @llvm.matrix.multiply(<4 x double> %a, <4 x double> %b, i32 2, i32 2, i32 2) - store <4 x double> %m, ptr %C, align 8 - ret void -} - -define void @multiple_unrelated_lifetimes(ptr noalias %A, ptr noalias %B, ptr noalias %C, i1 %c.0) { +define void @multiple_unrelated_lifetimes(ptr noalias %C, i1 %c.0) { ; CHECK-LABEL: @multiple_unrelated_lifetimes( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[ALLOC_1:%.*]] = alloca i32, align 4 ; CHECK-NEXT: [[ALLOC_2:%.*]] = alloca i32, align 4 +; CHECK-NEXT: [[A:%.*]] = alloca <4 x double>, align 32 +; CHECK-NEXT: [[B:%.*]] = alloca <4 x double>, align 32 +; CHECK-NEXT: call void @init(ptr [[A]]) +; CHECK-NEXT: call void @init(ptr [[B]]) ; CHECK-NEXT: br i1 [[C:%.*]], label [[THEN:%.*]], label [[EXIT:%.*]] ; CHECK: then: ; CHECK-NEXT: call void @llvm.lifetime.end.p0(i64 -1, ptr [[ALLOC_1]]) ; CHECK-NEXT: call void @llvm.lifetime.end.p0(i64 -1, ptr [[ALLOC_2]]) ; CHECK-NEXT: br label [[EXIT]] ; CHECK: exit: -; CHECK-NEXT: [[TMP0:%.*]] = getelementptr double, ptr [[A:%.*]], i64 0 +; CHECK-NEXT: [[TMP0:%.*]] = getelementptr double, ptr [[A]], i64 0 ; CHECK-NEXT: [[COL_LOAD:%.*]] = load <2 x double>, ptr [[TMP0]], align 8 ; CHECK-NEXT: [[VEC_GEP:%.*]] = getelementptr double, ptr [[TMP0]], i64 2 ; CHECK-NEXT: [[COL_LOAD1:%.*]] = load <2 x double>, ptr [[VEC_GEP]], align 8 -; CHECK-NEXT: [[TMP1:%.*]] = getelementptr double, ptr [[B:%.*]], i64 0 +; CHECK-NEXT: [[TMP1:%.*]] = getelementptr double, ptr [[B]], i64 0 ; CHECK-NEXT: [[COL_LOAD2:%.*]] = load <2 x double>, ptr [[TMP1]], align 8 ; CHECK-NEXT: [[VEC_GEP3:%.*]] = getelementptr double, ptr [[TMP1]], i64 2 ; CHECK-NEXT: [[COL_LOAD4:%.*]] = load <2 x double>, ptr [[VEC_GEP3]], align 8 @@ -682,6 +513,10 @@ define void @multiple_unrelated_lifetimes(ptr noalias %A, ptr noalias %B, ptr no entry: %alloc.1 = alloca i32 %alloc.2 = alloca i32 + %A = alloca <4 x double> + %B = alloca <4 x double> + call void @init(ptr %A) + call void @init(ptr %B) %a = load <4 x double>, ptr %A, align 8 %b = load <4 x double>, ptr %B, align 8 br i1 %c.0, label %then, label %exit @@ -699,106 +534,20 @@ exit: ret void } -define void @lifetime_for_ptr_select_before_multiply(ptr noalias %A, ptr noalias %B, ptr noalias %C, i1 %c.0, i1 %c.1) { -; CHECK-LABEL: @lifetime_for_ptr_select_before_multiply( -; CHECK-NEXT: entry: -; CHECK-NEXT: [[P:%.*]] = select i1 [[C_0:%.*]], ptr [[A:%.*]], ptr [[B:%.*]] -; CHECK-NEXT: br i1 [[C_1:%.*]], label [[THEN:%.*]], label [[EXIT:%.*]] -; CHECK: then: -; CHECK-NEXT: br label [[EXIT]] -; CHECK: exit: -; CHECK-NEXT: [[TMP0:%.*]] = getelementptr double, ptr [[P]], i64 0 -; CHECK-NEXT: [[COL_LOAD:%.*]] = load <2 x double>, ptr [[TMP0]], align 8 -; CHECK-NEXT: [[VEC_GEP:%.*]] = getelementptr double, ptr [[TMP0]], i64 2 -; CHECK-NEXT: [[COL_LOAD1:%.*]] = load <2 x double>, ptr [[VEC_GEP]], align 8 -; CHECK-NEXT: [[TMP1:%.*]] = getelementptr double, ptr [[B]], i64 0 -; CHECK-NEXT: [[COL_LOAD2:%.*]] = load <2 x double>, ptr [[TMP1]], align 8 -; CHECK-NEXT: [[VEC_GEP3:%.*]] = getelementptr double, ptr [[TMP1]], i64 2 -; CHECK-NEXT: [[COL_LOAD4:%.*]] = load <2 x double>, ptr [[VEC_GEP3]], align 8 -; CHECK-NEXT: [[BLOCK:%.*]] = shufflevector <2 x double> [[COL_LOAD]], <2 x double> poison, <1 x i32> zeroinitializer -; CHECK-NEXT: [[TMP2:%.*]] = extractelement <2 x double> [[COL_LOAD2]], i64 0 -; CHECK-NEXT: [[SPLAT_SPLATINSERT:%.*]] = insertelement <1 x double> poison, double [[TMP2]], i64 0 -; CHECK-NEXT: [[SPLAT_SPLAT:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT]], <1 x double> poison, <1 x i32> zeroinitializer -; CHECK-NEXT: [[TMP3:%.*]] = fmul contract <1 x double> [[BLOCK]], [[SPLAT_SPLAT]] -; CHECK-NEXT: [[BLOCK5:%.*]] = shufflevector <2 x double> [[COL_LOAD1]], <2 x double> poison, <1 x i32> zeroinitializer -; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x double> [[COL_LOAD2]], i64 1 -; CHECK-NEXT: [[SPLAT_SPLATINSERT6:%.*]] = insertelement <1 x double> poison, double [[TMP4]], i64 0 -; CHECK-NEXT: [[SPLAT_SPLAT7:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT6]], <1 x double> poison, <1 x i32> zeroinitializer -; CHECK-NEXT: [[TMP5:%.*]] = call contract <1 x double> @llvm.fmuladd.v1f64(<1 x double> [[BLOCK5]], <1 x double> [[SPLAT_SPLAT7]], <1 x double> [[TMP3]]) -; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <1 x double> [[TMP5]], <1 x double> poison, <2 x i32> <i32 0, i32 poison> -; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <2 x double> zeroinitializer, <2 x double> [[TMP6]], <2 x i32> <i32 2, i32 1> -; CHECK-NEXT: [[BLOCK8:%.*]] = shufflevector <2 x double> [[TMP7]], <2 x double> poison, <1 x i32> <i32 1> -; CHECK-NEXT: [[BLOCK9:%.*]] = shufflevector <2 x double> [[COL_LOAD]], <2 x double> poison, <1 x i32> <i32 1> -; CHECK-NEXT: [[TMP8:%.*]] = extractelement <2 x double> [[COL_LOAD2]], i64 0 -; CHECK-NEXT: [[SPLAT_SPLATINSERT10:%.*]] = insertelement <1 x double> poison, double [[TMP8]], i64 0 -; CHECK-NEXT: [[SPLAT_SPLAT11:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT10]], <1 x double> poison, <1 x i32> zeroinitializer -; CHECK-NEXT: [[TMP9:%.*]] = fmul contract <1 x double> [[BLOCK9]], [[SPLAT_SPLAT11]] -; CHECK-NEXT: [[BLOCK12:%.*]] = shufflevector <2 x double> [[COL_LOAD1]], <2 x double> poison, <1 x i32> <i32 1> -; CHECK-NEXT: [[TMP10:%.*]] = extractelement <2 x double> [[COL_LOAD2]], i64 1 -; CHECK-NEXT: [[SPLAT_SPLATINSERT13:%.*]] = insertelement <1 x double> poison, double [[TMP10]], i64 0 -; CHECK-NEXT: [[SPLAT_SPLAT14:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT13]], <1 x double> poison, <1 x i32> zeroinitializer -; CHECK-NEXT: [[TMP11:%.*]] = call contract <1 x double> @llvm.fmuladd.v1f64(<1 x double> [[BLOCK12]], <1 x double> [[SPLAT_SPLAT14]], <1 x double> [[TMP9]]) -; CHECK-NEXT: [[TMP12:%.*]] = shufflevector <1 x double> [[TMP11]], <1 x double> poison, <2 x i32> <i32 0, i32 poison> -; CHECK-NEXT: [[TMP13:%.*]] = shufflevector <2 x double> [[TMP7]], <2 x double> [[TMP12]], <2 x i32> <i32 0, i32 2> -; CHECK-NEXT: [[BLOCK15:%.*]] = shufflevector <2 x double> [[COL_LOAD]], <2 x double> poison, <1 x i32> zeroinitializer -; CHECK-NEXT: [[TMP14:%.*]] = extractelement <2 x double> [[COL_LOAD4]], i64 0 -; CHECK-NEXT: [[SPLAT_SPLATINSERT16:%.*]] = insertelement <1 x double> poison, double [[TMP14]], i64 0 -; CHECK-NEXT: [[SPLAT_SPLAT17:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT16]], <1 x double> poison, <1 x i32> zeroinitializer -; CHECK-NEXT: [[TMP15:%.*]] = fmul contract <1 x double> [[BLOCK15]], [[SPLAT_SPLAT17]] -; CHECK-NEXT: [[BLOCK18:%.*]] = shufflevector <2 x double> [[COL_LOAD1]], <2 x double> poison, <1 x i32> zeroinitializer -; CHECK-NEXT: [[TMP16:%.*]] = extractelement <2 x double> [[COL_LOAD4]], i64 1 -; CHECK-NEXT: [[SPLAT_SPLATINSERT19:%.*]] = insertelement <1 x double> poison, double [[TMP16]], i64 0 -; CHECK-NEXT: [[SPLAT_SPLAT20:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT19]], <1 x double> poison, <1 x i32> zeroinitializer -; CHECK-NEXT: [[TMP17:%.*]] = call contract <1 x double> @llvm.fmuladd.v1f64(<1 x double> [[BLOCK18]], <1 x double> [[SPLAT_SPLAT20]], <1 x double> [[TMP15]]) -; CHECK-NEXT: [[TMP18:%.*]] = shufflevector <1 x double> [[TMP17]], <1 x double> poison, <2 x i32> <i32 0, i32 poison> -; CHECK-NEXT: [[TMP19:%.*]] = shufflevector <2 x double> zeroinitializer, <2 x double> [[TMP18]], <2 x i32> <i32 2, i32 1> -; CHECK-NEXT: [[BLOCK21:%.*]] = shufflevector <2 x double> [[TMP19]], <2 x double> poison, <1 x i32> <i32 1> -; CHECK-NEXT: [[BLOCK22:%.*]] = shufflevector <2 x double> [[COL_LOAD]], <2 x double> poison, <1 x i32> <i32 1> -; CHECK-NEXT: [[TMP20:%.*]] = extractelement <2 x double> [[COL_LOAD4]], i64 0 -; CHECK-NEXT: [[SPLAT_SPLATINSERT23:%.*]] = insertelement <1 x double> poison, double [[TMP20]], i64 0 -; CHECK-NEXT: [[SPLAT_SPLAT24:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT23]], <1 x double> poison, <1 x i32> zeroinitializer -; CHECK-NEXT: [[TMP21:%.*]] = fmul contract <1 x double> [[BLOCK22]], [[SPLAT_SPLAT24]] -; CHECK-NEXT: [[BLOCK25:%.*]] = shufflevector <2 x double> [[COL_LOAD1]], <2 x double> poison, <1 x i32> <i32 1> -; CHECK-NEXT: [[TMP22:%.*]] = extractelement <2 x double> [[COL_LOAD4]], i64 1 -; CHECK-NEXT: [[SPLAT_SPLATINSERT26:%.*]] = insertelement <1 x double> poison, double [[TMP22]], i64 0 -; CHECK-NEXT: [[SPLAT_SPLAT27:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT26]], <1 x double> poison, <1 x i32> zeroinitializer -; CHECK-NEXT: [[TMP23:%.*]] = call contract <1 x double> @llvm.fmuladd.v1f64(<1 x double> [[BLOCK25]], <1 x double> [[SPLAT_SPLAT27]], <1 x double> [[TMP21]]) -; CHECK-NEXT: [[TMP24:%.*]] = shufflevector <1 x double> [[TMP23]], <1 x double> poison, <2 x i32> <i32 0, i32 poison> -; CHECK-NEXT: [[TMP25:%.*]] = shufflevector <2 x double> [[TMP19]], <2 x double> [[TMP24]], <2 x i32> <i32 0, i32 2> -; CHECK-NEXT: [[TMP26:%.*]] = getelementptr double, ptr [[C:%.*]], i64 0 -; CHECK-NEXT: store <2 x double> [[TMP13]], ptr [[TMP26]], align 8 -; CHECK-NEXT: [[VEC_GEP28:%.*]] = getelementptr double, ptr [[TMP26]], i64 2 -; CHECK-NEXT: store <2 x double> [[TMP25]], ptr [[VEC_GEP28]], align 8 -; CHECK-NEXT: ret void -; -entry: - %P = select i1 %c.0, ptr %A, ptr %B - %a = load <4 x double>, ptr %P, align 8 - %b = load <4 x double>, ptr %B, align 8 - br i1 %c.1, label %then, label %exit - -then: - call void @llvm.lifetime.end(i64 -1, ptr %P) - br label %exit - -exit: - %m = call <4 x double> @llvm.matrix.multiply(<4 x double> %a, <4 x double> %b, i32 2, i32 2, i32 2) - store <4 x double> %m, ptr %C, align 8 - ret void -} - -define void @lifetimes_for_args_in_different_blocks(ptr noalias %B, ptr noalias %C, i1 %c.0) { +define void @lifetimes_for_args_in_different_blocks(ptr noalias %C, i1 %c.0) { ; CHECK-LABEL: @lifetimes_for_args_in_different_blocks( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[A:%.*]] = alloca <4 x double>, align 32 +; CHECK-NEXT: [[B:%.*]] = alloca <4 x double>, align 32 ; CHECK-NEXT: call void @init(ptr [[A]]) +; CHECK-NEXT: call void @init(ptr [[B]]) ; CHECK-NEXT: br i1 [[C:%.*]], label [[THEN:%.*]], label [[EXIT:%.*]] ; CHECK: then: ; CHECK-NEXT: [[TMP0:%.*]] = getelementptr double, ptr [[A]], i64 0 ; CHECK-NEXT: [[COL_LOAD:%.*]] = load <2 x double>, ptr [[TMP0]], align 8 ; CHECK-NEXT: [[VEC_GEP:%.*]] = getelementptr double, ptr [[TMP0]], i64 2 ; CHECK-NEXT: [[COL_LOAD1:%.*]] = load <2 x double>, ptr [[VEC_GEP]], align 8 -; CHECK-NEXT: [[TMP1:%.*]] = getelementptr double, ptr [[B:%.*]], i64 0 +; CHECK-NEXT: [[TMP1:%.*]] = getelementptr double, ptr [[B]], i64 0 ; CHECK-NEXT: [[COL_LOAD2:%.*]] = load <2 x double>, ptr [[TMP1]], align 8 ; CHECK-NEXT: [[VEC_GEP3:%.*]] = getelementptr double, ptr [[TMP1]], i64 2 ; CHECK-NEXT: [[COL_LOAD4:%.*]] = load <2 x double>, ptr [[VEC_GEP3]], align 8 @@ -864,7 +613,9 @@ define void @lifetimes_for_args_in_different_blocks(ptr noalias %B, ptr noalias ; entry: %A = alloca <4 x double> + %B = alloca <4 x double> call void @init(ptr %A) + call void @init(ptr %B) br i1 %c.0, label %then, label %exit then: @@ -880,15 +631,17 @@ exit: ret void } -define void @lifetimes_for_args_in_different_blocks2(ptr noalias %B, ptr noalias %C, i1 %c.0) { +define void @lifetimes_for_args_in_different_blocks2(ptr noalias %C, i1 %c.0) { ; CHECK-LABEL: @lifetimes_for_args_in_different_blocks2( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[A:%.*]] = alloca <4 x double>, align 32 +; CHECK-NEXT: [[B:%.*]] = alloca <4 x double>, align 32 ; CHECK-NEXT: call void @init(ptr [[A]]) +; CHECK-NEXT: call void @init(ptr [[B]]) ; CHECK-NEXT: br i1 [[C:%.*]], label [[THEN:%.*]], label [[EXIT:%.*]] ; CHECK: then: ; CHECK-NEXT: call void @llvm.lifetime.end.p0(i64 -1, ptr [[A]]) -; CHECK-NEXT: call void @llvm.lifetime.end.p0(i64 -1, ptr [[B:%.*]]) +; CHECK-NEXT: call void @llvm.lifetime.end.p0(i64 -1, ptr [[B]]) ; CHECK-NEXT: br label [[EXIT]] ; CHECK: exit: ; CHECK-NEXT: [[TMP0:%.*]] = getelementptr double, ptr [[A]], i64 0 @@ -957,7 +710,9 @@ define void @lifetimes_for_args_in_different_blocks2(ptr noalias %B, ptr noalias ; entry: %A = alloca <4 x double> + %B = alloca <4 x double> call void @init(ptr %A) + call void @init(ptr %B) br i1 %c.0, label %then, label %exit then: @@ -973,18 +728,20 @@ exit: ret void } -define void @lifetimes_for_args_load0_in_different_block(ptr noalias %B, ptr noalias %C, i1 %c.0) { +define void @lifetimes_for_args_load0_in_different_block(ptr noalias %C, i1 %c.0) { ; CHECK-LABEL: @lifetimes_for_args_load0_in_different_block( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[A:%.*]] = alloca <4 x double>, align 32 +; CHECK-NEXT: [[B:%.*]] = alloca <4 x double>, align 32 ; CHECK-NEXT: call void @init(ptr [[A]]) +; CHECK-NEXT: call void @init(ptr [[B]]) ; CHECK-NEXT: br i1 [[C:%.*]], label [[THEN:%.*]], label [[EXIT:%.*]] ; CHECK: then: ; CHECK-NEXT: [[TMP0:%.*]] = getelementptr double, ptr [[A]], i64 0 ; CHECK-NEXT: [[COL_LOAD:%.*]] = load <2 x double>, ptr [[TMP0]], align 8 ; CHECK-NEXT: [[VEC_GEP:%.*]] = getelementptr double, ptr [[TMP0]], i64 2 ; CHECK-NEXT: [[COL_LOAD1:%.*]] = load <2 x double>, ptr [[VEC_GEP]], align 8 -; CHECK-NEXT: [[TMP1:%.*]] = getelementptr double, ptr [[B:%.*]], i64 0 +; CHECK-NEXT: [[TMP1:%.*]] = getelementptr double, ptr [[B]], i64 0 ; CHECK-NEXT: [[COL_LOAD2:%.*]] = load <2 x double>, ptr [[TMP1]], align 8 ; CHECK-NEXT: [[VEC_GEP3:%.*]] = getelementptr double, ptr [[TMP1]], i64 2 ; CHECK-NEXT: [[COL_LOAD4:%.*]] = load <2 x double>, ptr [[VEC_GEP3]], align 8 @@ -1048,7 +805,9 @@ define void @lifetimes_for_args_load0_in_different_block(ptr noalias %B, ptr noa ; entry: %A = alloca <4 x double> + %B = alloca <4 x double> call void @init(ptr %A) + call void @init(ptr %B) %a = load <4 x double>, ptr %A, align 8 call void @llvm.lifetime.end(i64 -1, ptr %A) br i1 %c.0, label %then, label %exit @@ -1064,18 +823,20 @@ exit: ret void } -define void @lifetimes_for_args_load1_in_different_block(ptr noalias %B, ptr noalias %C, i1 %c.0) { +define void @lifetimes_for_args_load1_in_different_block(ptr noalias %C, i1 %c.0) { ; CHECK-LABEL: @lifetimes_for_args_load1_in_different_block( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[A:%.*]] = alloca <4 x double>, align 32 +; CHECK-NEXT: [[B:%.*]] = alloca <4 x double>, align 32 ; CHECK-NEXT: call void @init(ptr [[A]]) +; CHECK-NEXT: call void @init(ptr [[B]]) ; CHECK-NEXT: br i1 [[C:%.*]], label [[THEN:%.*]], label [[EXIT:%.*]] ; CHECK: then: ; CHECK-NEXT: [[TMP0:%.*]] = getelementptr double, ptr [[A]], i64 0 ; CHECK-NEXT: [[COL_LOAD:%.*]] = load <2 x double>, ptr [[TMP0]], align 8 ; CHECK-NEXT: [[VEC_GEP:%.*]] = getelementptr double, ptr [[TMP0]], i64 2 ; CHECK-NEXT: [[COL_LOAD1:%.*]] = load <2 x double>, ptr [[VEC_GEP]], align 8 -; CHECK-NEXT: [[TMP1:%.*]] = getelementptr double, ptr [[B:%.*]], i64 0 +; CHECK-NEXT: [[TMP1:%.*]] = getelementptr double, ptr [[B]], i64 0 ; CHECK-NEXT: [[COL_LOAD2:%.*]] = load <2 x double>, ptr [[TMP1]], align 8 ; CHECK-NEXT: [[VEC_GEP3:%.*]] = getelementptr double, ptr [[TMP1]], i64 2 ; CHECK-NEXT: [[COL_LOAD4:%.*]] = load <2 x double>, ptr [[VEC_GEP3]], align 8 @@ -1139,7 +900,9 @@ define void @lifetimes_for_args_load1_in_different_block(ptr noalias %B, ptr noa ; entry: %A = alloca <4 x double> + %B = alloca <4 x double> call void @init(ptr %A) + call void @init(ptr %B) %b = load <4 x double>, ptr %B, align 8 call void @llvm.lifetime.end(i64 -1, ptr %B) br i1 %c.0, label %then, label %exit diff --git a/llvm/test/Transforms/MemCpyOpt/lifetime.ll b/llvm/test/Transforms/MemCpyOpt/lifetime.ll index 6158874..e9fc06b 100644 --- a/llvm/test/Transforms/MemCpyOpt/lifetime.ll +++ b/llvm/test/Transforms/MemCpyOpt/lifetime.ll @@ -116,22 +116,3 @@ define i32 @call_slot_clobber_before_lifetime_start() { %v = load i32, ptr %dst ret i32 %v } - -define void @call_slot_lifetime_bitcast(ptr %ptr) { -; CHECK-LABEL: @call_slot_lifetime_bitcast( -; CHECK-NEXT: [[TMP1:%.*]] = alloca i32, align 4 -; CHECK-NEXT: [[TMP2:%.*]] = alloca i32, align 4 -; CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[TMP2]], ptr align 4 [[PTR:%.*]], i64 4, i1 false) -; CHECK-NEXT: [[TMP1_CAST:%.*]] = bitcast ptr [[TMP1]] to ptr -; CHECK-NEXT: call void @llvm.lifetime.start.p0(i64 4, ptr nonnull [[TMP1_CAST]]) -; CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[TMP1]], ptr align 4 [[PTR]], i64 4, i1 false) -; CHECK-NEXT: ret void -; - %tmp1 = alloca i32 - %tmp2 = alloca i32 - call void @llvm.memcpy.p0.p0.i64(ptr align 8 %tmp2, ptr align 4 %ptr, i64 4, i1 false) - %tmp1.cast = bitcast ptr %tmp1 to ptr - call void @llvm.lifetime.start.p0(i64 4, ptr nonnull %tmp1.cast) - call void @llvm.memcpy.p0.p0.i64(ptr align 4 %tmp1.cast, ptr align 4 %tmp2, i64 4, i1 false) - ret void -} diff --git a/llvm/test/Transforms/MemCpyOpt/memcpy-undef.ll b/llvm/test/Transforms/MemCpyOpt/memcpy-undef.ll index 2f1ce37..816e103 100644 --- a/llvm/test/Transforms/MemCpyOpt/memcpy-undef.ll +++ b/llvm/test/Transforms/MemCpyOpt/memcpy-undef.ll @@ -26,35 +26,41 @@ define i32 @test1(ptr nocapture %foobie) nounwind noinline ssp uwtable { } ; Check that the memcpy is removed. -define void @test2(ptr sret(i8) noalias nocapture %out, ptr %in) nounwind noinline ssp uwtable { +define void @test2(ptr sret(i8) noalias nocapture %out) nounwind noinline ssp uwtable { ; CHECK-LABEL: @test2( -; CHECK-NEXT: call void @llvm.lifetime.start.p0(i64 8, ptr [[IN:%.*]]) +; CHECK-NEXT: [[IN:%.*]] = alloca i64, align 8 +; CHECK-NEXT: call void @llvm.lifetime.start.p0(i64 8, ptr [[IN]]) ; CHECK-NEXT: ret void ; + %in = alloca i64 call void @llvm.lifetime.start.p0(i64 8, ptr %in) call void @llvm.memcpy.p0.p0.i64(ptr %out, ptr %in, i64 8, i1 false) ret void } ; Check that the memcpy is not removed. -define void @test3(ptr sret(i8) noalias nocapture %out, ptr %in) nounwind noinline ssp uwtable { +define void @test3(ptr sret(i8) noalias nocapture %out) nounwind noinline ssp uwtable { ; CHECK-LABEL: @test3( -; CHECK-NEXT: call void @llvm.lifetime.start.p0(i64 4, ptr [[IN:%.*]]) +; CHECK-NEXT: [[IN:%.*]] = alloca i64, align 8 +; CHECK-NEXT: call void @llvm.lifetime.start.p0(i64 4, ptr [[IN]]) ; CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr [[OUT:%.*]], ptr [[IN]], i64 8, i1 false) ; CHECK-NEXT: ret void ; + %in = alloca i64 call void @llvm.lifetime.start.p0(i64 4, ptr %in) call void @llvm.memcpy.p0.p0.i64(ptr %out, ptr %in, i64 8, i1 false) ret void } ; Check that the memcpy is not removed. -define void @test_lifetime_may_alias(ptr %lifetime, ptr %src, ptr %dst) { +define void @test_lifetime_may_alias(ptr %src, ptr %dst) { ; CHECK-LABEL: @test_lifetime_may_alias( -; CHECK-NEXT: call void @llvm.lifetime.start.p0(i64 8, ptr [[LIFETIME:%.*]]) +; CHECK-NEXT: [[LIFETIME:%.*]] = alloca i64, align 8 +; CHECK-NEXT: call void @llvm.lifetime.start.p0(i64 8, ptr [[LIFETIME]]) ; CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr [[DST:%.*]], ptr [[SRC:%.*]], i64 8, i1 false) ; CHECK-NEXT: ret void ; + %lifetime = alloca i64 call void @llvm.lifetime.start.p0(i64 8, ptr %lifetime) call void @llvm.memcpy.p0.p0.i64(ptr %dst, ptr %src, i64 8, i1 false) ret void diff --git a/llvm/test/Transforms/MemCpyOpt/memset-memcpy-oversized.ll b/llvm/test/Transforms/MemCpyOpt/memset-memcpy-oversized.ll index 0c16f34..7ea63bb 100644 --- a/llvm/test/Transforms/MemCpyOpt/memset-memcpy-oversized.ll +++ b/llvm/test/Transforms/MemCpyOpt/memset-memcpy-oversized.ll @@ -37,29 +37,10 @@ define void @test_alloca_with_lifetimes(ptr %result) { ret void } -define void @test_malloc_with_lifetimes(ptr %result) { -; CHECK-LABEL: @test_malloc_with_lifetimes( -; CHECK-NEXT: [[A:%.*]] = call ptr @malloc(i64 16) -; CHECK-NEXT: call void @llvm.lifetime.start.p0(i64 16, ptr [[A]]) -; CHECK-NEXT: call void @llvm.memset.p0.i64(ptr align 8 [[A]], i8 0, i64 12, i1 false) -; CHECK-NEXT: call void @llvm.memset.p0.i64(ptr [[RESULT:%.*]], i8 0, i64 12, i1 false) -; CHECK-NEXT: call void @llvm.lifetime.end.p0(i64 16, ptr [[A]]) -; CHECK-NEXT: call void @free(ptr [[A]]) -; CHECK-NEXT: ret void -; - %a = call ptr @malloc(i64 16) - call void @llvm.lifetime.start.p0(i64 16, ptr %a) - call void @llvm.memset.p0.i64(ptr align 8 %a, i8 0, i64 12, i1 false) - call void @llvm.memcpy.p0.p0.i64(ptr %result, ptr align 8 %a, i64 16, i1 false) - call void @llvm.lifetime.end.p0(i64 16, ptr %a) - call void @free(ptr %a) - ret void -} - ; memcpy size is larger than lifetime, don't optimize. define void @test_copy_larger_than_lifetime_size(ptr %result) { ; CHECK-LABEL: @test_copy_larger_than_lifetime_size( -; CHECK-NEXT: [[A:%.*]] = call ptr @malloc(i64 16) +; CHECK-NEXT: [[A:%.*]] = alloca [[T:%.*]], align 8 ; CHECK-NEXT: call void @llvm.lifetime.start.p0(i64 12, ptr [[A]]) ; CHECK-NEXT: call void @llvm.memset.p0.i64(ptr align 8 [[A]], i8 0, i64 12, i1 false) ; CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr [[RESULT:%.*]], ptr align 8 [[A]], i64 16, i1 false) @@ -67,7 +48,7 @@ define void @test_copy_larger_than_lifetime_size(ptr %result) { ; CHECK-NEXT: call void @free(ptr [[A]]) ; CHECK-NEXT: ret void ; - %a = call ptr @malloc(i64 16) + %a = alloca %T, align 8 call void @llvm.lifetime.start.p0(i64 12, ptr %a) call void @llvm.memset.p0.i64(ptr align 8 %a, i8 0, i64 12, i1 false) call void @llvm.memcpy.p0.p0.i64(ptr %result, ptr align 8 %a, i64 16, i1 false) diff --git a/llvm/test/Transforms/MemCpyOpt/preserve-memssa.ll b/llvm/test/Transforms/MemCpyOpt/preserve-memssa.ll index b654319..ff36bf0 100644 --- a/llvm/test/Transforms/MemCpyOpt/preserve-memssa.ll +++ b/llvm/test/Transforms/MemCpyOpt/preserve-memssa.ll @@ -94,21 +94,6 @@ entry: ret void } -define i8 @test6(ptr %ptr, ptr noalias %ptr.1) { -; CHECK-LABEL: @test6( -; CHECK-NEXT: entry: -; CHECK-NEXT: call void @llvm.lifetime.start.p0(i64 24, ptr [[PTR:%.*]]) -; CHECK-NEXT: [[TMP0:%.*]] = load i8, ptr [[PTR]], align 8 -; CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr [[PTR]], ptr [[PTR_1:%.*]], i64 24, i1 false) -; CHECK-NEXT: ret i8 [[TMP0]] -; -entry: - call void @llvm.lifetime.start.p0(i64 24, ptr %ptr) - %0 = load i8, ptr %ptr, align 8 - call void @llvm.memmove.p0.p0.i64(ptr %ptr, ptr %ptr.1, i64 24, i1 false) - ret i8 %0 -} - define void @test7(ptr %ptr) { ; CHECK-LABEL: @test7( ; CHECK-NEXT: entry: diff --git a/llvm/test/Transforms/MoveAutoInit/clobber.ll b/llvm/test/Transforms/MoveAutoInit/clobber.ll index 09084b6..08ffb13 100644 --- a/llvm/test/Transforms/MoveAutoInit/clobber.ll +++ b/llvm/test/Transforms/MoveAutoInit/clobber.ll @@ -10,14 +10,14 @@ define i32 @foo(i32 noundef %0, i32 noundef %1, i32 noundef %2) #0 { ; CHECK-NEXT: [[TMP4:%.*]] = alloca [100 x i8], align 16 ; CHECK-NEXT: [[TMP5:%.*]] = alloca [2 x i8], align 1 ; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds [100 x i8], ptr [[TMP4]], i64 0, i64 0 -; CHECK-NEXT: call void @llvm.lifetime.start.p0(i64 100, ptr nonnull [[TMP6]]) #[[ATTR3:[0-9]+]] +; CHECK-NEXT: call void @llvm.lifetime.start.p0(i64 100, ptr nonnull [[TMP4]]) #[[ATTR3:[0-9]+]] ; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds [2 x i8], ptr [[TMP5]], i64 0, i64 0 -; CHECK-NEXT: call void @llvm.lifetime.start.p0(i64 2, ptr nonnull [[TMP7]]) #[[ATTR3]] +; CHECK-NEXT: call void @llvm.lifetime.start.p0(i64 2, ptr nonnull [[TMP5]]) #[[ATTR3]] ; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds [2 x i8], ptr [[TMP5]], i64 0, i64 1 ; CHECK-NEXT: [[TMP9:%.*]] = icmp eq i32 [[TMP1:%.*]], 0 ; CHECK-NEXT: br i1 [[TMP9]], label [[TMP15:%.*]], label [[TMP10:%.*]] ; CHECK: 10: -; CHECK-NEXT: call void @llvm.memset.p0.i64(ptr noundef nonnull align 16 dereferenceable(100) [[TMP6]], i8 -86, i64 100, i1 false), !annotation !0 +; CHECK-NEXT: call void @llvm.memset.p0.i64(ptr noundef nonnull align 16 dereferenceable(100) [[TMP6]], i8 -86, i64 100, i1 false), !annotation [[META0:![0-9]+]] ; CHECK-NEXT: [[TMP11:%.*]] = sext i32 [[TMP0:%.*]] to i64 ; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds [100 x i8], ptr [[TMP4]], i64 0, i64 [[TMP11]] ; CHECK-NEXT: store i8 12, ptr [[TMP12]], align 1 @@ -28,8 +28,8 @@ define i32 @foo(i32 noundef %0, i32 noundef %1, i32 noundef %2) #0 { ; CHECK-NEXT: [[TMP16:%.*]] = icmp eq i32 [[TMP2:%.*]], 0 ; CHECK-NEXT: br i1 [[TMP16]], label [[TMP22]], label [[TMP17:%.*]] ; CHECK: 17: -; CHECK-NEXT: store i8 -86, ptr [[TMP7]], align 1, !annotation !0 -; CHECK-NEXT: store i8 -86, ptr [[TMP8]], align 1, !annotation !0 +; CHECK-NEXT: store i8 -86, ptr [[TMP7]], align 1, !annotation [[META0]] +; CHECK-NEXT: store i8 -86, ptr [[TMP8]], align 1, !annotation [[META0]] ; CHECK-NEXT: [[TMP18:%.*]] = sext i32 [[TMP0]] to i64 ; CHECK-NEXT: [[TMP19:%.*]] = getelementptr inbounds [2 x i8], ptr [[TMP5]], i64 0, i64 [[TMP18]] ; CHECK-NEXT: store i8 12, ptr [[TMP19]], align 1 @@ -38,19 +38,19 @@ define i32 @foo(i32 noundef %0, i32 noundef %1, i32 noundef %2) #0 { ; CHECK-NEXT: br label [[TMP22]] ; CHECK: 22: ; CHECK-NEXT: [[TMP23:%.*]] = phi i32 [ [[TMP14]], [[TMP10]] ], [ [[TMP21]], [[TMP17]] ], [ 0, [[TMP15]] ] -; CHECK-NEXT: call void @llvm.lifetime.end.p0(i64 2, ptr nonnull [[TMP7]]) #[[ATTR3]] -; CHECK-NEXT: call void @llvm.lifetime.end.p0(i64 100, ptr nonnull [[TMP6]]) #[[ATTR3]] +; CHECK-NEXT: call void @llvm.lifetime.end.p0(i64 2, ptr nonnull [[TMP5]]) #[[ATTR3]] +; CHECK-NEXT: call void @llvm.lifetime.end.p0(i64 100, ptr nonnull [[TMP4]]) #[[ATTR3]] ; CHECK-NEXT: ret i32 [[TMP23]] ; %4 = alloca [100 x i8], align 16 %5 = alloca [2 x i8], align 1 %6 = getelementptr inbounds [100 x i8], ptr %4, i64 0, i64 0 - call void @llvm.lifetime.start.p0(i64 100, ptr nonnull %6) #3 + call void @llvm.lifetime.start.p0(i64 100, ptr nonnull %4) #3 ; This memset must move. call void @llvm.memset.p0.i64(ptr noundef nonnull align 16 dereferenceable(100) %6, i8 -86, i64 100, i1 false), !annotation !0 %7 = getelementptr inbounds [2 x i8], ptr %5, i64 0, i64 0 - call void @llvm.lifetime.start.p0(i64 2, ptr nonnull %7) #3 + call void @llvm.lifetime.start.p0(i64 2, ptr nonnull %5) #3 ; This store must move. store i8 -86, ptr %7, align 1, !annotation !0 %8 = getelementptr inbounds [2 x i8], ptr %5, i64 0, i64 1 @@ -81,8 +81,8 @@ define i32 @foo(i32 noundef %0, i32 noundef %1, i32 noundef %2) #0 { 22: %23 = phi i32 [ %14, %10 ], [ %21, %17 ], [ 0, %15 ] - call void @llvm.lifetime.end.p0(i64 2, ptr nonnull %7) #3 - call void @llvm.lifetime.end.p0(i64 100, ptr nonnull %6) #3 + call void @llvm.lifetime.end.p0(i64 2, ptr nonnull %5) #3 + call void @llvm.lifetime.end.p0(i64 100, ptr nonnull %4) #3 ret i32 %23 } diff --git a/llvm/test/Transforms/NewGVN/lifetime-simple.ll b/llvm/test/Transforms/NewGVN/lifetime-simple.ll index 55e4611..0a7bd33 100644 --- a/llvm/test/Transforms/NewGVN/lifetime-simple.ll +++ b/llvm/test/Transforms/NewGVN/lifetime-simple.ll @@ -4,10 +4,11 @@ target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128" target triple = "i386-apple-darwin7" -define i8 @test(ptr %P) nounwind { +define i8 @test() nounwind { ; CHECK-LABEL: define i8 @test( -; CHECK-SAME: ptr [[P:%.*]]) #[[ATTR0:[0-9]+]] { +; CHECK-SAME: ) #[[ATTR0:[0-9]+]] { ; CHECK-NEXT: entry: +; CHECK-NEXT: [[P:%.*]] = alloca [32 x i8], align 1 ; CHECK-NEXT: call void @llvm.lifetime.start.p0(i64 32, ptr [[P]]) ; CHECK-NEXT: store i8 1, ptr [[P]], align 1 ; CHECK-NEXT: call void @llvm.lifetime.end.p0(i64 32, ptr [[P]]) @@ -15,6 +16,7 @@ define i8 @test(ptr %P) nounwind { ; CHECK-NEXT: ret i8 [[TMP0]] ; entry: + %P = alloca [32 x i8] call void @llvm.lifetime.start.p0(i64 32, ptr %P) %0 = load i8, ptr %P store i8 1, ptr %P diff --git a/llvm/test/Transforms/NewGVN/salvage-eliminate-instruction.ll b/llvm/test/Transforms/NewGVN/salvage-eliminate-instruction.ll new file mode 100644 index 0000000..d1da7ea --- /dev/null +++ b/llvm/test/Transforms/NewGVN/salvage-eliminate-instruction.ll @@ -0,0 +1,45 @@ +; RUN: opt -S -passes=newgvn %s | FileCheck %s + +; Check that eliminateInstruction() replaces the debug uses of the instructions +; marked for deletion with the dominating leader. + +define void @binop(i32 %x, i32 %y) !dbg !5 { +; CHECK: #dbg_value(i32 %add1, [[META9:![0-9]+]], !DIExpression(), [[META12:![0-9]+]]) +; CHECK-NEXT: #dbg_value(i32 %add1, [[META11:![0-9]+]], !DIExpression(), [[META13:![0-9]+]]) +; + %add1 = add i32 %x, %y, !dbg !12 + #dbg_value(i32 %add1, !9, !DIExpression(), !12) + %add2 = add i32 %y, %x, !dbg !13 + #dbg_value(i32 %add2, !11, !DIExpression(), !13) + call void @use(i32 %add1, i32 %add2), !dbg !14 + ret void, !dbg !15 +} + +declare void @use(i32, i32) + +!llvm.dbg.cu = !{!0} +!llvm.debugify = !{!2, !3} +!llvm.module.flags = !{!4} + +!0 = distinct !DICompileUnit(language: DW_LANG_C, file: !1, producer: "debugify", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug) +!1 = !DIFile(filename: "/app/example.ll", directory: "/") +!2 = !{i32 4} +!3 = !{i32 2} +!4 = !{i32 2, !"Debug Info Version", i32 3} +!5 = distinct !DISubprogram(name: "binop", linkageName: "binop", scope: null, file: !1, line: 1, type: !6, scopeLine: 1, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0, retainedNodes: !8) +!6 = !DISubroutineType(types: !7) +!7 = !{} +!8 = !{!9, !11} +!9 = !DILocalVariable(name: "1", scope: !5, file: !1, line: 1, type: !10) +!10 = !DIBasicType(name: "ty32", size: 32, encoding: DW_ATE_unsigned) +!11 = !DILocalVariable(name: "2", scope: !5, file: !1, line: 2, type: !10) +!12 = !DILocation(line: 1, column: 1, scope: !5) +!13 = !DILocation(line: 2, column: 1, scope: !5) +!14 = !DILocation(line: 3, column: 1, scope: !5) +!15 = !DILocation(line: 4, column: 1, scope: !5) +;. +; CHECK: [[META9]] = !DILocalVariable(name: "1", +; CHECK: [[META11]] = !DILocalVariable(name: "2", +; CHECK: [[META12]] = !DILocation(line: 1, +; CHECK: [[META13]] = !DILocation(line: 2, +;. diff --git a/llvm/test/Transforms/NewGVN/salvage-trivially-dead-inst.ll b/llvm/test/Transforms/NewGVN/salvage-trivially-dead-inst.ll new file mode 100644 index 0000000..cc69541 --- /dev/null +++ b/llvm/test/Transforms/NewGVN/salvage-trivially-dead-inst.ll @@ -0,0 +1,33 @@ +; RUN: opt -passes=newgvn -S %s | FileCheck %s + +; Check that assignDFSNumbers() in NewGVN salvages the debug values of the +; trivially dead instructions that are marked for deletion. + +; CHECK: #dbg_value(i8 %tmp, [[META11:![0-9]+]], !DIExpression(DW_OP_constu, 8, DW_OP_eq, DW_OP_stack_value), [[META26:![0-9]+]]) +; CHECK: [[META11]] = !DILocalVariable(name: "2" +; CHECK: [[META26]] = !DILocation(line: 2 + +define void @test13() !dbg !5 { +entry: + %tmp = load i8, ptr null, align 1 + %tmp2 = icmp eq i8 %tmp, 8, !dbg !13 + #dbg_value(i1 %tmp2, !11, !DIExpression(), !13) + ret void +} + +!llvm.dbg.cu = !{!0} +!llvm.debugify = !{!2, !3} +!llvm.module.flags = !{!4} + +!0 = distinct !DICompileUnit(language: DW_LANG_C, file: !1, producer: "debugify", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug) +!1 = !DIFile(filename: "/app/example.ll", directory: "/") +!2 = !{i32 3} +!3 = !{i32 2} +!4 = !{i32 2, !"Debug Info Version", i32 3} +!5 = distinct !DISubprogram(name: "test13", linkageName: "test13", scope: null, file: !1, line: 1, type: !6, scopeLine: 1, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0, retainedNodes: !8) +!6 = !DISubroutineType(types: !7) +!7 = !{} +!8 = !{!11} +!10 = !DIBasicType(name: "ty8", size: 8, encoding: DW_ATE_unsigned) +!11 = !DILocalVariable(name: "2", scope: !5, file: !1, line: 2, type: !10) +!13 = !DILocation(line: 2, column: 1, scope: !5)
\ No newline at end of file diff --git a/llvm/test/Transforms/ObjCARC/inlined-autorelease-return-value.ll b/llvm/test/Transforms/ObjCARC/inlined-autorelease-return-value.ll index 60180c4..180fd0a 100644 --- a/llvm/test/Transforms/ObjCARC/inlined-autorelease-return-value.ll +++ b/llvm/test/Transforms/ObjCARC/inlined-autorelease-return-value.ll @@ -80,12 +80,14 @@ entry: ; CHECK-LABEL: define ptr @elide_with_retainRV_splitByLifetime( ; CHECK-NEXT: entry: +; CHECK-NEXT: %x = alloca ptr ; CHECK-NEXT: call void @llvm.lifetime.start.p0(i64 8, ptr %x) ; CHECK-NEXT: call void @llvm.lifetime.end.p0(i64 8, ptr %x) ; CHECK-NEXT: ret ptr %x -define ptr @elide_with_retainRV_splitByLifetime(ptr %x) nounwind { +define ptr @elide_with_retainRV_splitByLifetime() nounwind { entry: ; Cleanup should skip over lifetime intrinsics. + %x = alloca ptr call void @llvm.lifetime.start(i64 8, ptr %x) %b = call ptr @llvm.objc.autoreleaseReturnValue(ptr %x) nounwind call void @llvm.lifetime.end(i64 8, ptr %x) @@ -218,13 +220,15 @@ entry: ; CHECK-LABEL: define ptr @elide_with_claimRV_splitByLifetime( ; CHECK-NEXT: entry: +; CHECK-NEXT: %x = alloca ptr ; CHECK-NEXT: call void @llvm.lifetime.start.p0(i64 8, ptr %x) ; CHECK-NEXT: call void @llvm.lifetime.end.p0(i64 8, ptr %x) ; CHECK-NEXT: tail call void @llvm.objc.release(ptr %x) ; CHECK-NEXT: ret ptr %x -define ptr @elide_with_claimRV_splitByLifetime(ptr %x) nounwind { +define ptr @elide_with_claimRV_splitByLifetime() nounwind { entry: ; Cleanup should skip over lifetime intrinsics. + %x = alloca ptr call void @llvm.lifetime.start(i64 8, ptr %x) %b = call ptr @llvm.objc.autoreleaseReturnValue(ptr %x) nounwind call void @llvm.lifetime.end(i64 8, ptr %x) diff --git a/llvm/test/Transforms/SCCP/uscmp.ll b/llvm/test/Transforms/SCCP/uscmp.ll new file mode 100644 index 0000000..d010c06 --- /dev/null +++ b/llvm/test/Transforms/SCCP/uscmp.ll @@ -0,0 +1,185 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 +; RUN: opt -passes=sccp -S < %s | FileCheck %s + +define i32 @scmp_to_sub(i32 range(i32 -1, 2) %a) { +; CHECK-LABEL: define i32 @scmp_to_sub( +; CHECK-SAME: i32 range(i32 -1, 2) [[A:%.*]]) { +; CHECK-NEXT: [[SCMP:%.*]] = sub nsw i32 [[A]], 0 +; CHECK-NEXT: ret i32 [[SCMP]] +; + %scmp = call i32 @llvm.scmp(i32 %a, i32 0) + ret i32 %scmp +} + +define i32 @scmp_zext_to_sub(i1 %a, i1 %b) { +; CHECK-LABEL: define i32 @scmp_zext_to_sub( +; CHECK-SAME: i1 [[A:%.*]], i1 [[B:%.*]]) { +; CHECK-NEXT: [[ZEXT_A:%.*]] = zext i1 [[A]] to i32 +; CHECK-NEXT: [[ZEXT_B:%.*]] = zext i1 [[B]] to i32 +; CHECK-NEXT: [[SCMP:%.*]] = sub nsw i32 [[ZEXT_A]], [[ZEXT_B]] +; CHECK-NEXT: ret i32 [[SCMP]] +; + %zext_a = zext i1 %a to i32 + %zext_b = zext i1 %b to i32 + %scmp = call i32 @llvm.scmp(i32 %zext_a, i32 %zext_b) + ret i32 %scmp +} + +define i8 @scmp_to_sub_trunc(i32 range(i32 -1, 2) %a) { +; CHECK-LABEL: define i8 @scmp_to_sub_trunc( +; CHECK-SAME: i32 range(i32 -1, 2) [[A:%.*]]) { +; CHECK-NEXT: [[SCMP1:%.*]] = sub nsw i32 [[A]], 0 +; CHECK-NEXT: [[SCMP:%.*]] = trunc i32 [[SCMP1]] to i8 +; CHECK-NEXT: ret i8 [[SCMP]] +; + %scmp = call i8 @llvm.scmp(i32 %a, i32 0) + ret i8 %scmp +} + +define i64 @scmp_to_sub_sext(i32 range(i32 -1, 2) %a) { +; CHECK-LABEL: define i64 @scmp_to_sub_sext( +; CHECK-SAME: i32 range(i32 -1, 2) [[A:%.*]]) { +; CHECK-NEXT: [[SCMP1:%.*]] = sub nsw i32 [[A]], 0 +; CHECK-NEXT: [[SCMP:%.*]] = sext i32 [[SCMP1]] to i64 +; CHECK-NEXT: ret i64 [[SCMP]] +; + %scmp = call i64 @llvm.scmp(i32 %a, i32 0) + ret i64 %scmp +} + +define i32 @scmp_to_sub_small_range(i32 range(i32 -1, 1) %a) { +; CHECK-LABEL: define i32 @scmp_to_sub_small_range( +; CHECK-SAME: i32 range(i32 -1, 1) [[A:%.*]]) { +; CHECK-NEXT: [[SCMP:%.*]] = sub nsw i32 [[A]], 0 +; CHECK-NEXT: ret i32 [[SCMP]] +; + %scmp = call i32 @llvm.scmp(i32 %a, i32 0) + ret i32 %scmp +} + +define i32 @ucmp_to_sub(i32 range(i32 0, 3) %a) { +; CHECK-LABEL: define i32 @ucmp_to_sub( +; CHECK-SAME: i32 range(i32 0, 3) [[A:%.*]]) { +; CHECK-NEXT: [[SCMP:%.*]] = sub i32 [[A]], 1 +; CHECK-NEXT: ret i32 [[SCMP]] +; + %ucmp = call i32 @llvm.ucmp(i32 %a, i32 1) + ret i32 %ucmp +} + +define i8 @ucmp_to_sub_trunc(i32 range(i32 0, 3) %a) { +; CHECK-LABEL: define i8 @ucmp_to_sub_trunc( +; CHECK-SAME: i32 range(i32 0, 3) [[A:%.*]]) { +; CHECK-NEXT: [[UCMP1:%.*]] = sub i32 [[A]], 1 +; CHECK-NEXT: [[UCMP:%.*]] = trunc i32 [[UCMP1]] to i8 +; CHECK-NEXT: ret i8 [[UCMP]] +; + %ucmp = call i8 @llvm.ucmp(i32 %a, i32 1) + ret i8 %ucmp +} + +define i64 @ucmp_to_sub_sext(i32 range(i32 0, 3) %a) { +; CHECK-LABEL: define i64 @ucmp_to_sub_sext( +; CHECK-SAME: i32 range(i32 0, 3) [[A:%.*]]) { +; CHECK-NEXT: [[UCMP1:%.*]] = sub i32 [[A]], 1 +; CHECK-NEXT: [[UCMP:%.*]] = sext i32 [[UCMP1]] to i64 +; CHECK-NEXT: ret i64 [[UCMP]] +; + %ucmp = call i64 @llvm.ucmp(i32 %a, i32 1) + ret i64 %ucmp +} + +; TODO: we can fold this into %a. +define i32 @ucmp_to_sub_small_range(i32 range(i32 0, 2) %a) { +; CHECK-LABEL: define i32 @ucmp_to_sub_small_range( +; CHECK-SAME: i32 range(i32 0, 2) [[A:%.*]]) { +; CHECK-NEXT: [[UCMP:%.*]] = call i32 @llvm.ucmp.i32.i32(i32 [[A]], i32 0) +; CHECK-NEXT: ret i32 [[UCMP]] +; + %ucmp = call i32 @llvm.ucmp(i32 %a, i32 0) + ret i32 %ucmp +} + +define i32 @scmp_to_sub_large_range(i32 range(i32 -1, 3) %a) { +; CHECK-LABEL: define i32 @scmp_to_sub_large_range( +; CHECK-SAME: i32 range(i32 -1, 3) [[A:%.*]]) { +; CHECK-NEXT: [[SCMP:%.*]] = call i32 @llvm.scmp.i32.i32(i32 [[A]], i32 0) +; CHECK-NEXT: ret i32 [[SCMP]] +; + %scmp = call i32 @llvm.scmp(i32 %a, i32 0) + ret i32 %scmp +} + +define i32 @ucmp_to_sub_large_range(i32 range(i32 -1, 3) %a) { +; CHECK-LABEL: define i32 @ucmp_to_sub_large_range( +; CHECK-SAME: i32 range(i32 -1, 3) [[A:%.*]]) { +; CHECK-NEXT: [[SCMP:%.*]] = call i32 @llvm.ucmp.i32.i32(i32 [[A]], i32 0) +; CHECK-NEXT: ret i32 [[SCMP]] +; + %ucmp = call i32 @llvm.ucmp(i32 %a, i32 0) + ret i32 %ucmp +} + +define i32 @scmp_to_sub_wrap(i8 range(i8 127, -126) %a) { +; CHECK-LABEL: define i32 @scmp_to_sub_wrap( +; CHECK-SAME: i8 range(i8 127, -126) [[A:%.*]]) { +; CHECK-NEXT: [[SCMP:%.*]] = call i32 @llvm.scmp.i32.i8(i8 [[A]], i8 -128) +; CHECK-NEXT: ret i32 [[SCMP]] +; + %scmp = call i32 @llvm.scmp(i8 %a, i8 -128) + ret i32 %scmp +} + +define i32 @ucmp_to_sub_wrap(i8 range(i8 -1, 2) %a) { +; CHECK-LABEL: define i32 @ucmp_to_sub_wrap( +; CHECK-SAME: i8 range(i8 -1, 2) [[A:%.*]]) { +; CHECK-NEXT: [[UCMP:%.*]] = call i32 @llvm.ucmp.i32.i8(i8 [[A]], i8 0) +; CHECK-NEXT: ret i32 [[UCMP]] +; + %ucmp = call i32 @llvm.ucmp(i8 %a, i8 0) + ret i32 %ucmp +} + +; It is incorrect to convert a ucmp into sub when the input type is i1. +define i32 @ucmp_to_sub_i1_rhs_const(i1 %a) { +; CHECK-LABEL: define i32 @ucmp_to_sub_i1_rhs_const( +; CHECK-SAME: i1 [[A:%.*]]) { +; CHECK-NEXT: [[SCMP:%.*]] = call i32 @llvm.ucmp.i32.i1(i1 [[A]], i1 false) +; CHECK-NEXT: ret i32 [[SCMP]] +; + %ucmp = call i32 @llvm.ucmp(i1 %a, i1 false) + ret i32 %ucmp +} + +; It is incorrect to convert a ucmp into sub when the input type is i1. +define i32 @ucmp_to_sub_i1_lhs_const(i1 %a) { +; CHECK-LABEL: define i32 @ucmp_to_sub_i1_lhs_const( +; CHECK-SAME: i1 [[A:%.*]]) { +; CHECK-NEXT: [[SCMP:%.*]] = call i32 @llvm.ucmp.i32.i1(i1 false, i1 [[A]]) +; CHECK-NEXT: ret i32 [[SCMP]] +; + %ucmp = call i32 @llvm.ucmp(i1 false, i1 %a) + ret i32 %ucmp +} + +; It is incorrect to convert a ucmp into sub when the input type is i1. +define i32 @ucmp_to_sub_i1(i1 %a, i1 %b) { +; CHECK-LABEL: define i32 @ucmp_to_sub_i1( +; CHECK-SAME: i1 [[A:%.*]], i1 [[B:%.*]]) { +; CHECK-NEXT: [[SCMP:%.*]] = call i32 @llvm.ucmp.i32.i1(i1 [[A]], i1 [[B]]) +; CHECK-NEXT: ret i32 [[SCMP]] +; + %ucmp = call i32 @llvm.ucmp(i1 %a, i1 %b) + ret i32 %ucmp +} + +; It is incorrect to convert a scmp into sub when the input type is i1. +define i32 @scmp_to_sub_i1_rhs_const(i1 %a) { +; CHECK-LABEL: define i32 @scmp_to_sub_i1_rhs_const( +; CHECK-SAME: i1 [[A:%.*]]) { +; CHECK-NEXT: [[SCMP:%.*]] = call i32 @llvm.scmp.i32.i1(i1 [[A]], i1 false) +; CHECK-NEXT: ret i32 [[SCMP]] +; + %scmp = call i32 @llvm.scmp(i1 %a, i1 false) + ret i32 %scmp +} diff --git a/llvm/test/Transforms/SLPVectorizer/X86/buildvector-schedule-for-subvector.ll b/llvm/test/Transforms/SLPVectorizer/X86/buildvector-schedule-for-subvector.ll index 07fdc9d..7408ba1 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/buildvector-schedule-for-subvector.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/buildvector-schedule-for-subvector.ll @@ -4,9 +4,6 @@ define void @test() { ; CHECK-LABEL: define void @test() { ; CHECK-NEXT: [[BB:.*:]] -; CHECK-NEXT: [[ADD:%.*]] = add i32 1, 0 -; CHECK-NEXT: [[TMP0:%.*]] = insertelement <4 x i32> <i32 0, i32 0, i32 0, i32 poison>, i32 [[ADD]], i32 3 -; CHECK-NEXT: [[TMP1:%.*]] = icmp ult <4 x i32> [[TMP0]], zeroinitializer ; CHECK-NEXT: [[ICMP:%.*]] = icmp samesign ult i32 0, 0 ; CHECK-NEXT: [[SELECT:%.*]] = select i1 [[ICMP]], i32 0, i32 0 ; CHECK-NEXT: [[ZEXT:%.*]] = zext i32 [[SELECT]] to i64 @@ -17,8 +14,7 @@ define void @test() { ; CHECK-NEXT: [[TMP2:%.*]] = insertelement <4 x i32> <i32 0, i32 0, i32 0, i32 poison>, i32 [[CALL]], i32 3 ; CHECK-NEXT: [[TMP3:%.*]] = icmp eq <4 x i32> [[TMP2]], zeroinitializer ; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <4 x i1> [[TMP3]], <4 x i1> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison> -; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <4 x i1> [[TMP1]], <4 x i1> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison> -; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <8 x i1> [[TMP4]], <8 x i1> [[TMP5]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11> +; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <8 x i1> [[TMP4]], <8 x i1> <i1 false, i1 false, i1 false, i1 false, i1 undef, i1 undef, i1 undef, i1 undef>, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11> ; CHECK-NEXT: ret void ; bb: diff --git a/llvm/test/Transforms/SLPVectorizer/X86/full-match-with-poison-scalar.ll b/llvm/test/Transforms/SLPVectorizer/X86/full-match-with-poison-scalar.ll index 15ba98f..5e3d471 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/full-match-with-poison-scalar.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/full-match-with-poison-scalar.ll @@ -7,17 +7,10 @@ define i32 @test() { ; CHECK-NEXT: br label %[[FUNC_135_EXIT_I:.*]] ; CHECK: [[FUNC_135_EXIT_I]]: ; CHECK-NEXT: [[G_228_PROMOTED166_I1105_I:%.*]] = phi i32 [ 0, %[[ENTRY]] ] -; CHECK-NEXT: [[TMP0:%.*]] = insertelement <4 x i32> <i32 poison, i32 0, i32 poison, i32 poison>, i32 [[G_228_PROMOTED166_I1105_I]], i32 0 -; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x i32> [[TMP0]], <4 x i32> poison, <4 x i32> <i32 0, i32 0, i32 0, i32 1> -; CHECK-NEXT: [[TMP2:%.*]] = add <4 x i32> zeroinitializer, [[TMP1]] -; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> poison, <12 x i32> <i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 3> -; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <4 x i32> [[TMP0]], <4 x i32> poison, <16 x i32> <i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 0, i32 0, i32 0, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison> -; CHECK-NEXT: [[TMP5:%.*]] = insertelement <16 x i32> poison, i32 [[G_228_PROMOTED166_I1105_I]], i32 0 -; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <16 x i32> [[TMP5]], <16 x i32> poison, <16 x i32> <i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 0, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison> -; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <16 x i32> [[TMP7]], <16 x i32> [[TMP9]], <16 x i32> <i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 23, i32 8, i32 9, i32 10, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison> -; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <12 x i32> [[TMP3]], <12 x i32> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 poison, i32 poison, i32 poison, i32 poison> -; CHECK-NEXT: [[TMP17:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison> -; CHECK-NEXT: [[TMP11:%.*]] = shufflevector <16 x i32> [[TMP17]], <16 x i32> [[TMP8]], <16 x i32> <i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 23, i32 24, i32 25, i32 26, i32 2, i32 2, i32 2, i32 2, i32 3> +; CHECK-NEXT: [[TMP0:%.*]] = insertelement <8 x i32> <i32 poison, i32 poison, i32 poison, i32 poison, i32 0, i32 poison, i32 poison, i32 poison>, i32 [[G_228_PROMOTED166_I1105_I]], i32 0 +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <8 x i32> [[TMP0]], <8 x i32> poison, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 4, i32 poison, i32 poison, i32 poison> +; CHECK-NEXT: [[TMP2:%.*]] = add <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 poison, i32 poison, i32 poison>, [[TMP1]] +; CHECK-NEXT: [[TMP11:%.*]] = shufflevector <8 x i32> [[TMP2]], <8 x i32> poison, <16 x i32> <i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 4> ; CHECK-NEXT: [[TMP12:%.*]] = icmp ugt <16 x i32> [[TMP11]], zeroinitializer ; CHECK-NEXT: [[TMP13:%.*]] = icmp ult <16 x i32> [[TMP11]], zeroinitializer ; CHECK-NEXT: [[TMP14:%.*]] = shufflevector <16 x i1> [[TMP12]], <16 x i1> [[TMP13]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 31> diff --git a/llvm/test/Transforms/SLPVectorizer/X86/node-outside-used-only.ll b/llvm/test/Transforms/SLPVectorizer/X86/node-outside-used-only.ll index 1c482e0..03d76ef 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/node-outside-used-only.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/node-outside-used-only.ll @@ -4,11 +4,10 @@ define i64 @test() { ; CHECK-LABEL: define i64 @test() { ; CHECK-NEXT: [[BB:.*]]: -; CHECK-NEXT: [[TMP0:%.*]] = insertelement <2 x i32> <i32 0, i32 poison>, i32 0, i32 1 ; CHECK-NEXT: br label %[[BB1:.*]] ; CHECK: [[BB1]]: ; CHECK-NEXT: [[TMP1:%.*]] = phi <2 x i32> [ zeroinitializer, %[[BB]] ], [ [[TMP4:%.*]], %[[BB5:.*]] ] -; CHECK-NEXT: [[TMP2:%.*]] = or <2 x i32> [[TMP0]], [[TMP1]] +; CHECK-NEXT: [[TMP2:%.*]] = or <2 x i32> zeroinitializer, [[TMP1]] ; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP2]], <2 x i32> <i32 0, i32 3> ; CHECK-NEXT: [[TMP4]] = or <2 x i32> [[TMP3]], zeroinitializer ; CHECK-NEXT: br label %[[BB5]] diff --git a/llvm/test/Transforms/SLPVectorizer/X86/non-schedulable-instructions-become-schedulable.ll b/llvm/test/Transforms/SLPVectorizer/X86/non-schedulable-instructions-become-schedulable.ll index 652abef..6bb52e0 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/non-schedulable-instructions-become-schedulable.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/non-schedulable-instructions-become-schedulable.ll @@ -7,19 +7,17 @@ define void @test() { ; CHECK-NEXT: [[ENTRY:.*:]] ; CHECK-NEXT: br label %[[BB1:.*]] ; CHECK: [[IF_THEN_I_I:.*]]: -; CHECK-NEXT: br label %[[BB5:.*]] +; CHECK-NEXT: br label %[[BB3:.*]] ; CHECK: [[BB1]]: ; CHECK-NEXT: [[TMP0:%.*]] = zext i1 false to i64 -; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x i64> <i64 poison, i64 0>, i64 [[TMP0]], i32 0 -; CHECK-NEXT: [[TMP2:%.*]] = add <2 x i64> zeroinitializer, [[TMP1]] -; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <2 x i64> [[TMP2]], <2 x i64> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison> -; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <4 x i64> <i64 0, i64 0, i64 poison, i64 poison>, <4 x i64> [[TMP3]], <4 x i32> <i32 0, i32 1, i32 4, i32 5> -; CHECK-NEXT: br i1 false, label %[[BB5]], label %[[BB2:.*]] -; CHECK: [[BB5]]: -; CHECK-NEXT: [[TMP6:%.*]] = phi <4 x i64> [ [[TMP4]], %[[BB1]] ], [ poison, %[[IF_THEN_I_I]] ] +; CHECK-NEXT: [[TMP1:%.*]] = insertelement <4 x i64> <i64 0, i64 0, i64 poison, i64 0>, i64 [[TMP0]], i32 2 +; CHECK-NEXT: [[TMP2:%.*]] = add <4 x i64> zeroinitializer, [[TMP1]] +; CHECK-NEXT: br i1 false, label %[[BB3]], label %[[BB2:.*]] +; CHECK: [[BB3]]: +; CHECK-NEXT: [[TMP4:%.*]] = phi <4 x i64> [ [[TMP2]], %[[BB1]] ], [ poison, %[[IF_THEN_I_I]] ] ; CHECK-NEXT: br label %[[BB2]] ; CHECK: [[BB2]]: -; CHECK-NEXT: [[TMP7:%.*]] = phi <4 x i64> [ [[TMP6]], %[[BB5]] ], [ [[TMP4]], %[[BB1]] ] +; CHECK-NEXT: [[TMP7:%.*]] = phi <4 x i64> [ [[TMP4]], %[[BB3]] ], [ [[TMP2]], %[[BB1]] ] ; CHECK-NEXT: store <4 x i64> [[TMP7]], ptr getelementptr inbounds nuw (i8, ptr null, i64 40), align 8 ; CHECK-NEXT: ret void ; diff --git a/llvm/test/Transforms/SLPVectorizer/X86/pr47642.ll b/llvm/test/Transforms/SLPVectorizer/X86/pr47642.ll index a4949bc..782aada 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/pr47642.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/pr47642.ll @@ -6,14 +6,9 @@ target triple = "x86_64-unknown-linux-gnu" define <4 x i32> @foo(<4 x i32> %x, i32 %f) { ; CHECK-LABEL: @foo( -; CHECK-NEXT: [[VECINIT:%.*]] = insertelement <4 x i32> undef, i32 [[F:%.*]], i32 0 -; CHECK-NEXT: [[ADD:%.*]] = add nsw i32 [[F]], 1 -; CHECK-NEXT: [[VECINIT1:%.*]] = insertelement <4 x i32> [[VECINIT]], i32 [[ADD]], i32 1 -; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x i32> poison, i32 [[F]], i32 0 -; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> poison, <2 x i32> zeroinitializer -; CHECK-NEXT: [[TMP3:%.*]] = add nsw <2 x i32> [[TMP2]], <i32 2, i32 3> -; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <2 x i32> [[TMP3]], <2 x i32> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison> -; CHECK-NEXT: [[VECINIT51:%.*]] = shufflevector <4 x i32> [[VECINIT1]], <4 x i32> [[TMP4]], <4 x i32> <i32 0, i32 1, i32 4, i32 5> +; CHECK-NEXT: [[VECINIT:%.*]] = insertelement <4 x i32> poison, i32 [[F:%.*]], i32 0 +; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x i32> [[VECINIT]], <4 x i32> poison, <4 x i32> zeroinitializer +; CHECK-NEXT: [[VECINIT51:%.*]] = add <4 x i32> [[TMP2]], <i32 0, i32 1, i32 2, i32 3> ; CHECK-NEXT: ret <4 x i32> [[VECINIT51]] ; %vecinit = insertelement <4 x i32> undef, i32 %f, i32 0 diff --git a/llvm/test/Transforms/SLPVectorizer/alternate-non-profitable.ll b/llvm/test/Transforms/SLPVectorizer/alternate-non-profitable.ll index ad4daea..125c2dc 100644 --- a/llvm/test/Transforms/SLPVectorizer/alternate-non-profitable.ll +++ b/llvm/test/Transforms/SLPVectorizer/alternate-non-profitable.ll @@ -150,9 +150,9 @@ define <2 x i32> @replace_through_int_casts_ele0_only(i16 %inp, <2 x i16> %dead) define <2 x i8> @replace_through_binop_fail_cant_speculate(i8 %inp, <2 x i8> %d, <2 x i8> %any) { ; CHECK-LABEL: define <2 x i8> @replace_through_binop_fail_cant_speculate( ; CHECK-SAME: i8 [[INP:%.*]], <2 x i8> [[D:%.*]], <2 x i8> [[ANY:%.*]]) { -; CHECK-NEXT: [[ADD:%.*]] = add i8 [[INP]], 5 -; CHECK-NEXT: [[V0:%.*]] = insertelement <2 x i8> poison, i8 [[INP]], i64 0 -; CHECK-NEXT: [[V:%.*]] = insertelement <2 x i8> [[V0]], i8 [[ADD]], i64 1 +; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x i8> poison, i8 [[INP]], i32 0 +; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <2 x i8> [[TMP3]], <2 x i8> poison, <2 x i32> zeroinitializer +; CHECK-NEXT: [[V:%.*]] = add <2 x i8> [[TMP2]], <i8 0, i8 5> ; CHECK-NEXT: [[DIV0:%.*]] = sdiv <2 x i8> splat (i8 -128), [[V]] ; CHECK-NEXT: [[TMP1:%.*]] = xor i8 [[INP]], 123 ; CHECK-NEXT: [[R:%.*]] = insertelement <2 x i8> [[DIV0]], i8 [[TMP1]], i64 0 diff --git a/llvm/test/Transforms/SafeStack/X86/coloring2.ll b/llvm/test/Transforms/SafeStack/X86/coloring2.ll index 2e02ea6..ae5f375 100644 --- a/llvm/test/Transforms/SafeStack/X86/coloring2.ll +++ b/llvm/test/Transforms/SafeStack/X86/coloring2.ll @@ -478,43 +478,6 @@ l2: br label %l2 } -; This test checks for a bug where the stack coloring algorithm was not tracking -; the live range of allocas through phi instructions, so it did not consider -; alloca and alloca2 to be live at the same time. As a result it was using -; the same stack slot for both allocas. To ensure this bug isn't present, we -; check that there are 64 bytes allocated for the unsafe stack which is enough -; space for both allocas. -; CHECK-LABEL: @stack_coloring_liveness_bug -define void @stack_coloring_liveness_bug(i32 %arg0) #0 { -entry: -; CHECK: %[[USP:.*]] = load ptr, ptr @__safestack_unsafe_stack_ptr -; CHECK-NEXT: getelementptr i8, ptr %[[USP]], i32 -64 - %alloca = alloca [32 x i8], align 16 - %alloca2 = alloca [32 x i8], align 16 - %cond = icmp eq i32 %arg0, 0 - br i1 %cond, label %if, label %else - -if: - br label %end - -else: -; CHECK: getelementptr i8, ptr %[[USP]], i32 -32 - call void @llvm.lifetime.start.p0(i64 32, ptr nonnull %alloca) - call void @capture8(ptr %alloca) - call void @llvm.lifetime.end.p0(i64 32, ptr nonnull %alloca) - br label %end - -end: -; CHECK: getelementptr i8, ptr %[[USP]], i32 -64 - %alloca.end = phi ptr [ %alloca, %if], [%alloca, %else] - call void @llvm.lifetime.start.p0(i64 32, ptr nonnull %alloca2) - call void @llvm.lifetime.start.p0(i64 32, ptr nonnull %alloca.end) - call void @capture2_8(ptr %alloca2, ptr %alloca.end) - call void @llvm.lifetime.end.p0(i64 32, ptr nonnull %alloca2) - call void @llvm.lifetime.end.p0(i64 32, ptr nonnull %alloca.end) - ret void -} - attributes #0 = { safestack } declare void @llvm.lifetime.start.p0(i64, ptr nocapture) diff --git a/llvm/test/Transforms/VectorCombine/X86/bitop-of-castops.ll b/llvm/test/Transforms/VectorCombine/X86/bitop-of-castops.ll new file mode 100644 index 0000000..220556c --- /dev/null +++ b/llvm/test/Transforms/VectorCombine/X86/bitop-of-castops.ll @@ -0,0 +1,262 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt < %s -passes=vector-combine -S -mtriple=x86_64-- | FileCheck %s + +; Negative test: bitcast from float to int (optimization should not apply) +define <4 x i32> @and_bitcast_v4f32_to_v4i32(<4 x float> %a, <4 x float> %b) { +; CHECK-LABEL: @and_bitcast_v4f32_to_v4i32( +; CHECK-NEXT: [[BC1:%.*]] = bitcast <4 x float> [[A:%.*]] to <4 x i32> +; CHECK-NEXT: [[BC2:%.*]] = bitcast <4 x float> [[B:%.*]] to <4 x i32> +; CHECK-NEXT: [[AND:%.*]] = and <4 x i32> [[BC1]], [[BC2]] +; CHECK-NEXT: ret <4 x i32> [[AND]] +; + %bc1 = bitcast <4 x float> %a to <4 x i32> + %bc2 = bitcast <4 x float> %b to <4 x i32> + %and = and <4 x i32> %bc1, %bc2 + ret <4 x i32> %and +} + +; Test bitwise operations with integer-to-integer bitcast +define <2 x i32> @or_bitcast_v4i16_to_v2i32(<4 x i16> %a, <4 x i16> %b) { +; CHECK-LABEL: @or_bitcast_v4i16_to_v2i32( +; CHECK-NEXT: [[B:%.*]] = or <4 x i16> [[A:%.*]], [[B1:%.*]] +; CHECK-NEXT: [[BC2:%.*]] = bitcast <4 x i16> [[B]] to <2 x i32> +; CHECK-NEXT: ret <2 x i32> [[BC2]] +; + %bc1 = bitcast <4 x i16> %a to <2 x i32> + %bc2 = bitcast <4 x i16> %b to <2 x i32> + %or = or <2 x i32> %bc1, %bc2 + ret <2 x i32> %or +} + +define <16 x i8> @xor_bitcast_v2i64_to_v16i8(<2 x i64> %a, <2 x i64> %b) { +; CHECK-LABEL: @xor_bitcast_v2i64_to_v16i8( +; CHECK-NEXT: [[B:%.*]] = xor <2 x i64> [[A:%.*]], [[B1:%.*]] +; CHECK-NEXT: [[BC2:%.*]] = bitcast <2 x i64> [[B]] to <16 x i8> +; CHECK-NEXT: ret <16 x i8> [[BC2]] +; + %bc1 = bitcast <2 x i64> %a to <16 x i8> + %bc2 = bitcast <2 x i64> %b to <16 x i8> + %xor = xor <16 x i8> %bc1, %bc2 + ret <16 x i8> %xor +} + +; Test bitwise operations with truncate +define <4 x i16> @and_trunc_v4i32_to_v4i16(<4 x i32> %a, <4 x i32> %b) { +; CHECK-LABEL: @and_trunc_v4i32_to_v4i16( +; CHECK-NEXT: [[AND_INNER:%.*]] = and <4 x i32> [[A:%.*]], [[B:%.*]] +; CHECK-NEXT: [[AND:%.*]] = trunc <4 x i32> [[AND_INNER]] to <4 x i16> +; CHECK-NEXT: ret <4 x i16> [[AND]] +; + %t1 = trunc <4 x i32> %a to <4 x i16> + %t2 = trunc <4 x i32> %b to <4 x i16> + %and = and <4 x i16> %t1, %t2 + ret <4 x i16> %and +} + +define <8 x i8> @or_trunc_v8i16_to_v8i8(<8 x i16> %a, <8 x i16> %b) { +; CHECK-LABEL: @or_trunc_v8i16_to_v8i8( +; CHECK-NEXT: [[OR_INNER:%.*]] = or <8 x i16> [[A:%.*]], [[B:%.*]] +; CHECK-NEXT: [[OR:%.*]] = trunc <8 x i16> [[OR_INNER]] to <8 x i8> +; CHECK-NEXT: ret <8 x i8> [[OR]] +; + %t1 = trunc <8 x i16> %a to <8 x i8> + %t2 = trunc <8 x i16> %b to <8 x i8> + %or = or <8 x i8> %t1, %t2 + ret <8 x i8> %or +} + +define <2 x i32> @xor_trunc_v2i64_to_v2i32(<2 x i64> %a, <2 x i64> %b) { +; CHECK-LABEL: @xor_trunc_v2i64_to_v2i32( +; CHECK-NEXT: [[XOR_INNER:%.*]] = xor <2 x i64> [[A:%.*]], [[B:%.*]] +; CHECK-NEXT: [[XOR:%.*]] = trunc <2 x i64> [[XOR_INNER]] to <2 x i32> +; CHECK-NEXT: ret <2 x i32> [[XOR]] +; + %t1 = trunc <2 x i64> %a to <2 x i32> + %t2 = trunc <2 x i64> %b to <2 x i32> + %xor = xor <2 x i32> %t1, %t2 + ret <2 x i32> %xor +} + +; Test bitwise operations with zero extend +define <4 x i32> @and_zext_v4i16_to_v4i32(<4 x i16> %a, <4 x i16> %b) { +; CHECK-LABEL: @and_zext_v4i16_to_v4i32( +; CHECK-NEXT: [[AND_INNER:%.*]] = and <4 x i16> [[A:%.*]], [[B:%.*]] +; CHECK-NEXT: [[AND:%.*]] = zext <4 x i16> [[AND_INNER]] to <4 x i32> +; CHECK-NEXT: ret <4 x i32> [[AND]] +; + %z1 = zext <4 x i16> %a to <4 x i32> + %z2 = zext <4 x i16> %b to <4 x i32> + %and = and <4 x i32> %z1, %z2 + ret <4 x i32> %and +} + +define <8 x i16> @or_zext_v8i8_to_v8i16(<8 x i8> %a, <8 x i8> %b) { +; CHECK-LABEL: @or_zext_v8i8_to_v8i16( +; CHECK-NEXT: [[OR_INNER:%.*]] = or <8 x i8> [[A:%.*]], [[B:%.*]] +; CHECK-NEXT: [[OR:%.*]] = zext <8 x i8> [[OR_INNER]] to <8 x i16> +; CHECK-NEXT: ret <8 x i16> [[OR]] +; + %z1 = zext <8 x i8> %a to <8 x i16> + %z2 = zext <8 x i8> %b to <8 x i16> + %or = or <8 x i16> %z1, %z2 + ret <8 x i16> %or +} + +define <2 x i64> @xor_zext_v2i32_to_v2i64(<2 x i32> %a, <2 x i32> %b) { +; CHECK-LABEL: @xor_zext_v2i32_to_v2i64( +; CHECK-NEXT: [[XOR_INNER:%.*]] = xor <2 x i32> [[A:%.*]], [[B:%.*]] +; CHECK-NEXT: [[XOR:%.*]] = zext <2 x i32> [[XOR_INNER]] to <2 x i64> +; CHECK-NEXT: ret <2 x i64> [[XOR]] +; + %z1 = zext <2 x i32> %a to <2 x i64> + %z2 = zext <2 x i32> %b to <2 x i64> + %xor = xor <2 x i64> %z1, %z2 + ret <2 x i64> %xor +} + +; Test bitwise operations with sign extend +define <4 x i32> @and_sext_v4i16_to_v4i32(<4 x i16> %a, <4 x i16> %b) { +; CHECK-LABEL: @and_sext_v4i16_to_v4i32( +; CHECK-NEXT: [[AND_INNER:%.*]] = and <4 x i16> [[A:%.*]], [[B:%.*]] +; CHECK-NEXT: [[AND:%.*]] = sext <4 x i16> [[AND_INNER]] to <4 x i32> +; CHECK-NEXT: ret <4 x i32> [[AND]] +; + %s1 = sext <4 x i16> %a to <4 x i32> + %s2 = sext <4 x i16> %b to <4 x i32> + %and = and <4 x i32> %s1, %s2 + ret <4 x i32> %and +} + +define <8 x i16> @or_sext_v8i8_to_v8i16(<8 x i8> %a, <8 x i8> %b) { +; CHECK-LABEL: @or_sext_v8i8_to_v8i16( +; CHECK-NEXT: [[OR_INNER:%.*]] = or <8 x i8> [[A:%.*]], [[B:%.*]] +; CHECK-NEXT: [[OR:%.*]] = sext <8 x i8> [[OR_INNER]] to <8 x i16> +; CHECK-NEXT: ret <8 x i16> [[OR]] +; + %s1 = sext <8 x i8> %a to <8 x i16> + %s2 = sext <8 x i8> %b to <8 x i16> + %or = or <8 x i16> %s1, %s2 + ret <8 x i16> %or +} + +define <2 x i64> @xor_sext_v2i32_to_v2i64(<2 x i32> %a, <2 x i32> %b) { +; CHECK-LABEL: @xor_sext_v2i32_to_v2i64( +; CHECK-NEXT: [[XOR_INNER:%.*]] = xor <2 x i32> [[A:%.*]], [[B:%.*]] +; CHECK-NEXT: [[XOR:%.*]] = sext <2 x i32> [[XOR_INNER]] to <2 x i64> +; CHECK-NEXT: ret <2 x i64> [[XOR]] +; + %s1 = sext <2 x i32> %a to <2 x i64> + %s2 = sext <2 x i32> %b to <2 x i64> + %xor = xor <2 x i64> %s1, %s2 + ret <2 x i64> %xor +} + +; Negative test: mismatched cast types (zext and sext) +define <4 x i32> @and_zext_sext_mismatch(<4 x i16> %a, <4 x i16> %b) { +; CHECK-LABEL: @and_zext_sext_mismatch( +; CHECK-NEXT: [[Z1:%.*]] = zext <4 x i16> [[A:%.*]] to <4 x i32> +; CHECK-NEXT: [[S2:%.*]] = sext <4 x i16> [[B:%.*]] to <4 x i32> +; CHECK-NEXT: [[AND:%.*]] = and <4 x i32> [[Z1]], [[S2]] +; CHECK-NEXT: ret <4 x i32> [[AND]] +; + %z1 = zext <4 x i16> %a to <4 x i32> + %s2 = sext <4 x i16> %b to <4 x i32> + %and = and <4 x i32> %z1, %s2 + ret <4 x i32> %and +} + +; Negative test: mismatched source types +define <4 x i32> @or_zext_different_src_types(<4 x i16> %a, <4 x i8> %b) { +; CHECK-LABEL: @or_zext_different_src_types( +; CHECK-NEXT: [[Z1:%.*]] = zext <4 x i16> [[A:%.*]] to <4 x i32> +; CHECK-NEXT: [[Z2:%.*]] = zext <4 x i8> [[B:%.*]] to <4 x i32> +; CHECK-NEXT: [[OR:%.*]] = or <4 x i32> [[Z1]], [[Z2]] +; CHECK-NEXT: ret <4 x i32> [[OR]] +; + %z1 = zext <4 x i16> %a to <4 x i32> + %z2 = zext <4 x i8> %b to <4 x i32> + %or = or <4 x i32> %z1, %z2 + ret <4 x i32> %or +} + +; Negative test: scalar types (not vectors) +define i32 @xor_zext_scalar(i16 %a, i16 %b) { +; CHECK-LABEL: @xor_zext_scalar( +; CHECK-NEXT: [[Z1:%.*]] = zext i16 [[A:%.*]] to i32 +; CHECK-NEXT: [[Z2:%.*]] = zext i16 [[B:%.*]] to i32 +; CHECK-NEXT: [[XOR:%.*]] = xor i32 [[Z1]], [[Z2]] +; CHECK-NEXT: ret i32 [[XOR]] +; + %z1 = zext i16 %a to i32 + %z2 = zext i16 %b to i32 + %xor = xor i32 %z1, %z2 + ret i32 %xor +} + +; Test multi-use: one cast has multiple uses +define <4 x i32> @and_zext_multiuse(<4 x i16> %a, <4 x i16> %b) { +; CHECK-LABEL: @and_zext_multiuse( +; CHECK-NEXT: [[Z1:%.*]] = zext <4 x i16> [[A:%.*]] to <4 x i32> +; CHECK-NEXT: [[AND_INNER:%.*]] = and <4 x i16> [[A]], [[B:%.*]] +; CHECK-NEXT: [[AND:%.*]] = zext <4 x i16> [[AND_INNER]] to <4 x i32> +; CHECK-NEXT: [[ADD:%.*]] = add <4 x i32> [[Z1]], [[AND]] +; CHECK-NEXT: ret <4 x i32> [[ADD]] +; + %z1 = zext <4 x i16> %a to <4 x i32> + %z2 = zext <4 x i16> %b to <4 x i32> + %and = and <4 x i32> %z1, %z2 + %add = add <4 x i32> %z1, %and ; z1 has multiple uses + ret <4 x i32> %add +} + +; Test with different vector sizes +define <16 x i16> @or_zext_v16i8_to_v16i16(<16 x i8> %a, <16 x i8> %b) { +; CHECK-LABEL: @or_zext_v16i8_to_v16i16( +; CHECK-NEXT: [[OR_INNER:%.*]] = or <16 x i8> [[A:%.*]], [[B:%.*]] +; CHECK-NEXT: [[OR:%.*]] = zext <16 x i8> [[OR_INNER]] to <16 x i16> +; CHECK-NEXT: ret <16 x i16> [[OR]] +; + %z1 = zext <16 x i8> %a to <16 x i16> + %z2 = zext <16 x i8> %b to <16 x i16> + %or = or <16 x i16> %z1, %z2 + ret <16 x i16> %or +} + +; Test bitcast with different element counts +define <8 x i16> @xor_bitcast_v4i32_to_v8i16(<4 x i32> %a, <4 x i32> %b) { +; CHECK-LABEL: @xor_bitcast_v4i32_to_v8i16( +; CHECK-NEXT: [[XOR_INNER:%.*]] = xor <4 x i32> [[A:%.*]], [[B:%.*]] +; CHECK-NEXT: [[XOR:%.*]] = bitcast <4 x i32> [[XOR_INNER]] to <8 x i16> +; CHECK-NEXT: ret <8 x i16> [[XOR]] +; + %bc1 = bitcast <4 x i32> %a to <8 x i16> + %bc2 = bitcast <4 x i32> %b to <8 x i16> + %xor = xor <8 x i16> %bc1, %bc2 + ret <8 x i16> %xor +} + +; Test truncate with flag preservation +define <4 x i16> @and_trunc_nuw_nsw(<4 x i32> %a, <4 x i32> %b) { +; CHECK-LABEL: @and_trunc_nuw_nsw( +; CHECK-NEXT: [[AND_INNER:%.*]] = and <4 x i32> [[A:%.*]], [[B:%.*]] +; CHECK-NEXT: [[AND:%.*]] = trunc nuw nsw <4 x i32> [[AND_INNER]] to <4 x i16> +; CHECK-NEXT: ret <4 x i16> [[AND]] +; + %t1 = trunc nuw nsw <4 x i32> %a to <4 x i16> + %t2 = trunc nuw nsw <4 x i32> %b to <4 x i16> + %and = and <4 x i16> %t1, %t2 + ret <4 x i16> %and +} + +; Test sign extend with nneg flag +define <4 x i32> @or_zext_nneg(<4 x i16> %a, <4 x i16> %b) { +; CHECK-LABEL: @or_zext_nneg( +; CHECK-NEXT: [[OR_INNER:%.*]] = or <4 x i16> [[A:%.*]], [[B:%.*]] +; CHECK-NEXT: [[OR:%.*]] = zext nneg <4 x i16> [[OR_INNER]] to <4 x i32> +; CHECK-NEXT: ret <4 x i32> [[OR]] +; + %z1 = zext nneg <4 x i16> %a to <4 x i32> + %z2 = zext nneg <4 x i16> %b to <4 x i32> + %or = or <4 x i32> %z1, %z2 + ret <4 x i32> %or +} diff --git a/llvm/test/Verifier/AMDGPU/wmma-f8f6f4.ll b/llvm/test/Verifier/AMDGPU/wmma-f8f6f4.ll new file mode 100644 index 0000000..af0d7f1 --- /dev/null +++ b/llvm/test/Verifier/AMDGPU/wmma-f8f6f4.ll @@ -0,0 +1,165 @@ +; RUN: not llvm-as %s -o /dev/null 2>&1 | FileCheck %s + +; -------------------------------------------------------------------- +; Wrong mangled types +; -------------------------------------------------------------------- + +; CHECK: operand 1 must be 8, 12 or 16 element i32 vector +; CHECK-NEXT: %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v16i64.v16i32(i32 0, <16 x i64> %A, i32 0, <16 x i32> %B, i16 0, <8 x float> %C) +; CHECK-NEXT: <16 x i64> %A +define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4___v16i64_fp8___v16i32_fp8(<16 x i64> %A, <16 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) { +bb: + %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v16i64.v16i32(i32 0, <16 x i64> %A, i32 0, <16 x i32> %B, i16 0, <8 x float> %C) + store <8 x float> %res, ptr addrspace(1) %out + ret void +} + +; CHECK: operand 3 must be 8, 12 or 16 element i32 vector +; CHECK-NEXT: %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v16i32.v16i64(i32 0, <16 x i32> %A, i32 0, <16 x i64> %B, i16 0, <8 x float> %C) +; CHECK-NEXT: <16 x i64> %B +define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4___v16i32_fp8___v16i64_fp8(<16 x i32> %A, <16 x i64> %B, <8 x float> %C, ptr addrspace(1) %out) { +bb: + %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v16i32.v16i64(i32 0, <16 x i32> %A, i32 0, <16 x i64> %B, i16 0, <8 x float> %C) + store <8 x float> %res, ptr addrspace(1) %out + ret void +} + +; -------------------------------------------------------------------- +; Impossible vector types +; -------------------------------------------------------------------- + +; CHECK: operand 1 must be 8, 12 or 16 element i32 vector +; CHECK-NEXT: %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v9i32.v16i32(i32 0, <9 x i32> %A, i32 0, <16 x i32> %B, i16 0, <8 x float> %C) +; CHECK-NEXT: <9 x i32> %A +define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4___v9i32_fp8___v16i32_fp8(<9 x i32> %A, <16 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) { +bb: + %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v9i32.v16i32(i32 0, <9 x i32> %A, i32 0, <16 x i32> %B, i16 0, <8 x float> %C) + store <8 x float> %res, ptr addrspace(1) %out + ret void +} + +; CHECK: operand 3 must be 8, 12 or 16 element i32 vector +; CHECK-NEXT: %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v16i32.v9i32(i32 0, <16 x i32> %A, i32 0, <9 x i32> %B, i16 0, <8 x float> %C) +; CHECK-NEXT: <9 x i32> %B +define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4___v16i32_fp8___v9i32_fp8(<16 x i32> %A, <9 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) { +bb: + %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v16i32.v9i32(i32 0, <16 x i32> %A, i32 0, <9 x i32> %B, i16 0, <8 x float> %C) + store <8 x float> %res, ptr addrspace(1) %out + ret void +} + +; -------------------------------------------------------------------- +; Out of bounds format +; -------------------------------------------------------------------- + +; CHECK: invalid value for matrix format +; CHECK-NEXT: %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v16i32.v16i32(i32 9999, <16 x i32> %A, i32 0, <16 x i32> %B, i16 0, <8 x float> %C) +; CHECK-NEXT: i32 9999 +define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4___v16i32_invalid0___v16i32_fp8(<16 x i32> %A, <16 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) { +bb: + %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v16i32.v16i32(i32 9999, <16 x i32> %A, i32 0, <16 x i32> %B, i16 0, <8 x float> %C) + store <8 x float> %res, ptr addrspace(1) %out + ret void +} + +; CHECK: invalid value for matrix format +; CHECK-NEXT: %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v16i32.v16i32(i32 0, <16 x i32> %A, i32 9999, <16 x i32> %B, i16 0, <8 x float> %C) +; CHECK-NEXT: i32 9999 +define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4___v16i32_fp8___v16i32_invalid1(<16 x i32> %A, <16 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) { +bb: + %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v16i32.v16i32(i32 0, <16 x i32> %A, i32 9999, <16 x i32> %B, i16 0, <8 x float> %C) + store <8 x float> %res, ptr addrspace(1) %out + ret void +} + +; -------------------------------------------------------------------- +; Incorrect signature for format cases (IR vector too small) +; -------------------------------------------------------------------- + +; CHECK: invalid vector type for format +; CHECK-NEXT: %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v8i32.v16i32(i32 0, <8 x i32> %A, i32 0, <16 x i32> %B, i16 0, <8 x float> %C) +; CHECK-NEXT: <8 x i32> %A +; CHECK-NEXT: i32 0 +define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4___v8i32_fp8___v16i32_fp8(<8 x i32> %A, <16 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) { +bb: + %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v8i32.v16i32(i32 0, <8 x i32> %A, i32 0, <16 x i32> %B, i16 0, <8 x float> %C) + store <8 x float> %res, ptr addrspace(1) %out + ret void +} + +; CHECK: invalid vector type for format +; CHECK-NEXT: %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v12i32.v16i32(i32 0, <12 x i32> %A, i32 0, <16 x i32> %B, i16 0, <8 x float> %C) +; CHECK-NEXT: <12 x i32> %A +; CHECK-NEXT: i32 0 +define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4___v12i32_fp8___v16i32_fp8(<12 x i32> %A, <16 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) { +bb: + %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v12i32.v16i32(i32 0, <12 x i32> %A, i32 0, <16 x i32> %B, i16 0, <8 x float> %C) + store <8 x float> %res, ptr addrspace(1) %out + ret void +} + +; CHECK: invalid vector type for format +; CHECK-NEXT: %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v8i32.v16i32(i32 1, <8 x i32> %A, i32 0, <16 x i32> %B, i16 0, <8 x float> %C) +; CHECK-NEXT: <8 x i32> %A +; CHECK-NEXT: i32 1 +define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4___v8i32_bf8___v16i32_fp8(<8 x i32> %A, <16 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) { +bb: + %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v8i32.v16i32(i32 1, <8 x i32> %A, i32 0, <16 x i32> %B, i16 0, <8 x float> %C) + store <8 x float> %res, ptr addrspace(1) %out + ret void +} + +; CHECK: invalid vector type for format +; CHECK-NEXT: %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v12i32.v16i32(i32 1, <12 x i32> %A, i32 0, <16 x i32> %B, i16 0, <8 x float> %C) +; CHECK-NEXT: <12 x i32> %A +; CHECK-NEXT: i32 1 +define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4___v12i32_bf8___v16i32_fp8(<12 x i32> %A, <16 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) { +bb: + %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v12i32.v16i32(i32 1, <12 x i32> %A, i32 0, <16 x i32> %B, i16 0, <8 x float> %C) + store <8 x float> %res, ptr addrspace(1) %out + ret void +} + +; CHECK: invalid vector type for format +; CHECK-NEXT: %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v16i32.v8i32(i32 0, <16 x i32> %A, i32 0, <8 x i32> %B, i16 0, <8 x float> %C) +; CHECK-NEXT: <8 x i32> %B +; CHECK-NEXT: i32 0 +define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4___v16i32_fp8___v8i32_fp8(<16 x i32> %A, <8 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) { +bb: + %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v16i32.v8i32(i32 0, <16 x i32> %A, i32 0, <8 x i32> %B, i16 0, <8 x float> %C) + store <8 x float> %res, ptr addrspace(1) %out + ret void +} + +; CHECK: invalid vector type for format +; CHECK-NEXT: %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v16i32.v12i32(i32 0, <16 x i32> %A, i32 0, <12 x i32> %B, i16 0, <8 x float> %C) +; CHECK-NEXT: <12 x i32> %B +; CHECK-NEXT: i32 0 +define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4___v16i32_fp8___v12i32_fp8(<16 x i32> %A, <12 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) { +bb: + %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v16i32.v12i32(i32 0, <16 x i32> %A, i32 0, <12 x i32> %B, i16 0, <8 x float> %C) + store <8 x float> %res, ptr addrspace(1) %out + ret void +} + +; CHECK: invalid vector type for format +; CHECK-NEXT: %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v16i32.v8i32(i32 0, <16 x i32> %A, i32 1, <8 x i32> %B, i16 0, <8 x float> %C) +; CHECK-NEXT: <8 x i32> %B +; CHECK-NEXT: i32 1 +define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4___v16i32_fp8___v8i32_bf8(<16 x i32> %A, <8 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) { +bb: + %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v16i32.v8i32(i32 0, <16 x i32> %A, i32 1, <8 x i32> %B, i16 0, <8 x float> %C) + store <8 x float> %res, ptr addrspace(1) %out + ret void +} + +; CHECK: invalid vector type for format +; CHECK-NEXT: %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v16i32.v12i32(i32 0, <16 x i32> %A, i32 1, <12 x i32> %B, i16 0, <8 x float> %C) +; CHECK-NEXT: <12 x i32> %B +; CHECK-NEXT: i32 1 +define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4___v16i32_fp8___v12i32_bf8(<16 x i32> %A, <12 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) { +bb: + %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v16i32.v12i32(i32 0, <16 x i32> %A, i32 1, <12 x i32> %B, i16 0, <8 x float> %C) + store <8 x float> %res, ptr addrspace(1) %out + ret void +} diff --git a/llvm/test/Verifier/amdgpu-cc.ll b/llvm/test/Verifier/amdgpu-cc.ll index aec0977..e86825e 100644 --- a/llvm/test/Verifier/amdgpu-cc.ll +++ b/llvm/test/Verifier/amdgpu-cc.ll @@ -217,3 +217,36 @@ define amdgpu_cs_chain_preserve void @preallocated_cc_amdgpu_cs_chain_preserve(p define amdgpu_cs_chain_preserve void @inalloca_cc_amdgpu_cs_chain_preserve(ptr inalloca(i32) %ptr) { ret void } + +; CHECK: Calling convention requires first argument to be i1 +; CHECK-NEXT: ptr @whole_wave_no_args +define amdgpu_gfx_whole_wave void @whole_wave_no_args() { + ret void +} + +; CHECK: Calling convention requires first argument to be i1 +; CHECK-NEXT: ptr @whole_wave_must_have_i1_active +define amdgpu_gfx_whole_wave void @whole_wave_must_have_i1_active(i32 %x) { + ret void +} + +; CHECK: Calling convention requires first argument to not be inreg +; CHECK-NEXT: ptr @whole_wave_i1_active_inreg +define amdgpu_gfx_whole_wave void @whole_wave_i1_active_inreg(i1 inreg %active) { + ret void +} + +; CHECK: Calling convention does not support varargs +; CHECK-NEXT: ptr @whole_wave_varargs +define amdgpu_gfx_whole_wave void @whole_wave_varargs(i1 %active, i32 %x, ...) { + ret void +} + +declare amdgpu_gfx_whole_wave void @whole_wave_callee(i1 %active) + +; CHECK: calling convention does not permit calls +; CHECK-NEXT: call amdgpu_gfx_whole_wave void @whole_wave_callee(i1 true) +define amdgpu_cs void @cant_call_whole_wave_func() { + call amdgpu_gfx_whole_wave void @whole_wave_callee(i1 true) + ret void +} diff --git a/llvm/test/Verifier/intrinsic-immarg.ll b/llvm/test/Verifier/intrinsic-immarg.ll index dd940d5..c1bb932 100644 --- a/llvm/test/Verifier/intrinsic-immarg.ll +++ b/llvm/test/Verifier/intrinsic-immarg.ll @@ -164,19 +164,21 @@ define void @test_scatter_8i32(<8 x i32> %a1, <8 x ptr> %ptr, <8 x i1> %mask, i3 } declare void @llvm.lifetime.start.p0(i64, ptr) -define void @test_lifetime_start(i64 %arg0, ptr %ptr) { +define void @test_lifetime_start(i64 %arg0) { ; CHECK: immarg operand has non-immediate parameter ; CHECK-NEXT: i64 %arg0 ; CHECK-NEXT: call void @llvm.lifetime.start.p0(i64 %arg0, ptr %ptr) + %ptr = alloca i64 call void @llvm.lifetime.start.p0(i64 %arg0, ptr %ptr) ret void } declare void @llvm.lifetime.end.p0(i64, ptr) -define void @test_lifetime_end(i64 %arg0, ptr %ptr) { +define void @test_lifetime_end(i64 %arg0) { ; CHECK: immarg operand has non-immediate parameter ; CHECK-NEXT: i64 %arg0 ; CHECK-NEXT: call void @llvm.lifetime.end.p0(i64 %arg0, ptr %ptr) + %ptr = alloca i64 call void @llvm.lifetime.end.p0(i64 %arg0, ptr %ptr) ret void } diff --git a/llvm/test/Verifier/opaque-ptr.ll b/llvm/test/Verifier/opaque-ptr.ll index 1f29000..10e43a4 100644 --- a/llvm/test/Verifier/opaque-ptr.ll +++ b/llvm/test/Verifier/opaque-ptr.ll @@ -37,12 +37,14 @@ define void @atomicrmw(ptr %a, i32 %i) { ret void } -define void @opaque_mangle(ptr %a) { +define void @opaque_mangle() { ; CHECK-LABEL: @opaque_mangle( -; CHECK-NEXT: call void @llvm.lifetime.start.p0(i64 8, ptr [[A:%.*]]) +; CHECK-NEXT: [[A:%.*]] = alloca i64, align 8 +; CHECK-NEXT: call void @llvm.lifetime.start.p0(i64 8, ptr [[A]]) ; CHECK-NEXT: call void @llvm.lifetime.end.p0(i64 8, ptr [[A]]) ; CHECK-NEXT: ret void ; + %a = alloca i64 call void @llvm.lifetime.start.p0(i64 8, ptr %a) call void @llvm.lifetime.end.p0(i64 8, ptr %a) ret void diff --git a/llvm/test/tools/llvm-objdump/ELF/ARM/debug-vars-dwarf4.s b/llvm/test/tools/llvm-objdump/ELF/ARM/debug-vars-dwarf4.s index 69b7489..085f258 100644 --- a/llvm/test/tools/llvm-objdump/ELF/ARM/debug-vars-dwarf4.s +++ b/llvm/test/tools/llvm-objdump/ELF/ARM/debug-vars-dwarf4.s @@ -15,10 +15,10 @@ ## Check that passing the default value for --debug-vars-indent (52) makes no ## change to the output. -# RUN: llvm-objdump %t.o -d --debug-vars --debug-vars-indent=52 | \ +# RUN: llvm-objdump %t.o -d --debug-vars --debug-indent=52 | \ # RUN: FileCheck %s --check-prefix=RAW --strict-whitespace -# RUN: llvm-objdump %t.o -d --debug-vars --debug-vars-indent=30 | \ +# RUN: llvm-objdump %t.o -d --debug-vars --debug-indent=30 | \ # RUN: FileCheck %s --check-prefix=INDENT --strict-whitespace # RUN: llvm-objdump %t.o -d --debug-vars --no-show-raw-insn | \ diff --git a/llvm/test/tools/llvm-objdump/X86/Inputs/debug-inlined-functions.cc b/llvm/test/tools/llvm-objdump/X86/Inputs/debug-inlined-functions.cc new file mode 100644 index 0000000..a708bc0 --- /dev/null +++ b/llvm/test/tools/llvm-objdump/X86/Inputs/debug-inlined-functions.cc @@ -0,0 +1,10 @@ +int bar(int x, int y) { + int sum = x + y; + int mul = x * y; + return sum + mul; +} + +int foo(int a, int b) { + int result = bar(a, b); + return result; +} diff --git a/llvm/test/tools/llvm-objdump/X86/debug-inlined-functions.s b/llvm/test/tools/llvm-objdump/X86/debug-inlined-functions.s new file mode 100644 index 0000000..6ed3507 --- /dev/null +++ b/llvm/test/tools/llvm-objdump/X86/debug-inlined-functions.s @@ -0,0 +1,643 @@ +## Generated with this compile command, with the source code in Inputs/debug-inlined-functions.cc: +## clang++ -g -c debug-inlined-functions.cc -O1 -S -o - + +# RUN: llvm-mc -triple=x86_64 %s -filetype=obj -o %t.o + +# RUN: llvm-objdump %t.o -d --debug-inlined-funcs=unicode | \ +# RUN: FileCheck %s --check-prefixes=UNICODE,UNICODE-MANGLED --strict-whitespace + +# RUN: llvm-objdump %t.o -d -C --debug-inlined-funcs | \ +# RUN: FileCheck %s --check-prefixes=UNICODE,UNICODE-DEMANGLED --strict-whitespace + +# RUN: llvm-objdump %t.o -d -C --debug-inlined-funcs=unicode | \ +# RUN: FileCheck %s --check-prefixes=UNICODE,UNICODE-DEMANGLED --strict-whitespace + +# RUN: llvm-objdump %t.o -d -C --debug-inlined-funcs=unicode --debug-indent=30 | \ +# RUN: FileCheck %s --check-prefix=UNICODE-DEMANGLED-INDENT --strict-whitespace + +# RUN: llvm-objdump %t.o -d -C --debug-inlined-funcs=ascii | \ +# RUN: FileCheck %s --check-prefix=ASCII-DEMANGLED --strict-whitespace + +# RUN: llvm-objdump %t.o -d -C --debug-inlined-funcs=limits-only | \ +# RUN: FileCheck %s --check-prefix=LIMITS-ONLY-DEMANGLED + +# RUN: llvm-objdump %t.o -d -C --debug-inlined-funcs=unicode --debug-vars=unicode | \ +# RUN: FileCheck %s --check-prefix=DEBUG-DEMANGLED-ALL --strict-whitespace + +# UNICODE-MANGLED: 0000000000000000 <_Z3barii>: +# UNICODE-DEMANGLED: 0000000000000000 <bar(int, int)>: +# UNICODE-NEXT: 0: 8d 04 3e leal (%rsi,%rdi), %eax +# UNICODE-NEXT: 3: 0f af f7 imull %edi, %esi +# UNICODE-NEXT: 6: 01 f0 addl %esi, %eax +# UNICODE-NEXT: 8: c3 retq +# UNICODE-NEXT: 9: 0f 1f 80 00 00 00 00 nopl (%rax) +# UNICODE-EMPTY: +# UNICODE-MANGLED-NEXT: 0000000000000010 <_Z3fooii>: +# UNICODE-DEMANGLED-NEXT: 0000000000000010 <foo(int, int)>: +# UNICODE-MANGLED-NEXT: ┠─ _Z3barii = inlined into _Z3fooii +# UNICODE-DEMANGLED-NEXT: ┠─ bar(int, int) = inlined into foo(int, int) +# UNICODE-NEXT: 10: 8d 04 3e leal (%rsi,%rdi), %eax ┃ +# UNICODE-NEXT: 13: 0f af f7 imull %edi, %esi ┃ +# UNICODE-NEXT: 16: 01 f0 addl %esi, %eax ┻ +# UNICODE-NEXT: 18: c3 retq + +# UNICODE-DEMANGLED-INDENT: 0000000000000010 <foo(int, int)>: +# UNICODE-DEMANGLED-INDENT-NEXT: ┠─ bar(int, int) = inlined into foo(int, int) +# UNICODE-DEMANGLED-INDENT-NEXT: 10: 8d 04 3e leal (%rsi,%rdi), %eax ┃ +# UNICODE-DEMANGLED-INDENT-NEXT: 13: 0f af f7 imull %edi, %esi ┃ +# UNICODE-DEMANGLED-INDENT-NEXT: 16: 01 f0 addl %esi, %eax ┻ +# UNICODE-DEMANGLED-INDENT-NEXT: 18: c3 retq + +# ASCII-DEMANGLED: 0000000000000010 <foo(int, int)>: +# ASCII-DEMANGLED-NEXT: |- bar(int, int) = inlined into foo(int, int) +# ASCII-DEMANGLED-NEXT: 10: 8d 04 3e leal (%rsi,%rdi), %eax | +# ASCII-DEMANGLED-NEXT: 13: 0f af f7 imull %edi, %esi | +# ASCII-DEMANGLED-NEXT: 16: 01 f0 addl %esi, %eax v +# ASCII-DEMANGLED-NEXT: 18: c3 retq + +# LIMITS-ONLY-DEMANGLED: 0000000000000010 <foo(int, int)>: +# LIMITS-ONLY-DEMANGLED-NEXT: debug-inlined-functions.cc:8:16: bar(int, int) inlined into foo(int, int) +# LIMITS-ONLY-DEMANGLED-NEXT: 10: 8d 04 3e leal (%rsi,%rdi), %eax +# LIMITS-ONLY-DEMANGLED-NEXT: 13: 0f af f7 imull %edi, %esi +# LIMITS-ONLY-DEMANGLED-NEXT: 16: 01 f0 addl %esi, %eax +# LIMITS-ONLY-DEMANGLED-NEXT: debug-inlined-functions.cc:8:16: end of bar(int, int) inlined into foo(int, int) +# LIMITS-ONLY-DEMANGLED-NEXT: 18: c3 retq + +# DEBUG-DEMANGLED-ALL: 0000000000000010 <foo(int, int)>: +# DEBUG-DEMANGLED-ALL-NEXT: ┠─ a = RDI +# DEBUG-DEMANGLED-ALL-NEXT: ┃ ┠─ b = RSI +# DEBUG-DEMANGLED-ALL-NEXT: ┃ ┃ ┠─ bar(int, int) = inlined into foo(int, int) +# DEBUG-DEMANGLED-ALL-NEXT: ┃ ┃ ┃ ┠─ x = RDI +# DEBUG-DEMANGLED-ALL-NEXT: ┃ ┃ ┃ ┃ ┠─ y = RSI +# DEBUG-DEMANGLED-ALL-NEXT: ┃ ┃ ┃ ┃ ┃ ┌─ sum = RAX +# DEBUG-DEMANGLED-ALL-NEXT: 10: 8d 04 3e leal (%rsi,%rdi), %eax ┃ ┃ ┃ ┃ ┃ ╈ +# DEBUG-DEMANGLED-ALL-NEXT: ┃ ┃ ┃ ┃ ┃ ┃ ┌─ b = entry(RSI) +# DEBUG-DEMANGLED-ALL-NEXT: ┃ ┃ ┃ ┃ ┃ ┃ │ ┌─ mul = RSI +# DEBUG-DEMANGLED-ALL-NEXT: 13: 0f af f7 imull %edi, %esi ┃ ┻ ┃ ┃ ┻ ┃ ╈ ╈ +# DEBUG-DEMANGLED-ALL-NEXT: ┃ ┌─ result = RAX +# DEBUG-DEMANGLED-ALL-NEXT: 16: 01 f0 addl %esi, %eax ┃ ╈ ┻ ┻ ┻ ┃ ┃ +# DEBUG-DEMANGLED-ALL-NEXT: 18: c3 retq ┻ ┻ ┻ ┻ + + .file "debug-inlined-functions.cc" + .text + .globl _Z3barii # -- Begin function _Z3barii + .p2align 4 + .type _Z3barii,@function +_Z3barii: # @_Z3barii +.Lfunc_begin0: + .file 0 "debug-inlined-functions.cc" md5 0xf07b869ec4d0996589aa6856ae4e6c83 + .cfi_startproc +# %bb.0: # %entry + #DEBUG_VALUE: bar:x <- $edi + #DEBUG_VALUE: bar:y <- $esi + # kill: def $esi killed $esi def $rsi + # kill: def $edi killed $edi def $rdi + .loc 0 2 15 prologue_end # llvm/test/tools/llvm-objdump/X86/Inputs/debug-inlined-functions.cc:2:15 + leal (%rsi,%rdi), %eax +.Ltmp0: + #DEBUG_VALUE: bar:sum <- $eax + .loc 0 3 15 # llvm/test/tools/llvm-objdump/X86/Inputs/debug-inlined-functions.cc:3:15 + imull %edi, %esi +.Ltmp1: + #DEBUG_VALUE: bar:y <- [DW_OP_LLVM_entry_value 1] $esi + #DEBUG_VALUE: bar:mul <- $esi + .loc 0 4 14 # llvm/test/tools/llvm-objdump/X86/Inputs/debug-inlined-functions.cc:4:14 + addl %esi, %eax +.Ltmp2: + .loc 0 4 3 is_stmt 0 # llvm/test/tools/llvm-objdump/X86/Inputs/debug-inlined-functions.cc:4:3 + retq +.Ltmp3: +.Lfunc_end0: + .size _Z3barii, .Lfunc_end0-_Z3barii + .cfi_endproc + # -- End function + .globl _Z3fooii # -- Begin function _Z3fooii + .p2align 4 + .type _Z3fooii,@function +_Z3fooii: # @_Z3fooii +.Lfunc_begin1: + .cfi_startproc +# %bb.0: # %entry + #DEBUG_VALUE: foo:a <- $edi + #DEBUG_VALUE: foo:b <- $esi + #DEBUG_VALUE: bar:x <- $edi + #DEBUG_VALUE: bar:y <- $esi + # kill: def $esi killed $esi def $rsi + # kill: def $edi killed $edi def $rdi + .loc 0 2 15 prologue_end is_stmt 1 # llvm/test/tools/llvm-objdump/X86/Inputs/debug-inlined-functions.cc:2:15 @[ llvm/test/tools/llvm-objdump/X86/Inputs/debug-inlined-functions.cc:8:16 ] + leal (%rsi,%rdi), %eax +.Ltmp4: + #DEBUG_VALUE: bar:sum <- $eax + .loc 0 3 15 # llvm/test/tools/llvm-objdump/X86/Inputs/debug-inlined-functions.cc:3:15 @[ llvm/test/tools/llvm-objdump/X86/Inputs/debug-inlined-functions.cc:8:16 ] + imull %edi, %esi +.Ltmp5: + #DEBUG_VALUE: foo:b <- [DW_OP_LLVM_entry_value 1] $esi + #DEBUG_VALUE: bar:mul <- $esi + .loc 0 4 14 # llvm/test/tools/llvm-objdump/X86/Inputs/debug-inlined-functions.cc:4:14 @[ llvm/test/tools/llvm-objdump/X86/Inputs/debug-inlined-functions.cc:8:16 ] + addl %esi, %eax +.Ltmp6: + #DEBUG_VALUE: foo:result <- $eax + .loc 0 9 3 # llvm/test/tools/llvm-objdump/X86/Inputs/debug-inlined-functions.cc:9:3 + retq +.Ltmp7: +.Lfunc_end1: + .size _Z3fooii, .Lfunc_end1-_Z3fooii + .cfi_endproc + # -- End function + .section .debug_loclists,"",@progbits + .long .Ldebug_list_header_end0-.Ldebug_list_header_start0 # Length +.Ldebug_list_header_start0: + .short 5 # Version + .byte 8 # Address size + .byte 0 # Segment selector size + .long 8 # Offset entry count +.Lloclists_table_base0: + .long .Ldebug_loc0-.Lloclists_table_base0 + .long .Ldebug_loc1-.Lloclists_table_base0 + .long .Ldebug_loc2-.Lloclists_table_base0 + .long .Ldebug_loc3-.Lloclists_table_base0 + .long .Ldebug_loc4-.Lloclists_table_base0 + .long .Ldebug_loc5-.Lloclists_table_base0 + .long .Ldebug_loc6-.Lloclists_table_base0 + .long .Ldebug_loc7-.Lloclists_table_base0 +.Ldebug_loc0: + .byte 4 # DW_LLE_offset_pair + .uleb128 .Lfunc_begin0-.Lfunc_begin0 # starting offset + .uleb128 .Ltmp1-.Lfunc_begin0 # ending offset + .byte 1 # Loc expr size + .byte 84 # super-register DW_OP_reg4 + .byte 4 # DW_LLE_offset_pair + .uleb128 .Ltmp1-.Lfunc_begin0 # starting offset + .uleb128 .Lfunc_end0-.Lfunc_begin0 # ending offset + .byte 4 # Loc expr size + .byte 163 # DW_OP_entry_value + .byte 1 # 1 + .byte 84 # super-register DW_OP_reg4 + .byte 159 # DW_OP_stack_value + .byte 0 # DW_LLE_end_of_list +.Ldebug_loc1: + .byte 4 # DW_LLE_offset_pair + .uleb128 .Ltmp0-.Lfunc_begin0 # starting offset + .uleb128 .Ltmp2-.Lfunc_begin0 # ending offset + .byte 1 # Loc expr size + .byte 80 # super-register DW_OP_reg0 + .byte 0 # DW_LLE_end_of_list +.Ldebug_loc2: + .byte 4 # DW_LLE_offset_pair + .uleb128 .Ltmp1-.Lfunc_begin0 # starting offset + .uleb128 .Lfunc_end0-.Lfunc_begin0 # ending offset + .byte 1 # Loc expr size + .byte 84 # super-register DW_OP_reg4 + .byte 0 # DW_LLE_end_of_list +.Ldebug_loc3: + .byte 4 # DW_LLE_offset_pair + .uleb128 .Lfunc_begin1-.Lfunc_begin0 # starting offset + .uleb128 .Ltmp5-.Lfunc_begin0 # ending offset + .byte 1 # Loc expr size + .byte 84 # super-register DW_OP_reg4 + .byte 4 # DW_LLE_offset_pair + .uleb128 .Ltmp5-.Lfunc_begin0 # starting offset + .uleb128 .Lfunc_end1-.Lfunc_begin0 # ending offset + .byte 4 # Loc expr size + .byte 163 # DW_OP_entry_value + .byte 1 # 1 + .byte 84 # super-register DW_OP_reg4 + .byte 159 # DW_OP_stack_value + .byte 0 # DW_LLE_end_of_list +.Ldebug_loc4: + .byte 4 # DW_LLE_offset_pair + .uleb128 .Lfunc_begin1-.Lfunc_begin0 # starting offset + .uleb128 .Ltmp5-.Lfunc_begin0 # ending offset + .byte 1 # Loc expr size + .byte 84 # super-register DW_OP_reg4 + .byte 0 # DW_LLE_end_of_list +.Ldebug_loc5: + .byte 4 # DW_LLE_offset_pair + .uleb128 .Ltmp4-.Lfunc_begin0 # starting offset + .uleb128 .Ltmp6-.Lfunc_begin0 # ending offset + .byte 1 # Loc expr size + .byte 80 # super-register DW_OP_reg0 + .byte 0 # DW_LLE_end_of_list +.Ldebug_loc6: + .byte 4 # DW_LLE_offset_pair + .uleb128 .Ltmp5-.Lfunc_begin0 # starting offset + .uleb128 .Lfunc_end1-.Lfunc_begin0 # ending offset + .byte 1 # Loc expr size + .byte 84 # super-register DW_OP_reg4 + .byte 0 # DW_LLE_end_of_list +.Ldebug_loc7: + .byte 4 # DW_LLE_offset_pair + .uleb128 .Ltmp6-.Lfunc_begin0 # starting offset + .uleb128 .Lfunc_end1-.Lfunc_begin0 # ending offset + .byte 1 # Loc expr size + .byte 80 # super-register DW_OP_reg0 + .byte 0 # DW_LLE_end_of_list +.Ldebug_list_header_end0: + .section .debug_abbrev,"",@progbits + .byte 1 # Abbreviation Code + .byte 17 # DW_TAG_compile_unit + .byte 1 # DW_CHILDREN_yes + .byte 37 # DW_AT_producer + .byte 37 # DW_FORM_strx1 + .byte 19 # DW_AT_language + .byte 5 # DW_FORM_data2 + .byte 3 # DW_AT_name + .byte 37 # DW_FORM_strx1 + .byte 114 # DW_AT_str_offsets_base + .byte 23 # DW_FORM_sec_offset + .byte 16 # DW_AT_stmt_list + .byte 23 # DW_FORM_sec_offset + .byte 27 # DW_AT_comp_dir + .byte 37 # DW_FORM_strx1 + .byte 17 # DW_AT_low_pc + .byte 27 # DW_FORM_addrx + .byte 18 # DW_AT_high_pc + .byte 6 # DW_FORM_data4 + .byte 115 # DW_AT_addr_base + .byte 23 # DW_FORM_sec_offset + .ascii "\214\001" # DW_AT_loclists_base + .byte 23 # DW_FORM_sec_offset + .byte 0 # EOM(1) + .byte 0 # EOM(2) + .byte 2 # Abbreviation Code + .byte 46 # DW_TAG_subprogram + .byte 1 # DW_CHILDREN_yes + .byte 17 # DW_AT_low_pc + .byte 27 # DW_FORM_addrx + .byte 18 # DW_AT_high_pc + .byte 6 # DW_FORM_data4 + .byte 64 # DW_AT_frame_base + .byte 24 # DW_FORM_exprloc + .byte 122 # DW_AT_call_all_calls + .byte 25 # DW_FORM_flag_present + .byte 49 # DW_AT_abstract_origin + .byte 19 # DW_FORM_ref4 + .byte 0 # EOM(1) + .byte 0 # EOM(2) + .byte 3 # Abbreviation Code + .byte 5 # DW_TAG_formal_parameter + .byte 0 # DW_CHILDREN_no + .byte 2 # DW_AT_location + .byte 24 # DW_FORM_exprloc + .byte 49 # DW_AT_abstract_origin + .byte 19 # DW_FORM_ref4 + .byte 0 # EOM(1) + .byte 0 # EOM(2) + .byte 4 # Abbreviation Code + .byte 5 # DW_TAG_formal_parameter + .byte 0 # DW_CHILDREN_no + .byte 2 # DW_AT_location + .byte 34 # DW_FORM_loclistx + .byte 49 # DW_AT_abstract_origin + .byte 19 # DW_FORM_ref4 + .byte 0 # EOM(1) + .byte 0 # EOM(2) + .byte 5 # Abbreviation Code + .byte 52 # DW_TAG_variable + .byte 0 # DW_CHILDREN_no + .byte 2 # DW_AT_location + .byte 34 # DW_FORM_loclistx + .byte 49 # DW_AT_abstract_origin + .byte 19 # DW_FORM_ref4 + .byte 0 # EOM(1) + .byte 0 # EOM(2) + .byte 6 # Abbreviation Code + .byte 46 # DW_TAG_subprogram + .byte 1 # DW_CHILDREN_yes + .byte 110 # DW_AT_linkage_name + .byte 37 # DW_FORM_strx1 + .byte 3 # DW_AT_name + .byte 37 # DW_FORM_strx1 + .byte 58 # DW_AT_decl_file + .byte 11 # DW_FORM_data1 + .byte 59 # DW_AT_decl_line + .byte 11 # DW_FORM_data1 + .byte 73 # DW_AT_type + .byte 19 # DW_FORM_ref4 + .byte 63 # DW_AT_external + .byte 25 # DW_FORM_flag_present + .byte 32 # DW_AT_inline + .byte 33 # DW_FORM_implicit_const + .byte 1 + .byte 0 # EOM(1) + .byte 0 # EOM(2) + .byte 7 # Abbreviation Code + .byte 5 # DW_TAG_formal_parameter + .byte 0 # DW_CHILDREN_no + .byte 3 # DW_AT_name + .byte 37 # DW_FORM_strx1 + .byte 58 # DW_AT_decl_file + .byte 11 # DW_FORM_data1 + .byte 59 # DW_AT_decl_line + .byte 11 # DW_FORM_data1 + .byte 73 # DW_AT_type + .byte 19 # DW_FORM_ref4 + .byte 0 # EOM(1) + .byte 0 # EOM(2) + .byte 8 # Abbreviation Code + .byte 52 # DW_TAG_variable + .byte 0 # DW_CHILDREN_no + .byte 3 # DW_AT_name + .byte 37 # DW_FORM_strx1 + .byte 58 # DW_AT_decl_file + .byte 11 # DW_FORM_data1 + .byte 59 # DW_AT_decl_line + .byte 11 # DW_FORM_data1 + .byte 73 # DW_AT_type + .byte 19 # DW_FORM_ref4 + .byte 0 # EOM(1) + .byte 0 # EOM(2) + .byte 9 # Abbreviation Code + .byte 36 # DW_TAG_base_type + .byte 0 # DW_CHILDREN_no + .byte 3 # DW_AT_name + .byte 37 # DW_FORM_strx1 + .byte 62 # DW_AT_encoding + .byte 11 # DW_FORM_data1 + .byte 11 # DW_AT_byte_size + .byte 11 # DW_FORM_data1 + .byte 0 # EOM(1) + .byte 0 # EOM(2) + .byte 10 # Abbreviation Code + .byte 46 # DW_TAG_subprogram + .byte 1 # DW_CHILDREN_yes + .byte 17 # DW_AT_low_pc + .byte 27 # DW_FORM_addrx + .byte 18 # DW_AT_high_pc + .byte 6 # DW_FORM_data4 + .byte 64 # DW_AT_frame_base + .byte 24 # DW_FORM_exprloc + .byte 122 # DW_AT_call_all_calls + .byte 25 # DW_FORM_flag_present + .byte 110 # DW_AT_linkage_name + .byte 37 # DW_FORM_strx1 + .byte 3 # DW_AT_name + .byte 37 # DW_FORM_strx1 + .byte 58 # DW_AT_decl_file + .byte 11 # DW_FORM_data1 + .byte 59 # DW_AT_decl_line + .byte 11 # DW_FORM_data1 + .byte 73 # DW_AT_type + .byte 19 # DW_FORM_ref4 + .byte 63 # DW_AT_external + .byte 25 # DW_FORM_flag_present + .byte 0 # EOM(1) + .byte 0 # EOM(2) + .byte 11 # Abbreviation Code + .byte 5 # DW_TAG_formal_parameter + .byte 0 # DW_CHILDREN_no + .byte 2 # DW_AT_location + .byte 24 # DW_FORM_exprloc + .byte 3 # DW_AT_name + .byte 37 # DW_FORM_strx1 + .byte 58 # DW_AT_decl_file + .byte 11 # DW_FORM_data1 + .byte 59 # DW_AT_decl_line + .byte 11 # DW_FORM_data1 + .byte 73 # DW_AT_type + .byte 19 # DW_FORM_ref4 + .byte 0 # EOM(1) + .byte 0 # EOM(2) + .byte 12 # Abbreviation Code + .byte 5 # DW_TAG_formal_parameter + .byte 0 # DW_CHILDREN_no + .byte 2 # DW_AT_location + .byte 34 # DW_FORM_loclistx + .byte 3 # DW_AT_name + .byte 37 # DW_FORM_strx1 + .byte 58 # DW_AT_decl_file + .byte 11 # DW_FORM_data1 + .byte 59 # DW_AT_decl_line + .byte 11 # DW_FORM_data1 + .byte 73 # DW_AT_type + .byte 19 # DW_FORM_ref4 + .byte 0 # EOM(1) + .byte 0 # EOM(2) + .byte 13 # Abbreviation Code + .byte 52 # DW_TAG_variable + .byte 0 # DW_CHILDREN_no + .byte 2 # DW_AT_location + .byte 34 # DW_FORM_loclistx + .byte 3 # DW_AT_name + .byte 37 # DW_FORM_strx1 + .byte 58 # DW_AT_decl_file + .byte 11 # DW_FORM_data1 + .byte 59 # DW_AT_decl_line + .byte 11 # DW_FORM_data1 + .byte 73 # DW_AT_type + .byte 19 # DW_FORM_ref4 + .byte 0 # EOM(1) + .byte 0 # EOM(2) + .byte 14 # Abbreviation Code + .byte 29 # DW_TAG_inlined_subroutine + .byte 1 # DW_CHILDREN_yes + .byte 49 # DW_AT_abstract_origin + .byte 19 # DW_FORM_ref4 + .byte 17 # DW_AT_low_pc + .byte 27 # DW_FORM_addrx + .byte 18 # DW_AT_high_pc + .byte 6 # DW_FORM_data4 + .byte 88 # DW_AT_call_file + .byte 11 # DW_FORM_data1 + .byte 89 # DW_AT_call_line + .byte 11 # DW_FORM_data1 + .byte 87 # DW_AT_call_column + .byte 11 # DW_FORM_data1 + .byte 0 # EOM(1) + .byte 0 # EOM(2) + .byte 0 # EOM(3) + .section .debug_info,"",@progbits +.Lcu_begin0: + .long .Ldebug_info_end0-.Ldebug_info_start0 # Length of Unit +.Ldebug_info_start0: + .short 5 # DWARF version number + .byte 1 # DWARF Unit Type + .byte 8 # Address Size (in bytes) + .long .debug_abbrev # Offset Into Abbrev. Section + .byte 1 # Abbrev [1] 0xc:0xc4 DW_TAG_compile_unit + .byte 0 # DW_AT_producer + .short 33 # DW_AT_language + .byte 1 # DW_AT_name + .long .Lstr_offsets_base0 # DW_AT_str_offsets_base + .long .Lline_table_start0 # DW_AT_stmt_list + .byte 2 # DW_AT_comp_dir + .byte 0 # DW_AT_low_pc + .long .Lfunc_end1-.Lfunc_begin0 # DW_AT_high_pc + .long .Laddr_table_base0 # DW_AT_addr_base + .long .Lloclists_table_base0 # DW_AT_loclists_base + .byte 2 # Abbrev [2] 0x27:0x26 DW_TAG_subprogram + .byte 0 # DW_AT_low_pc + .long .Lfunc_end0-.Lfunc_begin0 # DW_AT_high_pc + .byte 1 # DW_AT_frame_base + .byte 87 + # DW_AT_call_all_calls + .long 77 # DW_AT_abstract_origin + .byte 3 # Abbrev [3] 0x33:0x7 DW_TAG_formal_parameter + .byte 1 # DW_AT_location + .byte 85 + .long 86 # DW_AT_abstract_origin + .byte 4 # Abbrev [4] 0x3a:0x6 DW_TAG_formal_parameter + .byte 0 # DW_AT_location + .long 94 # DW_AT_abstract_origin + .byte 5 # Abbrev [5] 0x40:0x6 DW_TAG_variable + .byte 1 # DW_AT_location + .long 102 # DW_AT_abstract_origin + .byte 5 # Abbrev [5] 0x46:0x6 DW_TAG_variable + .byte 2 # DW_AT_location + .long 110 # DW_AT_abstract_origin + .byte 0 # End Of Children Mark + .byte 6 # Abbrev [6] 0x4d:0x2a DW_TAG_subprogram + .byte 3 # DW_AT_linkage_name + .byte 4 # DW_AT_name + .byte 0 # DW_AT_decl_file + .byte 1 # DW_AT_decl_line + .long 119 # DW_AT_type + # DW_AT_external + # DW_AT_inline + .byte 7 # Abbrev [7] 0x56:0x8 DW_TAG_formal_parameter + .byte 6 # DW_AT_name + .byte 0 # DW_AT_decl_file + .byte 1 # DW_AT_decl_line + .long 119 # DW_AT_type + .byte 7 # Abbrev [7] 0x5e:0x8 DW_TAG_formal_parameter + .byte 7 # DW_AT_name + .byte 0 # DW_AT_decl_file + .byte 1 # DW_AT_decl_line + .long 119 # DW_AT_type + .byte 8 # Abbrev [8] 0x66:0x8 DW_TAG_variable + .byte 8 # DW_AT_name + .byte 0 # DW_AT_decl_file + .byte 2 # DW_AT_decl_line + .long 119 # DW_AT_type + .byte 8 # Abbrev [8] 0x6e:0x8 DW_TAG_variable + .byte 9 # DW_AT_name + .byte 0 # DW_AT_decl_file + .byte 3 # DW_AT_decl_line + .long 119 # DW_AT_type + .byte 0 # End Of Children Mark + .byte 9 # Abbrev [9] 0x77:0x4 DW_TAG_base_type + .byte 5 # DW_AT_name + .byte 5 # DW_AT_encoding + .byte 4 # DW_AT_byte_size + .byte 10 # Abbrev [10] 0x7b:0x54 DW_TAG_subprogram + .byte 1 # DW_AT_low_pc + .long .Lfunc_end1-.Lfunc_begin1 # DW_AT_high_pc + .byte 1 # DW_AT_frame_base + .byte 87 + # DW_AT_call_all_calls + .byte 10 # DW_AT_linkage_name + .byte 11 # DW_AT_name + .byte 0 # DW_AT_decl_file + .byte 7 # DW_AT_decl_line + .long 119 # DW_AT_type + # DW_AT_external + .byte 11 # Abbrev [11] 0x8b:0xa DW_TAG_formal_parameter + .byte 1 # DW_AT_location + .byte 85 + .byte 12 # DW_AT_name + .byte 0 # DW_AT_decl_file + .byte 7 # DW_AT_decl_line + .long 119 # DW_AT_type + .byte 12 # Abbrev [12] 0x95:0x9 DW_TAG_formal_parameter + .byte 3 # DW_AT_location + .byte 13 # DW_AT_name + .byte 0 # DW_AT_decl_file + .byte 7 # DW_AT_decl_line + .long 119 # DW_AT_type + .byte 13 # Abbrev [13] 0x9e:0x9 DW_TAG_variable + .byte 7 # DW_AT_location + .byte 14 # DW_AT_name + .byte 0 # DW_AT_decl_file + .byte 8 # DW_AT_decl_line + .long 119 # DW_AT_type + .byte 14 # Abbrev [14] 0xa7:0x27 DW_TAG_inlined_subroutine + .long 77 # DW_AT_abstract_origin + .byte 1 # DW_AT_low_pc + .long .Ltmp6-.Lfunc_begin1 # DW_AT_high_pc + .byte 0 # DW_AT_call_file + .byte 8 # DW_AT_call_line + .byte 16 # DW_AT_call_column + .byte 3 # Abbrev [3] 0xb4:0x7 DW_TAG_formal_parameter + .byte 1 # DW_AT_location + .byte 85 + .long 86 # DW_AT_abstract_origin + .byte 4 # Abbrev [4] 0xbb:0x6 DW_TAG_formal_parameter + .byte 4 # DW_AT_location + .long 94 # DW_AT_abstract_origin + .byte 5 # Abbrev [5] 0xc1:0x6 DW_TAG_variable + .byte 5 # DW_AT_location + .long 102 # DW_AT_abstract_origin + .byte 5 # Abbrev [5] 0xc7:0x6 DW_TAG_variable + .byte 6 # DW_AT_location + .long 110 # DW_AT_abstract_origin + .byte 0 # End Of Children Mark + .byte 0 # End Of Children Mark + .byte 0 # End Of Children Mark +.Ldebug_info_end0: + .section .debug_str_offsets,"",@progbits + .long 64 # Length of String Offsets Set + .short 5 + .short 0 +.Lstr_offsets_base0: + .section .debug_str,"MS",@progbits,1 +.Linfo_string0: + .asciz "clang version 21.0.0git (git@github.com:llvm/llvm-project.git eed98e1493414ae9c30596b1eeb8f4a9b260e42)" # string offset=0 +.Linfo_string1: + .asciz "llvm/test/tools/llvm-objdump/X86/Inputs/debug-inlined-functions.cc" # string offset=112 +.Linfo_string2: + .asciz "llvm-project" # string offset=179 +.Linfo_string3: + .asciz "_Z3barii" # string offset=229 +.Linfo_string4: + .asciz "bar" # string offset=238 +.Linfo_string5: + .asciz "int" # string offset=242 +.Linfo_string6: + .asciz "x" # string offset=246 +.Linfo_string7: + .asciz "y" # string offset=248 +.Linfo_string8: + .asciz "sum" # string offset=250 +.Linfo_string9: + .asciz "mul" # string offset=254 +.Linfo_string10: + .asciz "_Z3fooii" # string offset=258 +.Linfo_string11: + .asciz "foo" # string offset=267 +.Linfo_string12: + .asciz "a" # string offset=271 +.Linfo_string13: + .asciz "b" # string offset=273 +.Linfo_string14: + .asciz "result" # string offset=275 + .section .debug_str_offsets,"",@progbits + .long .Linfo_string0 + .long .Linfo_string1 + .long .Linfo_string2 + .long .Linfo_string3 + .long .Linfo_string4 + .long .Linfo_string5 + .long .Linfo_string6 + .long .Linfo_string7 + .long .Linfo_string8 + .long .Linfo_string9 + .long .Linfo_string10 + .long .Linfo_string11 + .long .Linfo_string12 + .long .Linfo_string13 + .long .Linfo_string14 + .section .debug_addr,"",@progbits + .long .Ldebug_addr_end0-.Ldebug_addr_start0 # Length of contribution +.Ldebug_addr_start0: + .short 5 # DWARF version number + .byte 8 # Address size + .byte 0 # Segment selector size +.Laddr_table_base0: + .quad .Lfunc_begin0 + .quad .Lfunc_begin1 +.Ldebug_addr_end0: + .ident "clang version 21.0.0git (git@github.com:llvm/llvm-project.git eed98e1493414ae9c30596b1eeb8f4a9b260e42a)" + .section ".note.GNU-stack","",@progbits + .addrsig + .section .debug_line,"",@progbits +.Lline_table_start0: diff --git a/llvm/test/tools/llvm-readobj/ELF/sframe-header.test b/llvm/test/tools/llvm-readobj/ELF/sframe-header.test new file mode 100644 index 0000000..f827296 --- /dev/null +++ b/llvm/test/tools/llvm-readobj/ELF/sframe-header.test @@ -0,0 +1,148 @@ +## Check parsing and dumping of the SFrame header. +# RUN: yaml2obj --docnum=1 %s -o %t.1 +# RUN: llvm-readobj --sframe=.sframe_bad_sh_size --sframe=.sframe_1b \ +# RUN: --sframe=.sframe_bad_magic --sframe=.sframe_bad_version \ +# RUN: --sframe=.sframe_6b --sframe=.sframe_header %t.1 2>&1 | \ +# RUN: FileCheck %s --strict-whitespace --match-full-lines \ +# RUN: -DFILE=%t.1 --check-prefix=CASE1 + +## Check big-endian support and the handling of --sframe argument default. +# RUN: yaml2obj --docnum=2 %s -o %t.2 +# RUN: llvm-readobj --sframe %t.2 2>&1 | \ +# RUN: FileCheck %s --strict-whitespace --match-full-lines \ +# RUN: -DFILE=%t.2 --check-prefix=CASE2 + +## Check handling of corrupted elf files (bad sh_name) +# RUN: yaml2obj --docnum=3 %s -o %t.3 +# RUN: not llvm-readobj --sframe %t.3 2>&1 | \ +# RUN: FileCheck %s --strict-whitespace --match-full-lines \ +# RUN: -DFILE=%t.3 --check-prefix=CASE3 + +--- !ELF +FileHeader: + Class: ELFCLASS64 + Data: ELFDATA2LSB + Type: ET_EXEC +Sections: + - Name: .sframe_bad_sh_size + Type: SHT_GNU_SFRAME + Flags: [ SHF_ALLOC ] + ShSize: 0xfffff +# CASE1-LABEL:SFrame section '.sframe_bad_sh_size' { +# CASE1:{{.*}}: warning: '[[FILE]]': The end of the file was unexpectedly encountered + - Name: .sframe_1b + Type: SHT_GNU_SFRAME + Flags: [ SHF_ALLOC ] + ContentArray: [ 0x00 ] +# CASE1-LABEL:SFrame section '.sframe_1b' { +# CASE1:{{.*}}: warning: '[[FILE]]': invalid sframe section: unexpected end of data at offset 0x1 while reading [0x0, 0x4) + + - Name: .sframe_bad_magic + Type: SHT_GNU_SFRAME + Flags: [ SHF_ALLOC ] + ContentArray: [ 0xde, 0xad, 0xbe, 0xef] +# CASE1-LABEL:SFrame section '.sframe_bad_magic' { +# CASE1:{{.*}}: warning: '[[FILE]]': invalid sframe section: invalid magic number (0xadde) + + - Name: .sframe_bad_version + Type: SHT_GNU_SFRAME + Flags: [ SHF_ALLOC ] + ContentArray: [ + 0xe2, 0xde, 0x01, 0x00 # Preamble (magic, version, flags) + ] +# CASE1-LABEL:SFrame section '.sframe_bad_version' { +# CASE1:{{.*}}: warning: '[[FILE]]': invalid sframe section: invalid/unsupported version number (1) + + - Name: .sframe_6b + Type: SHT_GNU_SFRAME + Flags: [ SHF_ALLOC ] + ContentArray: [ + 0xe2, 0xde, 0x02, 0x00, # Preamble (magic, version, flags) + 0x01, 0x02 + ] + +# CASE1-LABEL:SFrame section '.sframe_6b' { +# CASE1:{{.*}}: warning: '[[FILE]]': invalid sframe section: unexpected end of data at offset 0x6 while reading [0x0, 0x1c) + + - Name: .sframe_header + Type: SHT_GNU_SFRAME + Flags: [ SHF_ALLOC ] + ContentArray: [ + 0xe2, 0xde, 0x02, 0x06, # Preamble (magic, version, flags) + # Header: + 0x03, 0x42, 0x47, 0x00, # ABI, Fixed FP offset, Fixed RA Offset, AUX header length + 0x01, 0x00, 0x00, 0x00, # Number of FDEs + 0x10, 0x00, 0x00, 0x00, # Number of FREs + 0x00, 0x10, 0x00, 0x00, # FRE length + 0x04, 0x00, 0x00, 0x00, # FDE offset + 0x00, 0x01, 0x00, 0x00, # FRE offset + ] +# CASE1-LABEL:SFrame section '.sframe_header' { +# CASE1: Header { +# CASE1-NEXT: Magic: 0xDEE2 +# CASE1-NEXT: Version: V2 (0x2) +# CASE1-NEXT: Flags [ (0x6) +# CASE1-NEXT: FDEFuncStartPCRel (0x4){{ *}} +# CASE1-NEXT: FramePointer (0x2){{ *}} +# CASE1-NEXT: ] +# CASE1-NEXT: ABI: AMD64EndianLittle (0x3) +# CASE1-NEXT: CFA fixed FP offset (unused): 66 +# CASE1-NEXT: CFA fixed RA offset: 71 +# CASE1-NEXT: Auxiliary header length: 0 +# CASE1-NEXT: Num FDEs: 1 +# CASE1-NEXT: Num FREs: 16 +# CASE1-NEXT: FRE subsection length: 4096 +# CASE1-NEXT: FDE subsection offset: 4 +# CASE1-NEXT: FRE subsection offset: 256 +# CASE1-NEXT: } +# CASE1-NEXT:} + +--- !ELF +FileHeader: + Class: ELFCLASS64 + Data: ELFDATA2MSB + Type: ET_EXEC +Sections: + - Name: .sframe + Type: SHT_GNU_SFRAME + Flags: [ SHF_ALLOC ] + ContentArray: [ + 0xde, 0xe2, 0x02, 0x01, # Preamble (magic, version, flags) + # Header: + 0x01, 0x42, 0x47, 0x00, # ABI, Fixed FP offset, Fixed RA Offset, AUX header length + 0x00, 0x00, 0x00, 0x01, # Number of FDEs + 0x00, 0x00, 0x00, 0x10, # Number of FREs + 0x00, 0x00, 0x10, 0x00, # FRE length + 0x00, 0x00, 0x00, 0x04, # FDE offset + 0x00, 0x00, 0x01, 0x00, # FRE offset + ] +# CASE2-LABEL:SFrame section '.sframe' { +# CASE2: Header { +# CASE2-NEXT: Magic: 0xDEE2 +# CASE2-NEXT: Version: V2 (0x2) +# CASE2-NEXT: Flags [ (0x1) +# CASE2-NEXT: FDESorted (0x1){{ *}} +# CASE2-NEXT: ] +# CASE2-NEXT: ABI: AArch64EndianBig (0x1) +# CASE2-NEXT: CFA fixed FP offset (unused): 66 +# CASE2-NEXT: CFA fixed RA offset (unused): 71 +# CASE2-NEXT: Auxiliary header length: 0 +# CASE2-NEXT: Num FDEs: 1 +# CASE2-NEXT: Num FREs: 16 +# CASE2-NEXT: FRE subsection length: 4096 +# CASE2-NEXT: FDE subsection offset: 4 +# CASE2-NEXT: FRE subsection offset: 256 +# CASE2-NEXT: } +# CASE2-NEXT:} + +--- !ELF +FileHeader: + Class: ELFCLASS64 + Data: ELFDATA2MSB + Type: ET_EXEC +Sections: + - Name: .corrupted + Type: SHT_GNU_SFRAME + Flags: [ SHF_ALLOC ] + ShName: 0x10000 +# CASE3:{{.*}}: error: '[[FILE]]': a section [index 1] has an invalid sh_name (0x10000) offset which goes past the end of the section name string table |