232 files changed, 7786 insertions, 2333 deletions
diff --git a/llvm/test/Analysis/DependenceAnalysis/compute-absolute-value.ll b/llvm/test/Analysis/DependenceAnalysis/compute-absolute-value.ll
new file mode 100644
index 0000000..64fad37
--- /dev/null
+++ b/llvm/test/Analysis/DependenceAnalysis/compute-absolute-value.ll
@@ -0,0 +1,48 @@
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --version 6
+; RUN: opt < %s -disable-output "-passes=print<da>" 2>&1 | FileCheck %s
+
+; for (i = 0; i < 3; i++) {
+;     a[-k * i] = 1;
+;     a[-k * i + (2 * k + 1)] = 2;
+; }
+;
+; When k = -1, dependency exists between the two stores. Accesses will be:
+;
+;   - a[-k * i]               : a[ 0], a[-1], a[-2]
+;   - a[-k * i + (2 * k + 1)] : a[-1], a[-2], a[-3]
+;
+; We cannot determine the sign of `k` and `2*k + 1` at compile time,
+;
+define void @unknown_sign(ptr %a, i64 %k) {
+; CHECK-LABEL: 'unknown_sign'
+; CHECK-NEXT:  Src: store i8 1, ptr %idx.0, align 1 --> Dst: store i8 1, ptr %idx.0, align 1
+; CHECK-NEXT:    da analyze - none!
+; CHECK-NEXT:  Src: store i8 1, ptr %idx.0, align 1 --> Dst: store i8 2, ptr %idx.1, align 1
+; CHECK-NEXT:    da analyze - output [<>]!
+; CHECK-NEXT:  Src: store i8 2, ptr %idx.1, align 1 --> Dst: store i8 2, ptr %idx.1, align 1
+; CHECK-NEXT:    da analyze - none!
+;
+entry:
+  %k.neg = sub nsw i64 0, %k
+  %kk = mul nsw i64 %k, 2
+  %subscript.1.init = add i64 1, %kk
+  br label %loop
+
+loop:
+  %i = phi i64 [ 0, %entry ], [ %i.next, %loop ]
+  %subscript.0 = phi i64 [ 0, %entry ], [ %subscript.0.next, %loop ]
+  %subscript.1 = phi i64 [ %subscript.1.init, %entry ], [ %subscript.1.next, %loop ]
+  %idx.0 = getelementptr i8, ptr %a, i64 %subscript.0
+  %idx.1 = getelementptr i8, ptr %a, i64 %subscript.1
+  store i8 1, ptr %idx.0
+  store i8 2, ptr %idx.1
+  %i.next = add i64 %i, 1
+  %subscript.0.next = add nsw i64 %subscript.0, %k.neg
+  %subscript.1.next = add nsw i64 %subscript.1, %k.neg
+  %cond.exit = icmp eq i64 %i.next, 3
+  br i1 %cond.exit, label %exit, label %loop
+
+exit:
+  ret void
+}
+
diff --git a/llvm/test/Assembler/metadata-annotations.ll b/llvm/test/Assembler/metadata-annotations.ll
new file mode 100644
index 0000000..4fd4713
--- /dev/null
+++ b/llvm/test/Assembler/metadata-annotations.ll
@@ -0,0 +1,9 @@
+; RUN: llvm-as < %s | llvm-dis --materialize-metadata --show-annotations | FileCheck %s
+
+; CHECK: ; Materializable
+; CHECK-NEXT: define dso_local i32 @test() {}
+define dso_local i32 @test() {
+entry:
+  ret i32 0
+}
+
diff --git a/llvm/test/CodeGen/AArch64/aarch64-matrix-umull-smull.ll b/llvm/test/CodeGen/AArch64/aarch64-matrix-umull-smull.ll
index 0933e67..b54f262 100644
--- a/llvm/test/CodeGen/AArch64/aarch64-matrix-umull-smull.ll
+++ b/llvm/test/CodeGen/AArch64/aarch64-matrix-umull-smull.ll
@@ -749,12 +749,429 @@ for.body:                                         ; preds = %for.body.preheader1
   br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
 }
 
+define i64 @red_mla_dup_ext_u8_s8_s64(ptr noalias noundef readonly captures(none) %A, i8 noundef %B, i32 noundef %n) {
+; CHECK-SD-LABEL: red_mla_dup_ext_u8_s8_s64:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    // kill: def $w1 killed $w1 def $x1
+; CHECK-SD-NEXT:    cbz w2, .LBB6_3
+; CHECK-SD-NEXT:  // %bb.1: // %iter.check
+; CHECK-SD-NEXT:    str x25, [sp, #-64]! // 8-byte Folded Spill
+; CHECK-SD-NEXT:    stp x24, x23, [sp, #16] // 16-byte Folded Spill
+; CHECK-SD-NEXT:    stp x22, x21, [sp, #32] // 16-byte Folded Spill
+; CHECK-SD-NEXT:    stp x20, x19, [sp, #48] // 16-byte Folded Spill
+; CHECK-SD-NEXT:    .cfi_def_cfa_offset 64
+; CHECK-SD-NEXT:    .cfi_offset w19, -8
+; CHECK-SD-NEXT:    .cfi_offset w20, -16
+; CHECK-SD-NEXT:    .cfi_offset w21, -24
+; CHECK-SD-NEXT:    .cfi_offset w22, -32
+; CHECK-SD-NEXT:    .cfi_offset w23, -40
+; CHECK-SD-NEXT:    .cfi_offset w24, -48
+; CHECK-SD-NEXT:    .cfi_offset w25, -64
+; CHECK-SD-NEXT:    sxtb x9, w1
+; CHECK-SD-NEXT:    cmp w2, #3
+; CHECK-SD-NEXT:    mov w10, w2
+; CHECK-SD-NEXT:    b.hi .LBB6_4
+; CHECK-SD-NEXT:  // %bb.2:
+; CHECK-SD-NEXT:    mov x11, xzr
+; CHECK-SD-NEXT:    mov x8, xzr
+; CHECK-SD-NEXT:    b .LBB6_13
+; CHECK-SD-NEXT:  .LBB6_3:
+; CHECK-SD-NEXT:    mov x0, xzr
+; CHECK-SD-NEXT:    ret
+; CHECK-SD-NEXT:  .LBB6_4: // %vector.main.loop.iter.check
+; CHECK-SD-NEXT:    dup v0.2d, x9
+; CHECK-SD-NEXT:    cmp w2, #16
+; CHECK-SD-NEXT:    b.hs .LBB6_6
+; CHECK-SD-NEXT:  // %bb.5:
+; CHECK-SD-NEXT:    mov x11, xzr
+; CHECK-SD-NEXT:    mov x8, xzr
+; CHECK-SD-NEXT:    b .LBB6_10
+; CHECK-SD-NEXT:  .LBB6_6: // %vector.ph
+; CHECK-SD-NEXT:    movi v1.2d, #0000000000000000
+; CHECK-SD-NEXT:    mov x8, v0.d[1]
+; CHECK-SD-NEXT:    and x12, x10, #0xc
+; CHECK-SD-NEXT:    movi v2.2d, #0000000000000000
+; CHECK-SD-NEXT:    movi v4.2d, #0000000000000000
+; CHECK-SD-NEXT:    and x11, x10, #0xfffffff0
+; CHECK-SD-NEXT:    movi v3.2d, #0000000000000000
+; CHECK-SD-NEXT:    movi v7.2d, #0000000000000000
+; CHECK-SD-NEXT:    mov x15, x0
+; CHECK-SD-NEXT:    movi v5.2d, #0000000000000000
+; CHECK-SD-NEXT:    movi v16.2d, #0000000000000000
+; CHECK-SD-NEXT:    and x16, x10, #0xfffffff0
+; CHECK-SD-NEXT:    movi v6.2d, #0000000000000000
+; CHECK-SD-NEXT:    fmov x13, d0
+; CHECK-SD-NEXT:    fmov x14, d0
+; CHECK-SD-NEXT:  .LBB6_7: // %vector.body
+; CHECK-SD-NEXT:    // =>This Inner Loop Header: Depth=1
+; CHECK-SD-NEXT:    ldr q17, [x15], #16
+; CHECK-SD-NEXT:    subs x16, x16, #16
+; CHECK-SD-NEXT:    ushll v18.8h, v17.8b, #0
+; CHECK-SD-NEXT:    ushll2 v19.8h, v17.16b, #0
+; CHECK-SD-NEXT:    ushll v17.4s, v18.4h, #0
+; CHECK-SD-NEXT:    ushll2 v20.4s, v19.8h, #0
+; CHECK-SD-NEXT:    ushll2 v18.4s, v18.8h, #0
+; CHECK-SD-NEXT:    ushll v19.4s, v19.4h, #0
+; CHECK-SD-NEXT:    ushll v21.2d, v17.2s, #0
+; CHECK-SD-NEXT:    ushll2 v22.2d, v20.4s, #0
+; CHECK-SD-NEXT:    ushll2 v17.2d, v17.4s, #0
+; CHECK-SD-NEXT:    ushll v23.2d, v18.2s, #0
+; CHECK-SD-NEXT:    ushll v20.2d, v20.2s, #0
+; CHECK-SD-NEXT:    ushll2 v18.2d, v18.4s, #0
+; CHECK-SD-NEXT:    fmov x17, d21
+; CHECK-SD-NEXT:    mov x2, v21.d[1]
+; CHECK-SD-NEXT:    ushll v21.2d, v19.2s, #0
+; CHECK-SD-NEXT:    ushll2 v19.2d, v19.4s, #0
+; CHECK-SD-NEXT:    fmov x18, d22
+; CHECK-SD-NEXT:    fmov x1, d17
+; CHECK-SD-NEXT:    fmov x3, d23
+; CHECK-SD-NEXT:    fmov x21, d20
+; CHECK-SD-NEXT:    fmov x22, d18
+; CHECK-SD-NEXT:    fmov x19, d21
+; CHECK-SD-NEXT:    mul x17, x13, x17
+; CHECK-SD-NEXT:    mov x4, v22.d[1]
+; CHECK-SD-NEXT:    fmov x24, d19
+; CHECK-SD-NEXT:    mov x5, v23.d[1]
+; CHECK-SD-NEXT:    mov x6, v21.d[1]
+; CHECK-SD-NEXT:    mov x7, v20.d[1]
+; CHECK-SD-NEXT:    mov x20, v18.d[1]
+; CHECK-SD-NEXT:    mov x23, v19.d[1]
+; CHECK-SD-NEXT:    mov x25, v17.d[1]
+; CHECK-SD-NEXT:    mul x18, x14, x18
+; CHECK-SD-NEXT:    mul x1, x13, x1
+; CHECK-SD-NEXT:    fmov d17, x17
+; CHECK-SD-NEXT:    mul x3, x13, x3
+; CHECK-SD-NEXT:    fmov d18, x18
+; CHECK-SD-NEXT:    mul x19, x13, x19
+; CHECK-SD-NEXT:    fmov d19, x1
+; CHECK-SD-NEXT:    mul x21, x13, x21
+; CHECK-SD-NEXT:    fmov d20, x3
+; CHECK-SD-NEXT:    mul x22, x13, x22
+; CHECK-SD-NEXT:    fmov d21, x19
+; CHECK-SD-NEXT:    mul x24, x13, x24
+; CHECK-SD-NEXT:    fmov d24, x21
+; CHECK-SD-NEXT:    mul x2, x8, x2
+; CHECK-SD-NEXT:    fmov d22, x22
+; CHECK-SD-NEXT:    mul x4, x8, x4
+; CHECK-SD-NEXT:    fmov d23, x24
+; CHECK-SD-NEXT:    mul x5, x8, x5
+; CHECK-SD-NEXT:    mov v17.d[1], x2
+; CHECK-SD-NEXT:    mul x6, x8, x6
+; CHECK-SD-NEXT:    mov v18.d[1], x4
+; CHECK-SD-NEXT:    mul x7, x8, x7
+; CHECK-SD-NEXT:    mov v20.d[1], x5
+; CHECK-SD-NEXT:    add v1.2d, v17.2d, v1.2d
+; CHECK-SD-NEXT:    mul x20, x8, x20
+; CHECK-SD-NEXT:    mov v21.d[1], x6
+; CHECK-SD-NEXT:    add v6.2d, v18.2d, v6.2d
+; CHECK-SD-NEXT:    mul x23, x8, x23
+; CHECK-SD-NEXT:    mov v24.d[1], x7
+; CHECK-SD-NEXT:    add v4.2d, v20.2d, v4.2d
+; CHECK-SD-NEXT:    mul x17, x8, x25
+; CHECK-SD-NEXT:    mov v22.d[1], x20
+; CHECK-SD-NEXT:    add v7.2d, v21.2d, v7.2d
+; CHECK-SD-NEXT:    mov v23.d[1], x23
+; CHECK-SD-NEXT:    add v16.2d, v24.2d, v16.2d
+; CHECK-SD-NEXT:    mov v19.d[1], x17
+; CHECK-SD-NEXT:    add v3.2d, v22.2d, v3.2d
+; CHECK-SD-NEXT:    add v5.2d, v23.2d, v5.2d
+; CHECK-SD-NEXT:    add v2.2d, v19.2d, v2.2d
+; CHECK-SD-NEXT:    b.ne .LBB6_7
+; CHECK-SD-NEXT:  // %bb.8: // %middle.block
+; CHECK-SD-NEXT:    add v1.2d, v1.2d, v7.2d
+; CHECK-SD-NEXT:    add v4.2d, v4.2d, v16.2d
+; CHECK-SD-NEXT:    cmp x11, x10
+; CHECK-SD-NEXT:    add v2.2d, v2.2d, v5.2d
+; CHECK-SD-NEXT:    add v3.2d, v3.2d, v6.2d
+; CHECK-SD-NEXT:    add v1.2d, v1.2d, v4.2d
+; CHECK-SD-NEXT:    add v2.2d, v2.2d, v3.2d
+; CHECK-SD-NEXT:    add v1.2d, v1.2d, v2.2d
+; CHECK-SD-NEXT:    addp d1, v1.2d
+; CHECK-SD-NEXT:    fmov x8, d1
+; CHECK-SD-NEXT:    b.eq .LBB6_15
+; CHECK-SD-NEXT:  // %bb.9: // %vec.epilog.iter.check
+; CHECK-SD-NEXT:    cbz x12, .LBB6_13
+; CHECK-SD-NEXT:  .LBB6_10: // %vec.epilog.ph
+; CHECK-SD-NEXT:    movi v1.2d, #0000000000000000
+; CHECK-SD-NEXT:    movi v2.2d, #0000000000000000
+; CHECK-SD-NEXT:    mov x13, x11
+; CHECK-SD-NEXT:    movi v3.2d, #0x000000000000ff
+; CHECK-SD-NEXT:    fmov x14, d0
+; CHECK-SD-NEXT:    and x11, x10, #0xfffffffc
+; CHECK-SD-NEXT:    fmov x15, d0
+; CHECK-SD-NEXT:    sub x12, x13, x11
+; CHECK-SD-NEXT:    add x13, x0, x13
+; CHECK-SD-NEXT:    mov v1.d[0], x8
+; CHECK-SD-NEXT:    mov x8, v0.d[1]
+; CHECK-SD-NEXT:  .LBB6_11: // %vec.epilog.vector.body
+; CHECK-SD-NEXT:    // =>This Inner Loop Header: Depth=1
+; CHECK-SD-NEXT:    ldr s0, [x13], #4
+; CHECK-SD-NEXT:    adds x12, x12, #4
+; CHECK-SD-NEXT:    ushll v0.8h, v0.8b, #0
+; CHECK-SD-NEXT:    ushll v0.4s, v0.4h, #0
+; CHECK-SD-NEXT:    ushll v4.2d, v0.2s, #0
+; CHECK-SD-NEXT:    ushll2 v0.2d, v0.4s, #0
+; CHECK-SD-NEXT:    and v4.16b, v4.16b, v3.16b
+; CHECK-SD-NEXT:    and v0.16b, v0.16b, v3.16b
+; CHECK-SD-NEXT:    fmov x16, d4
+; CHECK-SD-NEXT:    fmov x18, d0
+; CHECK-SD-NEXT:    mov x17, v4.d[1]
+; CHECK-SD-NEXT:    mov x1, v0.d[1]
+; CHECK-SD-NEXT:    mul x16, x14, x16
+; CHECK-SD-NEXT:    mul x18, x15, x18
+; CHECK-SD-NEXT:    mul x17, x8, x17
+; CHECK-SD-NEXT:    fmov d0, x16
+; CHECK-SD-NEXT:    mul x1, x8, x1
+; CHECK-SD-NEXT:    fmov d4, x18
+; CHECK-SD-NEXT:    mov v0.d[1], x17
+; CHECK-SD-NEXT:    mov v4.d[1], x1
+; CHECK-SD-NEXT:    add v1.2d, v0.2d, v1.2d
+; CHECK-SD-NEXT:    add v2.2d, v4.2d, v2.2d
+; CHECK-SD-NEXT:    b.ne .LBB6_11
+; CHECK-SD-NEXT:  // %bb.12: // %vec.epilog.middle.block
+; CHECK-SD-NEXT:    add v0.2d, v1.2d, v2.2d
+; CHECK-SD-NEXT:    cmp x11, x10
+; CHECK-SD-NEXT:    addp d0, v0.2d
+; CHECK-SD-NEXT:    fmov x8, d0
+; CHECK-SD-NEXT:    b.eq .LBB6_15
+; CHECK-SD-NEXT:  .LBB6_13: // %for.body.preheader
+; CHECK-SD-NEXT:    sub x10, x10, x11
+; CHECK-SD-NEXT:    add x11, x0, x11
+; CHECK-SD-NEXT:  .LBB6_14: // %for.body
+; CHECK-SD-NEXT:    // =>This Inner Loop Header: Depth=1
+; CHECK-SD-NEXT:    ldrb w12, [x11], #1
+; CHECK-SD-NEXT:    subs x10, x10, #1
+; CHECK-SD-NEXT:    smaddl x8, w12, w9, x8
+; CHECK-SD-NEXT:    b.ne .LBB6_14
+; CHECK-SD-NEXT:  .LBB6_15:
+; CHECK-SD-NEXT:    ldp x20, x19, [sp, #48] // 16-byte Folded Reload
+; CHECK-SD-NEXT:    ldp x22, x21, [sp, #32] // 16-byte Folded Reload
+; CHECK-SD-NEXT:    ldp x24, x23, [sp, #16] // 16-byte Folded Reload
+; CHECK-SD-NEXT:    ldr x25, [sp], #64 // 8-byte Folded Reload
+; CHECK-SD-NEXT:    mov x0, x8
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: red_mla_dup_ext_u8_s8_s64:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    // kill: def $w1 killed $w1 def $x1
+; CHECK-GI-NEXT:    cbz w2, .LBB6_7
+; CHECK-GI-NEXT:  // %bb.1: // %iter.check
+; CHECK-GI-NEXT:    movi d0, #0000000000000000
+; CHECK-GI-NEXT:    sxtb x9, w1
+; CHECK-GI-NEXT:    mov x11, xzr
+; CHECK-GI-NEXT:    cmp w2, #4
+; CHECK-GI-NEXT:    mov w10, w2
+; CHECK-GI-NEXT:    b.lo .LBB6_12
+; CHECK-GI-NEXT:  // %bb.2: // %vector.main.loop.iter.check
+; CHECK-GI-NEXT:    movi d0, #0000000000000000
+; CHECK-GI-NEXT:    dup v1.2d, x9
+; CHECK-GI-NEXT:    mov x11, xzr
+; CHECK-GI-NEXT:    cmp w2, #16
+; CHECK-GI-NEXT:    b.lo .LBB6_9
+; CHECK-GI-NEXT:  // %bb.3: // %vector.ph
+; CHECK-GI-NEXT:    movi v0.2d, #0000000000000000
+; CHECK-GI-NEXT:    xtn v2.2s, v1.2d
+; CHECK-GI-NEXT:    and x8, x10, #0xc
+; CHECK-GI-NEXT:    movi v3.2d, #0000000000000000
+; CHECK-GI-NEXT:    movi v4.2d, #0000000000000000
+; CHECK-GI-NEXT:    and x11, x10, #0xfffffff0
+; CHECK-GI-NEXT:    movi v5.2d, #0000000000000000
+; CHECK-GI-NEXT:    movi v6.2d, #0000000000000000
+; CHECK-GI-NEXT:    mov x12, x0
+; CHECK-GI-NEXT:    movi v7.2d, #0000000000000000
+; CHECK-GI-NEXT:    movi v16.2d, #0000000000000000
+; CHECK-GI-NEXT:    and x13, x10, #0xfffffff0
+; CHECK-GI-NEXT:    movi v17.2d, #0000000000000000
+; CHECK-GI-NEXT:  .LBB6_4: // %vector.body
+; CHECK-GI-NEXT:    // =>This Inner Loop Header: Depth=1
+; CHECK-GI-NEXT:    ldr q18, [x12], #16
+; CHECK-GI-NEXT:    subs x13, x13, #16
+; CHECK-GI-NEXT:    ushll v19.8h, v18.8b, #0
+; CHECK-GI-NEXT:    ushll2 v18.8h, v18.16b, #0
+; CHECK-GI-NEXT:    ushll v20.4s, v19.4h, #0
+; CHECK-GI-NEXT:    ushll2 v19.4s, v19.8h, #0
+; CHECK-GI-NEXT:    ushll v21.4s, v18.4h, #0
+; CHECK-GI-NEXT:    ushll2 v18.4s, v18.8h, #0
+; CHECK-GI-NEXT:    mov d22, v20.d[1]
+; CHECK-GI-NEXT:    mov d23, v19.d[1]
+; CHECK-GI-NEXT:    mov d24, v21.d[1]
+; CHECK-GI-NEXT:    mov d25, v18.d[1]
+; CHECK-GI-NEXT:    smlal v0.2d, v2.2s, v20.2s
+; CHECK-GI-NEXT:    smlal v4.2d, v2.2s, v19.2s
+; CHECK-GI-NEXT:    smlal v6.2d, v2.2s, v21.2s
+; CHECK-GI-NEXT:    smlal v16.2d, v2.2s, v18.2s
+; CHECK-GI-NEXT:    smlal v3.2d, v2.2s, v22.2s
+; CHECK-GI-NEXT:    smlal v5.2d, v2.2s, v23.2s
+; CHECK-GI-NEXT:    smlal v7.2d, v2.2s, v24.2s
+; CHECK-GI-NEXT:    smlal v17.2d, v2.2s, v25.2s
+; CHECK-GI-NEXT:    b.ne .LBB6_4
+; CHECK-GI-NEXT:  // %bb.5: // %middle.block
+; CHECK-GI-NEXT:    add v0.2d, v0.2d, v3.2d
+; CHECK-GI-NEXT:    add v2.2d, v4.2d, v5.2d
+; CHECK-GI-NEXT:    cmp x11, x10
+; CHECK-GI-NEXT:    add v3.2d, v6.2d, v7.2d
+; CHECK-GI-NEXT:    add v4.2d, v16.2d, v17.2d
+; CHECK-GI-NEXT:    add v0.2d, v0.2d, v2.2d
+; CHECK-GI-NEXT:    add v2.2d, v3.2d, v4.2d
+; CHECK-GI-NEXT:    add v0.2d, v0.2d, v2.2d
+; CHECK-GI-NEXT:    addp d0, v0.2d
+; CHECK-GI-NEXT:    b.ne .LBB6_8
+; CHECK-GI-NEXT:  // %bb.6:
+; CHECK-GI-NEXT:    fmov x8, d0
+; CHECK-GI-NEXT:    mov x0, x8
+; CHECK-GI-NEXT:    ret
+; CHECK-GI-NEXT:  .LBB6_7:
+; CHECK-GI-NEXT:    mov x8, xzr
+; CHECK-GI-NEXT:    mov x0, x8
+; CHECK-GI-NEXT:    ret
+; CHECK-GI-NEXT:  .LBB6_8: // %vec.epilog.iter.check
+; CHECK-GI-NEXT:    cbz x8, .LBB6_12
+; CHECK-GI-NEXT:  .LBB6_9: // %vec.epilog.ph
+; CHECK-GI-NEXT:    mov v0.d[1], xzr
+; CHECK-GI-NEXT:    movi v2.2d, #0000000000000000
+; CHECK-GI-NEXT:    mov x12, x11
+; CHECK-GI-NEXT:    xtn v1.2s, v1.2d
+; CHECK-GI-NEXT:    and x11, x10, #0xfffffffc
+; CHECK-GI-NEXT:    sub x8, x12, x11
+; CHECK-GI-NEXT:    add x12, x0, x12
+; CHECK-GI-NEXT:  .LBB6_10: // %vec.epilog.vector.body
+; CHECK-GI-NEXT:    // =>This Inner Loop Header: Depth=1
+; CHECK-GI-NEXT:    ldr w13, [x12], #4
+; CHECK-GI-NEXT:    adds x8, x8, #4
+; CHECK-GI-NEXT:    fmov s3, w13
+; CHECK-GI-NEXT:    uxtb w13, w13
+; CHECK-GI-NEXT:    mov b4, v3.b[2]
+; CHECK-GI-NEXT:    mov b5, v3.b[1]
+; CHECK-GI-NEXT:    mov b6, v3.b[3]
+; CHECK-GI-NEXT:    fmov s3, w13
+; CHECK-GI-NEXT:    fmov w14, s4
+; CHECK-GI-NEXT:    fmov w15, s5
+; CHECK-GI-NEXT:    fmov w16, s6
+; CHECK-GI-NEXT:    uxtb w14, w14
+; CHECK-GI-NEXT:    uxtb w15, w15
+; CHECK-GI-NEXT:    uxtb w16, w16
+; CHECK-GI-NEXT:    fmov s4, w14
+; CHECK-GI-NEXT:    mov v3.s[1], w15
+; CHECK-GI-NEXT:    mov v4.s[1], w16
+; CHECK-GI-NEXT:    smlal v0.2d, v1.2s, v3.2s
+; CHECK-GI-NEXT:    smlal v2.2d, v1.2s, v4.2s
+; CHECK-GI-NEXT:    b.ne .LBB6_10
+; CHECK-GI-NEXT:  // %bb.11: // %vec.epilog.middle.block
+; CHECK-GI-NEXT:    add v0.2d, v0.2d, v2.2d
+; CHECK-GI-NEXT:    cmp x11, x10
+; CHECK-GI-NEXT:    addp d0, v0.2d
+; CHECK-GI-NEXT:    fmov x8, d0
+; CHECK-GI-NEXT:    b.eq .LBB6_14
+; CHECK-GI-NEXT:  .LBB6_12: // %for.body.preheader
+; CHECK-GI-NEXT:    sub x10, x10, x11
+; CHECK-GI-NEXT:    add x11, x0, x11
+; CHECK-GI-NEXT:  .LBB6_13: // %for.body
+; CHECK-GI-NEXT:    // =>This Inner Loop Header: Depth=1
+; CHECK-GI-NEXT:    ldrb w8, [x11], #1
+; CHECK-GI-NEXT:    fmov x12, d0
+; CHECK-GI-NEXT:    subs x10, x10, #1
+; CHECK-GI-NEXT:    madd x8, x8, x9, x12
+; CHECK-GI-NEXT:    fmov d0, x8
+; CHECK-GI-NEXT:    b.ne .LBB6_13
+; CHECK-GI-NEXT:  .LBB6_14: // %for.cond.cleanup
+; CHECK-GI-NEXT:    mov x0, x8
+; CHECK-GI-NEXT:    ret
+entry:
+  %cmp5.not = icmp eq i32 %n, 0
+  br i1 %cmp5.not, label %for.cond.cleanup, label %iter.check
+
+iter.check:                                       ; preds = %entry
+  %conv1 = sext i8 %B to i64
+  %wide.trip.count = zext i32 %n to i64
+  %min.iters.check = icmp ult i32 %n, 4
+  br i1 %min.iters.check, label %for.body.preheader, label %vector.main.loop.iter.check
+
+vector.main.loop.iter.check:                      ; preds = %iter.check
+  %min.iters.check9 = icmp ult i32 %n, 16
+  br i1 %min.iters.check9, label %vec.epilog.ph, label %vector.ph
+
+vector.ph:                                        ; preds = %vector.main.loop.iter.check
+  %n.mod.vf = and i64 %wide.trip.count, 12
+  %n.vec = and i64 %wide.trip.count, 4294967280
+  %broadcast.splatinsert = insertelement <16 x i64> poison, i64 %conv1, i64 0
+  %broadcast.splat = shufflevector <16 x i64> %broadcast.splatinsert, <16 x i64> poison, <16 x i32> zeroinitializer
+  br label %vector.body
+
+vector.body:                                      ; preds = %vector.body, %vector.ph
+  %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+  %vec.phi = phi <16 x i64> [ zeroinitializer, %vector.ph ], [ %3, %vector.body ]
+  %0 = getelementptr inbounds nuw i8, ptr %A, i64 %index
+  %wide.load = load <16 x i8>, ptr %0, align 1
+  %1 = zext <16 x i8> %wide.load to <16 x i64>
+  %2 = mul nsw <16 x i64> %broadcast.splat, %1
+  %3 = add <16 x i64> %2, %vec.phi
+  %index.next = add nuw i64 %index, 16
+  %4 = icmp eq i64 %index.next, %n.vec
+  br i1 %4, label %middle.block, label %vector.body
+
+middle.block:                                     ; preds = %vector.body
+  %5 = tail call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> %3)
+  %cmp.n = icmp eq i64 %n.vec, %wide.trip.count
+  br i1 %cmp.n, label %for.cond.cleanup, label %vec.epilog.iter.check
+
+vec.epilog.iter.check:                            ; preds = %middle.block
+  %min.epilog.iters.check = icmp eq i64 %n.mod.vf, 0
+  br i1 %min.epilog.iters.check, label %for.body.preheader, label %vec.epilog.ph
+
+vec.epilog.ph:                                    ; preds = %vector.main.loop.iter.check, %vec.epilog.iter.check
+  %vec.epilog.resume.val = phi i64 [ %n.vec, %vec.epilog.iter.check ], [ 0, %vector.main.loop.iter.check ]
+  %bc.merge.rdx = phi i64 [ %5, %vec.epilog.iter.check ], [ 0, %vector.main.loop.iter.check ]
+  %n.vec11 = and i64 %wide.trip.count, 4294967292
+  %6 = insertelement <4 x i64> <i64 poison, i64 0, i64 0, i64 0>, i64 %bc.merge.rdx, i64 0
+  %broadcast.splatinsert12 = insertelement <4 x i64> poison, i64 %conv1, i64 0
+  %broadcast.splat13 = shufflevector <4 x i64> %broadcast.splatinsert12, <4 x i64> poison, <4 x i32> zeroinitializer
+  br label %vec.epilog.vector.body
+
+vec.epilog.vector.body:                           ; preds = %vec.epilog.vector.body, %vec.epilog.ph
+  %index14 = phi i64 [ %vec.epilog.resume.val, %vec.epilog.ph ], [ %index.next17, %vec.epilog.vector.body ]
+  %vec.phi15 = phi <4 x i64> [ %6, %vec.epilog.ph ], [ %10, %vec.epilog.vector.body ]
+  %7 = getelementptr inbounds nuw i8, ptr %A, i64 %index14
+  %wide.load16 = load <4 x i8>, ptr %7, align 1
+  %8 = zext <4 x i8> %wide.load16 to <4 x i64>
+  %9 = mul nsw <4 x i64> %broadcast.splat13, %8
+  %10 = add <4 x i64> %9, %vec.phi15
+  %index.next17 = add nuw i64 %index14, 4
+  %11 = icmp eq i64 %index.next17, %n.vec11
+  br i1 %11, label %vec.epilog.middle.block, label %vec.epilog.vector.body
+
+vec.epilog.middle.block:                          ; preds = %vec.epilog.vector.body
+  %12 = tail call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %10)
+  %cmp.n18 = icmp eq i64 %n.vec11, %wide.trip.count
+  br i1 %cmp.n18, label %for.cond.cleanup, label %for.body.preheader
+
+for.body.preheader:                               ; preds = %iter.check, %vec.epilog.iter.check, %vec.epilog.middle.block
+  %indvars.iv.ph = phi i64 [ 0, %iter.check ], [ %n.vec, %vec.epilog.iter.check ], [ %n.vec11, %vec.epilog.middle.block ]
+  %s.06.ph = phi i64 [ 0, %iter.check ], [ %5, %vec.epilog.iter.check ], [ %12, %vec.epilog.middle.block ]
+  br label %for.body
+
+for.cond.cleanup:                                 ; preds = %for.body, %middle.block, %vec.epilog.middle.block, %entry
+  %s.0.lcssa = phi i64 [ 0, %entry ], [ %5, %middle.block ], [ %12, %vec.epilog.middle.block ], [ %add, %for.body ]
+  ret i64 %s.0.lcssa
+
+for.body:                                         ; preds = %for.body.preheader, %for.body
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ %indvars.iv.ph, %for.body.preheader ]
+  %s.06 = phi i64 [ %add, %for.body ], [ %s.06.ph, %for.body.preheader ]
+  %arrayidx = getelementptr inbounds nuw i8, ptr %A, i64 %indvars.iv
+  %13 = load i8, ptr %arrayidx, align 1
+  %conv = zext i8 %13 to i64
+  %mul = mul nsw i64 %conv, %conv1
+  %add = add nsw i64 %mul, %s.06
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
+  br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+}
+
 define void @sink_v2z64_1(ptr %p, ptr %d, i64 %n, <2 x i32> %a) {
 ; CHECK-SD-LABEL: sink_v2z64_1:
 ; CHECK-SD:       // %bb.0: // %entry
 ; CHECK-SD-NEXT:    mov x8, xzr
 ; CHECK-SD-NEXT:    // kill: def $d0 killed $d0 def $q0
-; CHECK-SD-NEXT:  .LBB6_1: // %loop
+; CHECK-SD-NEXT:  .LBB7_1: // %loop
 ; CHECK-SD-NEXT:    // =>This Inner Loop Header: Depth=1
 ; CHECK-SD-NEXT:    ldr d1, [x0]
 ; CHECK-SD-NEXT:    subs x2, x2, #8
@@ -762,7 +1179,7 @@ define void @sink_v2z64_1(ptr %p, ptr %d, i64 %n, <2 x i32> %a) {
 ; CHECK-SD-NEXT:    umull v1.2d, v1.2s, v0.s[1]
 ; CHECK-SD-NEXT:    shrn v1.2s, v1.2d, #15
 ; CHECK-SD-NEXT:    str d1, [x0], #32
-; CHECK-SD-NEXT:    b.ne .LBB6_1
+; CHECK-SD-NEXT:    b.ne .LBB7_1
 ; CHECK-SD-NEXT:  // %bb.2: // %exit
 ; CHECK-SD-NEXT:    ret
 ;
@@ -772,7 +1189,7 @@ define void @sink_v2z64_1(ptr %p, ptr %d, i64 %n, <2 x i32> %a) {
 ; CHECK-GI-NEXT:    mov x8, xzr
 ; CHECK-GI-NEXT:    dup v0.2d, v0.d[1]
 ; CHECK-GI-NEXT:    xtn v0.2s, v0.2d
-; CHECK-GI-NEXT:  .LBB6_1: // %loop
+; CHECK-GI-NEXT:  .LBB7_1: // %loop
 ; CHECK-GI-NEXT:    // =>This Inner Loop Header: Depth=1
 ; CHECK-GI-NEXT:    ldr d1, [x0]
 ; CHECK-GI-NEXT:    subs x2, x2, #8
@@ -780,7 +1197,7 @@ define void @sink_v2z64_1(ptr %p, ptr %d, i64 %n, <2 x i32> %a) {
 ; CHECK-GI-NEXT:    umull v1.2d, v1.2s, v0.2s
 ; CHECK-GI-NEXT:    shrn v1.2s, v1.2d, #15
 ; CHECK-GI-NEXT:    str d1, [x0], #32
-; CHECK-GI-NEXT:    b.ne .LBB6_1
+; CHECK-GI-NEXT:    b.ne .LBB7_1
 ; CHECK-GI-NEXT:  // %bb.2: // %exit
 ; CHECK-GI-NEXT:    ret
 entry:
@@ -813,7 +1230,7 @@ define void @sink_v4i64_1(ptr %p, ptr %d, i64 %n, <2 x i32> %a) {
 ; CHECK-SD:       // %bb.0: // %entry
 ; CHECK-SD-NEXT:    mov x8, xzr
 ; CHECK-SD-NEXT:    // kill: def $d0 killed $d0 def $q0
-; CHECK-SD-NEXT:  .LBB7_1: // %loop
+; CHECK-SD-NEXT:  .LBB8_1: // %loop
 ; CHECK-SD-NEXT:    // =>This Inner Loop Header: Depth=1
 ; CHECK-SD-NEXT:    ldr q1, [x0]
 ; CHECK-SD-NEXT:    subs x2, x2, #8
@@ -823,7 +1240,7 @@ define void @sink_v4i64_1(ptr %p, ptr %d, i64 %n, <2 x i32> %a) {
 ; CHECK-SD-NEXT:    shrn v2.2s, v2.2d, #15
 ; CHECK-SD-NEXT:    shrn2 v2.4s, v1.2d, #15
 ; CHECK-SD-NEXT:    str q2, [x0], #32
-; CHECK-SD-NEXT:    b.ne .LBB7_1
+; CHECK-SD-NEXT:    b.ne .LBB8_1
 ; CHECK-SD-NEXT:  // %bb.2: // %exit
 ; CHECK-SD-NEXT:    ret
 ;
@@ -833,7 +1250,7 @@ define void @sink_v4i64_1(ptr %p, ptr %d, i64 %n, <2 x i32> %a) {
 ; CHECK-GI-NEXT:    mov x8, xzr
 ; CHECK-GI-NEXT:    dup v0.2d, v0.d[1]
 ; CHECK-GI-NEXT:    xtn v0.2s, v0.2d
-; CHECK-GI-NEXT:  .LBB7_1: // %loop
+; CHECK-GI-NEXT:  .LBB8_1: // %loop
 ; CHECK-GI-NEXT:    // =>This Inner Loop Header: Depth=1
 ; CHECK-GI-NEXT:    ldr q1, [x0]
 ; CHECK-GI-NEXT:    subs x2, x2, #8
@@ -844,7 +1261,7 @@ define void @sink_v4i64_1(ptr %p, ptr %d, i64 %n, <2 x i32> %a) {
 ; CHECK-GI-NEXT:    shrn v1.2s, v1.2d, #15
 ; CHECK-GI-NEXT:    shrn2 v1.4s, v2.2d, #15
 ; CHECK-GI-NEXT:    str q1, [x0], #32
-; CHECK-GI-NEXT:    b.ne .LBB7_1
+; CHECK-GI-NEXT:    b.ne .LBB8_1
 ; CHECK-GI-NEXT:  // %bb.2: // %exit
 ; CHECK-GI-NEXT:    ret
 entry:
@@ -877,7 +1294,7 @@ define void @sink_v8z16_0(ptr %p, ptr %d, i64 %n, <16 x i8> %a) {
 ; CHECK-SD:       // %bb.0: // %entry
 ; CHECK-SD-NEXT:    dup v0.8b, v0.b[0]
 ; CHECK-SD-NEXT:    mov x8, xzr
-; CHECK-SD-NEXT:  .LBB8_1: // %loop
+; CHECK-SD-NEXT:  .LBB9_1: // %loop
 ; CHECK-SD-NEXT:    // =>This Inner Loop Header: Depth=1
 ; CHECK-SD-NEXT:    ldr d1, [x0]
 ; CHECK-SD-NEXT:    subs x2, x2, #8
@@ -886,7 +1303,7 @@ define void @sink_v8z16_0(ptr %p, ptr %d, i64 %n, <16 x i8> %a) {
 ; CHECK-SD-NEXT:    cmlt v1.8h, v1.8h, #0
 ; CHECK-SD-NEXT:    xtn v1.8b, v1.8h
 ; CHECK-SD-NEXT:    str d1, [x0], #32
-; CHECK-SD-NEXT:    b.ne .LBB8_1
+; CHECK-SD-NEXT:    b.ne .LBB9_1
 ; CHECK-SD-NEXT:  // %bb.2: // %exit
 ; CHECK-SD-NEXT:    ret
 ;
@@ -896,7 +1313,7 @@ define void @sink_v8z16_0(ptr %p, ptr %d, i64 %n, <16 x i8> %a) {
 ; CHECK-GI-NEXT:    mov x8, xzr
 ; CHECK-GI-NEXT:    dup v0.8h, v0.h[0]
 ; CHECK-GI-NEXT:    xtn v0.8b, v0.8h
-; CHECK-GI-NEXT:  .LBB8_1: // %loop
+; CHECK-GI-NEXT:  .LBB9_1: // %loop
 ; CHECK-GI-NEXT:    // =>This Inner Loop Header: Depth=1
 ; CHECK-GI-NEXT:    ldr d1, [x0]
 ; CHECK-GI-NEXT:    subs x2, x2, #8
@@ -905,7 +1322,7 @@ define void @sink_v8z16_0(ptr %p, ptr %d, i64 %n, <16 x i8> %a) {
 ; CHECK-GI-NEXT:    cmlt v1.8h, v1.8h, #0
 ; CHECK-GI-NEXT:    xtn v1.8b, v1.8h
 ; CHECK-GI-NEXT:    str d1, [x0], #32
-; CHECK-GI-NEXT:    b.ne .LBB8_1
+; CHECK-GI-NEXT:    b.ne .LBB9_1
 ; CHECK-GI-NEXT:  // %bb.2: // %exit
 ; CHECK-GI-NEXT:    ret
 entry:
@@ -938,7 +1355,7 @@ define void @sink_v16s16_8(ptr %p, ptr %d, i64 %n, <16 x i8> %a) {
 ; CHECK-SD:       // %bb.0: // %entry
 ; CHECK-SD-NEXT:    dup v0.16b, v0.b[10]
 ; CHECK-SD-NEXT:    mov x8, xzr
-; CHECK-SD-NEXT:  .LBB9_1: // %loop
+; CHECK-SD-NEXT:  .LBB10_1: // %loop
 ; CHECK-SD-NEXT:    // =>This Inner Loop Header: Depth=1
 ; CHECK-SD-NEXT:    ldr q1, [x0]
 ; CHECK-SD-NEXT:    subs x2, x2, #8
@@ -949,7 +1366,7 @@ define void @sink_v16s16_8(ptr %p, ptr %d, i64 %n, <16 x i8> %a) {
 ; CHECK-SD-NEXT:    cmlt v2.8h, v2.8h, #0
 ; CHECK-SD-NEXT:    uzp1 v1.16b, v2.16b, v1.16b
 ; CHECK-SD-NEXT:    str q1, [x0], #32
-; CHECK-SD-NEXT:    b.ne .LBB9_1
+; CHECK-SD-NEXT:    b.ne .LBB10_1
 ; CHECK-SD-NEXT:  // %bb.2: // %exit
 ; CHECK-SD-NEXT:    ret
 ;
@@ -959,7 +1376,7 @@ define void @sink_v16s16_8(ptr %p, ptr %d, i64 %n, <16 x i8> %a) {
 ; CHECK-GI-NEXT:    mov x8, xzr
 ; CHECK-GI-NEXT:    dup v0.8h, v0.h[2]
 ; CHECK-GI-NEXT:    xtn v0.8b, v0.8h
-; CHECK-GI-NEXT:  .LBB9_1: // %loop
+; CHECK-GI-NEXT:  .LBB10_1: // %loop
 ; CHECK-GI-NEXT:    // =>This Inner Loop Header: Depth=1
 ; CHECK-GI-NEXT:    ldr q1, [x0]
 ; CHECK-GI-NEXT:    subs x2, x2, #8
@@ -971,7 +1388,7 @@ define void @sink_v16s16_8(ptr %p, ptr %d, i64 %n, <16 x i8> %a) {
 ; CHECK-GI-NEXT:    cmlt v2.8h, v2.8h, #0
 ; CHECK-GI-NEXT:    uzp1 v1.16b, v1.16b, v2.16b
 ; CHECK-GI-NEXT:    str q1, [x0], #32
-; CHECK-GI-NEXT:    b.ne .LBB9_1
+; CHECK-GI-NEXT:    b.ne .LBB10_1
 ; CHECK-GI-NEXT:  // %bb.2: // %exit
 ; CHECK-GI-NEXT:    ret
 entry:
@@ -1005,7 +1422,7 @@ define void @matrix_mul_unsigned_and(i32 %N, ptr nocapture %C, ptr nocapture rea
 ; CHECK-SD-NEXT:    dup v0.4h, w3
 ; CHECK-SD-NEXT:    // kill: def $w0 killed $w0 def $x0
 ; CHECK-SD-NEXT:    and x8, x0, #0xfffffff8
-; CHECK-SD-NEXT:  .LBB10_1: // %vector.body
+; CHECK-SD-NEXT:  .LBB11_1: // %vector.body
 ; CHECK-SD-NEXT:    // =>This Inner Loop Header: Depth=1
 ; CHECK-SD-NEXT:    add x9, x2, w0, uxtw #1
 ; CHECK-SD-NEXT:    subs x8, x8, #8
@@ -1015,7 +1432,7 @@ define void @matrix_mul_unsigned_and(i32 %N, ptr nocapture %C, ptr nocapture rea
 ; CHECK-SD-NEXT:    umull v1.4s, v0.4h, v1.4h
 ; CHECK-SD-NEXT:    umull v2.4s, v0.4h, v2.4h
 ; CHECK-SD-NEXT:    stp q1, q2, [x9]
-; CHECK-SD-NEXT:    b.ne .LBB10_1
+; CHECK-SD-NEXT:    b.ne .LBB11_1
 ; CHECK-SD-NEXT:  // %bb.2: // %for.end12
 ; CHECK-SD-NEXT:    ret
 ;
@@ -1026,7 +1443,7 @@ define void @matrix_mul_unsigned_and(i32 %N, ptr nocapture %C, ptr nocapture rea
 ; CHECK-GI-NEXT:    mov w8, w0
 ; CHECK-GI-NEXT:    and x8, x8, #0xfffffff8
 ; CHECK-GI-NEXT:    xtn v0.4h, v0.4s
-; CHECK-GI-NEXT:  .LBB10_1: // %vector.body
+; CHECK-GI-NEXT:  .LBB11_1: // %vector.body
 ; CHECK-GI-NEXT:    // =>This Inner Loop Header: Depth=1
 ; CHECK-GI-NEXT:    add x9, x2, w0, uxtw #1
 ; CHECK-GI-NEXT:    subs x8, x8, #8
@@ -1036,7 +1453,7 @@ define void @matrix_mul_unsigned_and(i32 %N, ptr nocapture %C, ptr nocapture rea
 ; CHECK-GI-NEXT:    umull v1.4s, v0.4h, v1.4h
 ; CHECK-GI-NEXT:    umull v2.4s, v0.4h, v2.4h
 ; CHECK-GI-NEXT:    stp q1, q2, [x9]
-; CHECK-GI-NEXT:    b.ne .LBB10_1
+; CHECK-GI-NEXT:    b.ne .LBB11_1
 ; CHECK-GI-NEXT:  // %bb.2: // %for.end12
 ; CHECK-GI-NEXT:    ret
 vector.header:
@@ -1089,7 +1506,7 @@ define void @matrix_mul_unsigned_and_double(i32 %N, ptr nocapture %C, ptr nocapt
 ; CHECK-SD-NEXT:    dup v0.8h, w3
 ; CHECK-SD-NEXT:    // kill: def $w0 killed $w0 def $x0
 ; CHECK-SD-NEXT:    and x8, x0, #0xfffffff0
-; CHECK-SD-NEXT:  .LBB11_1: // %vector.body
+; CHECK-SD-NEXT:  .LBB12_1: // %vector.body
 ; CHECK-SD-NEXT:    // =>This Inner Loop Header: Depth=1
 ; CHECK-SD-NEXT:    add x9, x2, w0, uxtw #1
 ; CHECK-SD-NEXT:    subs x8, x8, #16
@@ -1103,7 +1520,7 @@ define void @matrix_mul_unsigned_and_double(i32 %N, ptr nocapture %C, ptr nocapt
 ; CHECK-SD-NEXT:    umull v2.4s, v0.4h, v2.4h
 ; CHECK-SD-NEXT:    stp q1, q3, [x9]
 ; CHECK-SD-NEXT:    stp q2, q4, [x9, #32]
-; CHECK-SD-NEXT:    b.ne .LBB11_1
+; CHECK-SD-NEXT:    b.ne .LBB12_1
 ; CHECK-SD-NEXT:  // %bb.2: // %for.end12
 ; CHECK-SD-NEXT:    ret
 ;
@@ -1114,7 +1531,7 @@ define void @matrix_mul_unsigned_and_double(i32 %N, ptr nocapture %C, ptr nocapt
 ; CHECK-GI-NEXT:    mov w8, w0
 ; CHECK-GI-NEXT:    and x8, x8, #0xfffffff0
 ; CHECK-GI-NEXT:    xtn v0.4h, v0.4s
-; CHECK-GI-NEXT:  .LBB11_1: // %vector.body
+; CHECK-GI-NEXT:  .LBB12_1: // %vector.body
 ; CHECK-GI-NEXT:    // =>This Inner Loop Header: Depth=1
 ; CHECK-GI-NEXT:    add x9, x2, w0, uxtw #1
 ; CHECK-GI-NEXT:    subs x8, x8, #16
@@ -1130,7 +1547,7 @@ define void @matrix_mul_unsigned_and_double(i32 %N, ptr nocapture %C, ptr nocapt
 ; CHECK-GI-NEXT:    umull v4.4s, v0.4h, v4.4h
 ; CHECK-GI-NEXT:    stp q1, q3, [x9]
 ; CHECK-GI-NEXT:    stp q2, q4, [x9, #32]!
-; CHECK-GI-NEXT:    b.ne .LBB11_1
+; CHECK-GI-NEXT:    b.ne .LBB12_1
 ; CHECK-GI-NEXT:  // %bb.2: // %for.end12
 ; CHECK-GI-NEXT:    ret
 vector.header:
@@ -1184,7 +1601,7 @@ define void @matrix_mul_signed_and(i32 %N, ptr nocapture %C, ptr nocapture reado
 ; CHECK-SD-NEXT:    // kill: def $w0 killed $w0 def $x0
 ; CHECK-SD-NEXT:    and x8, x0, #0xfffffff8
 ; CHECK-SD-NEXT:    fmov s0, w9
-; CHECK-SD-NEXT:  .LBB12_1: // %vector.body
+; CHECK-SD-NEXT:  .LBB13_1: // %vector.body
 ; CHECK-SD-NEXT:    // =>This Inner Loop Header: Depth=1
 ; CHECK-SD-NEXT:    add x9, x2, w0, uxtw #1
 ; CHECK-SD-NEXT:    subs x8, x8, #8
@@ -1196,7 +1613,7 @@ define void @matrix_mul_signed_and(i32 %N, ptr nocapture %C, ptr nocapture reado
 ; CHECK-SD-NEXT:    mul v1.4s, v1.4s, v0.s[0]
 ; CHECK-SD-NEXT:    mul v2.4s, v2.4s, v0.s[0]
 ; CHECK-SD-NEXT:    stp q1, q2, [x9]
-; CHECK-SD-NEXT:    b.ne .LBB12_1
+; CHECK-SD-NEXT:    b.ne .LBB13_1
 ; CHECK-SD-NEXT:  // %bb.2: // %for.end12
 ; CHECK-SD-NEXT:    ret
 ;
@@ -1206,7 +1623,7 @@ define void @matrix_mul_signed_and(i32 %N, ptr nocapture %C, ptr nocapture reado
 ; CHECK-GI-NEXT:    dup v0.4s, w8
 ; CHECK-GI-NEXT:    mov w8, w0
 ; CHECK-GI-NEXT:    and x8, x8, #0xfffffff8
-; CHECK-GI-NEXT:  .LBB12_1: // %vector.body
+; CHECK-GI-NEXT:  .LBB13_1: // %vector.body
 ; CHECK-GI-NEXT:    // =>This Inner Loop Header: Depth=1
 ; CHECK-GI-NEXT:    add x9, x2, w0, uxtw #1
 ; CHECK-GI-NEXT:    subs x8, x8, #8
@@ -1218,7 +1635,7 @@ define void @matrix_mul_signed_and(i32 %N, ptr nocapture %C, ptr nocapture reado
 ; CHECK-GI-NEXT:    mul v1.4s, v0.4s, v1.4s
 ; CHECK-GI-NEXT:    mul v2.4s, v0.4s, v2.4s
 ; CHECK-GI-NEXT:    stp q1, q2, [x9]
-; CHECK-GI-NEXT:    b.ne .LBB12_1
+; CHECK-GI-NEXT:    b.ne .LBB13_1
 ; CHECK-GI-NEXT:  // %bb.2: // %for.end12
 ; CHECK-GI-NEXT:    ret
 vector.header:
@@ -1272,7 +1689,7 @@ define void @matrix_mul_signed_and_double(i32 %N, ptr nocapture %C, ptr nocaptur
 ; CHECK-SD-NEXT:    // kill: def $w0 killed $w0 def $x0
 ; CHECK-SD-NEXT:    and x8, x0, #0xfffffff0
 ; CHECK-SD-NEXT:    fmov s0, w9
-; CHECK-SD-NEXT:  .LBB13_1: // %vector.body
+; CHECK-SD-NEXT:  .LBB14_1: // %vector.body
 ; CHECK-SD-NEXT:    // =>This Inner Loop Header: Depth=1
 ; CHECK-SD-NEXT:    add x9, x2, w0, uxtw #1
 ; CHECK-SD-NEXT:    subs x8, x8, #16
@@ -1290,7 +1707,7 @@ define void @matrix_mul_signed_and_double(i32 %N, ptr nocapture %C, ptr nocaptur
 ; CHECK-SD-NEXT:    mul v2.4s, v2.4s, v0.s[0]
 ; CHECK-SD-NEXT:    stp q1, q3, [x9]
 ; CHECK-SD-NEXT:    stp q2, q4, [x9, #32]
-; CHECK-SD-NEXT:    b.ne .LBB13_1
+; CHECK-SD-NEXT:    b.ne .LBB14_1
 ; CHECK-SD-NEXT:  // %bb.2: // %for.end12
 ; CHECK-SD-NEXT:    ret
 ;
@@ -1300,7 +1717,7 @@ define void @matrix_mul_signed_and_double(i32 %N, ptr nocapture %C, ptr nocaptur
 ; CHECK-GI-NEXT:    dup v0.4s, w8
 ; CHECK-GI-NEXT:    mov w8, w0
 ; CHECK-GI-NEXT:    and x8, x8, #0xfffffff0
-; CHECK-GI-NEXT:  .LBB13_1: // %vector.body
+; CHECK-GI-NEXT:  .LBB14_1: // %vector.body
 ; CHECK-GI-NEXT:    // =>This Inner Loop Header: Depth=1
 ; CHECK-GI-NEXT:    add x9, x2, w0, uxtw #1
 ; CHECK-GI-NEXT:    subs x8, x8, #16
@@ -1318,7 +1735,7 @@ define void @matrix_mul_signed_and_double(i32 %N, ptr nocapture %C, ptr nocaptur
 ; CHECK-GI-NEXT:    mul v2.4s, v0.4s, v2.4s
 ; CHECK-GI-NEXT:    stp q3, q1, [x9]
 ; CHECK-GI-NEXT:    stp q4, q2, [x9, #32]!
-; CHECK-GI-NEXT:    b.ne .LBB13_1
+; CHECK-GI-NEXT:    b.ne .LBB14_1
 ; CHECK-GI-NEXT:  // %bb.2: // %for.end12
 ; CHECK-GI-NEXT:    ret
 vector.header:
@@ -1369,9 +1786,9 @@ define noundef <8 x i16> @cmplx_mul_combined_re_im(<8 x i16> noundef %a, i64 %sc
 ; CHECK-SD-LABEL: cmplx_mul_combined_re_im:
 ; CHECK-SD:       // %bb.0: // %entry
 ; CHECK-SD-NEXT:    lsr x9, x0, #16
-; CHECK-SD-NEXT:    adrp x8, .LCPI14_0
+; CHECK-SD-NEXT:    adrp x8, .LCPI15_0
 ; CHECK-SD-NEXT:    dup v4.8h, w0
-; CHECK-SD-NEXT:    ldr q3, [x8, :lo12:.LCPI14_0]
+; CHECK-SD-NEXT:    ldr q3, [x8, :lo12:.LCPI15_0]
 ; CHECK-SD-NEXT:    dup v2.8h, w9
 ; CHECK-SD-NEXT:    sqneg v1.8h, v2.8h
 ; CHECK-SD-NEXT:    tbl v1.16b, { v1.16b, v2.16b }, v3.16b
@@ -1386,12 +1803,12 @@ define noundef <8 x i16> @cmplx_mul_combined_re_im(<8 x i16> noundef %a, i64 %sc
 ; CHECK-GI-LABEL: cmplx_mul_combined_re_im:
 ; CHECK-GI:       // %bb.0: // %entry
 ; CHECK-GI-NEXT:    lsr w9, w0, #16
-; CHECK-GI-NEXT:    adrp x8, .LCPI14_0
+; CHECK-GI-NEXT:    adrp x8, .LCPI15_0
 ; CHECK-GI-NEXT:    rev32 v4.8h, v0.8h
 ; CHECK-GI-NEXT:    dup v1.8h, w9
 ; CHECK-GI-NEXT:    fmov s3, w9
 ; CHECK-GI-NEXT:    sqneg v2.8h, v1.8h
-; CHECK-GI-NEXT:    ldr q1, [x8, :lo12:.LCPI14_0]
+; CHECK-GI-NEXT:    ldr q1, [x8, :lo12:.LCPI15_0]
 ; CHECK-GI-NEXT:    tbl v1.16b, { v2.16b, v3.16b }, v1.16b
 ; CHECK-GI-NEXT:    mov d2, v0.d[1]
 ; CHECK-GI-NEXT:    dup v3.8h, w0
diff --git a/llvm/test/CodeGen/AArch64/aarch64-smull.ll b/llvm/test/CodeGen/AArch64/aarch64-smull.ll
index 6e5c666..0cd885e 100644
--- a/llvm/test/CodeGen/AArch64/aarch64-smull.ll
+++ b/llvm/test/CodeGen/AArch64/aarch64-smull.ll
@@ -222,22 +222,20 @@ define <4 x i32> @smull_zext_v4i16_v4i32(ptr %A, ptr %B) nounwind {
 define <2 x i64> @smull_zext_v2i32_v2i64(ptr %A, ptr %B) nounwind {
 ; CHECK-NEON-LABEL: smull_zext_v2i32_v2i64:
 ; CHECK-NEON:       // %bb.0:
-; CHECK-NEON-NEXT:    ldrh w8, [x0]
-; CHECK-NEON-NEXT:    ldrh w9, [x0, #2]
+; CHECK-NEON-NEXT:    ldrh w8, [x0, #2]
+; CHECK-NEON-NEXT:    ldr h0, [x0]
 ; CHECK-NEON-NEXT:    ldr d1, [x1]
-; CHECK-NEON-NEXT:    fmov d0, x8
-; CHECK-NEON-NEXT:    mov v0.d[1], x9
+; CHECK-NEON-NEXT:    mov v0.d[1], x8
 ; CHECK-NEON-NEXT:    xtn v0.2s, v0.2d
 ; CHECK-NEON-NEXT:    smull v0.2d, v0.2s, v1.2s
 ; CHECK-NEON-NEXT:    ret
 ;
 ; CHECK-SVE-LABEL: smull_zext_v2i32_v2i64:
 ; CHECK-SVE:       // %bb.0:
-; CHECK-SVE-NEXT:    ldrh w8, [x0]
-; CHECK-SVE-NEXT:    ldrh w9, [x0, #2]
+; CHECK-SVE-NEXT:    ldrh w8, [x0, #2]
+; CHECK-SVE-NEXT:    ldr h0, [x0]
 ; CHECK-SVE-NEXT:    ldr d1, [x1]
-; CHECK-SVE-NEXT:    fmov d0, x8
-; CHECK-SVE-NEXT:    mov v0.d[1], x9
+; CHECK-SVE-NEXT:    mov v0.d[1], x8
 ; CHECK-SVE-NEXT:    xtn v0.2s, v0.2d
 ; CHECK-SVE-NEXT:    smull v0.2d, v0.2s, v1.2s
 ; CHECK-SVE-NEXT:    ret
diff --git a/llvm/test/CodeGen/AArch64/dup-ext-load-combine.ll b/llvm/test/CodeGen/AArch64/dup-ext-load-combine.ll
new file mode 100644
index 0000000..cf52934
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/dup-ext-load-combine.ll
@@ -0,0 +1,178 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=aarch64-none-linux-gnu < %s | FileCheck %s
+
+; Test optimization of DUP with extended narrow loads
+; This should avoid GPR->SIMD transfers by loading directly into vector registers
+
+define <4 x i16> @test_dup_zextload_i8_v4i16(ptr %p) {
+; CHECK-LABEL: test_dup_zextload_i8_v4i16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr b0, [x0]
+; CHECK-NEXT:    dup v0.4h, v0.h[0]
+; CHECK-NEXT:    ret
+  %load = load i8, ptr %p, align 1
+  %ext = zext i8 %load to i16
+  %vec = insertelement <4 x i16> poison, i16 %ext, i32 0
+  %dup = shufflevector <4 x i16> %vec, <4 x i16> poison, <4 x i32> zeroinitializer
+  ret <4 x i16> %dup
+}
+
+define <8 x i16> @test_dup_zextload_i8_v8i16(ptr %p) {
+; CHECK-LABEL: test_dup_zextload_i8_v8i16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr b0, [x0]
+; CHECK-NEXT:    dup v0.8h, v0.h[0]
+; CHECK-NEXT:    ret
+  %load = load i8, ptr %p, align 1
+  %ext = zext i8 %load to i16
+  %vec = insertelement <8 x i16> poison, i16 %ext, i32 0
+  %dup = shufflevector <8 x i16> %vec, <8 x i16> poison, <8 x i32> zeroinitializer
+  ret <8 x i16> %dup
+}
+
+define <2 x i32> @test_dup_zextload_i8_v2i32(ptr %p) {
+; CHECK-LABEL: test_dup_zextload_i8_v2i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr b0, [x0]
+; CHECK-NEXT:    dup v0.2s, v0.s[0]
+; CHECK-NEXT:    ret
+  %load = load i8, ptr %p, align 1
+  %ext = zext i8 %load to i32
+  %vec = insertelement <2 x i32> poison, i32 %ext, i32 0
+  %dup = shufflevector <2 x i32> %vec, <2 x i32> poison, <2 x i32> zeroinitializer
+  ret <2 x i32> %dup
+}
+
+define <4 x i32> @test_dup_zextload_i8_v4i32(ptr %p) {
+; CHECK-LABEL: test_dup_zextload_i8_v4i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr b0, [x0]
+; CHECK-NEXT:    dup v0.4s, v0.s[0]
+; CHECK-NEXT:    ret
+  %load = load i8, ptr %p, align 1
+  %ext = zext i8 %load to i32
+  %vec = insertelement <4 x i32> poison, i32 %ext, i32 0
+  %dup = shufflevector <4 x i32> %vec, <4 x i32> poison, <4 x i32> zeroinitializer
+  ret <4 x i32> %dup
+}
+
+define <4 x i32> @test_dup_zextload_i8_v4i32_offset(ptr %p) {
+; CHECK-LABEL: test_dup_zextload_i8_v4i32_offset:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr b0, [x0, #4]
+; CHECK-NEXT:    dup v0.4s, v0.s[0]
+; CHECK-NEXT:    ret
+  %addr = getelementptr inbounds i8, ptr %p, i64 4
+  %load = load i8, ptr %addr, align 1
+  %ext = zext i8 %load to i32
+  %vec = insertelement <4 x i32> poison, i32 %ext, i32 0
+  %dup = shufflevector <4 x i32> %vec, <4 x i32> poison, <4 x i32> zeroinitializer
+  ret <4 x i32> %dup
+}
+
+define <4 x i32> @test_dup_zextload_i8_v4i32_reg_offset(ptr %p, i64 %offset) {
+; CHECK-LABEL: test_dup_zextload_i8_v4i32_reg_offset:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr b0, [x0, x1]
+; CHECK-NEXT:    dup v0.4s, v0.s[0]
+; CHECK-NEXT:    ret
+  %addr = getelementptr inbounds i8, ptr %p, i64 %offset
+  %load = load i8, ptr %addr, align 1
+  %ext = zext i8 %load to i32
+  %vec = insertelement <4 x i32> poison, i32 %ext, i32 0
+  %dup = shufflevector <4 x i32> %vec, <4 x i32> poison, <4 x i32> zeroinitializer
+  ret <4 x i32> %dup
+}
+
+define <2 x i64> @test_dup_zextload_i8_v2i64(ptr %p) {
+; CHECK-LABEL: test_dup_zextload_i8_v2i64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr b0, [x0]
+; CHECK-NEXT:    dup v0.2d, v0.d[0]
+; CHECK-NEXT:    ret
+  %load = load i8, ptr %p, align 1
+  %ext = zext i8 %load to i64
+  %vec = insertelement <2 x i64> poison, i64 %ext, i32 0
+  %dup = shufflevector <2 x i64> %vec, <2 x i64> poison, <2 x i32> zeroinitializer
+  ret <2 x i64> %dup
+}
+
+define <2 x i32> @test_dup_zextload_i16_v2i32(ptr %p) {
+; CHECK-LABEL: test_dup_zextload_i16_v2i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr h0, [x0]
+; CHECK-NEXT:    dup v0.2s, v0.s[0]
+; CHECK-NEXT:    ret
+  %load = load i16, ptr %p, align 1
+  %ext = zext i16 %load to i32
+  %vec = insertelement <2 x i32> poison, i32 %ext, i32 0
+  %dup = shufflevector <2 x i32> %vec, <2 x i32> poison, <2 x i32> zeroinitializer
+  ret <2 x i32> %dup
+}
+
+define <4 x i32> @test_dup_zextload_i16_v4i32(ptr %p) {
+; CHECK-LABEL: test_dup_zextload_i16_v4i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr h0, [x0]
+; CHECK-NEXT:    dup v0.4s, v0.s[0]
+; CHECK-NEXT:    ret
+  %load = load i16, ptr %p, align 1
+  %ext = zext i16 %load to i32
+  %vec = insertelement <4 x i32> poison, i32 %ext, i32 0
+  %dup = shufflevector <4 x i32> %vec, <4 x i32> poison, <4 x i32> zeroinitializer
+  ret <4 x i32> %dup
+}
+
+define <4 x i32> @test_dup_zextload_i16_v4i32_offset(ptr %p) {
+; CHECK-LABEL: test_dup_zextload_i16_v4i32_offset:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr h0, [x0, #8]
+; CHECK-NEXT:    dup v0.4s, v0.s[0]
+; CHECK-NEXT:    ret
+  %addr = getelementptr inbounds i16, ptr %p, i64 4
+  %load = load i16, ptr %addr, align 1
+  %ext = zext i16 %load to i32
+  %vec = insertelement <4 x i32> poison, i32 %ext, i32 0
+  %dup = shufflevector <4 x i32> %vec, <4 x i32> poison, <4 x i32> zeroinitializer
+  ret <4 x i32> %dup
+}
+
+define <4 x i32> @test_dup_zextload_i16_v4i32_reg_offset(ptr %p, i64 %offset) {
+; CHECK-LABEL: test_dup_zextload_i16_v4i32_reg_offset:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr h0, [x0, x1, lsl #1]
+; CHECK-NEXT:    dup v0.4s, v0.s[0]
+; CHECK-NEXT:    ret
+  %addr = getelementptr inbounds i16, ptr %p, i64 %offset
+  %load = load i16, ptr %addr, align 1
+  %ext = zext i16 %load to i32
+  %vec = insertelement <4 x i32> poison, i32 %ext, i32 0
+  %dup = shufflevector <4 x i32> %vec, <4 x i32> poison, <4 x i32> zeroinitializer
+  ret <4 x i32> %dup
+}
+
+define <2 x i64> @test_dup_zextload_i16_v2i64(ptr %p) {
+; CHECK-LABEL: test_dup_zextload_i16_v2i64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr h0, [x0]
+; CHECK-NEXT:    dup v0.2d, v0.d[0]
+; CHECK-NEXT:    ret
+  %load = load i16, ptr %p, align 1
+  %ext = zext i16 %load to i64
+  %vec = insertelement <2 x i64> poison, i64 %ext, i32 0
+  %dup = shufflevector <2 x i64> %vec, <2 x i64> poison, <2 x i32> zeroinitializer
+  ret <2 x i64> %dup
+}
+
+define <2 x i64> @test_dup_zextload_i32_v2i64(ptr %p) {
+; CHECK-LABEL: test_dup_zextload_i32_v2i64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr s0, [x0]
+; CHECK-NEXT:    dup v0.2d, v0.d[0]
+; CHECK-NEXT:    ret
+  %load = load i32, ptr %p, align 1
+  %ext = zext i32 %load to i64
+  %vec = insertelement <2 x i64> poison, i64 %ext, i32 0
+  %dup = shufflevector <2 x i64> %vec, <2 x i64> poison, <2 x i32> zeroinitializer
+  ret <2 x i64> %dup
+}
diff --git a/llvm/test/CodeGen/AArch64/dup.ll b/llvm/test/CodeGen/AArch64/dup.ll
index 079ff10..670574f2 100644
--- a/llvm/test/CodeGen/AArch64/dup.ll
+++ b/llvm/test/CodeGen/AArch64/dup.ll
@@ -32,8 +32,8 @@ entry:
 define <2 x i8> @loaddup_v2i8(ptr %p) {
 ; CHECK-LABEL: loaddup_v2i8:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    ldrb w8, [x0]
-; CHECK-NEXT:    dup v0.2s, w8
+; CHECK-NEXT:    ldr b0, [x0]
+; CHECK-NEXT:    dup v0.2s, v0.s[0]
 ; CHECK-NEXT:    ret
 entry:
   %a = load i8, ptr %p
@@ -189,8 +189,8 @@ entry:
 define <4 x i8> @loaddup_v4i8(ptr %p) {
 ; CHECK-SD-LABEL: loaddup_v4i8:
 ; CHECK-SD:       // %bb.0: // %entry
-; CHECK-SD-NEXT:    ldrb w8, [x0]
-; CHECK-SD-NEXT:    dup v0.4h, w8
+; CHECK-SD-NEXT:    ldr b0, [x0]
+; CHECK-SD-NEXT:    dup v0.4h, v0.h[0]
 ; CHECK-SD-NEXT:    ret
 ;
 ; CHECK-GI-LABEL: loaddup_v4i8:
@@ -444,8 +444,8 @@ entry:
 define <2 x i16> @loaddup_v2i16(ptr %p) {
 ; CHECK-SD-LABEL: loaddup_v2i16:
 ; CHECK-SD:       // %bb.0: // %entry
-; CHECK-SD-NEXT:    ldrh w8, [x0]
-; CHECK-SD-NEXT:    dup v0.2s, w8
+; CHECK-SD-NEXT:    ldr h0, [x0]
+; CHECK-SD-NEXT:    dup v0.2s, v0.s[0]
 ; CHECK-SD-NEXT:    ret
 ;
 ; CHECK-GI-LABEL: loaddup_v2i16:
diff --git a/llvm/test/CodeGen/AArch64/load-zext-bitcast.ll b/llvm/test/CodeGen/AArch64/load-zext-bitcast.ll
index 6177ae5..628506b 100644
--- a/llvm/test/CodeGen/AArch64/load-zext-bitcast.ll
+++ b/llvm/test/CodeGen/AArch64/load-zext-bitcast.ll
@@ -84,8 +84,7 @@ entry:
 define double @load_u64_from_u32_off1(ptr %n){
 ; CHECK-LABEL: load_u64_from_u32_off1:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    ldur w8, [x0, #1]
-; CHECK-NEXT:    fmov d0, x8
+; CHECK-NEXT:    ldur s0, [x0, #1]
 ; CHECK-NEXT:    ret
 entry:
   %p = getelementptr i8, ptr %n, i64 1
@@ -98,8 +97,7 @@ entry:
 define double @load_u64_from_u16_off1(ptr %n){
 ; CHECK-LABEL: load_u64_from_u16_off1:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    ldurh w8, [x0, #1]
-; CHECK-NEXT:    fmov d0, x8
+; CHECK-NEXT:    ldur h0, [x0, #1]
 ; CHECK-NEXT:    ret
 entry:
   %p = getelementptr i8, ptr %n, i64 1
@@ -125,8 +123,7 @@ entry:
 define float @load_u32_from_u16_off1(ptr %n){
 ; CHECK-LABEL: load_u32_from_u16_off1:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    ldurh w8, [x0, #1]
-; CHECK-NEXT:    fmov s0, w8
+; CHECK-NEXT:    ldur h0, [x0, #1]
 ; CHECK-NEXT:    ret
 entry:
   %p = getelementptr i8, ptr %n, i64 1
@@ -168,8 +165,7 @@ entry:
 define double @load_u64_from_u32_off2(ptr %n){
 ; CHECK-LABEL: load_u64_from_u32_off2:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    ldur w8, [x0, #2]
-; CHECK-NEXT:    fmov d0, x8
+; CHECK-NEXT:    ldur s0, [x0, #2]
 ; CHECK-NEXT:    ret
 entry:
   %p = getelementptr i8, ptr %n, i64 2
@@ -250,8 +246,7 @@ entry:
 define double @load_u64_from_u32_off255(ptr %n){
 ; CHECK-LABEL: load_u64_from_u32_off255:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    ldur w8, [x0, #255]
-; CHECK-NEXT:    fmov d0, x8
+; CHECK-NEXT:    ldur s0, [x0, #255]
 ; CHECK-NEXT:    ret
 entry:
   %p = getelementptr i8, ptr %n, i64 255
@@ -264,8 +259,7 @@ entry:
 define double @load_u64_from_u16_off255(ptr %n){
 ; CHECK-LABEL: load_u64_from_u16_off255:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    ldurh w8, [x0, #255]
-; CHECK-NEXT:    fmov d0, x8
+; CHECK-NEXT:    ldur h0, [x0, #255]
 ; CHECK-NEXT:    ret
 entry:
   %p = getelementptr i8, ptr %n, i64 255
@@ -291,8 +285,7 @@ entry:
 define float @load_u32_from_u16_off255(ptr %n){
 ; CHECK-LABEL: load_u32_from_u16_off255:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    ldurh w8, [x0, #255]
-; CHECK-NEXT:    fmov s0, w8
+; CHECK-NEXT:    ldur h0, [x0, #255]
 ; CHECK-NEXT:    ret
 entry:
   %p = getelementptr i8, ptr %n, i64 255
@@ -494,8 +487,8 @@ entry:
 define double @load_u64_from_u32_offnp1(ptr %n){
 ; CHECK-LABEL: load_u64_from_u32_offnp1:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    add x8, x0, #4, lsl #12 // =16384
-; CHECK-NEXT:    ldr s0, [x8]
+; CHECK-NEXT:    mov w8, #16384 // =0x4000
+; CHECK-NEXT:    ldr s0, [x0, x8]
 ; CHECK-NEXT:    ret
 entry:
   %p = getelementptr i8, ptr %n, i64 16384
@@ -508,8 +501,8 @@ entry:
 define double @load_u64_from_u16_offnp1(ptr %n){
 ; CHECK-LABEL: load_u64_from_u16_offnp1:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    add x8, x0, #2, lsl #12 // =8192
-; CHECK-NEXT:    ldr h0, [x8]
+; CHECK-NEXT:    mov w8, #8192 // =0x2000
+; CHECK-NEXT:    ldr h0, [x0, x8]
 ; CHECK-NEXT:    ret
 entry:
   %p = getelementptr i8, ptr %n, i64 8192
@@ -522,8 +515,8 @@ entry:
 define double @load_u64_from_u8_offnp1(ptr %n){
 ; CHECK-LABEL: load_u64_from_u8_offnp1:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    add x8, x0, #1, lsl #12 // =4096
-; CHECK-NEXT:    ldr b0, [x8]
+; CHECK-NEXT:    mov w8, #4096 // =0x1000
+; CHECK-NEXT:    ldr b0, [x0, x8]
 ; CHECK-NEXT:    ret
 entry:
   %p = getelementptr i8, ptr %n, i64 4096
@@ -536,8 +529,8 @@ entry:
 define float @load_u32_from_u16_offnp1(ptr %n){
 ; CHECK-LABEL: load_u32_from_u16_offnp1:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    add x8, x0, #2, lsl #12 // =8192
-; CHECK-NEXT:    ldr h0, [x8]
+; CHECK-NEXT:    mov w8, #8192 // =0x2000
+; CHECK-NEXT:    ldr h0, [x0, x8]
 ; CHECK-NEXT:    ret
 entry:
   %p = getelementptr i8, ptr %n, i64 8192
@@ -550,8 +543,8 @@ entry:
 define float @load_u32_from_u8_offnp1(ptr %n){
 ; CHECK-LABEL: load_u32_from_u8_offnp1:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    add x8, x0, #1, lsl #12 // =4096
-; CHECK-NEXT:    ldr b0, [x8]
+; CHECK-NEXT:    mov w8, #4096 // =0x1000
+; CHECK-NEXT:    ldr b0, [x0, x8]
 ; CHECK-NEXT:    ret
 entry:
   %p = getelementptr i8, ptr %n, i64 4096
@@ -564,8 +557,8 @@ entry:
 define half @load_u16_from_u8_offnp1(ptr %n){
 ; CHECK-LABEL: load_u16_from_u8_offnp1:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    add x8, x0, #1, lsl #12 // =4096
-; CHECK-NEXT:    ldr b0, [x8]
+; CHECK-NEXT:    mov w8, #4096 // =0x1000
+; CHECK-NEXT:    ldr b0, [x0, x8]
 ; CHECK-NEXT:    // kill: def $h0 killed $h0 killed $s0
 ; CHECK-NEXT:    ret
 entry:
diff --git a/llvm/test/CodeGen/AArch64/sme-za-exceptions.ll b/llvm/test/CodeGen/AArch64/sme-za-exceptions.ll
index b8d6c88..3f35cb5 100644
--- a/llvm/test/CodeGen/AArch64/sme-za-exceptions.ll
+++ b/llvm/test/CodeGen/AArch64/sme-za-exceptions.ll
@@ -829,7 +829,7 @@ define void @try_catch_agnostic_za_invoke() "aarch64_za_state_agnostic" personal
 ; CHECK-SDAG-NEXT:    bl __arm_sme_restore
 ; CHECK-SDAG-NEXT:    b .LBB5_1
 entry:
-  invoke void @agnostic_za_call()
+  invoke void @agnostic_za_call() "aarch64_za_state_agnostic"
           to label %exit unwind label %catch
 
 catch:
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/ashr.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/ashr.ll
index 353c09b..ecd7cc2 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/ashr.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/ashr.ll
@@ -1778,7 +1778,7 @@ define i65 @v_ashr_i65_33(i65 %value) {
 ; GFX6-NEXT:    v_ashrrev_i32_e32 v2, 31, v1
 ; GFX6-NEXT:    v_lshl_b64 v[0:1], v[1:2], 31
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v3, 1, v3
-; GFX6-NEXT:    v_or_b32_e32 v0, v3, v0
+; GFX6-NEXT:    v_or_b32_e32 v0, v0, v3
 ; GFX6-NEXT:    v_ashrrev_i32_e32 v2, 1, v2
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -1790,7 +1790,7 @@ define i65 @v_ashr_i65_33(i65 %value) {
 ; GFX8-NEXT:    v_ashrrev_i32_e32 v2, 31, v1
 ; GFX8-NEXT:    v_lshlrev_b64 v[0:1], 31, v[1:2]
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 1, v3
-; GFX8-NEXT:    v_or_b32_e32 v0, v3, v0
+; GFX8-NEXT:    v_or_b32_e32 v0, v0, v3
 ; GFX8-NEXT:    v_ashrrev_i32_e32 v2, 1, v2
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -1802,7 +1802,7 @@ define i65 @v_ashr_i65_33(i65 %value) {
 ; GFX9-NEXT:    v_ashrrev_i32_e32 v2, 31, v1
 ; GFX9-NEXT:    v_lshlrev_b64 v[0:1], 31, v[1:2]
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v3, 1, v3
-; GFX9-NEXT:    v_or_b32_e32 v0, v3, v0
+; GFX9-NEXT:    v_or_b32_e32 v0, v0, v3
 ; GFX9-NEXT:    v_ashrrev_i32_e32 v2, 1, v2
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -1815,7 +1815,7 @@ define i65 @v_ashr_i65_33(i65 %value) {
 ; GFX10PLUS-NEXT:    v_ashrrev_i32_e32 v2, 31, v1
 ; GFX10PLUS-NEXT:    v_lshlrev_b64 v[0:1], 31, v[1:2]
 ; GFX10PLUS-NEXT:    v_ashrrev_i32_e32 v2, 1, v2
-; GFX10PLUS-NEXT:    v_or_b32_e32 v0, v3, v0
+; GFX10PLUS-NEXT:    v_or_b32_e32 v0, v0, v3
 ; GFX10PLUS-NEXT:    s_setpc_b64 s[30:31]
   %result = ashr i65 %value, 33
   ret i65 %result
@@ -1875,21 +1875,19 @@ define amdgpu_ps i65 @s_ashr_i65_33(i65 inreg %value) {
 ; GCN-LABEL: s_ashr_i65_33:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_bfe_i64 s[2:3], s[2:3], 0x10000
-; GCN-NEXT:    s_lshr_b32 s0, s1, 1
-; GCN-NEXT:    s_mov_b32 s1, 0
-; GCN-NEXT:    s_lshl_b64 s[4:5], s[2:3], 31
-; GCN-NEXT:    s_or_b64 s[0:1], s[0:1], s[4:5]
+; GCN-NEXT:    s_lshr_b32 s4, s1, 1
+; GCN-NEXT:    s_lshl_b64 s[0:1], s[2:3], 31
+; GCN-NEXT:    s_or_b32 s0, s0, s4
 ; GCN-NEXT:    s_ashr_i32 s2, s3, 1
 ; GCN-NEXT:    ; return to shader part epilog
 ;
 ; GFX10PLUS-LABEL: s_ashr_i65_33:
 ; GFX10PLUS:       ; %bb.0:
 ; GFX10PLUS-NEXT:    s_bfe_i64 s[2:3], s[2:3], 0x10000
-; GFX10PLUS-NEXT:    s_lshr_b32 s0, s1, 1
-; GFX10PLUS-NEXT:    s_mov_b32 s1, 0
-; GFX10PLUS-NEXT:    s_lshl_b64 s[4:5], s[2:3], 31
+; GFX10PLUS-NEXT:    s_lshr_b32 s4, s1, 1
+; GFX10PLUS-NEXT:    s_lshl_b64 s[0:1], s[2:3], 31
 ; GFX10PLUS-NEXT:    s_ashr_i32 s2, s3, 1
-; GFX10PLUS-NEXT:    s_or_b64 s[0:1], s[0:1], s[4:5]
+; GFX10PLUS-NEXT:    s_or_b32 s0, s0, s4
 ; GFX10PLUS-NEXT:    ; return to shader part epilog
   %result = ashr i65 %value, 33
   ret i65 %result
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-or-s64-s32.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-or-s64-s32.mir
new file mode 100644
index 0000000..48e9818
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-or-s64-s32.mir
@@ -0,0 +1,97 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5
+# RUN: llc -mtriple=amdgcn -mcpu=tahiti -run-pass=amdgpu-prelegalizer-combiner %s -o - | FileCheck %s
+
+---
+name: test_combine_or_s64_s32
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $sgpr0_sgpr1, $sgpr2
+    ; CHECK-LABEL: name: test_combine_or_s64_s32
+    ; CHECK: liveins: $sgpr0_sgpr1, $sgpr2
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $sgpr0_sgpr1
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr2
+    ; CHECK-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY]](s64)
+    ; CHECK-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[UV]], [[COPY1]]
+    ; CHECK-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR]](s32), [[UV1]](s32)
+    ; CHECK-NEXT: $sgpr0_sgpr1 = COPY [[MV]](s64)
+    ; CHECK-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0_sgpr1
+    %0:_(s64) = COPY $sgpr0_sgpr1
+    %1:_(s32) = COPY $sgpr2
+    %2:_(s64) = G_ZEXT %1(s32)
+    %3:_(s64) = G_OR %0, %2
+    $sgpr0_sgpr1 = COPY %3(s64)
+    SI_RETURN_TO_EPILOG implicit $sgpr0_sgpr1
+...
+---
+name: test_combine_or_s64_s32_rhs
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $sgpr0_sgpr1, $sgpr2
+    ; CHECK-LABEL: name: test_combine_or_s64_s32_rhs
+    ; CHECK: liveins: $sgpr0_sgpr1, $sgpr2
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $sgpr0_sgpr1
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr2
+    ; CHECK-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY]](s64)
+    ; CHECK-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[UV]], [[COPY1]]
+    ; CHECK-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR]](s32), [[UV1]](s32)
+    ; CHECK-NEXT: $sgpr0_sgpr1 = COPY [[MV]](s64)
+    ; CHECK-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0_sgpr1
+    %0:_(s64) = COPY $sgpr0_sgpr1
+    %1:_(s32) = COPY $sgpr2
+    %2:_(s64) = G_ZEXT %1(s32)
+    %3:_(s64) = G_OR %2, %0
+    $sgpr0_sgpr1 = COPY %3(s64)
+    SI_RETURN_TO_EPILOG implicit $sgpr0_sgpr1
+...
+---
+name: test_combine_or_s64_s32_merge_unmerge
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $sgpr0, $sgpr1, $sgpr2
+    ; CHECK-LABEL: name: test_combine_or_s64_s32_merge_unmerge
+    ; CHECK: liveins: $sgpr0, $sgpr1, $sgpr2
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr0
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr1
+    ; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr2
+    ; CHECK-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY]], [[COPY2]]
+    ; CHECK-NEXT: $sgpr0 = COPY [[OR]](s32)
+    ; CHECK-NEXT: $sgpr1 = COPY [[COPY1]](s32)
+    ; CHECK-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0, implicit $sgpr1
+    %0:_(s32) = COPY $sgpr0
+    %1:_(s32) = COPY $sgpr1
+    %2:_(s32) = COPY $sgpr2
+    %3:_(s64) = G_MERGE_VALUES %0(s32), %1(s32)
+    %4:_(s64) = G_ZEXT %2(s32)
+    %5:_(s64) = G_OR %3, %4
+    %6:_(s32), %7:_(s32) = G_UNMERGE_VALUES %5(s64)
+    $sgpr0 = COPY %6(s32)
+    $sgpr1 = COPY %7(s32)
+    SI_RETURN_TO_EPILOG implicit $sgpr0, implicit $sgpr1
+...
+---
+name: negative_test_incorrect_types
+tracksRegLiveness: true
+body: |
+  bb.0:
+    liveins: $vgpr0_vgpr1_vgpr2_vgpr3, $vgpr4_vgpr5
+    ; CHECK-LABEL: name: negative_test_incorrect_types
+    ; CHECK: liveins: $vgpr0_vgpr1_vgpr2_vgpr3, $vgpr4_vgpr5
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s128) = COPY $vgpr0_vgpr1_vgpr2_vgpr3
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s64) = COPY $vgpr4_vgpr5
+    ; CHECK-NEXT: [[ZEXT:%[0-9]+]]:_(s128) = G_ZEXT [[COPY1]](s64)
+    ; CHECK-NEXT: [[OR:%[0-9]+]]:_(s128) = G_OR [[COPY]], [[ZEXT]]
+    ; CHECK-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[OR]](s128)
+    %0:_(s128) = COPY $vgpr0_vgpr1_vgpr2_vgpr3
+    %1:_(s64) = COPY $vgpr4_vgpr5
+    %2:_(s128) = G_ZEXT %1
+    %3:_(s128) = G_OR %0, %2
+    $vgpr0_vgpr1_vgpr2_vgpr3 = COPY %3
+...
+
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-phis-no-lane-mask-merging.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-phis-no-lane-mask-merging.ll
index 5dff8c1..667fa98 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-phis-no-lane-mask-merging.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-phis-no-lane-mask-merging.ll
@@ -227,39 +227,38 @@ exit:
 define amdgpu_cs void @single_lane_execution_attribute(i32 inreg %.userdata0, <3 x i32> inreg %.WorkgroupId, <3 x i32> %.LocalInvocationId) #0 {
 ; GFX10-LABEL: single_lane_execution_attribute:
 ; GFX10:       ; %bb.0: ; %.entry
-; GFX10-NEXT:    s_getpc_b64 s[12:13]
-; GFX10-NEXT:    s_mov_b32 s12, 0
+; GFX10-NEXT:    s_getpc_b64 s[4:5]
 ; GFX10-NEXT:    s_mov_b32 s2, s0
-; GFX10-NEXT:    s_mov_b32 s3, s12
+; GFX10-NEXT:    s_mov_b32 s3, s5
 ; GFX10-NEXT:    v_mbcnt_lo_u32_b32 v1, -1, 0
-; GFX10-NEXT:    s_or_b64 s[2:3], s[12:13], s[2:3]
 ; GFX10-NEXT:    s_load_dwordx8 s[4:11], s[2:3], 0x0
 ; GFX10-NEXT:    v_mbcnt_hi_u32_b32 v1, -1, v1
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 2, v1
 ; GFX10-NEXT:    v_and_b32_e32 v3, 1, v1
 ; GFX10-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v3
 ; GFX10-NEXT:    s_xor_b32 s2, vcc_lo, exec_lo
-; GFX10-NEXT:    s_and_b32 vcc_lo, s2, exec_lo
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    buffer_load_dword v2, v2, s[4:7], 0 offen
+; GFX10-NEXT:    s_and_b32 vcc_lo, exec_lo, s2
+; GFX10-NEXT:    s_mov_b32 s2, 0
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_cmp_eq_u32_e64 s0, 0, v2
 ; GFX10-NEXT:    s_cbranch_vccnz .LBB4_4
 ; GFX10-NEXT:  ; %bb.1: ; %.preheader.preheader
-; GFX10-NEXT:    s_mov_b32 s2, 0
+; GFX10-NEXT:    s_mov_b32 s3, 0
 ; GFX10-NEXT:  .LBB4_2: ; %.preheader
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT:    v_mov_b32_e32 v3, s12
+; GFX10-NEXT:    v_mov_b32_e32 v3, s2
 ; GFX10-NEXT:    v_add_nc_u32_e32 v1, -1, v1
-; GFX10-NEXT:    s_add_i32 s12, s12, 4
+; GFX10-NEXT:    s_add_i32 s2, s2, 4
 ; GFX10-NEXT:    buffer_load_dword v3, v3, s[4:7], 0 offen
 ; GFX10-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_readfirstlane_b32 s3, v3
-; GFX10-NEXT:    s_add_i32 s2, s3, s2
+; GFX10-NEXT:    v_readfirstlane_b32 s12, v3
+; GFX10-NEXT:    s_add_i32 s3, s12, s3
 ; GFX10-NEXT:    s_cbranch_vccnz .LBB4_2
 ; GFX10-NEXT:  ; %bb.3: ; %.preheader._crit_edge
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, s2, v2
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, s3, v2
 ; GFX10-NEXT:    s_or_b32 s2, s0, vcc_lo
 ; GFX10-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s2
 ; GFX10-NEXT:    s_branch .LBB4_6
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll
index bd53032..715a777 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll
@@ -4934,17 +4934,15 @@ define amdgpu_ps i64 @s_fshl_i64_5(i64 inreg %lhs, i64 inreg %rhs) {
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_lshl_b64 s[0:1], s[0:1], 5
 ; GCN-NEXT:    s_lshr_b32 s2, s3, 27
-; GCN-NEXT:    s_mov_b32 s3, 0
-; GCN-NEXT:    s_or_b64 s[0:1], s[0:1], s[2:3]
+; GCN-NEXT:    s_or_b32 s0, s0, s2
 ; GCN-NEXT:    ; return to shader part epilog
 ;
 ; GFX11-LABEL: s_fshl_i64_5:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_lshl_b64 s[0:1], s[0:1], 5
 ; GFX11-NEXT:    s_lshr_b32 s2, s3, 27
-; GFX11-NEXT:    s_mov_b32 s3, 0
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    s_or_b64 s[0:1], s[0:1], s[2:3]
+; GFX11-NEXT:    s_or_b32 s0, s0, s2
 ; GFX11-NEXT:    ; return to shader part epilog
   %result = call i64 @llvm.fshl.i64(i64 %lhs, i64 %rhs, i64 5)
   ret i64 %result
@@ -4954,20 +4952,13 @@ define amdgpu_ps i64 @s_fshl_i64_32(i64 inreg %lhs, i64 inreg %rhs) {
 ; GCN-LABEL: s_fshl_i64_32:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_mov_b32 s1, s0
-; GCN-NEXT:    s_mov_b32 s0, 0
-; GCN-NEXT:    s_mov_b32 s2, s3
-; GCN-NEXT:    s_mov_b32 s3, s0
-; GCN-NEXT:    s_or_b64 s[0:1], s[0:1], s[2:3]
+; GCN-NEXT:    s_mov_b32 s0, s3
 ; GCN-NEXT:    ; return to shader part epilog
 ;
 ; GFX11-LABEL: s_fshl_i64_32:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_mov_b32 s1, s0
-; GFX11-NEXT:    s_mov_b32 s0, 0
-; GFX11-NEXT:    s_mov_b32 s2, s3
-; GFX11-NEXT:    s_mov_b32 s3, s0
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    s_or_b64 s[0:1], s[0:1], s[2:3]
+; GFX11-NEXT:    s_mov_b32 s0, s3
 ; GFX11-NEXT:    ; return to shader part epilog
   %result = call i64 @llvm.fshl.i64(i64 %lhs, i64 %rhs, i64 32)
   ret i64 %result
@@ -6823,56 +6814,50 @@ define amdgpu_ps i128 @s_fshl_i128_65(i128 inreg %lhs, i128 inreg %rhs) {
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_lshl_b64 s[2:3], s[0:1], 1
 ; GFX6-NEXT:    s_lshr_b32 s4, s5, 31
-; GFX6-NEXT:    s_mov_b32 s5, 0
 ; GFX6-NEXT:    s_lshl_b64 s[0:1], s[6:7], 1
-; GFX6-NEXT:    s_or_b64 s[0:1], s[4:5], s[0:1]
+; GFX6-NEXT:    s_or_b32 s0, s0, s4
 ; GFX6-NEXT:    s_lshr_b32 s4, s7, 31
-; GFX6-NEXT:    s_or_b64 s[2:3], s[2:3], s[4:5]
+; GFX6-NEXT:    s_or_b32 s2, s2, s4
 ; GFX6-NEXT:    ; return to shader part epilog
 ;
 ; GFX8-LABEL: s_fshl_i128_65:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_lshl_b64 s[2:3], s[0:1], 1
 ; GFX8-NEXT:    s_lshr_b32 s4, s5, 31
-; GFX8-NEXT:    s_mov_b32 s5, 0
 ; GFX8-NEXT:    s_lshl_b64 s[0:1], s[6:7], 1
-; GFX8-NEXT:    s_or_b64 s[0:1], s[4:5], s[0:1]
+; GFX8-NEXT:    s_or_b32 s0, s0, s4
 ; GFX8-NEXT:    s_lshr_b32 s4, s7, 31
-; GFX8-NEXT:    s_or_b64 s[2:3], s[2:3], s[4:5]
+; GFX8-NEXT:    s_or_b32 s2, s2, s4
 ; GFX8-NEXT:    ; return to shader part epilog
 ;
 ; GFX9-LABEL: s_fshl_i128_65:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_lshl_b64 s[2:3], s[0:1], 1
 ; GFX9-NEXT:    s_lshr_b32 s4, s5, 31
-; GFX9-NEXT:    s_mov_b32 s5, 0
 ; GFX9-NEXT:    s_lshl_b64 s[0:1], s[6:7], 1
-; GFX9-NEXT:    s_or_b64 s[0:1], s[4:5], s[0:1]
+; GFX9-NEXT:    s_or_b32 s0, s0, s4
 ; GFX9-NEXT:    s_lshr_b32 s4, s7, 31
-; GFX9-NEXT:    s_or_b64 s[2:3], s[2:3], s[4:5]
+; GFX9-NEXT:    s_or_b32 s2, s2, s4
 ; GFX9-NEXT:    ; return to shader part epilog
 ;
 ; GFX10-LABEL: s_fshl_i128_65:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_lshr_b32 s2, s5, 31
-; GFX10-NEXT:    s_mov_b32 s3, 0
-; GFX10-NEXT:    s_lshl_b64 s[4:5], s[6:7], 1
-; GFX10-NEXT:    s_lshl_b64 s[8:9], s[0:1], 1
-; GFX10-NEXT:    s_or_b64 s[0:1], s[2:3], s[4:5]
-; GFX10-NEXT:    s_lshr_b32 s2, s7, 31
-; GFX10-NEXT:    s_or_b64 s[2:3], s[8:9], s[2:3]
+; GFX10-NEXT:    s_lshl_b64 s[2:3], s[0:1], 1
+; GFX10-NEXT:    s_lshr_b32 s4, s5, 31
+; GFX10-NEXT:    s_lshl_b64 s[0:1], s[6:7], 1
+; GFX10-NEXT:    s_lshr_b32 s5, s7, 31
+; GFX10-NEXT:    s_or_b32 s0, s0, s4
+; GFX10-NEXT:    s_or_b32 s2, s2, s5
 ; GFX10-NEXT:    ; return to shader part epilog
 ;
 ; GFX11-LABEL: s_fshl_i128_65:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_lshr_b32 s2, s5, 31
-; GFX11-NEXT:    s_mov_b32 s3, 0
-; GFX11-NEXT:    s_lshl_b64 s[4:5], s[6:7], 1
-; GFX11-NEXT:    s_lshl_b64 s[8:9], s[0:1], 1
-; GFX11-NEXT:    s_or_b64 s[0:1], s[2:3], s[4:5]
-; GFX11-NEXT:    s_lshr_b32 s2, s7, 31
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    s_or_b64 s[2:3], s[8:9], s[2:3]
+; GFX11-NEXT:    s_lshl_b64 s[2:3], s[0:1], 1
+; GFX11-NEXT:    s_lshr_b32 s4, s5, 31
+; GFX11-NEXT:    s_lshl_b64 s[0:1], s[6:7], 1
+; GFX11-NEXT:    s_lshr_b32 s5, s7, 31
+; GFX11-NEXT:    s_or_b32 s0, s0, s4
+; GFX11-NEXT:    s_or_b32 s2, s2, s5
 ; GFX11-NEXT:    ; return to shader part epilog
   %result = call i128 @llvm.fshl.i128(i128 %lhs, i128 %rhs, i128 65)
   ret i128 %result
@@ -6885,7 +6870,7 @@ define i128 @v_fshl_i128_65(i128 %lhs, i128 %rhs) {
 ; GFX6-NEXT:    v_lshl_b64 v[2:3], v[0:1], 1
 ; GFX6-NEXT:    v_lshl_b64 v[0:1], v[6:7], 1
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v4, 31, v5
-; GFX6-NEXT:    v_or_b32_e32 v0, v4, v0
+; GFX6-NEXT:    v_or_b32_e32 v0, v0, v4
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v4, 31, v7
 ; GFX6-NEXT:    v_or_b32_e32 v2, v2, v4
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
@@ -6896,7 +6881,7 @@ define i128 @v_fshl_i128_65(i128 %lhs, i128 %rhs) {
 ; GFX8-NEXT:    v_lshlrev_b64 v[2:3], 1, v[0:1]
 ; GFX8-NEXT:    v_lshlrev_b64 v[0:1], 1, v[6:7]
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v4, 31, v5
-; GFX8-NEXT:    v_or_b32_e32 v0, v4, v0
+; GFX8-NEXT:    v_or_b32_e32 v0, v0, v4
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v4, 31, v7
 ; GFX8-NEXT:    v_or_b32_e32 v2, v2, v4
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
@@ -6907,7 +6892,7 @@ define i128 @v_fshl_i128_65(i128 %lhs, i128 %rhs) {
 ; GFX9-NEXT:    v_lshlrev_b64 v[2:3], 1, v[0:1]
 ; GFX9-NEXT:    v_lshlrev_b64 v[0:1], 1, v[6:7]
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v4, 31, v5
-; GFX9-NEXT:    v_or_b32_e32 v0, v4, v0
+; GFX9-NEXT:    v_or_b32_e32 v0, v0, v4
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v4, 31, v7
 ; GFX9-NEXT:    v_or_b32_e32 v2, v2, v4
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
@@ -6919,7 +6904,7 @@ define i128 @v_fshl_i128_65(i128 %lhs, i128 %rhs) {
 ; GFX10-NEXT:    v_lshlrev_b64 v[0:1], 1, v[6:7]
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v4, 31, v5
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v5, 31, v7
-; GFX10-NEXT:    v_or_b32_e32 v0, v4, v0
+; GFX10-NEXT:    v_or_b32_e32 v0, v0, v4
 ; GFX10-NEXT:    v_or_b32_e32 v2, v2, v5
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -6931,7 +6916,7 @@ define i128 @v_fshl_i128_65(i128 %lhs, i128 %rhs) {
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v4, 31, v5
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v5, 31, v7
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_or_b32_e32 v0, v4, v0
+; GFX11-NEXT:    v_or_b32_e32 v0, v0, v4
 ; GFX11-NEXT:    v_or_b32_e32 v2, v2, v5
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %result = call i128 @llvm.fshl.i128(i128 %lhs, i128 %rhs, i128 65)
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fshr.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fshr.ll
index ea6b3a3..5aa5a671 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fshr.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fshr.ll
@@ -4715,20 +4715,13 @@ define amdgpu_ps i64 @s_fshr_i64_32(i64 inreg %lhs, i64 inreg %rhs) {
 ; GCN-LABEL: s_fshr_i64_32:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_mov_b32 s1, s0
-; GCN-NEXT:    s_mov_b32 s0, 0
-; GCN-NEXT:    s_mov_b32 s2, s3
-; GCN-NEXT:    s_mov_b32 s3, s0
-; GCN-NEXT:    s_or_b64 s[0:1], s[0:1], s[2:3]
+; GCN-NEXT:    s_mov_b32 s0, s3
 ; GCN-NEXT:    ; return to shader part epilog
 ;
 ; GFX11-LABEL: s_fshr_i64_32:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_mov_b32 s1, s0
-; GFX11-NEXT:    s_mov_b32 s0, 0
-; GFX11-NEXT:    s_mov_b32 s2, s3
-; GFX11-NEXT:    s_mov_b32 s3, s0
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    s_or_b64 s[0:1], s[0:1], s[2:3]
+; GFX11-NEXT:    s_mov_b32 s0, s3
 ; GFX11-NEXT:    ; return to shader part epilog
   %result = call i64 @llvm.fshr.i64(i64 %lhs, i64 %rhs, i64 32)
   ret i64 %result
@@ -4739,17 +4732,15 @@ define amdgpu_ps i64 @s_fshr_i64_48(i64 inreg %lhs, i64 inreg %rhs) {
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_lshl_b64 s[0:1], s[0:1], 16
 ; GCN-NEXT:    s_lshr_b32 s2, s3, 16
-; GCN-NEXT:    s_mov_b32 s3, 0
-; GCN-NEXT:    s_or_b64 s[0:1], s[0:1], s[2:3]
+; GCN-NEXT:    s_or_b32 s0, s0, s2
 ; GCN-NEXT:    ; return to shader part epilog
 ;
 ; GFX11-LABEL: s_fshr_i64_48:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_lshl_b64 s[0:1], s[0:1], 16
 ; GFX11-NEXT:    s_lshr_b32 s2, s3, 16
-; GFX11-NEXT:    s_mov_b32 s3, 0
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    s_or_b64 s[0:1], s[0:1], s[2:3]
+; GFX11-NEXT:    s_or_b32 s0, s0, s2
 ; GFX11-NEXT:    ; return to shader part epilog
   %result = call i64 @llvm.fshr.i64(i64 %lhs, i64 %rhs, i64 48)
   ret i64 %result
@@ -5293,34 +5284,33 @@ define amdgpu_ps i128 @s_fshr_i128(i128 inreg %lhs, i128 inreg %rhs, i128 inreg
 ; GFX6-NEXT:    s_lshl_b64 s[10:11], s[0:1], 1
 ; GFX6-NEXT:    s_lshl_b64 s[2:3], s[2:3], 1
 ; GFX6-NEXT:    s_lshr_b32 s0, s1, 31
-; GFX6-NEXT:    s_mov_b32 s1, 0
-; GFX6-NEXT:    s_or_b64 s[0:1], s[2:3], s[0:1]
-; GFX6-NEXT:    s_andn2_b32 s2, 0x7f, s8
+; GFX6-NEXT:    s_or_b32 s2, s2, s0
+; GFX6-NEXT:    s_andn2_b32 s0, 0x7f, s8
 ; GFX6-NEXT:    s_not_b32 s9, s8
-; GFX6-NEXT:    s_sub_i32 s16, s2, 64
-; GFX6-NEXT:    s_sub_i32 s12, 64, s2
-; GFX6-NEXT:    s_cmp_lt_u32 s2, 64
+; GFX6-NEXT:    s_sub_i32 s16, s0, 64
+; GFX6-NEXT:    s_sub_i32 s12, 64, s0
+; GFX6-NEXT:    s_cmp_lt_u32 s0, 64
 ; GFX6-NEXT:    s_cselect_b32 s17, 1, 0
-; GFX6-NEXT:    s_cmp_eq_u32 s2, 0
+; GFX6-NEXT:    s_cmp_eq_u32 s0, 0
 ; GFX6-NEXT:    s_cselect_b32 s18, 1, 0
 ; GFX6-NEXT:    s_lshr_b64 s[12:13], s[10:11], s12
-; GFX6-NEXT:    s_lshl_b64 s[14:15], s[0:1], s9
-; GFX6-NEXT:    s_lshl_b64 s[2:3], s[10:11], s9
+; GFX6-NEXT:    s_lshl_b64 s[14:15], s[2:3], s9
+; GFX6-NEXT:    s_lshl_b64 s[0:1], s[10:11], s9
 ; GFX6-NEXT:    s_or_b64 s[12:13], s[12:13], s[14:15]
 ; GFX6-NEXT:    s_lshl_b64 s[10:11], s[10:11], s16
 ; GFX6-NEXT:    s_cmp_lg_u32 s17, 0
-; GFX6-NEXT:    s_cselect_b64 s[2:3], s[2:3], 0
+; GFX6-NEXT:    s_cselect_b64 s[0:1], s[0:1], 0
 ; GFX6-NEXT:    s_cselect_b64 s[10:11], s[12:13], s[10:11]
 ; GFX6-NEXT:    s_cmp_lg_u32 s18, 0
-; GFX6-NEXT:    s_cselect_b64 s[10:11], s[0:1], s[10:11]
-; GFX6-NEXT:    s_and_b32 s0, s8, 0x7f
-; GFX6-NEXT:    s_sub_i32 s14, s0, 64
-; GFX6-NEXT:    s_sub_i32 s12, 64, s0
-; GFX6-NEXT:    s_cmp_lt_u32 s0, 64
+; GFX6-NEXT:    s_cselect_b64 s[2:3], s[2:3], s[10:11]
+; GFX6-NEXT:    s_and_b32 s9, s8, 0x7f
+; GFX6-NEXT:    s_sub_i32 s14, s9, 64
+; GFX6-NEXT:    s_sub_i32 s12, 64, s9
+; GFX6-NEXT:    s_cmp_lt_u32 s9, 64
 ; GFX6-NEXT:    s_cselect_b32 s15, 1, 0
-; GFX6-NEXT:    s_cmp_eq_u32 s0, 0
+; GFX6-NEXT:    s_cmp_eq_u32 s9, 0
 ; GFX6-NEXT:    s_cselect_b32 s16, 1, 0
-; GFX6-NEXT:    s_lshr_b64 s[0:1], s[6:7], s8
+; GFX6-NEXT:    s_lshr_b64 s[10:11], s[6:7], s8
 ; GFX6-NEXT:    s_lshr_b64 s[8:9], s[4:5], s8
 ; GFX6-NEXT:    s_lshl_b64 s[12:13], s[6:7], s12
 ; GFX6-NEXT:    s_or_b64 s[8:9], s[8:9], s[12:13]
@@ -5330,9 +5320,9 @@ define amdgpu_ps i128 @s_fshr_i128(i128 inreg %lhs, i128 inreg %rhs, i128 inreg
 ; GFX6-NEXT:    s_cmp_lg_u32 s16, 0
 ; GFX6-NEXT:    s_cselect_b64 s[4:5], s[4:5], s[6:7]
 ; GFX6-NEXT:    s_cmp_lg_u32 s15, 0
-; GFX6-NEXT:    s_cselect_b64 s[6:7], s[0:1], 0
-; GFX6-NEXT:    s_or_b64 s[0:1], s[2:3], s[4:5]
-; GFX6-NEXT:    s_or_b64 s[2:3], s[10:11], s[6:7]
+; GFX6-NEXT:    s_cselect_b64 s[6:7], s[10:11], 0
+; GFX6-NEXT:    s_or_b64 s[0:1], s[0:1], s[4:5]
+; GFX6-NEXT:    s_or_b64 s[2:3], s[2:3], s[6:7]
 ; GFX6-NEXT:    ; return to shader part epilog
 ;
 ; GFX8-LABEL: s_fshr_i128:
@@ -5340,34 +5330,33 @@ define amdgpu_ps i128 @s_fshr_i128(i128 inreg %lhs, i128 inreg %rhs, i128 inreg
 ; GFX8-NEXT:    s_lshl_b64 s[10:11], s[0:1], 1
 ; GFX8-NEXT:    s_lshl_b64 s[2:3], s[2:3], 1
 ; GFX8-NEXT:    s_lshr_b32 s0, s1, 31
-; GFX8-NEXT:    s_mov_b32 s1, 0
-; GFX8-NEXT:    s_or_b64 s[0:1], s[2:3], s[0:1]
-; GFX8-NEXT:    s_andn2_b32 s2, 0x7f, s8
+; GFX8-NEXT:    s_or_b32 s2, s2, s0
+; GFX8-NEXT:    s_andn2_b32 s0, 0x7f, s8
 ; GFX8-NEXT:    s_not_b32 s9, s8
-; GFX8-NEXT:    s_sub_i32 s16, s2, 64
-; GFX8-NEXT:    s_sub_i32 s12, 64, s2
-; GFX8-NEXT:    s_cmp_lt_u32 s2, 64
+; GFX8-NEXT:    s_sub_i32 s16, s0, 64
+; GFX8-NEXT:    s_sub_i32 s12, 64, s0
+; GFX8-NEXT:    s_cmp_lt_u32 s0, 64
 ; GFX8-NEXT:    s_cselect_b32 s17, 1, 0
-; GFX8-NEXT:    s_cmp_eq_u32 s2, 0
+; GFX8-NEXT:    s_cmp_eq_u32 s0, 0
 ; GFX8-NEXT:    s_cselect_b32 s18, 1, 0
 ; GFX8-NEXT:    s_lshr_b64 s[12:13], s[10:11], s12
-; GFX8-NEXT:    s_lshl_b64 s[14:15], s[0:1], s9
-; GFX8-NEXT:    s_lshl_b64 s[2:3], s[10:11], s9
+; GFX8-NEXT:    s_lshl_b64 s[14:15], s[2:3], s9
+; GFX8-NEXT:    s_lshl_b64 s[0:1], s[10:11], s9
 ; GFX8-NEXT:    s_or_b64 s[12:13], s[12:13], s[14:15]
 ; GFX8-NEXT:    s_lshl_b64 s[10:11], s[10:11], s16
 ; GFX8-NEXT:    s_cmp_lg_u32 s17, 0
-; GFX8-NEXT:    s_cselect_b64 s[2:3], s[2:3], 0
+; GFX8-NEXT:    s_cselect_b64 s[0:1], s[0:1], 0
 ; GFX8-NEXT:    s_cselect_b64 s[10:11], s[12:13], s[10:11]
 ; GFX8-NEXT:    s_cmp_lg_u32 s18, 0
-; GFX8-NEXT:    s_cselect_b64 s[10:11], s[0:1], s[10:11]
-; GFX8-NEXT:    s_and_b32 s0, s8, 0x7f
-; GFX8-NEXT:    s_sub_i32 s14, s0, 64
-; GFX8-NEXT:    s_sub_i32 s12, 64, s0
-; GFX8-NEXT:    s_cmp_lt_u32 s0, 64
+; GFX8-NEXT:    s_cselect_b64 s[2:3], s[2:3], s[10:11]
+; GFX8-NEXT:    s_and_b32 s9, s8, 0x7f
+; GFX8-NEXT:    s_sub_i32 s14, s9, 64
+; GFX8-NEXT:    s_sub_i32 s12, 64, s9
+; GFX8-NEXT:    s_cmp_lt_u32 s9, 64
 ; GFX8-NEXT:    s_cselect_b32 s15, 1, 0
-; GFX8-NEXT:    s_cmp_eq_u32 s0, 0
+; GFX8-NEXT:    s_cmp_eq_u32 s9, 0
 ; GFX8-NEXT:    s_cselect_b32 s16, 1, 0
-; GFX8-NEXT:    s_lshr_b64 s[0:1], s[6:7], s8
+; GFX8-NEXT:    s_lshr_b64 s[10:11], s[6:7], s8
 ; GFX8-NEXT:    s_lshr_b64 s[8:9], s[4:5], s8
 ; GFX8-NEXT:    s_lshl_b64 s[12:13], s[6:7], s12
 ; GFX8-NEXT:    s_or_b64 s[8:9], s[8:9], s[12:13]
@@ -5377,9 +5366,9 @@ define amdgpu_ps i128 @s_fshr_i128(i128 inreg %lhs, i128 inreg %rhs, i128 inreg
 ; GFX8-NEXT:    s_cmp_lg_u32 s16, 0
 ; GFX8-NEXT:    s_cselect_b64 s[4:5], s[4:5], s[6:7]
 ; GFX8-NEXT:    s_cmp_lg_u32 s15, 0
-; GFX8-NEXT:    s_cselect_b64 s[6:7], s[0:1], 0
-; GFX8-NEXT:    s_or_b64 s[0:1], s[2:3], s[4:5]
-; GFX8-NEXT:    s_or_b64 s[2:3], s[10:11], s[6:7]
+; GFX8-NEXT:    s_cselect_b64 s[6:7], s[10:11], 0
+; GFX8-NEXT:    s_or_b64 s[0:1], s[0:1], s[4:5]
+; GFX8-NEXT:    s_or_b64 s[2:3], s[2:3], s[6:7]
 ; GFX8-NEXT:    ; return to shader part epilog
 ;
 ; GFX9-LABEL: s_fshr_i128:
@@ -5387,34 +5376,33 @@ define amdgpu_ps i128 @s_fshr_i128(i128 inreg %lhs, i128 inreg %rhs, i128 inreg
 ; GFX9-NEXT:    s_lshl_b64 s[10:11], s[0:1], 1
 ; GFX9-NEXT:    s_lshl_b64 s[2:3], s[2:3], 1
 ; GFX9-NEXT:    s_lshr_b32 s0, s1, 31
-; GFX9-NEXT:    s_mov_b32 s1, 0
-; GFX9-NEXT:    s_or_b64 s[0:1], s[2:3], s[0:1]
-; GFX9-NEXT:    s_andn2_b32 s2, 0x7f, s8
+; GFX9-NEXT:    s_or_b32 s2, s2, s0
+; GFX9-NEXT:    s_andn2_b32 s0, 0x7f, s8
 ; GFX9-NEXT:    s_not_b32 s9, s8
-; GFX9-NEXT:    s_sub_i32 s16, s2, 64
-; GFX9-NEXT:    s_sub_i32 s12, 64, s2
-; GFX9-NEXT:    s_cmp_lt_u32 s2, 64
+; GFX9-NEXT:    s_sub_i32 s16, s0, 64
+; GFX9-NEXT:    s_sub_i32 s12, 64, s0
+; GFX9-NEXT:    s_cmp_lt_u32 s0, 64
 ; GFX9-NEXT:    s_cselect_b32 s17, 1, 0
-; GFX9-NEXT:    s_cmp_eq_u32 s2, 0
+; GFX9-NEXT:    s_cmp_eq_u32 s0, 0
 ; GFX9-NEXT:    s_cselect_b32 s18, 1, 0
 ; GFX9-NEXT:    s_lshr_b64 s[12:13], s[10:11], s12
-; GFX9-NEXT:    s_lshl_b64 s[14:15], s[0:1], s9
-; GFX9-NEXT:    s_lshl_b64 s[2:3], s[10:11], s9
+; GFX9-NEXT:    s_lshl_b64 s[14:15], s[2:3], s9
+; GFX9-NEXT:    s_lshl_b64 s[0:1], s[10:11], s9
 ; GFX9-NEXT:    s_or_b64 s[12:13], s[12:13], s[14:15]
 ; GFX9-NEXT:    s_lshl_b64 s[10:11], s[10:11], s16
 ; GFX9-NEXT:    s_cmp_lg_u32 s17, 0
-; GFX9-NEXT:    s_cselect_b64 s[2:3], s[2:3], 0
+; GFX9-NEXT:    s_cselect_b64 s[0:1], s[0:1], 0
 ; GFX9-NEXT:    s_cselect_b64 s[10:11], s[12:13], s[10:11]
 ; GFX9-NEXT:    s_cmp_lg_u32 s18, 0
-; GFX9-NEXT:    s_cselect_b64 s[10:11], s[0:1], s[10:11]
-; GFX9-NEXT:    s_and_b32 s0, s8, 0x7f
-; GFX9-NEXT:    s_sub_i32 s14, s0, 64
-; GFX9-NEXT:    s_sub_i32 s12, 64, s0
-; GFX9-NEXT:    s_cmp_lt_u32 s0, 64
+; GFX9-NEXT:    s_cselect_b64 s[2:3], s[2:3], s[10:11]
+; GFX9-NEXT:    s_and_b32 s9, s8, 0x7f
+; GFX9-NEXT:    s_sub_i32 s14, s9, 64
+; GFX9-NEXT:    s_sub_i32 s12, 64, s9
+; GFX9-NEXT:    s_cmp_lt_u32 s9, 64
 ; GFX9-NEXT:    s_cselect_b32 s15, 1, 0
-; GFX9-NEXT:    s_cmp_eq_u32 s0, 0
+; GFX9-NEXT:    s_cmp_eq_u32 s9, 0
 ; GFX9-NEXT:    s_cselect_b32 s16, 1, 0
-; GFX9-NEXT:    s_lshr_b64 s[0:1], s[6:7], s8
+; GFX9-NEXT:    s_lshr_b64 s[10:11], s[6:7], s8
 ; GFX9-NEXT:    s_lshr_b64 s[8:9], s[4:5], s8
 ; GFX9-NEXT:    s_lshl_b64 s[12:13], s[6:7], s12
 ; GFX9-NEXT:    s_or_b64 s[8:9], s[8:9], s[12:13]
@@ -5424,19 +5412,18 @@ define amdgpu_ps i128 @s_fshr_i128(i128 inreg %lhs, i128 inreg %rhs, i128 inreg
 ; GFX9-NEXT:    s_cmp_lg_u32 s16, 0
 ; GFX9-NEXT:    s_cselect_b64 s[4:5], s[4:5], s[6:7]
 ; GFX9-NEXT:    s_cmp_lg_u32 s15, 0
-; GFX9-NEXT:    s_cselect_b64 s[6:7], s[0:1], 0
-; GFX9-NEXT:    s_or_b64 s[0:1], s[2:3], s[4:5]
-; GFX9-NEXT:    s_or_b64 s[2:3], s[10:11], s[6:7]
+; GFX9-NEXT:    s_cselect_b64 s[6:7], s[10:11], 0
+; GFX9-NEXT:    s_or_b64 s[0:1], s[0:1], s[4:5]
+; GFX9-NEXT:    s_or_b64 s[2:3], s[2:3], s[6:7]
 ; GFX9-NEXT:    ; return to shader part epilog
 ;
 ; GFX10-LABEL: s_fshr_i128:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_lshl_b64 s[2:3], s[2:3], 1
-; GFX10-NEXT:    s_lshr_b32 s10, s1, 31
-; GFX10-NEXT:    s_mov_b32 s11, 0
-; GFX10-NEXT:    s_andn2_b32 s9, 0x7f, s8
+; GFX10-NEXT:    s_lshr_b32 s9, s1, 31
 ; GFX10-NEXT:    s_lshl_b64 s[0:1], s[0:1], 1
-; GFX10-NEXT:    s_or_b64 s[2:3], s[2:3], s[10:11]
+; GFX10-NEXT:    s_or_b32 s2, s2, s9
+; GFX10-NEXT:    s_andn2_b32 s9, 0x7f, s8
 ; GFX10-NEXT:    s_not_b32 s14, s8
 ; GFX10-NEXT:    s_sub_i32 s16, s9, 64
 ; GFX10-NEXT:    s_sub_i32 s10, 64, s9
@@ -5479,11 +5466,10 @@ define amdgpu_ps i128 @s_fshr_i128(i128 inreg %lhs, i128 inreg %rhs, i128 inreg
 ; GFX11-LABEL: s_fshr_i128:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_lshl_b64 s[2:3], s[2:3], 1
-; GFX11-NEXT:    s_lshr_b32 s10, s1, 31
-; GFX11-NEXT:    s_mov_b32 s11, 0
-; GFX11-NEXT:    s_and_not1_b32 s9, 0x7f, s8
+; GFX11-NEXT:    s_lshr_b32 s9, s1, 31
 ; GFX11-NEXT:    s_lshl_b64 s[0:1], s[0:1], 1
-; GFX11-NEXT:    s_or_b64 s[2:3], s[2:3], s[10:11]
+; GFX11-NEXT:    s_or_b32 s2, s2, s9
+; GFX11-NEXT:    s_and_not1_b32 s9, 0x7f, s8
 ; GFX11-NEXT:    s_not_b32 s14, s8
 ; GFX11-NEXT:    s_sub_i32 s16, s9, 64
 ; GFX11-NEXT:    s_sub_i32 s10, 64, s9
@@ -5786,13 +5772,12 @@ define amdgpu_ps <4 x float> @v_fshr_i128_ssv(i128 inreg %lhs, i128 inreg %rhs,
 ; GFX6-NEXT:    s_lshl_b64 s[8:9], s[0:1], 1
 ; GFX6-NEXT:    s_lshl_b64 s[2:3], s[2:3], 1
 ; GFX6-NEXT:    s_lshr_b32 s0, s1, 31
-; GFX6-NEXT:    s_mov_b32 s1, 0
 ; GFX6-NEXT:    v_bfi_b32 v7, v0, 0, v1
-; GFX6-NEXT:    s_or_b64 s[0:1], s[2:3], s[0:1]
+; GFX6-NEXT:    s_or_b32 s2, s2, s0
 ; GFX6-NEXT:    v_sub_i32_e32 v1, vcc, 64, v7
 ; GFX6-NEXT:    v_not_b32_e32 v8, 63
 ; GFX6-NEXT:    v_lshr_b64 v[1:2], s[8:9], v1
-; GFX6-NEXT:    v_lshl_b64 v[3:4], s[0:1], v7
+; GFX6-NEXT:    v_lshl_b64 v[3:4], s[2:3], v7
 ; GFX6-NEXT:    v_add_i32_e32 v9, vcc, v7, v8
 ; GFX6-NEXT:    v_lshl_b64 v[5:6], s[8:9], v7
 ; GFX6-NEXT:    v_or_b32_e32 v3, v1, v3
@@ -5803,8 +5788,8 @@ define amdgpu_ps <4 x float> @v_fshr_i128_ssv(i128 inreg %lhs, i128 inreg %rhs,
 ; GFX6-NEXT:    v_cndmask_b32_e32 v6, 0, v6, vcc
 ; GFX6-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
 ; GFX6-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
-; GFX6-NEXT:    v_mov_b32_e32 v3, s0
-; GFX6-NEXT:    v_mov_b32_e32 v4, s1
+; GFX6-NEXT:    v_mov_b32_e32 v3, s2
+; GFX6-NEXT:    v_mov_b32_e32 v4, s3
 ; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v7
 ; GFX6-NEXT:    v_and_b32_e32 v11, 0x7f, v0
 ; GFX6-NEXT:    v_cndmask_b32_e32 v7, v1, v3, vcc
@@ -5839,13 +5824,12 @@ define amdgpu_ps <4 x float> @v_fshr_i128_ssv(i128 inreg %lhs, i128 inreg %rhs,
 ; GFX8-NEXT:    s_lshl_b64 s[8:9], s[0:1], 1
 ; GFX8-NEXT:    s_lshl_b64 s[2:3], s[2:3], 1
 ; GFX8-NEXT:    s_lshr_b32 s0, s1, 31
-; GFX8-NEXT:    s_mov_b32 s1, 0
 ; GFX8-NEXT:    v_bfi_b32 v7, v0, 0, v1
-; GFX8-NEXT:    s_or_b64 s[0:1], s[2:3], s[0:1]
+; GFX8-NEXT:    s_or_b32 s2, s2, s0
 ; GFX8-NEXT:    v_sub_u32_e32 v1, vcc, 64, v7
 ; GFX8-NEXT:    v_not_b32_e32 v8, 63
 ; GFX8-NEXT:    v_lshrrev_b64 v[1:2], v1, s[8:9]
-; GFX8-NEXT:    v_lshlrev_b64 v[3:4], v7, s[0:1]
+; GFX8-NEXT:    v_lshlrev_b64 v[3:4], v7, s[2:3]
 ; GFX8-NEXT:    v_add_u32_e32 v9, vcc, v7, v8
 ; GFX8-NEXT:    v_lshlrev_b64 v[5:6], v7, s[8:9]
 ; GFX8-NEXT:    v_or_b32_e32 v3, v1, v3
@@ -5856,8 +5840,8 @@ define amdgpu_ps <4 x float> @v_fshr_i128_ssv(i128 inreg %lhs, i128 inreg %rhs,
 ; GFX8-NEXT:    v_cndmask_b32_e32 v6, 0, v6, vcc
 ; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
 ; GFX8-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
-; GFX8-NEXT:    v_mov_b32_e32 v3, s0
-; GFX8-NEXT:    v_mov_b32_e32 v4, s1
+; GFX8-NEXT:    v_mov_b32_e32 v3, s2
+; GFX8-NEXT:    v_mov_b32_e32 v4, s3
 ; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v7
 ; GFX8-NEXT:    v_and_b32_e32 v11, 0x7f, v0
 ; GFX8-NEXT:    v_cndmask_b32_e32 v7, v1, v3, vcc
@@ -5892,12 +5876,11 @@ define amdgpu_ps <4 x float> @v_fshr_i128_ssv(i128 inreg %lhs, i128 inreg %rhs,
 ; GFX9-NEXT:    s_lshl_b64 s[8:9], s[0:1], 1
 ; GFX9-NEXT:    s_lshl_b64 s[2:3], s[2:3], 1
 ; GFX9-NEXT:    s_lshr_b32 s0, s1, 31
-; GFX9-NEXT:    s_mov_b32 s1, 0
 ; GFX9-NEXT:    v_bfi_b32 v7, v0, 0, v1
-; GFX9-NEXT:    s_or_b64 s[0:1], s[2:3], s[0:1]
+; GFX9-NEXT:    s_or_b32 s2, s2, s0
 ; GFX9-NEXT:    v_sub_u32_e32 v1, 64, v7
 ; GFX9-NEXT:    v_lshrrev_b64 v[1:2], v1, s[8:9]
-; GFX9-NEXT:    v_lshlrev_b64 v[3:4], v7, s[0:1]
+; GFX9-NEXT:    v_lshlrev_b64 v[3:4], v7, s[2:3]
 ; GFX9-NEXT:    v_add_u32_e32 v8, 0xffffffc0, v7
 ; GFX9-NEXT:    v_lshlrev_b64 v[5:6], v7, s[8:9]
 ; GFX9-NEXT:    v_or_b32_e32 v3, v1, v3
@@ -5908,10 +5891,10 @@ define amdgpu_ps <4 x float> @v_fshr_i128_ssv(i128 inreg %lhs, i128 inreg %rhs,
 ; GFX9-NEXT:    v_cndmask_b32_e32 v6, 0, v6, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
-; GFX9-NEXT:    v_mov_b32_e32 v4, s1
+; GFX9-NEXT:    v_mov_b32_e32 v4, s3
 ; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v7
 ; GFX9-NEXT:    v_and_b32_e32 v10, 0x7f, v0
-; GFX9-NEXT:    v_mov_b32_e32 v3, s0
+; GFX9-NEXT:    v_mov_b32_e32 v3, s2
 ; GFX9-NEXT:    v_cndmask_b32_e32 v9, v2, v4, vcc
 ; GFX9-NEXT:    v_sub_u32_e32 v2, 64, v10
 ; GFX9-NEXT:    v_cndmask_b32_e32 v7, v1, v3, vcc
@@ -5941,34 +5924,33 @@ define amdgpu_ps <4 x float> @v_fshr_i128_ssv(i128 inreg %lhs, i128 inreg %rhs,
 ; GFX10-LABEL: v_fshr_i128_ssv:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    v_bfi_b32 v11, v0, 0, 0x7f
-; GFX10-NEXT:    s_lshl_b64 s[2:3], s[2:3], 1
-; GFX10-NEXT:    s_lshr_b32 s8, s1, 31
-; GFX10-NEXT:    s_mov_b32 s9, 0
+; GFX10-NEXT:    s_lshl_b64 s[8:9], s[2:3], 1
+; GFX10-NEXT:    s_lshr_b32 s2, s1, 31
 ; GFX10-NEXT:    v_and_b32_e32 v12, 0x7f, v0
-; GFX10-NEXT:    v_sub_nc_u32_e32 v1, 64, v11
 ; GFX10-NEXT:    s_lshl_b64 s[0:1], s[0:1], 1
-; GFX10-NEXT:    s_or_b64 s[8:9], s[2:3], s[8:9]
+; GFX10-NEXT:    v_sub_nc_u32_e32 v1, 64, v11
+; GFX10-NEXT:    s_or_b32 s8, s8, s2
 ; GFX10-NEXT:    v_add_nc_u32_e32 v0, 0xffffffc0, v11
 ; GFX10-NEXT:    v_lshlrev_b64 v[3:4], v11, s[8:9]
-; GFX10-NEXT:    v_lshrrev_b64 v[1:2], v1, s[0:1]
 ; GFX10-NEXT:    v_sub_nc_u32_e32 v9, 64, v12
+; GFX10-NEXT:    v_lshrrev_b64 v[1:2], v1, s[0:1]
 ; GFX10-NEXT:    v_cmp_gt_u32_e32 vcc_lo, 64, v11
 ; GFX10-NEXT:    v_add_nc_u32_e32 v13, 0xffffffc0, v12
 ; GFX10-NEXT:    v_lshrrev_b64 v[7:8], v12, s[4:5]
+; GFX10-NEXT:    v_lshlrev_b64 v[9:10], v9, s[6:7]
 ; GFX10-NEXT:    v_lshlrev_b64 v[5:6], v11, s[0:1]
 ; GFX10-NEXT:    v_or_b32_e32 v3, v1, v3
 ; GFX10-NEXT:    v_lshlrev_b64 v[0:1], v0, s[0:1]
-; GFX10-NEXT:    v_lshlrev_b64 v[9:10], v9, s[6:7]
 ; GFX10-NEXT:    v_or_b32_e32 v4, v2, v4
 ; GFX10-NEXT:    v_cmp_gt_u32_e64 s1, 64, v12
 ; GFX10-NEXT:    v_cmp_eq_u32_e64 s0, 0, v11
 ; GFX10-NEXT:    v_cmp_eq_u32_e64 s2, 0, v12
+; GFX10-NEXT:    v_cndmask_b32_e32 v5, 0, v5, vcc_lo
 ; GFX10-NEXT:    v_cndmask_b32_e32 v14, v0, v3, vcc_lo
 ; GFX10-NEXT:    v_lshrrev_b64 v[2:3], v13, s[6:7]
 ; GFX10-NEXT:    v_or_b32_e32 v0, v7, v9
 ; GFX10-NEXT:    v_or_b32_e32 v7, v8, v10
 ; GFX10-NEXT:    v_cndmask_b32_e32 v4, v1, v4, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e32 v5, 0, v5, vcc_lo
 ; GFX10-NEXT:    v_cndmask_b32_e32 v6, 0, v6, vcc_lo
 ; GFX10-NEXT:    v_cndmask_b32_e64 v2, v2, v0, s1
 ; GFX10-NEXT:    v_lshrrev_b64 v[0:1], v12, s[6:7]
@@ -5988,18 +5970,18 @@ define amdgpu_ps <4 x float> @v_fshr_i128_ssv(i128 inreg %lhs, i128 inreg %rhs,
 ; GFX11-LABEL: v_fshr_i128_ssv:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    v_bfi_b32 v11, v0, 0, 0x7f
-; GFX11-NEXT:    s_lshr_b32 s8, s1, 31
+; GFX11-NEXT:    s_lshl_b64 s[8:9], s[2:3], 1
+; GFX11-NEXT:    s_lshr_b32 s2, s1, 31
 ; GFX11-NEXT:    s_lshl_b64 s[0:1], s[0:1], 1
-; GFX11-NEXT:    s_lshl_b64 s[2:3], s[2:3], 1
-; GFX11-NEXT:    s_mov_b32 s9, 0
+; GFX11-NEXT:    s_or_b32 s8, s8, s2
 ; GFX11-NEXT:    v_sub_nc_u32_e32 v1, 64, v11
 ; GFX11-NEXT:    v_lshlrev_b64 v[5:6], v11, s[0:1]
 ; GFX11-NEXT:    v_cmp_gt_u32_e32 vcc_lo, 64, v11
 ; GFX11-NEXT:    v_and_b32_e32 v12, 0x7f, v0
-; GFX11-NEXT:    s_or_b64 s[8:9], s[2:3], s[8:9]
-; GFX11-NEXT:    v_lshrrev_b64 v[1:2], v1, s[0:1]
 ; GFX11-NEXT:    v_lshlrev_b64 v[3:4], v11, s[8:9]
+; GFX11-NEXT:    v_lshrrev_b64 v[1:2], v1, s[0:1]
 ; GFX11-NEXT:    v_dual_cndmask_b32 v5, 0, v5 :: v_dual_add_nc_u32 v0, 0xffffffc0, v11
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4)
 ; GFX11-NEXT:    v_sub_nc_u32_e32 v9, 64, v12
 ; GFX11-NEXT:    v_lshrrev_b64 v[7:8], v12, s[4:5]
 ; GFX11-NEXT:    v_cmp_eq_u32_e64 s2, 0, v12
@@ -6045,26 +6027,25 @@ define amdgpu_ps <4 x float> @v_fshr_i128_svs(i128 inreg %lhs, i128 %rhs, i128 i
 ; GFX6-NEXT:    s_lshl_b64 s[6:7], s[0:1], 1
 ; GFX6-NEXT:    s_lshl_b64 s[2:3], s[2:3], 1
 ; GFX6-NEXT:    s_lshr_b32 s0, s1, 31
-; GFX6-NEXT:    s_mov_b32 s1, 0
-; GFX6-NEXT:    s_or_b64 s[0:1], s[2:3], s[0:1]
-; GFX6-NEXT:    s_andn2_b32 s2, 0x7f, s4
+; GFX6-NEXT:    s_or_b32 s2, s2, s0
+; GFX6-NEXT:    s_andn2_b32 s0, 0x7f, s4
 ; GFX6-NEXT:    s_not_b32 s5, s4
-; GFX6-NEXT:    s_sub_i32 s12, s2, 64
-; GFX6-NEXT:    s_sub_i32 s8, 64, s2
-; GFX6-NEXT:    s_cmp_lt_u32 s2, 64
+; GFX6-NEXT:    s_sub_i32 s12, s0, 64
+; GFX6-NEXT:    s_sub_i32 s8, 64, s0
+; GFX6-NEXT:    s_cmp_lt_u32 s0, 64
 ; GFX6-NEXT:    s_cselect_b32 s13, 1, 0
-; GFX6-NEXT:    s_cmp_eq_u32 s2, 0
+; GFX6-NEXT:    s_cmp_eq_u32 s0, 0
 ; GFX6-NEXT:    s_cselect_b32 s14, 1, 0
 ; GFX6-NEXT:    s_lshr_b64 s[8:9], s[6:7], s8
-; GFX6-NEXT:    s_lshl_b64 s[10:11], s[0:1], s5
-; GFX6-NEXT:    s_lshl_b64 s[2:3], s[6:7], s5
+; GFX6-NEXT:    s_lshl_b64 s[10:11], s[2:3], s5
+; GFX6-NEXT:    s_lshl_b64 s[0:1], s[6:7], s5
 ; GFX6-NEXT:    s_or_b64 s[8:9], s[8:9], s[10:11]
 ; GFX6-NEXT:    s_lshl_b64 s[6:7], s[6:7], s12
 ; GFX6-NEXT:    s_cmp_lg_u32 s13, 0
-; GFX6-NEXT:    s_cselect_b64 s[2:3], s[2:3], 0
-; GFX6-NEXT:    s_cselect_b64 s[6:7], s[8:9], s[6:7]
+; GFX6-NEXT:    s_cselect_b64 s[10:11], s[0:1], 0
+; GFX6-NEXT:    s_cselect_b64 s[0:1], s[8:9], s[6:7]
 ; GFX6-NEXT:    s_cmp_lg_u32 s14, 0
-; GFX6-NEXT:    s_cselect_b64 s[6:7], s[0:1], s[6:7]
+; GFX6-NEXT:    s_cselect_b64 s[2:3], s[2:3], s[0:1]
 ; GFX6-NEXT:    s_and_b32 s0, s4, 0x7f
 ; GFX6-NEXT:    s_sub_i32 s1, s0, 64
 ; GFX6-NEXT:    s_sub_i32 s4, 64, s0
@@ -6073,14 +6054,14 @@ define amdgpu_ps <4 x float> @v_fshr_i128_svs(i128 inreg %lhs, i128 %rhs, i128 i
 ; GFX6-NEXT:    s_cmp_eq_u32 s0, 0
 ; GFX6-NEXT:    v_lshr_b64 v[4:5], v[0:1], s0
 ; GFX6-NEXT:    v_lshl_b64 v[6:7], v[2:3], s4
-; GFX6-NEXT:    s_cselect_b32 s8, 1, 0
+; GFX6-NEXT:    s_cselect_b32 s6, 1, 0
 ; GFX6-NEXT:    v_lshr_b64 v[8:9], v[2:3], s0
 ; GFX6-NEXT:    v_lshr_b64 v[2:3], v[2:3], s1
 ; GFX6-NEXT:    s_and_b32 s0, 1, s5
 ; GFX6-NEXT:    v_or_b32_e32 v4, v4, v6
 ; GFX6-NEXT:    v_or_b32_e32 v5, v5, v7
 ; GFX6-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s0
-; GFX6-NEXT:    s_and_b32 s0, 1, s8
+; GFX6-NEXT:    s_and_b32 s0, 1, s6
 ; GFX6-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
 ; GFX6-NEXT:    v_cndmask_b32_e32 v3, v3, v5, vcc
 ; GFX6-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, s0
@@ -6088,10 +6069,10 @@ define amdgpu_ps <4 x float> @v_fshr_i128_svs(i128 inreg %lhs, i128 %rhs, i128 i
 ; GFX6-NEXT:    v_cndmask_b32_e64 v1, v3, v1, s[0:1]
 ; GFX6-NEXT:    v_cndmask_b32_e32 v2, 0, v8, vcc
 ; GFX6-NEXT:    v_cndmask_b32_e32 v3, 0, v9, vcc
-; GFX6-NEXT:    v_or_b32_e32 v0, s2, v0
-; GFX6-NEXT:    v_or_b32_e32 v1, s3, v1
-; GFX6-NEXT:    v_or_b32_e32 v2, s6, v2
-; GFX6-NEXT:    v_or_b32_e32 v3, s7, v3
+; GFX6-NEXT:    v_or_b32_e32 v0, s10, v0
+; GFX6-NEXT:    v_or_b32_e32 v1, s11, v1
+; GFX6-NEXT:    v_or_b32_e32 v2, s2, v2
+; GFX6-NEXT:    v_or_b32_e32 v3, s3, v3
 ; GFX6-NEXT:    ; return to shader part epilog
 ;
 ; GFX8-LABEL: v_fshr_i128_svs:
@@ -6099,26 +6080,25 @@ define amdgpu_ps <4 x float> @v_fshr_i128_svs(i128 inreg %lhs, i128 %rhs, i128 i
 ; GFX8-NEXT:    s_lshl_b64 s[6:7], s[0:1], 1
 ; GFX8-NEXT:    s_lshl_b64 s[2:3], s[2:3], 1
 ; GFX8-NEXT:    s_lshr_b32 s0, s1, 31
-; GFX8-NEXT:    s_mov_b32 s1, 0
-; GFX8-NEXT:    s_or_b64 s[0:1], s[2:3], s[0:1]
-; GFX8-NEXT:    s_andn2_b32 s2, 0x7f, s4
+; GFX8-NEXT:    s_or_b32 s2, s2, s0
+; GFX8-NEXT:    s_andn2_b32 s0, 0x7f, s4
 ; GFX8-NEXT:    s_not_b32 s5, s4
-; GFX8-NEXT:    s_sub_i32 s12, s2, 64
-; GFX8-NEXT:    s_sub_i32 s8, 64, s2
-; GFX8-NEXT:    s_cmp_lt_u32 s2, 64
+; GFX8-NEXT:    s_sub_i32 s12, s0, 64
+; GFX8-NEXT:    s_sub_i32 s8, 64, s0
+; GFX8-NEXT:    s_cmp_lt_u32 s0, 64
 ; GFX8-NEXT:    s_cselect_b32 s13, 1, 0
-; GFX8-NEXT:    s_cmp_eq_u32 s2, 0
+; GFX8-NEXT:    s_cmp_eq_u32 s0, 0
 ; GFX8-NEXT:    s_cselect_b32 s14, 1, 0
 ; GFX8-NEXT:    s_lshr_b64 s[8:9], s[6:7], s8
-; GFX8-NEXT:    s_lshl_b64 s[10:11], s[0:1], s5
-; GFX8-NEXT:    s_lshl_b64 s[2:3], s[6:7], s5
+; GFX8-NEXT:    s_lshl_b64 s[10:11], s[2:3], s5
+; GFX8-NEXT:    s_lshl_b64 s[0:1], s[6:7], s5
 ; GFX8-NEXT:    s_or_b64 s[8:9], s[8:9], s[10:11]
 ; GFX8-NEXT:    s_lshl_b64 s[6:7], s[6:7], s12
 ; GFX8-NEXT:    s_cmp_lg_u32 s13, 0
-; GFX8-NEXT:    s_cselect_b64 s[2:3], s[2:3], 0
-; GFX8-NEXT:    s_cselect_b64 s[6:7], s[8:9], s[6:7]
+; GFX8-NEXT:    s_cselect_b64 s[10:11], s[0:1], 0
+; GFX8-NEXT:    s_cselect_b64 s[0:1], s[8:9], s[6:7]
 ; GFX8-NEXT:    s_cmp_lg_u32 s14, 0
-; GFX8-NEXT:    s_cselect_b64 s[6:7], s[0:1], s[6:7]
+; GFX8-NEXT:    s_cselect_b64 s[2:3], s[2:3], s[0:1]
 ; GFX8-NEXT:    s_and_b32 s0, s4, 0x7f
 ; GFX8-NEXT:    s_sub_i32 s1, s0, 64
 ; GFX8-NEXT:    s_sub_i32 s4, 64, s0
@@ -6127,14 +6107,14 @@ define amdgpu_ps <4 x float> @v_fshr_i128_svs(i128 inreg %lhs, i128 %rhs, i128 i
 ; GFX8-NEXT:    s_cmp_eq_u32 s0, 0
 ; GFX8-NEXT:    v_lshrrev_b64 v[4:5], s0, v[0:1]
 ; GFX8-NEXT:    v_lshlrev_b64 v[6:7], s4, v[2:3]
-; GFX8-NEXT:    s_cselect_b32 s8, 1, 0
+; GFX8-NEXT:    s_cselect_b32 s6, 1, 0
 ; GFX8-NEXT:    v_lshrrev_b64 v[8:9], s0, v[2:3]
 ; GFX8-NEXT:    v_lshrrev_b64 v[2:3], s1, v[2:3]
 ; GFX8-NEXT:    s_and_b32 s0, 1, s5
 ; GFX8-NEXT:    v_or_b32_e32 v4, v4, v6
 ; GFX8-NEXT:    v_or_b32_e32 v5, v5, v7
 ; GFX8-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s0
-; GFX8-NEXT:    s_and_b32 s0, 1, s8
+; GFX8-NEXT:    s_and_b32 s0, 1, s6
 ; GFX8-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
 ; GFX8-NEXT:    v_cndmask_b32_e32 v3, v3, v5, vcc
 ; GFX8-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, s0
@@ -6142,10 +6122,10 @@ define amdgpu_ps <4 x float> @v_fshr_i128_svs(i128 inreg %lhs, i128 %rhs, i128 i
 ; GFX8-NEXT:    v_cndmask_b32_e64 v1, v3, v1, s[0:1]
 ; GFX8-NEXT:    v_cndmask_b32_e32 v2, 0, v8, vcc
 ; GFX8-NEXT:    v_cndmask_b32_e32 v3, 0, v9, vcc
-; GFX8-NEXT:    v_or_b32_e32 v0, s2, v0
-; GFX8-NEXT:    v_or_b32_e32 v1, s3, v1
-; GFX8-NEXT:    v_or_b32_e32 v2, s6, v2
-; GFX8-NEXT:    v_or_b32_e32 v3, s7, v3
+; GFX8-NEXT:    v_or_b32_e32 v0, s10, v0
+; GFX8-NEXT:    v_or_b32_e32 v1, s11, v1
+; GFX8-NEXT:    v_or_b32_e32 v2, s2, v2
+; GFX8-NEXT:    v_or_b32_e32 v3, s3, v3
 ; GFX8-NEXT:    ; return to shader part epilog
 ;
 ; GFX9-LABEL: v_fshr_i128_svs:
@@ -6153,26 +6133,25 @@ define amdgpu_ps <4 x float> @v_fshr_i128_svs(i128 inreg %lhs, i128 %rhs, i128 i
 ; GFX9-NEXT:    s_lshl_b64 s[6:7], s[0:1], 1
 ; GFX9-NEXT:    s_lshl_b64 s[2:3], s[2:3], 1
 ; GFX9-NEXT:    s_lshr_b32 s0, s1, 31
-; GFX9-NEXT:    s_mov_b32 s1, 0
-; GFX9-NEXT:    s_or_b64 s[0:1], s[2:3], s[0:1]
-; GFX9-NEXT:    s_andn2_b32 s2, 0x7f, s4
+; GFX9-NEXT:    s_or_b32 s2, s2, s0
+; GFX9-NEXT:    s_andn2_b32 s0, 0x7f, s4
 ; GFX9-NEXT:    s_not_b32 s5, s4
-; GFX9-NEXT:    s_sub_i32 s12, s2, 64
-; GFX9-NEXT:    s_sub_i32 s8, 64, s2
-; GFX9-NEXT:    s_cmp_lt_u32 s2, 64
+; GFX9-NEXT:    s_sub_i32 s12, s0, 64
+; GFX9-NEXT:    s_sub_i32 s8, 64, s0
+; GFX9-NEXT:    s_cmp_lt_u32 s0, 64
 ; GFX9-NEXT:    s_cselect_b32 s13, 1, 0
-; GFX9-NEXT:    s_cmp_eq_u32 s2, 0
+; GFX9-NEXT:    s_cmp_eq_u32 s0, 0
 ; GFX9-NEXT:    s_cselect_b32 s14, 1, 0
 ; GFX9-NEXT:    s_lshr_b64 s[8:9], s[6:7], s8
-; GFX9-NEXT:    s_lshl_b64 s[10:11], s[0:1], s5
-; GFX9-NEXT:    s_lshl_b64 s[2:3], s[6:7], s5
+; GFX9-NEXT:    s_lshl_b64 s[10:11], s[2:3], s5
+; GFX9-NEXT:    s_lshl_b64 s[0:1], s[6:7], s5
 ; GFX9-NEXT:    s_or_b64 s[8:9], s[8:9], s[10:11]
 ; GFX9-NEXT:    s_lshl_b64 s[6:7], s[6:7], s12
 ; GFX9-NEXT:    s_cmp_lg_u32 s13, 0
-; GFX9-NEXT:    s_cselect_b64 s[2:3], s[2:3], 0
-; GFX9-NEXT:    s_cselect_b64 s[6:7], s[8:9], s[6:7]
+; GFX9-NEXT:    s_cselect_b64 s[10:11], s[0:1], 0
+; GFX9-NEXT:    s_cselect_b64 s[0:1], s[8:9], s[6:7]
 ; GFX9-NEXT:    s_cmp_lg_u32 s14, 0
-; GFX9-NEXT:    s_cselect_b64 s[6:7], s[0:1], s[6:7]
+; GFX9-NEXT:    s_cselect_b64 s[2:3], s[2:3], s[0:1]
 ; GFX9-NEXT:    s_and_b32 s0, s4, 0x7f
 ; GFX9-NEXT:    s_sub_i32 s1, s0, 64
 ; GFX9-NEXT:    s_sub_i32 s4, 64, s0
@@ -6181,14 +6160,14 @@ define amdgpu_ps <4 x float> @v_fshr_i128_svs(i128 inreg %lhs, i128 %rhs, i128 i
 ; GFX9-NEXT:    s_cmp_eq_u32 s0, 0
 ; GFX9-NEXT:    v_lshrrev_b64 v[4:5], s0, v[0:1]
 ; GFX9-NEXT:    v_lshlrev_b64 v[6:7], s4, v[2:3]
-; GFX9-NEXT:    s_cselect_b32 s8, 1, 0
+; GFX9-NEXT:    s_cselect_b32 s6, 1, 0
 ; GFX9-NEXT:    v_lshrrev_b64 v[8:9], s0, v[2:3]
 ; GFX9-NEXT:    v_lshrrev_b64 v[2:3], s1, v[2:3]
 ; GFX9-NEXT:    s_and_b32 s0, 1, s5
 ; GFX9-NEXT:    v_or_b32_e32 v4, v4, v6
 ; GFX9-NEXT:    v_or_b32_e32 v5, v5, v7
 ; GFX9-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s0
-; GFX9-NEXT:    s_and_b32 s0, 1, s8
+; GFX9-NEXT:    s_and_b32 s0, 1, s6
 ; GFX9-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e32 v3, v3, v5, vcc
 ; GFX9-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, s0
@@ -6196,20 +6175,19 @@ define amdgpu_ps <4 x float> @v_fshr_i128_svs(i128 inreg %lhs, i128 %rhs, i128 i
 ; GFX9-NEXT:    v_cndmask_b32_e64 v1, v3, v1, s[0:1]
 ; GFX9-NEXT:    v_cndmask_b32_e32 v2, 0, v8, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e32 v3, 0, v9, vcc
-; GFX9-NEXT:    v_or_b32_e32 v0, s2, v0
-; GFX9-NEXT:    v_or_b32_e32 v1, s3, v1
-; GFX9-NEXT:    v_or_b32_e32 v2, s6, v2
-; GFX9-NEXT:    v_or_b32_e32 v3, s7, v3
+; GFX9-NEXT:    v_or_b32_e32 v0, s10, v0
+; GFX9-NEXT:    v_or_b32_e32 v1, s11, v1
+; GFX9-NEXT:    v_or_b32_e32 v2, s2, v2
+; GFX9-NEXT:    v_or_b32_e32 v3, s3, v3
 ; GFX9-NEXT:    ; return to shader part epilog
 ;
 ; GFX10-LABEL: v_fshr_i128_svs:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_lshl_b64 s[2:3], s[2:3], 1
-; GFX10-NEXT:    s_lshr_b32 s6, s1, 31
-; GFX10-NEXT:    s_mov_b32 s7, 0
-; GFX10-NEXT:    s_andn2_b32 s5, 0x7f, s4
+; GFX10-NEXT:    s_lshr_b32 s5, s1, 31
 ; GFX10-NEXT:    s_lshl_b64 s[0:1], s[0:1], 1
-; GFX10-NEXT:    s_or_b64 s[2:3], s[2:3], s[6:7]
+; GFX10-NEXT:    s_or_b32 s2, s2, s5
+; GFX10-NEXT:    s_andn2_b32 s5, 0x7f, s4
 ; GFX10-NEXT:    s_not_b32 s10, s4
 ; GFX10-NEXT:    s_sub_i32 s12, s5, 64
 ; GFX10-NEXT:    s_sub_i32 s6, 64, s5
@@ -6259,11 +6237,10 @@ define amdgpu_ps <4 x float> @v_fshr_i128_svs(i128 inreg %lhs, i128 %rhs, i128 i
 ; GFX11-LABEL: v_fshr_i128_svs:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_lshl_b64 s[2:3], s[2:3], 1
-; GFX11-NEXT:    s_lshr_b32 s6, s1, 31
-; GFX11-NEXT:    s_mov_b32 s7, 0
-; GFX11-NEXT:    s_and_not1_b32 s5, 0x7f, s4
+; GFX11-NEXT:    s_lshr_b32 s5, s1, 31
 ; GFX11-NEXT:    s_lshl_b64 s[0:1], s[0:1], 1
-; GFX11-NEXT:    s_or_b64 s[2:3], s[2:3], s[6:7]
+; GFX11-NEXT:    s_or_b32 s2, s2, s5
+; GFX11-NEXT:    s_and_not1_b32 s5, 0x7f, s4
 ; GFX11-NEXT:    s_not_b32 s10, s4
 ; GFX11-NEXT:    s_sub_i32 s12, s5, 64
 ; GFX11-NEXT:    s_sub_i32 s6, 64, s5
@@ -6714,81 +6691,80 @@ define i128 @v_fshr_i128_65(i128 %lhs, i128 %rhs) {
 define amdgpu_ps <2 x i128> @s_fshr_v2i128(<2 x i128> inreg %lhs, <2 x i128> inreg %rhs, <2 x i128> inreg %amt) {
 ; GFX6-LABEL: s_fshr_v2i128:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_lshl_b64 s[2:3], s[2:3], 1
-; GFX6-NEXT:    s_lshr_b32 s22, s1, 31
-; GFX6-NEXT:    s_mov_b32 s23, 0
 ; GFX6-NEXT:    s_lshl_b64 s[18:19], s[0:1], 1
-; GFX6-NEXT:    s_or_b64 s[0:1], s[2:3], s[22:23]
-; GFX6-NEXT:    s_andn2_b32 s2, 0x7f, s16
+; GFX6-NEXT:    s_lshl_b64 s[2:3], s[2:3], 1
+; GFX6-NEXT:    s_lshr_b32 s0, s1, 31
+; GFX6-NEXT:    s_or_b32 s2, s2, s0
+; GFX6-NEXT:    s_andn2_b32 s0, 0x7f, s16
 ; GFX6-NEXT:    s_not_b32 s17, s16
-; GFX6-NEXT:    s_sub_i32 s21, s2, 64
-; GFX6-NEXT:    s_sub_i32 s22, 64, s2
-; GFX6-NEXT:    s_cmp_lt_u32 s2, 64
-; GFX6-NEXT:    s_cselect_b32 s28, 1, 0
-; GFX6-NEXT:    s_cmp_eq_u32 s2, 0
-; GFX6-NEXT:    s_cselect_b32 s29, 1, 0
-; GFX6-NEXT:    s_lshr_b64 s[24:25], s[18:19], s22
-; GFX6-NEXT:    s_lshl_b64 s[26:27], s[0:1], s17
-; GFX6-NEXT:    s_lshl_b64 s[2:3], s[18:19], s17
-; GFX6-NEXT:    s_or_b64 s[24:25], s[24:25], s[26:27]
-; GFX6-NEXT:    s_lshl_b64 s[18:19], s[18:19], s21
-; GFX6-NEXT:    s_cmp_lg_u32 s28, 0
-; GFX6-NEXT:    s_cselect_b64 s[2:3], s[2:3], 0
-; GFX6-NEXT:    s_cselect_b64 s[18:19], s[24:25], s[18:19]
-; GFX6-NEXT:    s_cmp_lg_u32 s29, 0
-; GFX6-NEXT:    s_cselect_b64 s[18:19], s[0:1], s[18:19]
-; GFX6-NEXT:    s_and_b32 s0, s16, 0x7f
 ; GFX6-NEXT:    s_sub_i32 s21, s0, 64
 ; GFX6-NEXT:    s_sub_i32 s22, 64, s0
 ; GFX6-NEXT:    s_cmp_lt_u32 s0, 64
 ; GFX6-NEXT:    s_cselect_b32 s26, 1, 0
 ; GFX6-NEXT:    s_cmp_eq_u32 s0, 0
 ; GFX6-NEXT:    s_cselect_b32 s27, 1, 0
-; GFX6-NEXT:    s_lshr_b64 s[0:1], s[10:11], s16
+; GFX6-NEXT:    s_lshr_b64 s[22:23], s[18:19], s22
+; GFX6-NEXT:    s_lshl_b64 s[24:25], s[2:3], s17
+; GFX6-NEXT:    s_lshl_b64 s[0:1], s[18:19], s17
+; GFX6-NEXT:    s_or_b64 s[22:23], s[22:23], s[24:25]
+; GFX6-NEXT:    s_lshl_b64 s[18:19], s[18:19], s21
+; GFX6-NEXT:    s_cmp_lg_u32 s26, 0
+; GFX6-NEXT:    s_cselect_b64 s[0:1], s[0:1], 0
+; GFX6-NEXT:    s_cselect_b64 s[18:19], s[22:23], s[18:19]
+; GFX6-NEXT:    s_cmp_lg_u32 s27, 0
+; GFX6-NEXT:    s_cselect_b64 s[2:3], s[2:3], s[18:19]
+; GFX6-NEXT:    s_and_b32 s17, s16, 0x7f
+; GFX6-NEXT:    s_sub_i32 s21, s17, 64
+; GFX6-NEXT:    s_sub_i32 s22, 64, s17
+; GFX6-NEXT:    s_cmp_lt_u32 s17, 64
+; GFX6-NEXT:    s_cselect_b32 s24, 1, 0
+; GFX6-NEXT:    s_cmp_eq_u32 s17, 0
+; GFX6-NEXT:    s_cselect_b32 s25, 1, 0
+; GFX6-NEXT:    s_lshr_b64 s[18:19], s[10:11], s16
 ; GFX6-NEXT:    s_lshr_b64 s[16:17], s[8:9], s16
-; GFX6-NEXT:    s_lshl_b64 s[24:25], s[10:11], s22
-; GFX6-NEXT:    s_or_b64 s[16:17], s[16:17], s[24:25]
+; GFX6-NEXT:    s_lshl_b64 s[22:23], s[10:11], s22
+; GFX6-NEXT:    s_or_b64 s[16:17], s[16:17], s[22:23]
 ; GFX6-NEXT:    s_lshr_b64 s[10:11], s[10:11], s21
-; GFX6-NEXT:    s_cmp_lg_u32 s26, 0
+; GFX6-NEXT:    s_cmp_lg_u32 s24, 0
 ; GFX6-NEXT:    s_cselect_b64 s[10:11], s[16:17], s[10:11]
-; GFX6-NEXT:    s_cmp_lg_u32 s27, 0
+; GFX6-NEXT:    s_cmp_lg_u32 s25, 0
 ; GFX6-NEXT:    s_cselect_b64 s[8:9], s[8:9], s[10:11]
-; GFX6-NEXT:    s_cmp_lg_u32 s26, 0
-; GFX6-NEXT:    s_cselect_b64 s[10:11], s[0:1], 0
-; GFX6-NEXT:    s_lshl_b64 s[6:7], s[6:7], 1
-; GFX6-NEXT:    s_lshr_b32 s22, s5, 31
-; GFX6-NEXT:    s_or_b64 s[0:1], s[2:3], s[8:9]
+; GFX6-NEXT:    s_cmp_lg_u32 s24, 0
+; GFX6-NEXT:    s_cselect_b64 s[10:11], s[18:19], 0
+; GFX6-NEXT:    s_or_b64 s[0:1], s[0:1], s[8:9]
 ; GFX6-NEXT:    s_lshl_b64 s[8:9], s[4:5], 1
-; GFX6-NEXT:    s_or_b64 s[4:5], s[6:7], s[22:23]
-; GFX6-NEXT:    s_andn2_b32 s6, 0x7f, s20
-; GFX6-NEXT:    s_or_b64 s[2:3], s[18:19], s[10:11]
+; GFX6-NEXT:    s_lshl_b64 s[6:7], s[6:7], 1
+; GFX6-NEXT:    s_lshr_b32 s4, s5, 31
+; GFX6-NEXT:    s_or_b32 s6, s6, s4
+; GFX6-NEXT:    s_andn2_b32 s4, 0x7f, s20
+; GFX6-NEXT:    s_or_b64 s[2:3], s[2:3], s[10:11]
 ; GFX6-NEXT:    s_not_b32 s16, s20
-; GFX6-NEXT:    s_sub_i32 s18, s6, 64
-; GFX6-NEXT:    s_sub_i32 s10, 64, s6
-; GFX6-NEXT:    s_cmp_lt_u32 s6, 64
+; GFX6-NEXT:    s_sub_i32 s18, s4, 64
+; GFX6-NEXT:    s_sub_i32 s10, 64, s4
+; GFX6-NEXT:    s_cmp_lt_u32 s4, 64
 ; GFX6-NEXT:    s_cselect_b32 s19, 1, 0
-; GFX6-NEXT:    s_cmp_eq_u32 s6, 0
+; GFX6-NEXT:    s_cmp_eq_u32 s4, 0
 ; GFX6-NEXT:    s_cselect_b32 s21, 1, 0
-; GFX6-NEXT:    s_lshl_b64 s[6:7], s[8:9], s16
+; GFX6-NEXT:    s_lshl_b64 s[4:5], s[8:9], s16
 ; GFX6-NEXT:    s_lshr_b64 s[10:11], s[8:9], s10
-; GFX6-NEXT:    s_lshl_b64 s[16:17], s[4:5], s16
+; GFX6-NEXT:    s_lshl_b64 s[16:17], s[6:7], s16
 ; GFX6-NEXT:    s_or_b64 s[10:11], s[10:11], s[16:17]
 ; GFX6-NEXT:    s_lshl_b64 s[8:9], s[8:9], s18
 ; GFX6-NEXT:    s_cmp_lg_u32 s19, 0
-; GFX6-NEXT:    s_cselect_b64 s[6:7], s[6:7], 0
+; GFX6-NEXT:    s_cselect_b64 s[4:5], s[4:5], 0
 ; GFX6-NEXT:    s_cselect_b64 s[8:9], s[10:11], s[8:9]
 ; GFX6-NEXT:    s_cmp_lg_u32 s21, 0
-; GFX6-NEXT:    s_cselect_b64 s[8:9], s[4:5], s[8:9]
-; GFX6-NEXT:    s_and_b32 s4, s20, 0x7f
-; GFX6-NEXT:    s_sub_i32 s18, s4, 64
-; GFX6-NEXT:    s_sub_i32 s16, 64, s4
-; GFX6-NEXT:    s_cmp_lt_u32 s4, 64
+; GFX6-NEXT:    s_cselect_b64 s[6:7], s[6:7], s[8:9]
+; GFX6-NEXT:    s_and_b32 s8, s20, 0x7f
+; GFX6-NEXT:    s_sub_i32 s18, s8, 64
+; GFX6-NEXT:    s_sub_i32 s16, 64, s8
+; GFX6-NEXT:    s_cmp_lt_u32 s8, 64
 ; GFX6-NEXT:    s_cselect_b32 s19, 1, 0
-; GFX6-NEXT:    s_cmp_eq_u32 s4, 0
+; GFX6-NEXT:    s_cmp_eq_u32 s8, 0
 ; GFX6-NEXT:    s_cselect_b32 s21, 1, 0
 ; GFX6-NEXT:    s_lshr_b64 s[10:11], s[12:13], s20
 ; GFX6-NEXT:    s_lshl_b64 s[16:17], s[14:15], s16
-; GFX6-NEXT:    s_lshr_b64 s[4:5], s[14:15], s20
+; GFX6-NEXT:    s_lshr_b64 s[8:9], s[14:15], s20
 ; GFX6-NEXT:    s_or_b64 s[10:11], s[10:11], s[16:17]
 ; GFX6-NEXT:    s_lshr_b64 s[14:15], s[14:15], s18
 ; GFX6-NEXT:    s_cmp_lg_u32 s19, 0
@@ -6796,88 +6772,87 @@ define amdgpu_ps <2 x i128> @s_fshr_v2i128(<2 x i128> inreg %lhs, <2 x i128> inr
 ; GFX6-NEXT:    s_cmp_lg_u32 s21, 0
 ; GFX6-NEXT:    s_cselect_b64 s[10:11], s[12:13], s[10:11]
 ; GFX6-NEXT:    s_cmp_lg_u32 s19, 0
-; GFX6-NEXT:    s_cselect_b64 s[12:13], s[4:5], 0
-; GFX6-NEXT:    s_or_b64 s[4:5], s[6:7], s[10:11]
-; GFX6-NEXT:    s_or_b64 s[6:7], s[8:9], s[12:13]
+; GFX6-NEXT:    s_cselect_b64 s[8:9], s[8:9], 0
+; GFX6-NEXT:    s_or_b64 s[4:5], s[4:5], s[10:11]
+; GFX6-NEXT:    s_or_b64 s[6:7], s[6:7], s[8:9]
 ; GFX6-NEXT:    ; return to shader part epilog
 ;
 ; GFX8-LABEL: s_fshr_v2i128:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_lshl_b64 s[2:3], s[2:3], 1
-; GFX8-NEXT:    s_lshr_b32 s22, s1, 31
-; GFX8-NEXT:    s_mov_b32 s23, 0
 ; GFX8-NEXT:    s_lshl_b64 s[18:19], s[0:1], 1
-; GFX8-NEXT:    s_or_b64 s[0:1], s[2:3], s[22:23]
-; GFX8-NEXT:    s_andn2_b32 s2, 0x7f, s16
+; GFX8-NEXT:    s_lshl_b64 s[2:3], s[2:3], 1
+; GFX8-NEXT:    s_lshr_b32 s0, s1, 31
+; GFX8-NEXT:    s_or_b32 s2, s2, s0
+; GFX8-NEXT:    s_andn2_b32 s0, 0x7f, s16
 ; GFX8-NEXT:    s_not_b32 s17, s16
-; GFX8-NEXT:    s_sub_i32 s21, s2, 64
-; GFX8-NEXT:    s_sub_i32 s22, 64, s2
-; GFX8-NEXT:    s_cmp_lt_u32 s2, 64
-; GFX8-NEXT:    s_cselect_b32 s28, 1, 0
-; GFX8-NEXT:    s_cmp_eq_u32 s2, 0
-; GFX8-NEXT:    s_cselect_b32 s29, 1, 0
-; GFX8-NEXT:    s_lshr_b64 s[24:25], s[18:19], s22
-; GFX8-NEXT:    s_lshl_b64 s[26:27], s[0:1], s17
-; GFX8-NEXT:    s_lshl_b64 s[2:3], s[18:19], s17
-; GFX8-NEXT:    s_or_b64 s[24:25], s[24:25], s[26:27]
-; GFX8-NEXT:    s_lshl_b64 s[18:19], s[18:19], s21
-; GFX8-NEXT:    s_cmp_lg_u32 s28, 0
-; GFX8-NEXT:    s_cselect_b64 s[2:3], s[2:3], 0
-; GFX8-NEXT:    s_cselect_b64 s[18:19], s[24:25], s[18:19]
-; GFX8-NEXT:    s_cmp_lg_u32 s29, 0
-; GFX8-NEXT:    s_cselect_b64 s[18:19], s[0:1], s[18:19]
-; GFX8-NEXT:    s_and_b32 s0, s16, 0x7f
 ; GFX8-NEXT:    s_sub_i32 s21, s0, 64
 ; GFX8-NEXT:    s_sub_i32 s22, 64, s0
 ; GFX8-NEXT:    s_cmp_lt_u32 s0, 64
 ; GFX8-NEXT:    s_cselect_b32 s26, 1, 0
 ; GFX8-NEXT:    s_cmp_eq_u32 s0, 0
 ; GFX8-NEXT:    s_cselect_b32 s27, 1, 0
-; GFX8-NEXT:    s_lshr_b64 s[0:1], s[10:11], s16
+; GFX8-NEXT:    s_lshr_b64 s[22:23], s[18:19], s22
+; GFX8-NEXT:    s_lshl_b64 s[24:25], s[2:3], s17
+; GFX8-NEXT:    s_lshl_b64 s[0:1], s[18:19], s17
+; GFX8-NEXT:    s_or_b64 s[22:23], s[22:23], s[24:25]
+; GFX8-NEXT:    s_lshl_b64 s[18:19], s[18:19], s21
+; GFX8-NEXT:    s_cmp_lg_u32 s26, 0
+; GFX8-NEXT:    s_cselect_b64 s[0:1], s[0:1], 0
+; GFX8-NEXT:    s_cselect_b64 s[18:19], s[22:23], s[18:19]
+; GFX8-NEXT:    s_cmp_lg_u32 s27, 0
+; GFX8-NEXT:    s_cselect_b64 s[2:3], s[2:3], s[18:19]
+; GFX8-NEXT:    s_and_b32 s17, s16, 0x7f
+; GFX8-NEXT:    s_sub_i32 s21, s17, 64
+; GFX8-NEXT:    s_sub_i32 s22, 64, s17
+; GFX8-NEXT:    s_cmp_lt_u32 s17, 64
+; GFX8-NEXT:    s_cselect_b32 s24, 1, 0
+; GFX8-NEXT:    s_cmp_eq_u32 s17, 0
+; GFX8-NEXT:    s_cselect_b32 s25, 1, 0
+; GFX8-NEXT:    s_lshr_b64 s[18:19], s[10:11], s16
 ; GFX8-NEXT:    s_lshr_b64 s[16:17], s[8:9], s16
-; GFX8-NEXT:    s_lshl_b64 s[24:25], s[10:11], s22
-; GFX8-NEXT:    s_or_b64 s[16:17], s[16:17], s[24:25]
+; GFX8-NEXT:    s_lshl_b64 s[22:23], s[10:11], s22
+; GFX8-NEXT:    s_or_b64 s[16:17], s[16:17], s[22:23]
 ; GFX8-NEXT:    s_lshr_b64 s[10:11], s[10:11], s21
-; GFX8-NEXT:    s_cmp_lg_u32 s26, 0
+; GFX8-NEXT:    s_cmp_lg_u32 s24, 0
 ; GFX8-NEXT:    s_cselect_b64 s[10:11], s[16:17], s[10:11]
-; GFX8-NEXT:    s_cmp_lg_u32 s27, 0
+; GFX8-NEXT:    s_cmp_lg_u32 s25, 0
 ; GFX8-NEXT:    s_cselect_b64 s[8:9], s[8:9], s[10:11]
-; GFX8-NEXT:    s_cmp_lg_u32 s26, 0
-; GFX8-NEXT:    s_cselect_b64 s[10:11], s[0:1], 0
-; GFX8-NEXT:    s_lshl_b64 s[6:7], s[6:7], 1
-; GFX8-NEXT:    s_lshr_b32 s22, s5, 31
-; GFX8-NEXT:    s_or_b64 s[0:1], s[2:3], s[8:9]
+; GFX8-NEXT:    s_cmp_lg_u32 s24, 0
+; GFX8-NEXT:    s_cselect_b64 s[10:11], s[18:19], 0
+; GFX8-NEXT:    s_or_b64 s[0:1], s[0:1], s[8:9]
 ; GFX8-NEXT:    s_lshl_b64 s[8:9], s[4:5], 1
-; GFX8-NEXT:    s_or_b64 s[4:5], s[6:7], s[22:23]
-; GFX8-NEXT:    s_andn2_b32 s6, 0x7f, s20
-; GFX8-NEXT:    s_or_b64 s[2:3], s[18:19], s[10:11]
+; GFX8-NEXT:    s_lshl_b64 s[6:7], s[6:7], 1
+; GFX8-NEXT:    s_lshr_b32 s4, s5, 31
+; GFX8-NEXT:    s_or_b32 s6, s6, s4
+; GFX8-NEXT:    s_andn2_b32 s4, 0x7f, s20
+; GFX8-NEXT:    s_or_b64 s[2:3], s[2:3], s[10:11]
 ; GFX8-NEXT:    s_not_b32 s16, s20
-; GFX8-NEXT:    s_sub_i32 s18, s6, 64
-; GFX8-NEXT:    s_sub_i32 s10, 64, s6
-; GFX8-NEXT:    s_cmp_lt_u32 s6, 64
+; GFX8-NEXT:    s_sub_i32 s18, s4, 64
+; GFX8-NEXT:    s_sub_i32 s10, 64, s4
+; GFX8-NEXT:    s_cmp_lt_u32 s4, 64
 ; GFX8-NEXT:    s_cselect_b32 s19, 1, 0
-; GFX8-NEXT:    s_cmp_eq_u32 s6, 0
+; GFX8-NEXT:    s_cmp_eq_u32 s4, 0
 ; GFX8-NEXT:    s_cselect_b32 s21, 1, 0
-; GFX8-NEXT:    s_lshl_b64 s[6:7], s[8:9], s16
+; GFX8-NEXT:    s_lshl_b64 s[4:5], s[8:9], s16
 ; GFX8-NEXT:    s_lshr_b64 s[10:11], s[8:9], s10
-; GFX8-NEXT:    s_lshl_b64 s[16:17], s[4:5], s16
+; GFX8-NEXT:    s_lshl_b64 s[16:17], s[6:7], s16
 ; GFX8-NEXT:    s_or_b64 s[10:11], s[10:11], s[16:17]
 ; GFX8-NEXT:    s_lshl_b64 s[8:9], s[8:9], s18
 ; GFX8-NEXT:    s_cmp_lg_u32 s19, 0
-; GFX8-NEXT:    s_cselect_b64 s[6:7], s[6:7], 0
+; GFX8-NEXT:    s_cselect_b64 s[4:5], s[4:5], 0
 ; GFX8-NEXT:    s_cselect_b64 s[8:9], s[10:11], s[8:9]
 ; GFX8-NEXT:    s_cmp_lg_u32 s21, 0
-; GFX8-NEXT:    s_cselect_b64 s[8:9], s[4:5], s[8:9]
-; GFX8-NEXT:    s_and_b32 s4, s20, 0x7f
-; GFX8-NEXT:    s_sub_i32 s18, s4, 64
-; GFX8-NEXT:    s_sub_i32 s16, 64, s4
-; GFX8-NEXT:    s_cmp_lt_u32 s4, 64
+; GFX8-NEXT:    s_cselect_b64 s[6:7], s[6:7], s[8:9]
+; GFX8-NEXT:    s_and_b32 s8, s20, 0x7f
+; GFX8-NEXT:    s_sub_i32 s18, s8, 64
+; GFX8-NEXT:    s_sub_i32 s16, 64, s8
+; GFX8-NEXT:    s_cmp_lt_u32 s8, 64
 ; GFX8-NEXT:    s_cselect_b32 s19, 1, 0
-; GFX8-NEXT:    s_cmp_eq_u32 s4, 0
+; GFX8-NEXT:    s_cmp_eq_u32 s8, 0
 ; GFX8-NEXT:    s_cselect_b32 s21, 1, 0
 ; GFX8-NEXT:    s_lshr_b64 s[10:11], s[12:13], s20
 ; GFX8-NEXT:    s_lshl_b64 s[16:17], s[14:15], s16
-; GFX8-NEXT:    s_lshr_b64 s[4:5], s[14:15], s20
+; GFX8-NEXT:    s_lshr_b64 s[8:9], s[14:15], s20
 ; GFX8-NEXT:    s_or_b64 s[10:11], s[10:11], s[16:17]
 ; GFX8-NEXT:    s_lshr_b64 s[14:15], s[14:15], s18
 ; GFX8-NEXT:    s_cmp_lg_u32 s19, 0
@@ -6885,88 +6860,87 @@ define amdgpu_ps <2 x i128> @s_fshr_v2i128(<2 x i128> inreg %lhs, <2 x i128> inr
 ; GFX8-NEXT:    s_cmp_lg_u32 s21, 0
 ; GFX8-NEXT:    s_cselect_b64 s[10:11], s[12:13], s[10:11]
 ; GFX8-NEXT:    s_cmp_lg_u32 s19, 0
-; GFX8-NEXT:    s_cselect_b64 s[12:13], s[4:5], 0
-; GFX8-NEXT:    s_or_b64 s[4:5], s[6:7], s[10:11]
-; GFX8-NEXT:    s_or_b64 s[6:7], s[8:9], s[12:13]
+; GFX8-NEXT:    s_cselect_b64 s[8:9], s[8:9], 0
+; GFX8-NEXT:    s_or_b64 s[4:5], s[4:5], s[10:11]
+; GFX8-NEXT:    s_or_b64 s[6:7], s[6:7], s[8:9]
 ; GFX8-NEXT:    ; return to shader part epilog
 ;
 ; GFX9-LABEL: s_fshr_v2i128:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_lshl_b64 s[2:3], s[2:3], 1
-; GFX9-NEXT:    s_lshr_b32 s22, s1, 31
-; GFX9-NEXT:    s_mov_b32 s23, 0
 ; GFX9-NEXT:    s_lshl_b64 s[18:19], s[0:1], 1
-; GFX9-NEXT:    s_or_b64 s[0:1], s[2:3], s[22:23]
-; GFX9-NEXT:    s_andn2_b32 s2, 0x7f, s16
+; GFX9-NEXT:    s_lshl_b64 s[2:3], s[2:3], 1
+; GFX9-NEXT:    s_lshr_b32 s0, s1, 31
+; GFX9-NEXT:    s_or_b32 s2, s2, s0
+; GFX9-NEXT:    s_andn2_b32 s0, 0x7f, s16
 ; GFX9-NEXT:    s_not_b32 s17, s16
-; GFX9-NEXT:    s_sub_i32 s21, s2, 64
-; GFX9-NEXT:    s_sub_i32 s22, 64, s2
-; GFX9-NEXT:    s_cmp_lt_u32 s2, 64
-; GFX9-NEXT:    s_cselect_b32 s28, 1, 0
-; GFX9-NEXT:    s_cmp_eq_u32 s2, 0
-; GFX9-NEXT:    s_cselect_b32 s29, 1, 0
-; GFX9-NEXT:    s_lshr_b64 s[24:25], s[18:19], s22
-; GFX9-NEXT:    s_lshl_b64 s[26:27], s[0:1], s17
-; GFX9-NEXT:    s_lshl_b64 s[2:3], s[18:19], s17
-; GFX9-NEXT:    s_or_b64 s[24:25], s[24:25], s[26:27]
-; GFX9-NEXT:    s_lshl_b64 s[18:19], s[18:19], s21
-; GFX9-NEXT:    s_cmp_lg_u32 s28, 0
-; GFX9-NEXT:    s_cselect_b64 s[2:3], s[2:3], 0
-; GFX9-NEXT:    s_cselect_b64 s[18:19], s[24:25], s[18:19]
-; GFX9-NEXT:    s_cmp_lg_u32 s29, 0
-; GFX9-NEXT:    s_cselect_b64 s[18:19], s[0:1], s[18:19]
-; GFX9-NEXT:    s_and_b32 s0, s16, 0x7f
 ; GFX9-NEXT:    s_sub_i32 s21, s0, 64
 ; GFX9-NEXT:    s_sub_i32 s22, 64, s0
 ; GFX9-NEXT:    s_cmp_lt_u32 s0, 64
 ; GFX9-NEXT:    s_cselect_b32 s26, 1, 0
 ; GFX9-NEXT:    s_cmp_eq_u32 s0, 0
 ; GFX9-NEXT:    s_cselect_b32 s27, 1, 0
-; GFX9-NEXT:    s_lshr_b64 s[0:1], s[10:11], s16
+; GFX9-NEXT:    s_lshr_b64 s[22:23], s[18:19], s22
+; GFX9-NEXT:    s_lshl_b64 s[24:25], s[2:3], s17
+; GFX9-NEXT:    s_lshl_b64 s[0:1], s[18:19], s17
+; GFX9-NEXT:    s_or_b64 s[22:23], s[22:23], s[24:25]
+; GFX9-NEXT:    s_lshl_b64 s[18:19], s[18:19], s21
+; GFX9-NEXT:    s_cmp_lg_u32 s26, 0
+; GFX9-NEXT:    s_cselect_b64 s[0:1], s[0:1], 0
+; GFX9-NEXT:    s_cselect_b64 s[18:19], s[22:23], s[18:19]
+; GFX9-NEXT:    s_cmp_lg_u32 s27, 0
+; GFX9-NEXT:    s_cselect_b64 s[2:3], s[2:3], s[18:19]
+; GFX9-NEXT:    s_and_b32 s17, s16, 0x7f
+; GFX9-NEXT:    s_sub_i32 s21, s17, 64
+; GFX9-NEXT:    s_sub_i32 s22, 64, s17
+; GFX9-NEXT:    s_cmp_lt_u32 s17, 64
+; GFX9-NEXT:    s_cselect_b32 s24, 1, 0
+; GFX9-NEXT:    s_cmp_eq_u32 s17, 0
+; GFX9-NEXT:    s_cselect_b32 s25, 1, 0
+; GFX9-NEXT:    s_lshr_b64 s[18:19], s[10:11], s16
 ; GFX9-NEXT:    s_lshr_b64 s[16:17], s[8:9], s16
-; GFX9-NEXT:    s_lshl_b64 s[24:25], s[10:11], s22
-; GFX9-NEXT:    s_or_b64 s[16:17], s[16:17], s[24:25]
+; GFX9-NEXT:    s_lshl_b64 s[22:23], s[10:11], s22
+; GFX9-NEXT:    s_or_b64 s[16:17], s[16:17], s[22:23]
 ; GFX9-NEXT:    s_lshr_b64 s[10:11], s[10:11], s21
-; GFX9-NEXT:    s_cmp_lg_u32 s26, 0
+; GFX9-NEXT:    s_cmp_lg_u32 s24, 0
 ; GFX9-NEXT:    s_cselect_b64 s[10:11], s[16:17], s[10:11]
-; GFX9-NEXT:    s_cmp_lg_u32 s27, 0
+; GFX9-NEXT:    s_cmp_lg_u32 s25, 0
 ; GFX9-NEXT:    s_cselect_b64 s[8:9], s[8:9], s[10:11]
-; GFX9-NEXT:    s_cmp_lg_u32 s26, 0
-; GFX9-NEXT:    s_cselect_b64 s[10:11], s[0:1], 0
-; GFX9-NEXT:    s_lshl_b64 s[6:7], s[6:7], 1
-; GFX9-NEXT:    s_lshr_b32 s22, s5, 31
-; GFX9-NEXT:    s_or_b64 s[0:1], s[2:3], s[8:9]
+; GFX9-NEXT:    s_cmp_lg_u32 s24, 0
+; GFX9-NEXT:    s_cselect_b64 s[10:11], s[18:19], 0
+; GFX9-NEXT:    s_or_b64 s[0:1], s[0:1], s[8:9]
 ; GFX9-NEXT:    s_lshl_b64 s[8:9], s[4:5], 1
-; GFX9-NEXT:    s_or_b64 s[4:5], s[6:7], s[22:23]
-; GFX9-NEXT:    s_andn2_b32 s6, 0x7f, s20
-; GFX9-NEXT:    s_or_b64 s[2:3], s[18:19], s[10:11]
+; GFX9-NEXT:    s_lshl_b64 s[6:7], s[6:7], 1
+; GFX9-NEXT:    s_lshr_b32 s4, s5, 31
+; GFX9-NEXT:    s_or_b32 s6, s6, s4
+; GFX9-NEXT:    s_andn2_b32 s4, 0x7f, s20
+; GFX9-NEXT:    s_or_b64 s[2:3], s[2:3], s[10:11]
 ; GFX9-NEXT:    s_not_b32 s16, s20
-; GFX9-NEXT:    s_sub_i32 s18, s6, 64
-; GFX9-NEXT:    s_sub_i32 s10, 64, s6
-; GFX9-NEXT:    s_cmp_lt_u32 s6, 64
+; GFX9-NEXT:    s_sub_i32 s18, s4, 64
+; GFX9-NEXT:    s_sub_i32 s10, 64, s4
+; GFX9-NEXT:    s_cmp_lt_u32 s4, 64
 ; GFX9-NEXT:    s_cselect_b32 s19, 1, 0
-; GFX9-NEXT:    s_cmp_eq_u32 s6, 0
+; GFX9-NEXT:    s_cmp_eq_u32 s4, 0
 ; GFX9-NEXT:    s_cselect_b32 s21, 1, 0
-; GFX9-NEXT:    s_lshl_b64 s[6:7], s[8:9], s16
+; GFX9-NEXT:    s_lshl_b64 s[4:5], s[8:9], s16
 ; GFX9-NEXT:    s_lshr_b64 s[10:11], s[8:9], s10
-; GFX9-NEXT:    s_lshl_b64 s[16:17], s[4:5], s16
+; GFX9-NEXT:    s_lshl_b64 s[16:17], s[6:7], s16
 ; GFX9-NEXT:    s_or_b64 s[10:11], s[10:11], s[16:17]
 ; GFX9-NEXT:    s_lshl_b64 s[8:9], s[8:9], s18
 ; GFX9-NEXT:    s_cmp_lg_u32 s19, 0
-; GFX9-NEXT:    s_cselect_b64 s[6:7], s[6:7], 0
+; GFX9-NEXT:    s_cselect_b64 s[4:5], s[4:5], 0
 ; GFX9-NEXT:    s_cselect_b64 s[8:9], s[10:11], s[8:9]
 ; GFX9-NEXT:    s_cmp_lg_u32 s21, 0
-; GFX9-NEXT:    s_cselect_b64 s[8:9], s[4:5], s[8:9]
-; GFX9-NEXT:    s_and_b32 s4, s20, 0x7f
-; GFX9-NEXT:    s_sub_i32 s18, s4, 64
-; GFX9-NEXT:    s_sub_i32 s16, 64, s4
-; GFX9-NEXT:    s_cmp_lt_u32 s4, 64
+; GFX9-NEXT:    s_cselect_b64 s[6:7], s[6:7], s[8:9]
+; GFX9-NEXT:    s_and_b32 s8, s20, 0x7f
+; GFX9-NEXT:    s_sub_i32 s18, s8, 64
+; GFX9-NEXT:    s_sub_i32 s16, 64, s8
+; GFX9-NEXT:    s_cmp_lt_u32 s8, 64
 ; GFX9-NEXT:    s_cselect_b32 s19, 1, 0
-; GFX9-NEXT:    s_cmp_eq_u32 s4, 0
+; GFX9-NEXT:    s_cmp_eq_u32 s8, 0
 ; GFX9-NEXT:    s_cselect_b32 s21, 1, 0
 ; GFX9-NEXT:    s_lshr_b64 s[10:11], s[12:13], s20
 ; GFX9-NEXT:    s_lshl_b64 s[16:17], s[14:15], s16
-; GFX9-NEXT:    s_lshr_b64 s[4:5], s[14:15], s20
+; GFX9-NEXT:    s_lshr_b64 s[8:9], s[14:15], s20
 ; GFX9-NEXT:    s_or_b64 s[10:11], s[10:11], s[16:17]
 ; GFX9-NEXT:    s_lshr_b64 s[14:15], s[14:15], s18
 ; GFX9-NEXT:    s_cmp_lg_u32 s19, 0
@@ -6974,61 +6948,60 @@ define amdgpu_ps <2 x i128> @s_fshr_v2i128(<2 x i128> inreg %lhs, <2 x i128> inr
 ; GFX9-NEXT:    s_cmp_lg_u32 s21, 0
 ; GFX9-NEXT:    s_cselect_b64 s[10:11], s[12:13], s[10:11]
 ; GFX9-NEXT:    s_cmp_lg_u32 s19, 0
-; GFX9-NEXT:    s_cselect_b64 s[12:13], s[4:5], 0
-; GFX9-NEXT:    s_or_b64 s[4:5], s[6:7], s[10:11]
-; GFX9-NEXT:    s_or_b64 s[6:7], s[8:9], s[12:13]
+; GFX9-NEXT:    s_cselect_b64 s[8:9], s[8:9], 0
+; GFX9-NEXT:    s_or_b64 s[4:5], s[4:5], s[10:11]
+; GFX9-NEXT:    s_or_b64 s[6:7], s[6:7], s[8:9]
 ; GFX9-NEXT:    ; return to shader part epilog
 ;
 ; GFX10-LABEL: s_fshr_v2i128:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_lshl_b64 s[2:3], s[2:3], 1
-; GFX10-NEXT:    s_lshr_b32 s18, s1, 31
-; GFX10-NEXT:    s_mov_b32 s19, 0
-; GFX10-NEXT:    s_andn2_b32 s17, 0x7f, s16
+; GFX10-NEXT:    s_lshr_b32 s17, s1, 31
 ; GFX10-NEXT:    s_lshl_b64 s[0:1], s[0:1], 1
-; GFX10-NEXT:    s_or_b64 s[2:3], s[2:3], s[18:19]
-; GFX10-NEXT:    s_not_b32 s18, s16
-; GFX10-NEXT:    s_sub_i32 s21, s17, 64
-; GFX10-NEXT:    s_sub_i32 s22, 64, s17
+; GFX10-NEXT:    s_or_b32 s2, s2, s17
+; GFX10-NEXT:    s_andn2_b32 s17, 0x7f, s16
+; GFX10-NEXT:    s_not_b32 s21, s16
+; GFX10-NEXT:    s_sub_i32 s26, s17, 64
+; GFX10-NEXT:    s_sub_i32 s18, 64, s17
 ; GFX10-NEXT:    s_cmp_lt_u32 s17, 64
-; GFX10-NEXT:    s_cselect_b32 s28, 1, 0
+; GFX10-NEXT:    s_cselect_b32 s27, 1, 0
 ; GFX10-NEXT:    s_cmp_eq_u32 s17, 0
 ; GFX10-NEXT:    s_cselect_b32 s17, 1, 0
-; GFX10-NEXT:    s_lshr_b64 s[22:23], s[0:1], s22
-; GFX10-NEXT:    s_lshl_b64 s[24:25], s[2:3], s18
-; GFX10-NEXT:    s_lshl_b64 s[26:27], s[0:1], s18
-; GFX10-NEXT:    s_or_b64 s[22:23], s[22:23], s[24:25]
-; GFX10-NEXT:    s_lshl_b64 s[0:1], s[0:1], s21
-; GFX10-NEXT:    s_cmp_lg_u32 s28, 0
-; GFX10-NEXT:    s_cselect_b64 s[24:25], s[26:27], 0
-; GFX10-NEXT:    s_cselect_b64 s[0:1], s[22:23], s[0:1]
+; GFX10-NEXT:    s_lshr_b64 s[18:19], s[0:1], s18
+; GFX10-NEXT:    s_lshl_b64 s[22:23], s[2:3], s21
+; GFX10-NEXT:    s_lshl_b64 s[24:25], s[0:1], s21
+; GFX10-NEXT:    s_or_b64 s[18:19], s[18:19], s[22:23]
+; GFX10-NEXT:    s_lshl_b64 s[0:1], s[0:1], s26
+; GFX10-NEXT:    s_cmp_lg_u32 s27, 0
+; GFX10-NEXT:    s_cselect_b64 s[22:23], s[24:25], 0
+; GFX10-NEXT:    s_cselect_b64 s[0:1], s[18:19], s[0:1]
 ; GFX10-NEXT:    s_cmp_lg_u32 s17, 0
 ; GFX10-NEXT:    s_cselect_b64 s[2:3], s[2:3], s[0:1]
 ; GFX10-NEXT:    s_and_b32 s0, s16, 0x7f
-; GFX10-NEXT:    s_sub_i32 s18, s0, 64
+; GFX10-NEXT:    s_sub_i32 s21, s0, 64
 ; GFX10-NEXT:    s_sub_i32 s17, 64, s0
 ; GFX10-NEXT:    s_cmp_lt_u32 s0, 64
-; GFX10-NEXT:    s_cselect_b32 s21, 1, 0
+; GFX10-NEXT:    s_cselect_b32 s24, 1, 0
 ; GFX10-NEXT:    s_cmp_eq_u32 s0, 0
-; GFX10-NEXT:    s_cselect_b32 s26, 1, 0
+; GFX10-NEXT:    s_cselect_b32 s25, 1, 0
 ; GFX10-NEXT:    s_lshr_b64 s[0:1], s[8:9], s16
-; GFX10-NEXT:    s_lshl_b64 s[22:23], s[10:11], s17
+; GFX10-NEXT:    s_lshl_b64 s[18:19], s[10:11], s17
 ; GFX10-NEXT:    s_lshr_b64 s[16:17], s[10:11], s16
-; GFX10-NEXT:    s_or_b64 s[0:1], s[0:1], s[22:23]
-; GFX10-NEXT:    s_lshr_b64 s[10:11], s[10:11], s18
-; GFX10-NEXT:    s_cmp_lg_u32 s21, 0
+; GFX10-NEXT:    s_or_b64 s[0:1], s[0:1], s[18:19]
+; GFX10-NEXT:    s_lshr_b64 s[10:11], s[10:11], s21
+; GFX10-NEXT:    s_cmp_lg_u32 s24, 0
 ; GFX10-NEXT:    s_cselect_b64 s[0:1], s[0:1], s[10:11]
-; GFX10-NEXT:    s_cmp_lg_u32 s26, 0
+; GFX10-NEXT:    s_cmp_lg_u32 s25, 0
 ; GFX10-NEXT:    s_cselect_b64 s[0:1], s[8:9], s[0:1]
-; GFX10-NEXT:    s_cmp_lg_u32 s21, 0
+; GFX10-NEXT:    s_cmp_lg_u32 s24, 0
 ; GFX10-NEXT:    s_cselect_b64 s[8:9], s[16:17], 0
 ; GFX10-NEXT:    s_lshl_b64 s[6:7], s[6:7], 1
 ; GFX10-NEXT:    s_or_b64 s[2:3], s[2:3], s[8:9]
-; GFX10-NEXT:    s_lshr_b32 s18, s5, 31
+; GFX10-NEXT:    s_lshr_b32 s8, s5, 31
+; GFX10-NEXT:    s_or_b64 s[0:1], s[22:23], s[0:1]
+; GFX10-NEXT:    s_or_b32 s6, s6, s8
 ; GFX10-NEXT:    s_andn2_b32 s8, 0x7f, s20
-; GFX10-NEXT:    s_or_b64 s[0:1], s[24:25], s[0:1]
 ; GFX10-NEXT:    s_lshl_b64 s[4:5], s[4:5], 1
-; GFX10-NEXT:    s_or_b64 s[6:7], s[6:7], s[18:19]
 ; GFX10-NEXT:    s_not_b32 s16, s20
 ; GFX10-NEXT:    s_sub_i32 s18, s8, 64
 ; GFX10-NEXT:    s_sub_i32 s9, 64, s8
@@ -7071,54 +7044,53 @@ define amdgpu_ps <2 x i128> @s_fshr_v2i128(<2 x i128> inreg %lhs, <2 x i128> inr
 ; GFX11-LABEL: s_fshr_v2i128:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_lshl_b64 s[2:3], s[2:3], 1
-; GFX11-NEXT:    s_lshr_b32 s18, s1, 31
-; GFX11-NEXT:    s_mov_b32 s19, 0
-; GFX11-NEXT:    s_and_not1_b32 s17, 0x7f, s16
+; GFX11-NEXT:    s_lshr_b32 s17, s1, 31
 ; GFX11-NEXT:    s_lshl_b64 s[0:1], s[0:1], 1
-; GFX11-NEXT:    s_or_b64 s[2:3], s[2:3], s[18:19]
-; GFX11-NEXT:    s_not_b32 s18, s16
-; GFX11-NEXT:    s_sub_i32 s21, s17, 64
-; GFX11-NEXT:    s_sub_i32 s22, 64, s17
+; GFX11-NEXT:    s_or_b32 s2, s2, s17
+; GFX11-NEXT:    s_and_not1_b32 s17, 0x7f, s16
+; GFX11-NEXT:    s_not_b32 s21, s16
+; GFX11-NEXT:    s_sub_i32 s26, s17, 64
+; GFX11-NEXT:    s_sub_i32 s18, 64, s17
 ; GFX11-NEXT:    s_cmp_lt_u32 s17, 64
-; GFX11-NEXT:    s_cselect_b32 s28, 1, 0
+; GFX11-NEXT:    s_cselect_b32 s27, 1, 0
 ; GFX11-NEXT:    s_cmp_eq_u32 s17, 0
 ; GFX11-NEXT:    s_cselect_b32 s17, 1, 0
-; GFX11-NEXT:    s_lshr_b64 s[22:23], s[0:1], s22
-; GFX11-NEXT:    s_lshl_b64 s[24:25], s[2:3], s18
-; GFX11-NEXT:    s_lshl_b64 s[26:27], s[0:1], s18
-; GFX11-NEXT:    s_or_b64 s[22:23], s[22:23], s[24:25]
-; GFX11-NEXT:    s_lshl_b64 s[0:1], s[0:1], s21
-; GFX11-NEXT:    s_cmp_lg_u32 s28, 0
-; GFX11-NEXT:    s_cselect_b64 s[24:25], s[26:27], 0
-; GFX11-NEXT:    s_cselect_b64 s[0:1], s[22:23], s[0:1]
+; GFX11-NEXT:    s_lshr_b64 s[18:19], s[0:1], s18
+; GFX11-NEXT:    s_lshl_b64 s[22:23], s[2:3], s21
+; GFX11-NEXT:    s_lshl_b64 s[24:25], s[0:1], s21
+; GFX11-NEXT:    s_or_b64 s[18:19], s[18:19], s[22:23]
+; GFX11-NEXT:    s_lshl_b64 s[0:1], s[0:1], s26
+; GFX11-NEXT:    s_cmp_lg_u32 s27, 0
+; GFX11-NEXT:    s_cselect_b64 s[22:23], s[24:25], 0
+; GFX11-NEXT:    s_cselect_b64 s[0:1], s[18:19], s[0:1]
 ; GFX11-NEXT:    s_cmp_lg_u32 s17, 0
 ; GFX11-NEXT:    s_cselect_b64 s[2:3], s[2:3], s[0:1]
 ; GFX11-NEXT:    s_and_b32 s0, s16, 0x7f
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    s_sub_i32 s18, s0, 64
+; GFX11-NEXT:    s_sub_i32 s21, s0, 64
 ; GFX11-NEXT:    s_sub_i32 s17, 64, s0
 ; GFX11-NEXT:    s_cmp_lt_u32 s0, 64
-; GFX11-NEXT:    s_cselect_b32 s21, 1, 0
+; GFX11-NEXT:    s_cselect_b32 s24, 1, 0
 ; GFX11-NEXT:    s_cmp_eq_u32 s0, 0
-; GFX11-NEXT:    s_cselect_b32 s26, 1, 0
+; GFX11-NEXT:    s_cselect_b32 s25, 1, 0
 ; GFX11-NEXT:    s_lshr_b64 s[0:1], s[8:9], s16
-; GFX11-NEXT:    s_lshl_b64 s[22:23], s[10:11], s17
+; GFX11-NEXT:    s_lshl_b64 s[18:19], s[10:11], s17
 ; GFX11-NEXT:    s_lshr_b64 s[16:17], s[10:11], s16
-; GFX11-NEXT:    s_or_b64 s[0:1], s[0:1], s[22:23]
-; GFX11-NEXT:    s_lshr_b64 s[10:11], s[10:11], s18
-; GFX11-NEXT:    s_cmp_lg_u32 s21, 0
+; GFX11-NEXT:    s_or_b64 s[0:1], s[0:1], s[18:19]
+; GFX11-NEXT:    s_lshr_b64 s[10:11], s[10:11], s21
+; GFX11-NEXT:    s_cmp_lg_u32 s24, 0
 ; GFX11-NEXT:    s_cselect_b64 s[0:1], s[0:1], s[10:11]
-; GFX11-NEXT:    s_cmp_lg_u32 s26, 0
+; GFX11-NEXT:    s_cmp_lg_u32 s25, 0
 ; GFX11-NEXT:    s_cselect_b64 s[0:1], s[8:9], s[0:1]
-; GFX11-NEXT:    s_cmp_lg_u32 s21, 0
+; GFX11-NEXT:    s_cmp_lg_u32 s24, 0
 ; GFX11-NEXT:    s_cselect_b64 s[8:9], s[16:17], 0
 ; GFX11-NEXT:    s_lshl_b64 s[6:7], s[6:7], 1
 ; GFX11-NEXT:    s_or_b64 s[2:3], s[2:3], s[8:9]
-; GFX11-NEXT:    s_lshr_b32 s18, s5, 31
+; GFX11-NEXT:    s_lshr_b32 s8, s5, 31
+; GFX11-NEXT:    s_or_b64 s[0:1], s[22:23], s[0:1]
+; GFX11-NEXT:    s_or_b32 s6, s6, s8
 ; GFX11-NEXT:    s_and_not1_b32 s8, 0x7f, s20
-; GFX11-NEXT:    s_or_b64 s[0:1], s[24:25], s[0:1]
 ; GFX11-NEXT:    s_lshl_b64 s[4:5], s[4:5], 1
-; GFX11-NEXT:    s_or_b64 s[6:7], s[6:7], s[18:19]
 ; GFX11-NEXT:    s_not_b32 s16, s20
 ; GFX11-NEXT:    s_sub_i32 s18, s8, 64
 ; GFX11-NEXT:    s_sub_i32 s9, 64, s8
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fmaximum.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fmaximum.mir
new file mode 100644
index 0000000..4b214e6
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fmaximum.mir
@@ -0,0 +1,275 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -run-pass=legalizer %s -o - | FileCheck -check-prefix=GFX9  %s
+# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1200 -mattr=-real-true16 -run-pass=legalizer %s -o - | FileCheck -check-prefixes=GFX12  %s
+
+---
+name: test_fmaximum_f16
+body: |
+  bb.0:
+    liveins: $vgpr0, $vgpr1
+
+    ; GFX9-LABEL: name: test_fmaximum_f16
+    ; GFX9: liveins: $vgpr0, $vgpr1
+    ; GFX9-NEXT: {{  $}}
+    ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+    ; GFX9-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32)
+    ; GFX9-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+    ; GFX9-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY1]](s32)
+    ; GFX9-NEXT: [[FMAXNUM_IEEE:%[0-9]+]]:_(s16) = G_FMAXNUM_IEEE [[TRUNC]], [[TRUNC1]]
+    ; GFX9-NEXT: [[FCMP:%[0-9]+]]:_(s1) = G_FCMP floatpred(ord), [[TRUNC]](s16), [[TRUNC1]]
+    ; GFX9-NEXT: [[C:%[0-9]+]]:_(s16) = G_FCONSTANT half 0xH7E00
+    ; GFX9-NEXT: [[SELECT:%[0-9]+]]:_(s16) = G_SELECT [[FCMP]](s1), [[FMAXNUM_IEEE]], [[C]]
+    ; GFX9-NEXT: [[COPY2:%[0-9]+]]:_(s16) = COPY [[SELECT]](s16)
+    ; GFX9-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[COPY2]](s16)
+    ; GFX9-NEXT: $vgpr0 = COPY [[ANYEXT]](s32)
+    ; GFX9-NEXT: SI_RETURN implicit $vgpr0
+    ;
+    ; GFX12-LABEL: name: test_fmaximum_f16
+    ; GFX12: liveins: $vgpr0, $vgpr1
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+    ; GFX12-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32)
+    ; GFX12-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+    ; GFX12-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY1]](s32)
+    ; GFX12-NEXT: [[FMAXIMUM:%[0-9]+]]:_(s16) = G_FMAXIMUM [[TRUNC]], [[TRUNC1]]
+    ; GFX12-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[FMAXIMUM]](s16)
+    ; GFX12-NEXT: $vgpr0 = COPY [[ANYEXT]](s32)
+    ; GFX12-NEXT: SI_RETURN implicit $vgpr0
+    %0:_(s32) = COPY $vgpr0
+    %1:_(s16) = G_TRUNC %0(s32)
+    %2:_(s32) = COPY $vgpr1
+    %3:_(s16) = G_TRUNC %2(s32)
+    %4:_(s16) = G_FMAXIMUM %1, %3
+    %5:_(s32) = G_ANYEXT %4(s16)
+    $vgpr0 = COPY %5(s32)
+    SI_RETURN implicit $vgpr0
+...
+---
+name: test_fmaximum_f32
+body: |
+  bb.0:
+    liveins: $vgpr0, $vgpr1
+
+    ; GFX9-LABEL: name: test_fmaximum_f32
+    ; GFX9: liveins: $vgpr0, $vgpr1
+    ; GFX9-NEXT: {{  $}}
+    ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+    ; GFX9-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+    ; GFX9-NEXT: [[FMAXNUM_IEEE:%[0-9]+]]:_(s32) = G_FMAXNUM_IEEE [[COPY]], [[COPY1]]
+    ; GFX9-NEXT: [[FCMP:%[0-9]+]]:_(s1) = G_FCMP floatpred(ord), [[COPY]](s32), [[COPY1]]
+    ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_FCONSTANT float 0x7FF8000000000000
+    ; GFX9-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[FCMP]](s1), [[FMAXNUM_IEEE]], [[C]]
+    ; GFX9-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY [[SELECT]](s32)
+    ; GFX9-NEXT: $vgpr0 = COPY [[COPY2]](s32)
+    ; GFX9-NEXT: SI_RETURN implicit $vgpr0
+    ;
+    ; GFX12-LABEL: name: test_fmaximum_f32
+    ; GFX12: liveins: $vgpr0, $vgpr1
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+    ; GFX12-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+    ; GFX12-NEXT: [[FMAXIMUM:%[0-9]+]]:_(s32) = G_FMAXIMUM [[COPY]], [[COPY1]]
+    ; GFX12-NEXT: $vgpr0 = COPY [[FMAXIMUM]](s32)
+    ; GFX12-NEXT: SI_RETURN implicit $vgpr0
+    %0:_(s32) = COPY $vgpr0
+    %1:_(s32) = COPY $vgpr1
+    %2:_(s32) = G_FMAXIMUM %0, %1
+    $vgpr0 = COPY %2(s32)
+    SI_RETURN implicit $vgpr0
+...
+---
+name: test_fmaximum_f64
+body: |
+  bb.0:
+    liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
+
+    ; GFX9-LABEL: name: test_fmaximum_f64
+    ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
+    ; GFX9-NEXT: {{  $}}
+    ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $vgpr0_vgpr1
+    ; GFX9-NEXT: [[COPY1:%[0-9]+]]:_(s64) = COPY $vgpr2_vgpr3
+    ; GFX9-NEXT: [[FMAXNUM_IEEE:%[0-9]+]]:_(s64) = G_FMAXNUM_IEEE [[COPY]], [[COPY1]]
+    ; GFX9-NEXT: [[FCMP:%[0-9]+]]:_(s1) = G_FCMP floatpred(ord), [[COPY]](s64), [[COPY1]]
+    ; GFX9-NEXT: [[C:%[0-9]+]]:_(s64) = G_FCONSTANT double 0x7FF8000000000000
+    ; GFX9-NEXT: [[SELECT:%[0-9]+]]:_(s64) = G_SELECT [[FCMP]](s1), [[FMAXNUM_IEEE]], [[C]]
+    ; GFX9-NEXT: [[COPY2:%[0-9]+]]:_(s64) = COPY [[SELECT]](s64)
+    ; GFX9-NEXT: $vgpr0_vgpr1 = COPY [[COPY2]](s64)
+    ; GFX9-NEXT: SI_RETURN implicit $vgpr0_vgpr1
+    ;
+    ; GFX12-LABEL: name: test_fmaximum_f64
+    ; GFX12: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $vgpr0_vgpr1
+    ; GFX12-NEXT: [[COPY1:%[0-9]+]]:_(s64) = COPY $vgpr2_vgpr3
+    ; GFX12-NEXT: [[FMAXIMUM:%[0-9]+]]:_(s64) = G_FMAXIMUM [[COPY]], [[COPY1]]
+    ; GFX12-NEXT: $vgpr0_vgpr1 = COPY [[FMAXIMUM]](s64)
+    ; GFX12-NEXT: SI_RETURN implicit $vgpr0_vgpr1
+    %0:_(s64) = COPY $vgpr0_vgpr1
+    %1:_(s64) = COPY $vgpr2_vgpr3
+    %2:_(s64) = G_FMAXIMUM %0, %1
+    $vgpr0_vgpr1 = COPY %2(s64)
+    SI_RETURN implicit $vgpr0_vgpr1
+...
+---
+name: test_fmaximum_v2f16
+body: |
+  bb.0:
+    liveins: $vgpr0, $vgpr1
+
+    ; GFX9-LABEL: name: test_fmaximum_v2f16
+    ; GFX9: liveins: $vgpr0, $vgpr1
+    ; GFX9-NEXT: {{  $}}
+    ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0
+    ; GFX9-NEXT: [[COPY1:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1
+    ; GFX9-NEXT: [[FMAXNUM_IEEE:%[0-9]+]]:_(<2 x s16>) = G_FMAXNUM_IEEE [[COPY]], [[COPY1]]
+    ; GFX9-NEXT: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY]](<2 x s16>)
+    ; GFX9-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32)
+    ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; GFX9-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32)
+    ; GFX9-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32)
+    ; GFX9-NEXT: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[COPY1]](<2 x s16>)
+    ; GFX9-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST1]](s32)
+    ; GFX9-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32)
+    ; GFX9-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR1]](s32)
+    ; GFX9-NEXT: [[FCMP:%[0-9]+]]:_(s1) = G_FCMP floatpred(ord), [[TRUNC]](s16), [[TRUNC2]]
+    ; GFX9-NEXT: [[FCMP1:%[0-9]+]]:_(s1) = G_FCMP floatpred(ord), [[TRUNC1]](s16), [[TRUNC3]]
+    ; GFX9-NEXT: [[C1:%[0-9]+]]:_(s16) = G_FCONSTANT half 0xH7E00
+    ; GFX9-NEXT: [[BITCAST2:%[0-9]+]]:_(s32) = G_BITCAST [[FMAXNUM_IEEE]](<2 x s16>)
+    ; GFX9-NEXT: [[TRUNC4:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST2]](s32)
+    ; GFX9-NEXT: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST2]], [[C]](s32)
+    ; GFX9-NEXT: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR2]](s32)
+    ; GFX9-NEXT: [[SELECT:%[0-9]+]]:_(s16) = G_SELECT [[FCMP]](s1), [[TRUNC4]], [[C1]]
+    ; GFX9-NEXT: [[SELECT1:%[0-9]+]]:_(s16) = G_SELECT [[FCMP1]](s1), [[TRUNC5]], [[C1]]
+    ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[SELECT]](s16), [[SELECT1]](s16)
+    ; GFX9-NEXT: [[COPY2:%[0-9]+]]:_(<2 x s16>) = COPY [[BUILD_VECTOR]](<2 x s16>)
+    ; GFX9-NEXT: $vgpr0 = COPY [[COPY2]](<2 x s16>)
+    ; GFX9-NEXT: SI_RETURN implicit $vgpr0
+    ;
+    ; GFX12-LABEL: name: test_fmaximum_v2f16
+    ; GFX12: liveins: $vgpr0, $vgpr1
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0
+    ; GFX12-NEXT: [[COPY1:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1
+    ; GFX12-NEXT: [[FMAXIMUM:%[0-9]+]]:_(<2 x s16>) = G_FMAXIMUM [[COPY]], [[COPY1]]
+    ; GFX12-NEXT: $vgpr0 = COPY [[FMAXIMUM]](<2 x s16>)
+    ; GFX12-NEXT: SI_RETURN implicit $vgpr0
+    %0:_(<2 x s16>) = COPY $vgpr0
+    %1:_(<2 x s16>) = COPY $vgpr1
+    %2:_(<2 x s16>) = G_FMAXIMUM %0, %1
+    $vgpr0 = COPY %2(<2 x s16>)
+    SI_RETURN implicit $vgpr0
+...
+---
+name: test_fmaximum_v2f32
+body: |
+  bb.0:
+    liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3
+
+    ; GFX9-LABEL: name: test_fmaximum_v2f32
+    ; GFX9: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3
+    ; GFX9-NEXT: {{  $}}
+    ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+    ; GFX9-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+    ; GFX9-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
+    ; GFX9-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3
+    ; GFX9-NEXT: [[FMAXNUM_IEEE:%[0-9]+]]:_(s32) = G_FMAXNUM_IEEE [[COPY]], [[COPY2]]
+    ; GFX9-NEXT: [[FCMP:%[0-9]+]]:_(s1) = G_FCMP floatpred(ord), [[COPY]](s32), [[COPY2]]
+    ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_FCONSTANT float 0x7FF8000000000000
+    ; GFX9-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[FCMP]](s1), [[FMAXNUM_IEEE]], [[C]]
+    ; GFX9-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY [[SELECT]](s32)
+    ; GFX9-NEXT: [[FMAXNUM_IEEE1:%[0-9]+]]:_(s32) = G_FMAXNUM_IEEE [[COPY1]], [[COPY3]]
+    ; GFX9-NEXT: [[FCMP1:%[0-9]+]]:_(s1) = G_FCMP floatpred(ord), [[COPY1]](s32), [[COPY3]]
+    ; GFX9-NEXT: [[SELECT1:%[0-9]+]]:_(s32) = G_SELECT [[FCMP1]](s1), [[FMAXNUM_IEEE1]], [[C]]
+    ; GFX9-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY [[SELECT1]](s32)
+    ; GFX9-NEXT: $vgpr0 = COPY [[COPY4]](s32)
+    ; GFX9-NEXT: $vgpr1 = COPY [[COPY5]](s32)
+    ; GFX9-NEXT: SI_RETURN implicit $vgpr0, implicit $vgpr1
+    ;
+    ; GFX12-LABEL: name: test_fmaximum_v2f32
+    ; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+    ; GFX12-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+    ; GFX12-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
+    ; GFX12-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3
+    ; GFX12-NEXT: [[FMAXIMUM:%[0-9]+]]:_(s32) = G_FMAXIMUM [[COPY]], [[COPY2]]
+    ; GFX12-NEXT: [[FMAXIMUM1:%[0-9]+]]:_(s32) = G_FMAXIMUM [[COPY1]], [[COPY3]]
+    ; GFX12-NEXT: $vgpr0 = COPY [[FMAXIMUM]](s32)
+    ; GFX12-NEXT: $vgpr1 = COPY [[FMAXIMUM1]](s32)
+    ; GFX12-NEXT: SI_RETURN implicit $vgpr0, implicit $vgpr1
+    %0:_(s32) = COPY $vgpr0
+    %1:_(s32) = COPY $vgpr1
+    %2:_(<2 x s32>) = G_BUILD_VECTOR %0(s32), %1(s32)
+    %3:_(s32) = COPY $vgpr2
+    %4:_(s32) = COPY $vgpr3
+    %5:_(<2 x s32>) = G_BUILD_VECTOR %3(s32), %4(s32)
+    %6:_(<2 x s32>) = G_FMAXIMUM %2, %5
+    %7:_(s32), %8:_(s32) = G_UNMERGE_VALUES %6(<2 x s32>)
+    $vgpr0 = COPY %7(s32)
+    $vgpr1 = COPY %8(s32)
+    SI_RETURN implicit $vgpr0, implicit $vgpr1
+...
+---
+name: test_fmaximum_nsz_f32
+body: |
+  bb.0:
+    liveins: $vgpr0, $vgpr1
+
+    ; GFX9-LABEL: name: test_fmaximum_nsz_f32
+    ; GFX9: liveins: $vgpr0, $vgpr1
+    ; GFX9-NEXT: {{  $}}
+    ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+    ; GFX9-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+    ; GFX9-NEXT: [[FMAXNUM_IEEE:%[0-9]+]]:_(s32) = G_FMAXNUM_IEEE [[COPY]], [[COPY1]]
+    ; GFX9-NEXT: [[FCMP:%[0-9]+]]:_(s1) = G_FCMP floatpred(ord), [[COPY]](s32), [[COPY1]]
+    ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_FCONSTANT float 0x7FF8000000000000
+    ; GFX9-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[FCMP]](s1), [[FMAXNUM_IEEE]], [[C]]
+    ; GFX9-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY [[SELECT]](s32)
+    ; GFX9-NEXT: $vgpr0 = COPY [[COPY2]](s32)
+    ; GFX9-NEXT: SI_RETURN implicit $vgpr0
+    ;
+    ; GFX12-LABEL: name: test_fmaximum_nsz_f32
+    ; GFX12: liveins: $vgpr0, $vgpr1
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+    ; GFX12-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+    ; GFX12-NEXT: [[FMAXIMUM:%[0-9]+]]:_(s32) = nsz G_FMAXIMUM [[COPY]], [[COPY1]]
+    ; GFX12-NEXT: $vgpr0 = COPY [[FMAXIMUM]](s32)
+    ; GFX12-NEXT: SI_RETURN implicit $vgpr0
+    %0:_(s32) = COPY $vgpr0
+    %1:_(s32) = COPY $vgpr1
+    %2:_(s32) = nsz G_FMAXIMUM %0, %1
+    $vgpr0 = COPY %2(s32)
+    SI_RETURN implicit $vgpr0
+...
+---
+name: test_fmaximum_nnan_f32
+body: |
+  bb.0:
+    liveins: $vgpr0, $vgpr1
+
+    ; GFX9-LABEL: name: test_fmaximum_nnan_f32
+    ; GFX9: liveins: $vgpr0, $vgpr1
+    ; GFX9-NEXT: {{  $}}
+    ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+    ; GFX9-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+    ; GFX9-NEXT: [[FMAXNUM_IEEE:%[0-9]+]]:_(s32) = G_FMAXNUM_IEEE [[COPY]], [[COPY1]]
+    ; GFX9-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY [[FMAXNUM_IEEE]](s32)
+    ; GFX9-NEXT: $vgpr0 = COPY [[COPY2]](s32)
+    ; GFX9-NEXT: SI_RETURN implicit $vgpr0
+    ;
+    ; GFX12-LABEL: name: test_fmaximum_nnan_f32
+    ; GFX12: liveins: $vgpr0, $vgpr1
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+    ; GFX12-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+    ; GFX12-NEXT: [[FMAXIMUM:%[0-9]+]]:_(s32) = nnan G_FMAXIMUM [[COPY]], [[COPY1]]
+    ; GFX12-NEXT: $vgpr0 = COPY [[FMAXIMUM]](s32)
+    ; GFX12-NEXT: SI_RETURN implicit $vgpr0
+    %0:_(s32) = COPY $vgpr0
+    %1:_(s32) = COPY $vgpr1
+    %2:_(s32) = nnan G_FMAXIMUM %0, %1
+    $vgpr0 = COPY %2(s32)
+    SI_RETURN implicit $vgpr0
+...
+
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fminimum.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fminimum.mir
new file mode 100644
index 0000000..8ba0794
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fminimum.mir
@@ -0,0 +1,275 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -run-pass=legalizer %s -o - | FileCheck -check-prefix=GFX9  %s
+# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1200 -mattr=-real-true16 -run-pass=legalizer %s -o - | FileCheck -check-prefixes=GFX12  %s
+
+---
+name: test_fminimum_f16
+body: |
+  bb.0:
+    liveins: $vgpr0, $vgpr1
+
+    ; GFX9-LABEL: name: test_fminimum_f16
+    ; GFX9: liveins: $vgpr0, $vgpr1
+    ; GFX9-NEXT: {{  $}}
+    ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+    ; GFX9-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32)
+    ; GFX9-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+    ; GFX9-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY1]](s32)
+    ; GFX9-NEXT: [[FMINNUM_IEEE:%[0-9]+]]:_(s16) = G_FMINNUM_IEEE [[TRUNC]], [[TRUNC1]]
+    ; GFX9-NEXT: [[FCMP:%[0-9]+]]:_(s1) = G_FCMP floatpred(ord), [[TRUNC]](s16), [[TRUNC1]]
+    ; GFX9-NEXT: [[C:%[0-9]+]]:_(s16) = G_FCONSTANT half 0xH7E00
+    ; GFX9-NEXT: [[SELECT:%[0-9]+]]:_(s16) = G_SELECT [[FCMP]](s1), [[FMINNUM_IEEE]], [[C]]
+    ; GFX9-NEXT: [[COPY2:%[0-9]+]]:_(s16) = COPY [[SELECT]](s16)
+    ; GFX9-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[COPY2]](s16)
+    ; GFX9-NEXT: $vgpr0 = COPY [[ANYEXT]](s32)
+    ; GFX9-NEXT: SI_RETURN implicit $vgpr0
+    ;
+    ; GFX12-LABEL: name: test_fminimum_f16
+    ; GFX12: liveins: $vgpr0, $vgpr1
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+    ; GFX12-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32)
+    ; GFX12-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+    ; GFX12-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY1]](s32)
+    ; GFX12-NEXT: [[FMINIMUM:%[0-9]+]]:_(s16) = G_FMINIMUM [[TRUNC]], [[TRUNC1]]
+    ; GFX12-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[FMINIMUM]](s16)
+    ; GFX12-NEXT: $vgpr0 = COPY [[ANYEXT]](s32)
+    ; GFX12-NEXT: SI_RETURN implicit $vgpr0
+    %0:_(s32) = COPY $vgpr0
+    %1:_(s16) = G_TRUNC %0(s32)
+    %2:_(s32) = COPY $vgpr1
+    %3:_(s16) = G_TRUNC %2(s32)
+    %4:_(s16) = G_FMINIMUM %1, %3
+    %5:_(s32) = G_ANYEXT %4(s16)
+    $vgpr0 = COPY %5(s32)
+    SI_RETURN implicit $vgpr0
+...
+---
+name: test_fminimum_f32
+body: |
+  bb.0:
+    liveins: $vgpr0, $vgpr1
+
+    ; GFX9-LABEL: name: test_fminimum_f32
+    ; GFX9: liveins: $vgpr0, $vgpr1
+    ; GFX9-NEXT: {{  $}}
+    ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+    ; GFX9-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+    ; GFX9-NEXT: [[FMINNUM_IEEE:%[0-9]+]]:_(s32) = G_FMINNUM_IEEE [[COPY]], [[COPY1]]
+    ; GFX9-NEXT: [[FCMP:%[0-9]+]]:_(s1) = G_FCMP floatpred(ord), [[COPY]](s32), [[COPY1]]
+    ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_FCONSTANT float 0x7FF8000000000000
+    ; GFX9-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[FCMP]](s1), [[FMINNUM_IEEE]], [[C]]
+    ; GFX9-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY [[SELECT]](s32)
+    ; GFX9-NEXT: $vgpr0 = COPY [[COPY2]](s32)
+    ; GFX9-NEXT: SI_RETURN implicit $vgpr0
+    ;
+    ; GFX12-LABEL: name: test_fminimum_f32
+    ; GFX12: liveins: $vgpr0, $vgpr1
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+    ; GFX12-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+    ; GFX12-NEXT: [[FMINIMUM:%[0-9]+]]:_(s32) = G_FMINIMUM [[COPY]], [[COPY1]]
+    ; GFX12-NEXT: $vgpr0 = COPY [[FMINIMUM]](s32)
+    ; GFX12-NEXT: SI_RETURN implicit $vgpr0
+    %0:_(s32) = COPY $vgpr0
+    %1:_(s32) = COPY $vgpr1
+    %2:_(s32) = G_FMINIMUM %0, %1
+    $vgpr0 = COPY %2(s32)
+    SI_RETURN implicit $vgpr0
+...
+---
+name: test_fminimum_f64
+body: |
+  bb.0:
+    liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
+
+    ; GFX9-LABEL: name: test_fminimum_f64
+    ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
+    ; GFX9-NEXT: {{  $}}
+    ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $vgpr0_vgpr1
+    ; GFX9-NEXT: [[COPY1:%[0-9]+]]:_(s64) = COPY $vgpr2_vgpr3
+    ; GFX9-NEXT: [[FMINNUM_IEEE:%[0-9]+]]:_(s64) = G_FMINNUM_IEEE [[COPY]], [[COPY1]]
+    ; GFX9-NEXT: [[FCMP:%[0-9]+]]:_(s1) = G_FCMP floatpred(ord), [[COPY]](s64), [[COPY1]]
+    ; GFX9-NEXT: [[C:%[0-9]+]]:_(s64) = G_FCONSTANT double 0x7FF8000000000000
+    ; GFX9-NEXT: [[SELECT:%[0-9]+]]:_(s64) = G_SELECT [[FCMP]](s1), [[FMINNUM_IEEE]], [[C]]
+    ; GFX9-NEXT: [[COPY2:%[0-9]+]]:_(s64) = COPY [[SELECT]](s64)
+    ; GFX9-NEXT: $vgpr0_vgpr1 = COPY [[COPY2]](s64)
+    ; GFX9-NEXT: SI_RETURN implicit $vgpr0_vgpr1
+    ;
+    ; GFX12-LABEL: name: test_fminimum_f64
+    ; GFX12: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $vgpr0_vgpr1
+    ; GFX12-NEXT: [[COPY1:%[0-9]+]]:_(s64) = COPY $vgpr2_vgpr3
+    ; GFX12-NEXT: [[FMINIMUM:%[0-9]+]]:_(s64) = G_FMINIMUM [[COPY]], [[COPY1]]
+    ; GFX12-NEXT: $vgpr0_vgpr1 = COPY [[FMINIMUM]](s64)
+    ; GFX12-NEXT: SI_RETURN implicit $vgpr0_vgpr1
+    %0:_(s64) = COPY $vgpr0_vgpr1
+    %1:_(s64) = COPY $vgpr2_vgpr3
+    %2:_(s64) = G_FMINIMUM %0, %1
+    $vgpr0_vgpr1 = COPY %2(s64)
+    SI_RETURN implicit $vgpr0_vgpr1
+...
+---
+name: test_fminimum_v2f16
+body: |
+  bb.0:
+    liveins: $vgpr0, $vgpr1
+
+    ; GFX9-LABEL: name: test_fminimum_v2f16
+    ; GFX9: liveins: $vgpr0, $vgpr1
+    ; GFX9-NEXT: {{  $}}
+    ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0
+    ; GFX9-NEXT: [[COPY1:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1
+    ; GFX9-NEXT: [[FMINNUM_IEEE:%[0-9]+]]:_(<2 x s16>) = G_FMINNUM_IEEE [[COPY]], [[COPY1]]
+    ; GFX9-NEXT: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY]](<2 x s16>)
+    ; GFX9-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32)
+    ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; GFX9-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32)
+    ; GFX9-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32)
+    ; GFX9-NEXT: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[COPY1]](<2 x s16>)
+    ; GFX9-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST1]](s32)
+    ; GFX9-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32)
+    ; GFX9-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR1]](s32)
+    ; GFX9-NEXT: [[FCMP:%[0-9]+]]:_(s1) = G_FCMP floatpred(ord), [[TRUNC]](s16), [[TRUNC2]]
+    ; GFX9-NEXT: [[FCMP1:%[0-9]+]]:_(s1) = G_FCMP floatpred(ord), [[TRUNC1]](s16), [[TRUNC3]]
+    ; GFX9-NEXT: [[C1:%[0-9]+]]:_(s16) = G_FCONSTANT half 0xH7E00
+    ; GFX9-NEXT: [[BITCAST2:%[0-9]+]]:_(s32) = G_BITCAST [[FMINNUM_IEEE]](<2 x s16>)
+    ; GFX9-NEXT: [[TRUNC4:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST2]](s32)
+    ; GFX9-NEXT: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST2]], [[C]](s32)
+    ; GFX9-NEXT: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR2]](s32)
+    ; GFX9-NEXT: [[SELECT:%[0-9]+]]:_(s16) = G_SELECT [[FCMP]](s1), [[TRUNC4]], [[C1]]
+    ; GFX9-NEXT: [[SELECT1:%[0-9]+]]:_(s16) = G_SELECT [[FCMP1]](s1), [[TRUNC5]], [[C1]]
+    ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[SELECT]](s16), [[SELECT1]](s16)
+    ; GFX9-NEXT: [[COPY2:%[0-9]+]]:_(<2 x s16>) = COPY [[BUILD_VECTOR]](<2 x s16>)
+    ; GFX9-NEXT: $vgpr0 = COPY [[COPY2]](<2 x s16>)
+    ; GFX9-NEXT: SI_RETURN implicit $vgpr0
+    ;
+    ; GFX12-LABEL: name: test_fminimum_v2f16
+    ; GFX12: liveins: $vgpr0, $vgpr1
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0
+    ; GFX12-NEXT: [[COPY1:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1
+    ; GFX12-NEXT: [[FMINIMUM:%[0-9]+]]:_(<2 x s16>) = G_FMINIMUM [[COPY]], [[COPY1]]
+    ; GFX12-NEXT: $vgpr0 = COPY [[FMINIMUM]](<2 x s16>)
+    ; GFX12-NEXT: SI_RETURN implicit $vgpr0
+    %0:_(<2 x s16>) = COPY $vgpr0
+    %1:_(<2 x s16>) = COPY $vgpr1
+    %2:_(<2 x s16>) = G_FMINIMUM %0, %1
+    $vgpr0 = COPY %2(<2 x s16>)
+    SI_RETURN implicit $vgpr0
+...
+---
+name: test_fminimum_v2f32
+body: |
+  bb.0:
+    liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3
+
+    ; GFX9-LABEL: name: test_fminimum_v2f32
+    ; GFX9: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3
+    ; GFX9-NEXT: {{  $}}
+    ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+    ; GFX9-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+    ; GFX9-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
+    ; GFX9-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3
+    ; GFX9-NEXT: [[FMINNUM_IEEE:%[0-9]+]]:_(s32) = G_FMINNUM_IEEE [[COPY]], [[COPY2]]
+    ; GFX9-NEXT: [[FCMP:%[0-9]+]]:_(s1) = G_FCMP floatpred(ord), [[COPY]](s32), [[COPY2]]
+    ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_FCONSTANT float 0x7FF8000000000000
+    ; GFX9-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[FCMP]](s1), [[FMINNUM_IEEE]], [[C]]
+    ; GFX9-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY [[SELECT]](s32)
+    ; GFX9-NEXT: [[FMINNUM_IEEE1:%[0-9]+]]:_(s32) = G_FMINNUM_IEEE [[COPY1]], [[COPY3]]
+    ; GFX9-NEXT: [[FCMP1:%[0-9]+]]:_(s1) = G_FCMP floatpred(ord), [[COPY1]](s32), [[COPY3]]
+    ; GFX9-NEXT: [[SELECT1:%[0-9]+]]:_(s32) = G_SELECT [[FCMP1]](s1), [[FMINNUM_IEEE1]], [[C]]
+    ; GFX9-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY [[SELECT1]](s32)
+    ; GFX9-NEXT: $vgpr0 = COPY [[COPY4]](s32)
+    ; GFX9-NEXT: $vgpr1 = COPY [[COPY5]](s32)
+    ; GFX9-NEXT: SI_RETURN implicit $vgpr0, implicit $vgpr1
+    ;
+    ; GFX12-LABEL: name: test_fminimum_v2f32
+    ; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+    ; GFX12-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+    ; GFX12-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
+    ; GFX12-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3
+    ; GFX12-NEXT: [[FMINIMUM:%[0-9]+]]:_(s32) = G_FMINIMUM [[COPY]], [[COPY2]]
+    ; GFX12-NEXT: [[FMINIMUM1:%[0-9]+]]:_(s32) = G_FMINIMUM [[COPY1]], [[COPY3]]
+    ; GFX12-NEXT: $vgpr0 = COPY [[FMINIMUM]](s32)
+    ; GFX12-NEXT: $vgpr1 = COPY [[FMINIMUM1]](s32)
+    ; GFX12-NEXT: SI_RETURN implicit $vgpr0, implicit $vgpr1
+    %0:_(s32) = COPY $vgpr0
+    %1:_(s32) = COPY $vgpr1
+    %2:_(<2 x s32>) = G_BUILD_VECTOR %0(s32), %1(s32)
+    %3:_(s32) = COPY $vgpr2
+    %4:_(s32) = COPY $vgpr3
+    %5:_(<2 x s32>) = G_BUILD_VECTOR %3(s32), %4(s32)
+    %6:_(<2 x s32>) = G_FMINIMUM %2, %5
+    %7:_(s32), %8:_(s32) = G_UNMERGE_VALUES %6(<2 x s32>)
+    $vgpr0 = COPY %7(s32)
+    $vgpr1 = COPY %8(s32)
+    SI_RETURN implicit $vgpr0, implicit $vgpr1
+...
+---
+name: test_fminimum_nsz_f32
+body: |
+  bb.0:
+    liveins: $vgpr0, $vgpr1
+
+    ; GFX9-LABEL: name: test_fminimum_nsz_f32
+    ; GFX9: liveins: $vgpr0, $vgpr1
+    ; GFX9-NEXT: {{  $}}
+    ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+    ; GFX9-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+    ; GFX9-NEXT: [[FMINNUM_IEEE:%[0-9]+]]:_(s32) = G_FMINNUM_IEEE [[COPY]], [[COPY1]]
+    ; GFX9-NEXT: [[FCMP:%[0-9]+]]:_(s1) = G_FCMP floatpred(ord), [[COPY]](s32), [[COPY1]]
+    ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_FCONSTANT float 0x7FF8000000000000
+    ; GFX9-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[FCMP]](s1), [[FMINNUM_IEEE]], [[C]]
+    ; GFX9-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY [[SELECT]](s32)
+    ; GFX9-NEXT: $vgpr0 = COPY [[COPY2]](s32)
+    ; GFX9-NEXT: SI_RETURN implicit $vgpr0
+    ;
+    ; GFX12-LABEL: name: test_fminimum_nsz_f32
+    ; GFX12: liveins: $vgpr0, $vgpr1
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+    ; GFX12-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+    ; GFX12-NEXT: [[FMINIMUM:%[0-9]+]]:_(s32) = nsz G_FMINIMUM [[COPY]], [[COPY1]]
+    ; GFX12-NEXT: $vgpr0 = COPY [[FMINIMUM]](s32)
+    ; GFX12-NEXT: SI_RETURN implicit $vgpr0
+    %0:_(s32) = COPY $vgpr0
+    %1:_(s32) = COPY $vgpr1
+    %2:_(s32) = nsz G_FMINIMUM %0, %1
+    $vgpr0 = COPY %2(s32)
+    SI_RETURN implicit $vgpr0
+...
+---
+name: test_fminimum_nnan_f32
+body: |
+  bb.0:
+    liveins: $vgpr0, $vgpr1
+
+    ; GFX9-LABEL: name: test_fminimum_nnan_f32
+    ; GFX9: liveins: $vgpr0, $vgpr1
+    ; GFX9-NEXT: {{  $}}
+    ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+    ; GFX9-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+    ; GFX9-NEXT: [[FMINNUM_IEEE:%[0-9]+]]:_(s32) = G_FMINNUM_IEEE [[COPY]], [[COPY1]]
+    ; GFX9-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY [[FMINNUM_IEEE]](s32)
+    ; GFX9-NEXT: $vgpr0 = COPY [[COPY2]](s32)
+    ; GFX9-NEXT: SI_RETURN implicit $vgpr0
+    ;
+    ; GFX12-LABEL: name: test_fminimum_nnan_f32
+    ; GFX12: liveins: $vgpr0, $vgpr1
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+    ; GFX12-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+    ; GFX12-NEXT: [[FMINIMUM:%[0-9]+]]:_(s32) = nnan G_FMINIMUM [[COPY]], [[COPY1]]
+    ; GFX12-NEXT: $vgpr0 = COPY [[FMINIMUM]](s32)
+    ; GFX12-NEXT: SI_RETURN implicit $vgpr0
+    %0:_(s32) = COPY $vgpr0
+    %1:_(s32) = COPY $vgpr1
+    %2:_(s32) = nnan G_FMINIMUM %0, %1
+    $vgpr0 = COPY %2(s32)
+    SI_RETURN implicit $vgpr0
+...
+
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/lshr.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/lshr.ll
index 8533e34..518af70 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/lshr.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/lshr.ll
@@ -1750,7 +1750,7 @@ define i65 @v_lshr_i65_33(i65 %value) {
 ; GFX6-NEXT:    v_and_b32_e32 v0, 1, v2
 ; GFX6-NEXT:    v_lshl_b64 v[0:1], v[0:1], 31
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v2, 1, v3
-; GFX6-NEXT:    v_or_b32_e32 v0, v2, v0
+; GFX6-NEXT:    v_or_b32_e32 v0, v0, v2
 ; GFX6-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -1763,7 +1763,7 @@ define i65 @v_lshr_i65_33(i65 %value) {
 ; GFX8-NEXT:    v_and_b32_e32 v0, 1, v2
 ; GFX8-NEXT:    v_lshlrev_b64 v[0:1], 31, v[0:1]
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v2, 1, v3
-; GFX8-NEXT:    v_or_b32_e32 v0, v2, v0
+; GFX8-NEXT:    v_or_b32_e32 v0, v0, v2
 ; GFX8-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -1776,7 +1776,7 @@ define i65 @v_lshr_i65_33(i65 %value) {
 ; GFX9-NEXT:    v_and_b32_e32 v0, 1, v2
 ; GFX9-NEXT:    v_lshlrev_b64 v[0:1], 31, v[0:1]
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 1, v3
-; GFX9-NEXT:    v_or_b32_e32 v0, v2, v0
+; GFX9-NEXT:    v_or_b32_e32 v0, v0, v2
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -1789,7 +1789,7 @@ define i65 @v_lshr_i65_33(i65 %value) {
 ; GFX10-NEXT:    v_and_b32_e32 v0, 1, v2
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v2, 1, v3
 ; GFX10-NEXT:    v_lshlrev_b64 v[0:1], 31, v[0:1]
-; GFX10-NEXT:    v_or_b32_e32 v0, v2, v0
+; GFX10-NEXT:    v_or_b32_e32 v0, v0, v2
 ; GFX10-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -1800,7 +1800,7 @@ define i65 @v_lshr_i65_33(i65 %value) {
 ; GFX11-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 1, v2
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v2, 1, v3
 ; GFX11-NEXT:    v_lshlrev_b64 v[0:1], 31, v[0:1]
-; GFX11-NEXT:    v_or_b32_e32 v0, v2, v0
+; GFX11-NEXT:    v_or_b32_e32 v0, v0, v2
 ; GFX11-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %result = lshr i65 %value, 33
@@ -1859,21 +1859,19 @@ define amdgpu_ps i65 @s_lshr_i65_33(i65 inreg %value) {
 ; GCN-LABEL: s_lshr_i65_33:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_and_b64 s[2:3], s[2:3], 1
-; GCN-NEXT:    s_lshr_b32 s0, s1, 1
-; GCN-NEXT:    s_mov_b32 s1, 0
-; GCN-NEXT:    s_lshl_b64 s[2:3], s[2:3], 31
-; GCN-NEXT:    s_or_b64 s[0:1], s[0:1], s[2:3]
+; GCN-NEXT:    s_lshr_b32 s4, s1, 1
+; GCN-NEXT:    s_lshl_b64 s[0:1], s[2:3], 31
+; GCN-NEXT:    s_or_b32 s0, s0, s4
 ; GCN-NEXT:    s_mov_b32 s2, 0
 ; GCN-NEXT:    ; return to shader part epilog
 ;
 ; GFX10PLUS-LABEL: s_lshr_i65_33:
 ; GFX10PLUS:       ; %bb.0:
 ; GFX10PLUS-NEXT:    s_and_b64 s[2:3], s[2:3], 1
-; GFX10PLUS-NEXT:    s_lshr_b32 s0, s1, 1
-; GFX10PLUS-NEXT:    s_mov_b32 s1, 0
-; GFX10PLUS-NEXT:    s_lshl_b64 s[2:3], s[2:3], 31
-; GFX10PLUS-NEXT:    s_or_b64 s[0:1], s[0:1], s[2:3]
+; GFX10PLUS-NEXT:    s_lshr_b32 s4, s1, 1
+; GFX10PLUS-NEXT:    s_lshl_b64 s[0:1], s[2:3], 31
 ; GFX10PLUS-NEXT:    s_mov_b32 s2, 0
+; GFX10PLUS-NEXT:    s_or_b32 s0, s0, s4
 ; GFX10PLUS-NEXT:    ; return to shader part epilog
   %result = lshr i65 %value, 33
   ret i65 %result
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/or.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/or.ll
index af377b1..e0581f01 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/or.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/or.ll
@@ -597,13 +597,13 @@ define amdgpu_kernel void @s_or_u64_zext_with_sregs(ptr addrspace(1) %out, ptr a
 ; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
 ; GFX7-NEXT:    s_mov_b32 s5, 0
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_load_dword s4, s[2:3], 0x0
+; GFX7-NEXT:    s_load_dword s3, s[2:3], 0x0
 ; GFX7-NEXT:    s_mov_b32 s2, -1
-; GFX7-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_or_b64 s[4:5], s[4:5], 0x50
+; GFX7-NEXT:    s_or_b32 s4, s3, 0x50
 ; GFX7-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX7-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; GFX7-NEXT:    s_endpgm
 ;
@@ -616,7 +616,7 @@ define amdgpu_kernel void @s_or_u64_zext_with_sregs(ptr addrspace(1) %out, ptr a
 ; GFX8-NEXT:    s_mov_b32 s3, 0
 ; GFX8-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    s_or_b64 s[2:3], s[2:3], 0x50
+; GFX8-NEXT:    s_or_b32 s2, s2, 0x50
 ; GFX8-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX8-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
@@ -630,7 +630,7 @@ define amdgpu_kernel void @s_or_u64_zext_with_sregs(ptr addrspace(1) %out, ptr a
 ; GFX9-NEXT:    s_load_dword s2, s[2:3], 0x0
 ; GFX9-NEXT:    s_mov_b32 s3, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_or_b64 s[2:3], s[2:3], 0x50
+; GFX9-NEXT:    s_or_b32 s2, s2, 0x50
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
@@ -644,7 +644,7 @@ define amdgpu_kernel void @s_or_u64_zext_with_sregs(ptr addrspace(1) %out, ptr a
 ; GFX10-NEXT:    s_load_dword s2, s[2:3], 0x0
 ; GFX10-NEXT:    s_mov_b32 s3, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    s_or_b64 s[2:3], s[2:3], 0x50
+; GFX10-NEXT:    s_or_b32 s2, s2, 0x50
 ; GFX10-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX10-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX10-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
@@ -658,7 +658,7 @@ define amdgpu_kernel void @s_or_u64_zext_with_sregs(ptr addrspace(1) %out, ptr a
 ; GFX11-NEXT:    s_load_b32 s2, s[2:3], 0x0
 ; GFX11-NEXT:    s_mov_b32 s3, 0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    s_or_b64 s[2:3], s[2:3], 0x50
+; GFX11-NEXT:    s_or_b32 s2, s2, 0x50
 ; GFX11-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX11-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
 ; GFX11-NEXT:    s_endpgm
@@ -671,7 +671,7 @@ define amdgpu_kernel void @s_or_u64_zext_with_sregs(ptr addrspace(1) %out, ptr a
 ; GFX12-NEXT:    s_load_b32 s2, s[2:3], 0x0
 ; GFX12-NEXT:    s_mov_b32 s3, 0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    s_or_b64 s[2:3], s[2:3], 0x50
+; GFX12-NEXT:    s_or_b32 s2, s2, 0x50
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX12-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/sext_inreg.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/sext_inreg.ll
index a9b3deb..cfe655f 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/sext_inreg.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/sext_inreg.ll
@@ -1381,7 +1381,7 @@ define i65 @v_sext_inreg_i65_33(i65 %value) {
 ; GFX6-NEXT:    v_ashrrev_i32_e32 v2, 31, v1
 ; GFX6-NEXT:    v_lshl_b64 v[0:1], v[1:2], 31
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v3, 1, v3
-; GFX6-NEXT:    v_or_b32_e32 v0, v3, v0
+; GFX6-NEXT:    v_or_b32_e32 v0, v0, v3
 ; GFX6-NEXT:    v_ashrrev_i32_e32 v2, 1, v2
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -1393,7 +1393,7 @@ define i65 @v_sext_inreg_i65_33(i65 %value) {
 ; GFX8-NEXT:    v_ashrrev_i32_e32 v2, 31, v1
 ; GFX8-NEXT:    v_lshlrev_b64 v[0:1], 31, v[1:2]
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 1, v3
-; GFX8-NEXT:    v_or_b32_e32 v0, v3, v0
+; GFX8-NEXT:    v_or_b32_e32 v0, v0, v3
 ; GFX8-NEXT:    v_ashrrev_i32_e32 v2, 1, v2
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -1405,7 +1405,7 @@ define i65 @v_sext_inreg_i65_33(i65 %value) {
 ; GFX9-NEXT:    v_ashrrev_i32_e32 v2, 31, v1
 ; GFX9-NEXT:    v_lshlrev_b64 v[0:1], 31, v[1:2]
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v3, 1, v3
-; GFX9-NEXT:    v_or_b32_e32 v0, v3, v0
+; GFX9-NEXT:    v_or_b32_e32 v0, v0, v3
 ; GFX9-NEXT:    v_ashrrev_i32_e32 v2, 1, v2
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -1418,7 +1418,7 @@ define i65 @v_sext_inreg_i65_33(i65 %value) {
 ; GFX10PLUS-NEXT:    v_ashrrev_i32_e32 v2, 31, v1
 ; GFX10PLUS-NEXT:    v_lshlrev_b64 v[0:1], 31, v[1:2]
 ; GFX10PLUS-NEXT:    v_ashrrev_i32_e32 v2, 1, v2
-; GFX10PLUS-NEXT:    v_or_b32_e32 v0, v3, v0
+; GFX10PLUS-NEXT:    v_or_b32_e32 v0, v0, v3
 ; GFX10PLUS-NEXT:    s_setpc_b64 s[30:31]
   %shl = shl i65 %value, 33
   %ashr = ashr i65 %value, 33
@@ -1429,29 +1429,27 @@ define amdgpu_ps i65 @s_sext_inreg_i65_18(i65 inreg %value) {
 ; GCN-LABEL: s_sext_inreg_i65_18:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_lshl_b64 s[2:3], s[2:3], 18
-; GCN-NEXT:    s_lshr_b32 s4, s1, 14
-; GCN-NEXT:    s_mov_b32 s5, 0
-; GCN-NEXT:    s_or_b64 s[2:3], s[2:3], s[4:5]
+; GCN-NEXT:    s_lshr_b32 s3, s1, 14
+; GCN-NEXT:    s_or_b32 s2, s2, s3
 ; GCN-NEXT:    s_bfe_i64 s[2:3], s[2:3], 0x10000
 ; GCN-NEXT:    s_bfe_u64 s[0:1], s[0:1], 0x2e0000
-; GCN-NEXT:    s_lshl_b32 s7, s2, 14
-; GCN-NEXT:    s_mov_b32 s6, s5
-; GCN-NEXT:    s_or_b64 s[0:1], s[0:1], s[6:7]
+; GCN-NEXT:    s_lshl_b32 s5, s2, 14
+; GCN-NEXT:    s_mov_b32 s4, 0
+; GCN-NEXT:    s_or_b64 s[0:1], s[0:1], s[4:5]
 ; GCN-NEXT:    s_ashr_i64 s[2:3], s[2:3], 18
 ; GCN-NEXT:    ; return to shader part epilog
 ;
 ; GFX10PLUS-LABEL: s_sext_inreg_i65_18:
 ; GFX10PLUS:       ; %bb.0:
 ; GFX10PLUS-NEXT:    s_lshl_b64 s[2:3], s[2:3], 18
-; GFX10PLUS-NEXT:    s_lshr_b32 s4, s1, 14
-; GFX10PLUS-NEXT:    s_mov_b32 s5, 0
+; GFX10PLUS-NEXT:    s_lshr_b32 s3, s1, 14
 ; GFX10PLUS-NEXT:    s_bfe_u64 s[0:1], s[0:1], 0x2e0000
-; GFX10PLUS-NEXT:    s_or_b64 s[2:3], s[2:3], s[4:5]
-; GFX10PLUS-NEXT:    s_mov_b32 s6, s5
+; GFX10PLUS-NEXT:    s_or_b32 s2, s2, s3
+; GFX10PLUS-NEXT:    s_mov_b32 s4, 0
 ; GFX10PLUS-NEXT:    s_bfe_i64 s[2:3], s[2:3], 0x10000
-; GFX10PLUS-NEXT:    s_lshl_b32 s7, s2, 14
+; GFX10PLUS-NEXT:    s_lshl_b32 s5, s2, 14
 ; GFX10PLUS-NEXT:    s_ashr_i64 s[2:3], s[2:3], 18
-; GFX10PLUS-NEXT:    s_or_b64 s[0:1], s[0:1], s[6:7]
+; GFX10PLUS-NEXT:    s_or_b64 s[0:1], s[0:1], s[4:5]
 ; GFX10PLUS-NEXT:    ; return to shader part epilog
   %shl = shl i65 %value, 18
   %ashr = ashr i65 %shl, 18
@@ -1464,13 +1462,12 @@ define amdgpu_ps i65 @s_sext_inreg_i65_33(i65 inreg %value) {
 ; GCN-NEXT:    s_lshl_b32 s3, s2, 1
 ; GCN-NEXT:    s_mov_b32 s2, 0
 ; GCN-NEXT:    s_lshr_b64 s[4:5], s[0:1], 31
-; GCN-NEXT:    s_or_b64 s[4:5], s[2:3], s[4:5]
-; GCN-NEXT:    s_bfe_i64 s[4:5], s[4:5], 0x10000
-; GCN-NEXT:    s_bfe_u32 s0, s0, 0x1f0000
-; GCN-NEXT:    s_mov_b32 s1, s2
-; GCN-NEXT:    s_lshl_b64 s[2:3], s[4:5], 31
-; GCN-NEXT:    s_or_b64 s[0:1], s[0:1], s[2:3]
-; GCN-NEXT:    s_ashr_i32 s2, s5, 1
+; GCN-NEXT:    s_or_b64 s[2:3], s[2:3], s[4:5]
+; GCN-NEXT:    s_bfe_i64 s[2:3], s[2:3], 0x10000
+; GCN-NEXT:    s_bfe_u32 s4, s0, 0x1f0000
+; GCN-NEXT:    s_lshl_b64 s[0:1], s[2:3], 31
+; GCN-NEXT:    s_or_b32 s0, s0, s4
+; GCN-NEXT:    s_ashr_i32 s2, s3, 1
 ; GCN-NEXT:    ; return to shader part epilog
 ;
 ; GFX10PLUS-LABEL: s_sext_inreg_i65_33:
@@ -1478,13 +1475,12 @@ define amdgpu_ps i65 @s_sext_inreg_i65_33(i65 inreg %value) {
 ; GFX10PLUS-NEXT:    s_lshl_b32 s3, s2, 1
 ; GFX10PLUS-NEXT:    s_mov_b32 s2, 0
 ; GFX10PLUS-NEXT:    s_lshr_b64 s[4:5], s[0:1], 31
-; GFX10PLUS-NEXT:    s_bfe_u32 s0, s0, 0x1f0000
-; GFX10PLUS-NEXT:    s_or_b64 s[4:5], s[2:3], s[4:5]
-; GFX10PLUS-NEXT:    s_mov_b32 s1, s2
-; GFX10PLUS-NEXT:    s_bfe_i64 s[4:5], s[4:5], 0x10000
-; GFX10PLUS-NEXT:    s_lshl_b64 s[2:3], s[4:5], 31
-; GFX10PLUS-NEXT:    s_or_b64 s[0:1], s[0:1], s[2:3]
-; GFX10PLUS-NEXT:    s_ashr_i32 s2, s5, 1
+; GFX10PLUS-NEXT:    s_or_b64 s[2:3], s[2:3], s[4:5]
+; GFX10PLUS-NEXT:    s_bfe_u32 s4, s0, 0x1f0000
+; GFX10PLUS-NEXT:    s_bfe_i64 s[2:3], s[2:3], 0x10000
+; GFX10PLUS-NEXT:    s_lshl_b64 s[0:1], s[2:3], 31
+; GFX10PLUS-NEXT:    s_ashr_i32 s2, s3, 1
+; GFX10PLUS-NEXT:    s_or_b32 s0, s0, s4
 ; GFX10PLUS-NEXT:    ; return to shader part epilog
   %shl = shl i65 %value, 33
   %ashr = ashr i65 %shl, 33
diff --git a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointers-memcpy.ll b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointers-memcpy.ll
index afd0f01..6831380 100644
--- a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointers-memcpy.ll
+++ b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointers-memcpy.ll
@@ -415,28 +415,18 @@ define amdgpu_kernel void @memcpy_known(ptr addrspace(7) %src, ptr addrspace(7)
 ; GISEL-GFX942-LABEL: memcpy_known:
 ; GISEL-GFX942:       ; %bb.0:
 ; GISEL-GFX942-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GISEL-GFX942-NEXT:    s_load_dword s7, s[4:5], 0x54
 ; GISEL-GFX942-NEXT:    s_load_dword s11, s[4:5], 0x34
-; GISEL-GFX942-NEXT:    s_mov_b32 s7, 0
 ; GISEL-GFX942-NEXT:    s_load_dwordx4 s[12:15], s[4:5], 0x44
-; GISEL-GFX942-NEXT:    s_mov_b32 s8, s7
+; GISEL-GFX942-NEXT:    s_mov_b32 s16, 0
+; GISEL-GFX942-NEXT:    v_mov_b32_e32 v0, 0x2000
 ; GISEL-GFX942-NEXT:    s_waitcnt lgkmcnt(0)
-; GISEL-GFX942-NEXT:    s_mov_b32 s6, s1
+; GISEL-GFX942-NEXT:    s_mov_b32 s8, s1
 ; GISEL-GFX942-NEXT:    s_mov_b32 s9, s2
-; GISEL-GFX942-NEXT:    s_or_b64 s[8:9], s[6:7], s[8:9]
-; GISEL-GFX942-NEXT:    s_mov_b32 s6, s3
-; GISEL-GFX942-NEXT:    s_load_dword s3, s[4:5], 0x54
-; GISEL-GFX942-NEXT:    s_mov_b32 s10, s7
-; GISEL-GFX942-NEXT:    s_or_b64 s[10:11], s[6:7], s[10:11]
-; GISEL-GFX942-NEXT:    s_mov_b32 s6, s13
-; GISEL-GFX942-NEXT:    s_mov_b32 s4, s7
+; GISEL-GFX942-NEXT:    s_mov_b32 s10, s3
+; GISEL-GFX942-NEXT:    s_mov_b32 s4, s13
 ; GISEL-GFX942-NEXT:    s_mov_b32 s5, s14
-; GISEL-GFX942-NEXT:    s_mov_b32 s16, 0
-; GISEL-GFX942-NEXT:    s_or_b64 s[4:5], s[6:7], s[4:5]
 ; GISEL-GFX942-NEXT:    s_mov_b32 s6, s15
-; GISEL-GFX942-NEXT:    s_mov_b32 s2, s7
-; GISEL-GFX942-NEXT:    s_waitcnt lgkmcnt(0)
-; GISEL-GFX942-NEXT:    s_or_b64 s[6:7], s[6:7], s[2:3]
-; GISEL-GFX942-NEXT:    v_mov_b32_e32 v0, 0x2000
 ; GISEL-GFX942-NEXT:    v_mov_b32_e32 v1, s16
 ; GISEL-GFX942-NEXT:  .LBB0_1: ; %load-store-loop
 ; GISEL-GFX942-NEXT:    ; =>This Inner Loop Header: Depth=1
@@ -491,25 +481,16 @@ define amdgpu_kernel void @memcpy_known(ptr addrspace(7) %src, ptr addrspace(7)
 ; GISEL-GFX1100-NEXT:    s_load_b128 s[8:11], s[4:5], 0x44
 ; GISEL-GFX1100-NEXT:    s_load_b32 s7, s[4:5], 0x34
 ; GISEL-GFX1100-NEXT:    s_load_b32 s15, s[4:5], 0x54
-; GISEL-GFX1100-NEXT:    s_mov_b32 s17, 0
-; GISEL-GFX1100-NEXT:    s_mov_b32 s12, 0
-; GISEL-GFX1100-NEXT:    s_mov_b32 s4, s17
-; GISEL-GFX1100-NEXT:    s_mov_b32 s6, s17
-; GISEL-GFX1100-NEXT:    v_mov_b32_e32 v0, s12
-; GISEL-GFX1100-NEXT:    s_mov_b32 s14, s17
+; GISEL-GFX1100-NEXT:    s_mov_b32 s4, 0
+; GISEL-GFX1100-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GISEL-GFX1100-NEXT:    v_mov_b32_e32 v0, s4
 ; GISEL-GFX1100-NEXT:    s_waitcnt lgkmcnt(0)
-; GISEL-GFX1100-NEXT:    s_mov_b32 s16, s1
+; GISEL-GFX1100-NEXT:    s_mov_b32 s4, s1
 ; GISEL-GFX1100-NEXT:    s_mov_b32 s5, s2
-; GISEL-GFX1100-NEXT:    s_mov_b32 s2, s17
-; GISEL-GFX1100-NEXT:    s_or_b64 s[4:5], s[16:17], s[4:5]
-; GISEL-GFX1100-NEXT:    s_mov_b32 s16, s3
-; GISEL-GFX1100-NEXT:    s_mov_b32 s3, s10
-; GISEL-GFX1100-NEXT:    s_or_b64 s[6:7], s[16:17], s[6:7]
-; GISEL-GFX1100-NEXT:    s_mov_b32 s16, s9
-; GISEL-GFX1100-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GISEL-GFX1100-NEXT:    s_or_b64 s[12:13], s[16:17], s[2:3]
-; GISEL-GFX1100-NEXT:    s_mov_b32 s16, s11
-; GISEL-GFX1100-NEXT:    s_or_b64 s[14:15], s[16:17], s[14:15]
+; GISEL-GFX1100-NEXT:    s_mov_b32 s6, s3
+; GISEL-GFX1100-NEXT:    s_mov_b32 s12, s9
+; GISEL-GFX1100-NEXT:    s_mov_b32 s13, s10
+; GISEL-GFX1100-NEXT:    s_mov_b32 s14, s11
 ; GISEL-GFX1100-NEXT:  .LBB0_1: ; %load-store-loop
 ; GISEL-GFX1100-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GISEL-GFX1100-NEXT:    v_add_nc_u32_e32 v61, s0, v0
@@ -960,28 +941,18 @@ define amdgpu_kernel void @memcpy_known_medium(ptr addrspace(7) %src, ptr addrsp
 ; GISEL-GFX942-LABEL: memcpy_known_medium:
 ; GISEL-GFX942:       ; %bb.0:
 ; GISEL-GFX942-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GISEL-GFX942-NEXT:    s_load_dword s7, s[4:5], 0x54
 ; GISEL-GFX942-NEXT:    s_load_dword s11, s[4:5], 0x34
-; GISEL-GFX942-NEXT:    s_mov_b32 s7, 0
 ; GISEL-GFX942-NEXT:    s_load_dwordx4 s[12:15], s[4:5], 0x44
-; GISEL-GFX942-NEXT:    s_mov_b32 s8, s7
+; GISEL-GFX942-NEXT:    s_mov_b32 s16, 0
+; GISEL-GFX942-NEXT:    v_mov_b32_e32 v0, 0x100
 ; GISEL-GFX942-NEXT:    s_waitcnt lgkmcnt(0)
-; GISEL-GFX942-NEXT:    s_mov_b32 s6, s1
+; GISEL-GFX942-NEXT:    s_mov_b32 s8, s1
 ; GISEL-GFX942-NEXT:    s_mov_b32 s9, s2
-; GISEL-GFX942-NEXT:    s_or_b64 s[8:9], s[6:7], s[8:9]
-; GISEL-GFX942-NEXT:    s_mov_b32 s6, s3
-; GISEL-GFX942-NEXT:    s_load_dword s3, s[4:5], 0x54
-; GISEL-GFX942-NEXT:    s_mov_b32 s10, s7
-; GISEL-GFX942-NEXT:    s_or_b64 s[10:11], s[6:7], s[10:11]
-; GISEL-GFX942-NEXT:    s_mov_b32 s6, s13
-; GISEL-GFX942-NEXT:    s_mov_b32 s4, s7
+; GISEL-GFX942-NEXT:    s_mov_b32 s10, s3
+; GISEL-GFX942-NEXT:    s_mov_b32 s4, s13
 ; GISEL-GFX942-NEXT:    s_mov_b32 s5, s14
-; GISEL-GFX942-NEXT:    s_mov_b32 s16, 0
-; GISEL-GFX942-NEXT:    s_or_b64 s[4:5], s[6:7], s[4:5]
 ; GISEL-GFX942-NEXT:    s_mov_b32 s6, s15
-; GISEL-GFX942-NEXT:    s_mov_b32 s2, s7
-; GISEL-GFX942-NEXT:    s_waitcnt lgkmcnt(0)
-; GISEL-GFX942-NEXT:    s_or_b64 s[6:7], s[6:7], s[2:3]
-; GISEL-GFX942-NEXT:    v_mov_b32_e32 v0, 0x100
 ; GISEL-GFX942-NEXT:    v_mov_b32_e32 v1, s16
 ; GISEL-GFX942-NEXT:  .LBB1_1: ; %load-store-loop
 ; GISEL-GFX942-NEXT:    ; =>This Inner Loop Header: Depth=1
@@ -1036,25 +1007,16 @@ define amdgpu_kernel void @memcpy_known_medium(ptr addrspace(7) %src, ptr addrsp
 ; GISEL-GFX1100-NEXT:    s_load_b128 s[8:11], s[4:5], 0x44
 ; GISEL-GFX1100-NEXT:    s_load_b32 s7, s[4:5], 0x34
 ; GISEL-GFX1100-NEXT:    s_load_b32 s15, s[4:5], 0x54
-; GISEL-GFX1100-NEXT:    s_mov_b32 s17, 0
-; GISEL-GFX1100-NEXT:    s_mov_b32 s12, 0
-; GISEL-GFX1100-NEXT:    s_mov_b32 s4, s17
-; GISEL-GFX1100-NEXT:    s_mov_b32 s6, s17
-; GISEL-GFX1100-NEXT:    v_mov_b32_e32 v0, s12
-; GISEL-GFX1100-NEXT:    s_mov_b32 s14, s17
+; GISEL-GFX1100-NEXT:    s_mov_b32 s4, 0
+; GISEL-GFX1100-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GISEL-GFX1100-NEXT:    v_mov_b32_e32 v0, s4
 ; GISEL-GFX1100-NEXT:    s_waitcnt lgkmcnt(0)
-; GISEL-GFX1100-NEXT:    s_mov_b32 s16, s1
+; GISEL-GFX1100-NEXT:    s_mov_b32 s4, s1
 ; GISEL-GFX1100-NEXT:    s_mov_b32 s5, s2
-; GISEL-GFX1100-NEXT:    s_mov_b32 s2, s17
-; GISEL-GFX1100-NEXT:    s_or_b64 s[4:5], s[16:17], s[4:5]
-; GISEL-GFX1100-NEXT:    s_mov_b32 s16, s3
-; GISEL-GFX1100-NEXT:    s_mov_b32 s3, s10
-; GISEL-GFX1100-NEXT:    s_or_b64 s[6:7], s[16:17], s[6:7]
-; GISEL-GFX1100-NEXT:    s_mov_b32 s16, s9
-; GISEL-GFX1100-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GISEL-GFX1100-NEXT:    s_or_b64 s[12:13], s[16:17], s[2:3]
-; GISEL-GFX1100-NEXT:    s_mov_b32 s16, s11
-; GISEL-GFX1100-NEXT:    s_or_b64 s[14:15], s[16:17], s[14:15]
+; GISEL-GFX1100-NEXT:    s_mov_b32 s6, s3
+; GISEL-GFX1100-NEXT:    s_mov_b32 s12, s9
+; GISEL-GFX1100-NEXT:    s_mov_b32 s13, s10
+; GISEL-GFX1100-NEXT:    s_mov_b32 s14, s11
 ; GISEL-GFX1100-NEXT:  .LBB1_1: ; %load-store-loop
 ; GISEL-GFX1100-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GISEL-GFX1100-NEXT:    v_add_nc_u32_e32 v61, s0, v0
@@ -1228,27 +1190,18 @@ define amdgpu_kernel void @memcpy_known_small(ptr addrspace(7) %src, ptr addrspa
 ; GISEL-GFX942:       ; %bb.0:
 ; GISEL-GFX942-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; GISEL-GFX942-NEXT:    s_load_dword s11, s[4:5], 0x34
-; GISEL-GFX942-NEXT:    s_mov_b32 s7, 0
-; GISEL-GFX942-NEXT:    s_mov_b32 s8, s7
-; GISEL-GFX942-NEXT:    s_mov_b32 s10, s7
 ; GISEL-GFX942-NEXT:    s_waitcnt lgkmcnt(0)
-; GISEL-GFX942-NEXT:    s_mov_b32 s6, s1
+; GISEL-GFX942-NEXT:    s_mov_b32 s8, s1
 ; GISEL-GFX942-NEXT:    s_mov_b32 s9, s2
-; GISEL-GFX942-NEXT:    s_or_b64 s[8:9], s[6:7], s[8:9]
-; GISEL-GFX942-NEXT:    s_mov_b32 s6, s3
-; GISEL-GFX942-NEXT:    s_or_b64 s[10:11], s[6:7], s[10:11]
+; GISEL-GFX942-NEXT:    s_mov_b32 s10, s3
 ; GISEL-GFX942-NEXT:    v_mov_b32_e32 v4, s0
 ; GISEL-GFX942-NEXT:    buffer_load_dwordx4 v[0:3], v4, s[8:11], 0 offen
 ; GISEL-GFX942-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x44
-; GISEL-GFX942-NEXT:    s_load_dword s13, s[4:5], 0x54
-; GISEL-GFX942-NEXT:    s_mov_b32 s4, s7
-; GISEL-GFX942-NEXT:    s_mov_b32 s12, s7
+; GISEL-GFX942-NEXT:    s_load_dword s7, s[4:5], 0x54
 ; GISEL-GFX942-NEXT:    s_waitcnt lgkmcnt(0)
-; GISEL-GFX942-NEXT:    s_mov_b32 s6, s1
+; GISEL-GFX942-NEXT:    s_mov_b32 s4, s1
 ; GISEL-GFX942-NEXT:    s_mov_b32 s5, s2
-; GISEL-GFX942-NEXT:    s_or_b64 s[4:5], s[6:7], s[4:5]
 ; GISEL-GFX942-NEXT:    s_mov_b32 s6, s3
-; GISEL-GFX942-NEXT:    s_or_b64 s[6:7], s[6:7], s[12:13]
 ; GISEL-GFX942-NEXT:    v_mov_b32_e32 v5, s0
 ; GISEL-GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GISEL-GFX942-NEXT:    buffer_store_dwordx4 v[0:3], v5, s[4:7], 0 offen
@@ -1261,35 +1214,24 @@ define amdgpu_kernel void @memcpy_known_small(ptr addrspace(7) %src, ptr addrspa
 ; GISEL-GFX1100:       ; %bb.0:
 ; GISEL-GFX1100-NEXT:    s_clause 0x1
 ; GISEL-GFX1100-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
-; GISEL-GFX1100-NEXT:    s_load_b32 s7, s[4:5], 0x34
-; GISEL-GFX1100-NEXT:    s_mov_b32 s13, 0
-; GISEL-GFX1100-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GISEL-GFX1100-NEXT:    s_mov_b32 s8, s13
-; GISEL-GFX1100-NEXT:    s_mov_b32 s6, s13
+; GISEL-GFX1100-NEXT:    s_load_b32 s11, s[4:5], 0x34
 ; GISEL-GFX1100-NEXT:    s_waitcnt lgkmcnt(0)
-; GISEL-GFX1100-NEXT:    s_mov_b32 s12, s1
-; GISEL-GFX1100-NEXT:    s_mov_b32 s9, s2
 ; GISEL-GFX1100-NEXT:    v_mov_b32_e32 v4, s0
-; GISEL-GFX1100-NEXT:    s_or_b64 s[0:1], s[12:13], s[8:9]
-; GISEL-GFX1100-NEXT:    s_mov_b32 s12, s3
-; GISEL-GFX1100-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GISEL-GFX1100-NEXT:    s_or_b64 s[2:3], s[12:13], s[6:7]
-; GISEL-GFX1100-NEXT:    buffer_load_b128 v[0:3], v4, s[0:3], 0 offen
+; GISEL-GFX1100-NEXT:    s_mov_b32 s8, s1
+; GISEL-GFX1100-NEXT:    s_mov_b32 s9, s2
+; GISEL-GFX1100-NEXT:    s_mov_b32 s10, s3
+; GISEL-GFX1100-NEXT:    buffer_load_b128 v[0:3], v4, s[8:11], 0 offen
 ; GISEL-GFX1100-NEXT:    s_clause 0x1
-; GISEL-GFX1100-NEXT:    s_load_b128 s[8:11], s[4:5], 0x44
+; GISEL-GFX1100-NEXT:    s_load_b128 s[0:3], s[4:5], 0x44
 ; GISEL-GFX1100-NEXT:    s_load_b32 s7, s[4:5], 0x54
-; GISEL-GFX1100-NEXT:    s_mov_b32 s4, s13
 ; GISEL-GFX1100-NEXT:    s_waitcnt lgkmcnt(0)
-; GISEL-GFX1100-NEXT:    s_mov_b32 s12, s9
-; GISEL-GFX1100-NEXT:    s_mov_b32 s5, s10
-; GISEL-GFX1100-NEXT:    v_mov_b32_e32 v5, s8
-; GISEL-GFX1100-NEXT:    s_or_b64 s[4:5], s[12:13], s[4:5]
-; GISEL-GFX1100-NEXT:    s_mov_b32 s12, s11
-; GISEL-GFX1100-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GISEL-GFX1100-NEXT:    s_or_b64 s[6:7], s[12:13], s[6:7]
+; GISEL-GFX1100-NEXT:    v_mov_b32_e32 v5, s0
+; GISEL-GFX1100-NEXT:    s_mov_b32 s4, s1
+; GISEL-GFX1100-NEXT:    s_mov_b32 s5, s2
+; GISEL-GFX1100-NEXT:    s_mov_b32 s6, s3
 ; GISEL-GFX1100-NEXT:    s_waitcnt vmcnt(0)
 ; GISEL-GFX1100-NEXT:    buffer_store_b128 v[0:3], v5, s[4:7], 0 offen
-; GISEL-GFX1100-NEXT:    buffer_load_b128 v[0:3], v4, s[0:3], 0 offen offset:16
+; GISEL-GFX1100-NEXT:    buffer_load_b128 v[0:3], v4, s[8:11], 0 offen offset:16
 ; GISEL-GFX1100-NEXT:    s_waitcnt vmcnt(0)
 ; GISEL-GFX1100-NEXT:    buffer_store_b128 v[0:3], v5, s[4:7], 0 offen offset:16
 ; GISEL-GFX1100-NEXT:    s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/div_i128.ll b/llvm/test/CodeGen/AMDGPU/div_i128.ll
index 6c8207a..df7f8c6 100644
--- a/llvm/test/CodeGen/AMDGPU/div_i128.ll
+++ b/llvm/test/CodeGen/AMDGPU/div_i128.ll
@@ -4344,7 +4344,7 @@ define i128 @v_sdiv_i128_v_pow2k(i128 %lhs) {
 ; GFX9-G-NEXT:    v_addc_co_u32_e32 v2, vcc, 0, v3, vcc
 ; GFX9-G-NEXT:    v_lshlrev_b64 v[0:1], 31, v[1:2]
 ; GFX9-G-NEXT:    v_lshrrev_b32_e32 v3, 1, v4
-; GFX9-G-NEXT:    v_or_b32_e32 v0, v3, v0
+; GFX9-G-NEXT:    v_or_b32_e32 v0, v0, v3
 ; GFX9-G-NEXT:    v_ashrrev_i32_e32 v3, 31, v2
 ; GFX9-G-NEXT:    v_ashrrev_i32_e32 v2, 1, v2
 ; GFX9-G-NEXT:    s_setpc_b64 s[30:31]
@@ -4375,14 +4375,12 @@ define i128 @v_sdiv_i128_v_pow2k(i128 %lhs) {
 ; GFX9-G-O0-NEXT:    v_mov_b32_e32 v6, v4
 ; GFX9-G-O0-NEXT:    s_mov_b32 s5, 1
 ; GFX9-G-O0-NEXT:    v_mov_b32_e32 v0, s5
-; GFX9-G-O0-NEXT:    v_lshrrev_b32_e64 v0, v0, v1
-; GFX9-G-O0-NEXT:    v_mov_b32_e32 v1, 0
-; GFX9-G-O0-NEXT:    v_mov_b32_e32 v2, s4
-; GFX9-G-O0-NEXT:    v_lshlrev_b64 v[5:6], v2, v[5:6]
-; GFX9-G-O0-NEXT:    v_mov_b32_e32 v3, v5
-; GFX9-G-O0-NEXT:    v_mov_b32_e32 v2, v6
-; GFX9-G-O0-NEXT:    v_or_b32_e64 v0, v0, v3
-; GFX9-G-O0-NEXT:    v_or_b32_e64 v1, v1, v2
+; GFX9-G-O0-NEXT:    v_lshrrev_b32_e64 v2, v0, v1
+; GFX9-G-O0-NEXT:    v_mov_b32_e32 v0, s4
+; GFX9-G-O0-NEXT:    v_lshlrev_b64 v[5:6], v0, v[5:6]
+; GFX9-G-O0-NEXT:    v_mov_b32_e32 v0, v5
+; GFX9-G-O0-NEXT:    v_mov_b32_e32 v1, v6
+; GFX9-G-O0-NEXT:    v_or_b32_e64 v0, v0, v2
 ; GFX9-G-O0-NEXT:    s_mov_b32 s4, 31
 ; GFX9-G-O0-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX9-G-O0-NEXT:    v_ashrrev_i32_e64 v3, v2, v4
@@ -4437,7 +4435,7 @@ define i128 @v_udiv_i128_v_pow2k(i128 %lhs) {
 ; GFX9-G-NEXT:    v_mov_b32_e32 v4, v1
 ; GFX9-G-NEXT:    v_lshlrev_b64 v[0:1], 31, v[2:3]
 ; GFX9-G-NEXT:    v_lshrrev_b32_e32 v2, 1, v4
-; GFX9-G-NEXT:    v_or_b32_e32 v0, v2, v0
+; GFX9-G-NEXT:    v_or_b32_e32 v0, v0, v2
 ; GFX9-G-NEXT:    v_lshrrev_b32_e32 v2, 1, v3
 ; GFX9-G-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX9-G-NEXT:    s_setpc_b64 s[30:31]
@@ -4450,15 +4448,13 @@ define i128 @v_udiv_i128_v_pow2k(i128 %lhs) {
 ; GFX9-G-O0-NEXT:    v_mov_b32_e32 v5, v3
 ; GFX9-G-O0-NEXT:    s_mov_b32 s4, 1
 ; GFX9-G-O0-NEXT:    v_mov_b32_e32 v0, s4
-; GFX9-G-O0-NEXT:    v_lshrrev_b32_e64 v0, v0, v1
-; GFX9-G-O0-NEXT:    v_mov_b32_e32 v1, 0
+; GFX9-G-O0-NEXT:    v_lshrrev_b32_e64 v2, v0, v1
 ; GFX9-G-O0-NEXT:    s_mov_b32 s4, 31
-; GFX9-G-O0-NEXT:    v_mov_b32_e32 v2, s4
-; GFX9-G-O0-NEXT:    v_lshlrev_b64 v[5:6], v2, v[4:5]
-; GFX9-G-O0-NEXT:    v_mov_b32_e32 v4, v5
-; GFX9-G-O0-NEXT:    v_mov_b32_e32 v2, v6
-; GFX9-G-O0-NEXT:    v_or_b32_e64 v0, v0, v4
-; GFX9-G-O0-NEXT:    v_or_b32_e64 v1, v1, v2
+; GFX9-G-O0-NEXT:    v_mov_b32_e32 v0, s4
+; GFX9-G-O0-NEXT:    v_lshlrev_b64 v[4:5], v0, v[4:5]
+; GFX9-G-O0-NEXT:    v_mov_b32_e32 v0, v4
+; GFX9-G-O0-NEXT:    v_mov_b32_e32 v1, v5
+; GFX9-G-O0-NEXT:    v_or_b32_e64 v0, v0, v2
 ; GFX9-G-O0-NEXT:    s_mov_b32 s4, 1
 ; GFX9-G-O0-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX9-G-O0-NEXT:    v_lshrrev_b32_e64 v2, v2, v3
diff --git a/llvm/test/CodeGen/AMDGPU/flat-saddr-load.ll b/llvm/test/CodeGen/AMDGPU/flat-saddr-load.ll
index b5b2655..31344c7 100644
--- a/llvm/test/CodeGen/AMDGPU/flat-saddr-load.ll
+++ b/llvm/test/CodeGen/AMDGPU/flat-saddr-load.ll
@@ -2080,21 +2080,13 @@ define amdgpu_ps float @flat_load_saddr_i8_offset_or_i64_imm_offset_16(ptr addrs
 }
 
 define amdgpu_ps float @flat_load_saddr_i8_offset_or_i64_imm_offset_4160(ptr addrspace(6) inreg %sbase, i32 %idx) {
-; GFX1250-SDAG-LABEL: flat_load_saddr_i8_offset_or_i64_imm_offset_4160:
-; GFX1250-SDAG:       ; %bb.0:
-; GFX1250-SDAG-NEXT:    v_or_b32_e32 v0, 0x1040, v0
-; GFX1250-SDAG-NEXT:    v_mov_b32_e32 v1, 0
-; GFX1250-SDAG-NEXT:    flat_load_u8 v0, v[0:1]
-; GFX1250-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX1250-SDAG-NEXT:    ; return to shader part epilog
-;
-; GFX1250-GISEL-LABEL: flat_load_saddr_i8_offset_or_i64_imm_offset_4160:
-; GFX1250-GISEL:       ; %bb.0:
-; GFX1250-GISEL-NEXT:    v_mov_b32_e32 v1, 0
-; GFX1250-GISEL-NEXT:    v_or_b32_e32 v0, 0x1040, v0
-; GFX1250-GISEL-NEXT:    flat_load_u8 v0, v[0:1]
-; GFX1250-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX1250-GISEL-NEXT:    ; return to shader part epilog
+; GFX1250-LABEL: flat_load_saddr_i8_offset_or_i64_imm_offset_4160:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    v_or_b32_e32 v0, 0x1040, v0
+; GFX1250-NEXT:    v_mov_b32_e32 v1, 0
+; GFX1250-NEXT:    flat_load_u8 v0, v[0:1]
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    ; return to shader part epilog
   %zext.idx = zext i32 %idx to i64
   %or = or i64 %zext.idx, 4160
   %addr = inttoptr i64 %or to ptr
diff --git a/llvm/test/CodeGen/AMDGPU/fmaximum.ll b/llvm/test/CodeGen/AMDGPU/fmaximum.ll
index e59fbad..62ec010 100644
--- a/llvm/test/CodeGen/AMDGPU/fmaximum.ll
+++ b/llvm/test/CodeGen/AMDGPU/fmaximum.ll
@@ -1,117 +1,296 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=+real-true16 < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX12-SDAG,GFX12-SDAG-TRUE16 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=-real-true16 < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX12-SDAG,GFX12-SDAG-FAKE16 %s
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1200 -mattr=+real-true16 < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX12-GISEL,GFX12-GISEL-TRUE16 %s
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1200 -mattr=-real-true16 < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX12-GISEL,GFX12-GISEL-FAKE16 %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -enable-var-scope -check-prefixes=GFX9,GFX9-SDAG %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -enable-var-scope -check-prefixes=GFX9,GFX9-GISEL %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1200 -mattr=+real-true16 < %s | FileCheck -enable-var-scope -check-prefixes=GFX12,GFX12-SDAG,GFX12-SDAG-TRUE16 %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1200 -mattr=-real-true16 < %s | FileCheck -enable-var-scope -check-prefixes=GFX12,GFX12-SDAG,GFX12-SDAG-FAKE16 %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1200 -mattr=+real-true16 < %s | FileCheck -enable-var-scope -check-prefixes=GFX12,GFX12-GISEL,GFX12-GISEL-TRUE16 %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1200 -mattr=-real-true16 < %s | FileCheck -enable-var-scope -check-prefixes=GFX12,GFX12-GISEL,GFX12-GISEL-FAKE16 %s
 
 define amdgpu_ps float @test_fmaximum_f32_vv(float %a, float %b) {
-; GCN-LABEL: test_fmaximum_f32_vv:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    v_maximum_f32 v0, v0, v1
-; GCN-NEXT:    ; return to shader part epilog
+; GFX9-LABEL: test_fmaximum_f32_vv:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    v_max_f32_e32 v2, v0, v1
+; GFX9-NEXT:    v_mov_b32_e32 v3, 0x7fc00000
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v0, v1
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
+; GFX9-NEXT:    ; return to shader part epilog
+;
+; GFX12-LABEL: test_fmaximum_f32_vv:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    v_maximum_f32 v0, v0, v1
+; GFX12-NEXT:    ; return to shader part epilog
   %val = call float @llvm.maximum.f32(float %a, float %b)
   ret float %val
 }
 
 define amdgpu_ps float @test_fmaximum_f32_ss(float inreg %a, float inreg %b) {
-; GCN-LABEL: test_fmaximum_f32_ss:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_maximum_f32 s0, s0, s1
-; GCN-NEXT:    s_delay_alu instid0(SALU_CYCLE_3)
-; GCN-NEXT:    v_mov_b32_e32 v0, s0
-; GCN-NEXT:    ; return to shader part epilog
+; GFX9-LABEL: test_fmaximum_f32_ss:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    v_mov_b32_e32 v0, s1
+; GFX9-NEXT:    v_max_f32_e32 v1, s0, v0
+; GFX9-NEXT:    v_mov_b32_e32 v2, 0x7fc00000
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, s0, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
+; GFX9-NEXT:    ; return to shader part epilog
+;
+; GFX12-LABEL: test_fmaximum_f32_ss:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_maximum_f32 s0, s0, s1
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_3)
+; GFX12-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-NEXT:    ; return to shader part epilog
   %val = call float @llvm.maximum.f32(float %a, float %b)
   ret float %val
 }
 
 define amdgpu_ps float @test_fmaximum_f32_vs(float %a, float inreg %b) {
-; GCN-LABEL: test_fmaximum_f32_vs:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    v_maximum_f32 v0, v0, s0
-; GCN-NEXT:    ; return to shader part epilog
+; GFX9-LABEL: test_fmaximum_f32_vs:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    v_max_f32_e32 v1, s0, v0
+; GFX9-NEXT:    v_mov_b32_e32 v2, 0x7fc00000
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, s0, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
+; GFX9-NEXT:    ; return to shader part epilog
+;
+; GFX12-LABEL: test_fmaximum_f32_vs:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    v_maximum_f32 v0, v0, s0
+; GFX12-NEXT:    ; return to shader part epilog
   %val = call float @llvm.maximum.f32(float %a, float %b)
   ret float %val
 }
 
 define amdgpu_ps float @test_fmaximum_nnan_f32(float %a, float %b) {
-; GCN-LABEL: test_fmaximum_nnan_f32:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    v_maximum_f32 v0, v0, v1
-; GCN-NEXT:    ; return to shader part epilog
+; GFX9-LABEL: test_fmaximum_nnan_f32:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    v_max_f32_e32 v0, v0, v1
+; GFX9-NEXT:    ; return to shader part epilog
+;
+; GFX12-LABEL: test_fmaximum_nnan_f32:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    v_maximum_f32 v0, v0, v1
+; GFX12-NEXT:    ; return to shader part epilog
   %val = call nnan float @llvm.maximum.f32(float %a, float %b)
   ret float %val
 }
 
+define amdgpu_ps float @test_fmaximum_nsz_f32(float %a, float %b) {
+; GFX9-LABEL: test_fmaximum_nsz_f32:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    v_max_f32_e32 v2, v0, v1
+; GFX9-NEXT:    v_mov_b32_e32 v3, 0x7fc00000
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v0, v1
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
+; GFX9-NEXT:    ; return to shader part epilog
+;
+; GFX12-LABEL: test_fmaximum_nsz_f32:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    v_maximum_f32 v0, v0, v1
+; GFX12-NEXT:    ; return to shader part epilog
+  %val = call nsz float @llvm.maximum.f32(float %a, float %b)
+  ret float %val
+}
+
+define amdgpu_ps float @test_fmaximum_signed_zero_f32() {
+; GFX9-LABEL: test_fmaximum_signed_zero_f32:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9-NEXT:    ; return to shader part epilog
+;
+; GFX12-LABEL: test_fmaximum_signed_zero_f32:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-NEXT:    ; return to shader part epilog
+  %val = call float @llvm.maximum.f32(float -0.0, float 0.0)
+  ret float %val
+}
+
 define amdgpu_ps <2 x float> @test_fmaximum_v2f32(<2 x float> %a, <2 x float> %b) {
-; GCN-LABEL: test_fmaximum_v2f32:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    v_maximum_f32 v0, v0, v2
-; GCN-NEXT:    v_maximum_f32 v1, v1, v3
-; GCN-NEXT:    ; return to shader part epilog
+; GFX9-LABEL: test_fmaximum_v2f32:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    v_max_f32_e32 v4, v0, v2
+; GFX9-NEXT:    v_mov_b32_e32 v5, 0x7fc00000
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v0, v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v5, v4, vcc
+; GFX9-NEXT:    v_max_f32_e32 v2, v1, v3
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v1, v3
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v5, v2, vcc
+; GFX9-NEXT:    ; return to shader part epilog
+;
+; GFX12-LABEL: test_fmaximum_v2f32:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    v_maximum_f32 v0, v0, v2
+; GFX12-NEXT:    v_maximum_f32 v1, v1, v3
+; GFX12-NEXT:    ; return to shader part epilog
   %val = call <2 x float> @llvm.maximum.v2f32(<2 x float> %a, <2 x float> %b)
   ret <2 x float> %val
 }
 
 define amdgpu_ps <2 x float> @test_fmaximum_v2f32_ss(<2 x float> inreg %a, <2 x float> inreg %b) {
-; GCN-LABEL: test_fmaximum_v2f32_ss:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_maximum_f32 s0, s0, s2
-; GCN-NEXT:    s_maximum_f32 s1, s1, s3
-; GCN-NEXT:    s_delay_alu instid0(SALU_CYCLE_3)
-; GCN-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GCN-NEXT:    ; return to shader part epilog
+; GFX9-LABEL: test_fmaximum_v2f32_ss:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    v_mov_b32_e32 v0, s2
+; GFX9-NEXT:    v_max_f32_e32 v1, s0, v0
+; GFX9-NEXT:    v_mov_b32_e32 v2, 0x7fc00000
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, s0, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
+; GFX9-NEXT:    v_mov_b32_e32 v1, s3
+; GFX9-NEXT:    v_max_f32_e32 v3, s1, v1
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, s1, v1
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v2, v3, vcc
+; GFX9-NEXT:    ; return to shader part epilog
+;
+; GFX12-LABEL: test_fmaximum_v2f32_ss:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_maximum_f32 s0, s0, s2
+; GFX12-NEXT:    s_maximum_f32 s1, s1, s3
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_3)
+; GFX12-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-NEXT:    ; return to shader part epilog
   %val = call <2 x float> @llvm.maximum.v2f32(<2 x float> %a, <2 x float> %b)
   ret <2 x float> %val
 }
 
 define amdgpu_ps <3 x float> @test_fmaximum_v3f32(<3 x float> %a, <3 x float> %b) {
-; GCN-LABEL: test_fmaximum_v3f32:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    v_maximum_f32 v0, v0, v3
-; GCN-NEXT:    v_maximum_f32 v1, v1, v4
-; GCN-NEXT:    v_maximum_f32 v2, v2, v5
-; GCN-NEXT:    ; return to shader part epilog
+; GFX9-LABEL: test_fmaximum_v3f32:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    v_max_f32_e32 v6, v0, v3
+; GFX9-NEXT:    v_mov_b32_e32 v7, 0x7fc00000
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v0, v3
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v7, v6, vcc
+; GFX9-NEXT:    v_max_f32_e32 v3, v1, v4
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v1, v4
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v7, v3, vcc
+; GFX9-NEXT:    v_max_f32_e32 v3, v2, v5
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v2, v5
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v7, v3, vcc
+; GFX9-NEXT:    ; return to shader part epilog
+;
+; GFX12-LABEL: test_fmaximum_v3f32:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    v_maximum_f32 v0, v0, v3
+; GFX12-NEXT:    v_maximum_f32 v1, v1, v4
+; GFX12-NEXT:    v_maximum_f32 v2, v2, v5
+; GFX12-NEXT:    ; return to shader part epilog
   %val = call <3 x float> @llvm.maximum.v3f32(<3 x float> %a, <3 x float> %b)
   ret <3 x float> %val
 }
 
 define amdgpu_ps <4 x float> @test_fmaximum_v4f32(<4 x float> %a, <4 x float> %b) {
-; GCN-LABEL: test_fmaximum_v4f32:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    v_maximum_f32 v0, v0, v4
-; GCN-NEXT:    v_maximum_f32 v1, v1, v5
-; GCN-NEXT:    v_maximum_f32 v2, v2, v6
-; GCN-NEXT:    v_maximum_f32 v3, v3, v7
-; GCN-NEXT:    ; return to shader part epilog
+; GFX9-LABEL: test_fmaximum_v4f32:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    v_max_f32_e32 v8, v0, v4
+; GFX9-NEXT:    v_mov_b32_e32 v9, 0x7fc00000
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v0, v4
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v9, v8, vcc
+; GFX9-NEXT:    v_max_f32_e32 v4, v1, v5
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v1, v5
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v9, v4, vcc
+; GFX9-NEXT:    v_max_f32_e32 v4, v2, v6
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v2, v6
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v9, v4, vcc
+; GFX9-NEXT:    v_max_f32_e32 v4, v3, v7
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v3, v7
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v9, v4, vcc
+; GFX9-NEXT:    ; return to shader part epilog
+;
+; GFX12-LABEL: test_fmaximum_v4f32:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    v_maximum_f32 v0, v0, v4
+; GFX12-NEXT:    v_maximum_f32 v1, v1, v5
+; GFX12-NEXT:    v_maximum_f32 v2, v2, v6
+; GFX12-NEXT:    v_maximum_f32 v3, v3, v7
+; GFX12-NEXT:    ; return to shader part epilog
   %val = call <4 x float> @llvm.maximum.v4f32(<4 x float> %a, <4 x float> %b)
   ret <4 x float> %val
 }
 
 define amdgpu_ps <16 x float> @test_fmaximum_v16f32(<16 x float> %a, <16 x float> %b) {
-; GCN-LABEL: test_fmaximum_v16f32:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    v_maximum_f32 v0, v0, v16
-; GCN-NEXT:    v_maximum_f32 v1, v1, v17
-; GCN-NEXT:    v_maximum_f32 v2, v2, v18
-; GCN-NEXT:    v_maximum_f32 v3, v3, v19
-; GCN-NEXT:    v_maximum_f32 v4, v4, v20
-; GCN-NEXT:    v_maximum_f32 v5, v5, v21
-; GCN-NEXT:    v_maximum_f32 v6, v6, v22
-; GCN-NEXT:    v_maximum_f32 v7, v7, v23
-; GCN-NEXT:    v_maximum_f32 v8, v8, v24
-; GCN-NEXT:    v_maximum_f32 v9, v9, v25
-; GCN-NEXT:    v_maximum_f32 v10, v10, v26
-; GCN-NEXT:    v_maximum_f32 v11, v11, v27
-; GCN-NEXT:    v_maximum_f32 v12, v12, v28
-; GCN-NEXT:    v_maximum_f32 v13, v13, v29
-; GCN-NEXT:    v_maximum_f32 v14, v14, v30
-; GCN-NEXT:    v_maximum_f32 v15, v15, v31
-; GCN-NEXT:    ; return to shader part epilog
+; GFX9-LABEL: test_fmaximum_v16f32:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    v_max_f32_e32 v32, v1, v17
+; GFX9-NEXT:    v_mov_b32_e32 v33, 0x7fc00000
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v1, v17
+; GFX9-NEXT:    v_max_f32_e32 v1, v0, v16
+; GFX9-NEXT:    v_cmp_o_f32_e64 s[12:13], v0, v16
+; GFX9-NEXT:    v_max_f32_e32 v17, v2, v18
+; GFX9-NEXT:    v_cmp_o_f32_e64 s[0:1], v2, v18
+; GFX9-NEXT:    v_max_f32_e32 v18, v3, v19
+; GFX9-NEXT:    v_cmp_o_f32_e64 s[2:3], v3, v19
+; GFX9-NEXT:    v_max_f32_e32 v19, v4, v20
+; GFX9-NEXT:    v_cmp_o_f32_e64 s[4:5], v4, v20
+; GFX9-NEXT:    v_max_f32_e32 v20, v5, v21
+; GFX9-NEXT:    v_cmp_o_f32_e64 s[6:7], v5, v21
+; GFX9-NEXT:    v_max_f32_e32 v21, v6, v22
+; GFX9-NEXT:    v_cmp_o_f32_e64 s[8:9], v6, v22
+; GFX9-NEXT:    v_max_f32_e32 v22, v7, v23
+; GFX9-NEXT:    v_cmp_o_f32_e64 s[10:11], v7, v23
+; GFX9-NEXT:    v_max_f32_e32 v23, v8, v24
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, v33, v1, s[12:13]
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v33, v32, vcc
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v8, v24
+; GFX9-NEXT:    v_max_f32_e32 v34, v9, v25
+; GFX9-NEXT:    v_cndmask_b32_e32 v8, v33, v23, vcc
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v9, v25
+; GFX9-NEXT:    v_max_f32_e32 v35, v10, v26
+; GFX9-NEXT:    v_cndmask_b32_e32 v9, v33, v34, vcc
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v10, v26
+; GFX9-NEXT:    v_max_f32_e32 v36, v11, v27
+; GFX9-NEXT:    v_cndmask_b32_e32 v10, v33, v35, vcc
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v11, v27
+; GFX9-NEXT:    v_max_f32_e32 v37, v12, v28
+; GFX9-NEXT:    v_cndmask_b32_e32 v11, v33, v36, vcc
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v12, v28
+; GFX9-NEXT:    v_max_f32_e32 v16, v13, v29
+; GFX9-NEXT:    v_cndmask_b32_e32 v12, v33, v37, vcc
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v13, v29
+; GFX9-NEXT:    v_cndmask_b32_e32 v13, v33, v16, vcc
+; GFX9-NEXT:    v_max_f32_e32 v16, v14, v30
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v14, v30
+; GFX9-NEXT:    v_cndmask_b32_e32 v14, v33, v16, vcc
+; GFX9-NEXT:    v_max_f32_e32 v16, v15, v31
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v15, v31
+; GFX9-NEXT:    v_cndmask_b32_e64 v2, v33, v17, s[0:1]
+; GFX9-NEXT:    v_cndmask_b32_e64 v3, v33, v18, s[2:3]
+; GFX9-NEXT:    v_cndmask_b32_e64 v4, v33, v19, s[4:5]
+; GFX9-NEXT:    v_cndmask_b32_e64 v5, v33, v20, s[6:7]
+; GFX9-NEXT:    v_cndmask_b32_e64 v6, v33, v21, s[8:9]
+; GFX9-NEXT:    v_cndmask_b32_e64 v7, v33, v22, s[10:11]
+; GFX9-NEXT:    v_cndmask_b32_e32 v15, v33, v16, vcc
+; GFX9-NEXT:    ; return to shader part epilog
+;
+; GFX12-LABEL: test_fmaximum_v16f32:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    v_maximum_f32 v0, v0, v16
+; GFX12-NEXT:    v_maximum_f32 v1, v1, v17
+; GFX12-NEXT:    v_maximum_f32 v2, v2, v18
+; GFX12-NEXT:    v_maximum_f32 v3, v3, v19
+; GFX12-NEXT:    v_maximum_f32 v4, v4, v20
+; GFX12-NEXT:    v_maximum_f32 v5, v5, v21
+; GFX12-NEXT:    v_maximum_f32 v6, v6, v22
+; GFX12-NEXT:    v_maximum_f32 v7, v7, v23
+; GFX12-NEXT:    v_maximum_f32 v8, v8, v24
+; GFX12-NEXT:    v_maximum_f32 v9, v9, v25
+; GFX12-NEXT:    v_maximum_f32 v10, v10, v26
+; GFX12-NEXT:    v_maximum_f32 v11, v11, v27
+; GFX12-NEXT:    v_maximum_f32 v12, v12, v28
+; GFX12-NEXT:    v_maximum_f32 v13, v13, v29
+; GFX12-NEXT:    v_maximum_f32 v14, v14, v30
+; GFX12-NEXT:    v_maximum_f32 v15, v15, v31
+; GFX12-NEXT:    ; return to shader part epilog
   %val = call <16 x float> @llvm.maximum.v16f32(<16 x float> %a, <16 x float> %b)
   ret <16 x float> %val
 }
 
 define amdgpu_ps half @test_fmaximum_f16_vv(half %a, half %b) {
+; GFX9-LABEL: test_fmaximum_f16_vv:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    v_max_f16_e32 v2, v0, v1
+; GFX9-NEXT:    v_mov_b32_e32 v3, 0x7e00
+; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v0, v1
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
+; GFX9-NEXT:    ; return to shader part epilog
+;
 ; GFX12-SDAG-TRUE16-LABEL: test_fmaximum_f16_vv:
 ; GFX12-SDAG-TRUE16:       ; %bb.0:
 ; GFX12-SDAG-TRUE16-NEXT:    v_maximum_f16 v0.l, v0.l, v1.l
@@ -136,35 +315,131 @@ define amdgpu_ps half @test_fmaximum_f16_vv(half %a, half %b) {
 }
 
 define amdgpu_ps half @test_fmaximum_f16_ss(half inreg %a, half inreg %b) {
-; GCN-LABEL: test_fmaximum_f16_ss:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_maximum_f16 s0, s0, s1
-; GCN-NEXT:    s_delay_alu instid0(SALU_CYCLE_3)
-; GCN-NEXT:    v_mov_b32_e32 v0, s0
-; GCN-NEXT:    ; return to shader part epilog
+; GFX9-LABEL: test_fmaximum_f16_ss:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    v_mov_b32_e32 v0, s1
+; GFX9-NEXT:    v_max_f16_e32 v1, s0, v0
+; GFX9-NEXT:    v_mov_b32_e32 v2, 0x7e00
+; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, s0, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
+; GFX9-NEXT:    ; return to shader part epilog
+;
+; GFX12-LABEL: test_fmaximum_f16_ss:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_maximum_f16 s0, s0, s1
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_3)
+; GFX12-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-NEXT:    ; return to shader part epilog
   %val = call half @llvm.maximum.f16(half %a, half %b)
   ret half %val
 }
 
 define amdgpu_ps <2 x half> @test_fmaximum_v2f16_vv(<2 x half> %a, <2 x half> %b) {
-; GCN-LABEL: test_fmaximum_v2f16_vv:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    v_pk_maximum_f16 v0, v0, v1
-; GCN-NEXT:    ; return to shader part epilog
+; GFX9-SDAG-LABEL: test_fmaximum_v2f16_vv:
+; GFX9-SDAG:       ; %bb.0:
+; GFX9-SDAG-NEXT:    v_pk_max_f16 v2, v0, v1
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v3, 0x7e00
+; GFX9-SDAG-NEXT:    v_cmp_o_f16_e32 vcc, v0, v1
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v4, v3, v2, vcc
+; GFX9-SDAG-NEXT:    v_cmp_o_f16_sdwa vcc, v0, v1 src0_sel:WORD_1 src1_sel:WORD_1
+; GFX9-SDAG-NEXT:    v_cndmask_b32_sdwa v0, v3, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX9-SDAG-NEXT:    s_mov_b32 s0, 0x5040100
+; GFX9-SDAG-NEXT:    v_perm_b32 v0, v0, v4, s0
+; GFX9-SDAG-NEXT:    ; return to shader part epilog
+;
+; GFX9-GISEL-LABEL: test_fmaximum_v2f16_vv:
+; GFX9-GISEL:       ; %bb.0:
+; GFX9-GISEL-NEXT:    v_pk_max_f16 v2, v0, v1
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v3, 0x7e00
+; GFX9-GISEL-NEXT:    v_cmp_o_f16_e64 s[0:1], v0, v1
+; GFX9-GISEL-NEXT:    v_cmp_o_f16_sdwa vcc, v0, v1 src0_sel:WORD_1 src1_sel:WORD_1
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e64 v0, v3, v2, s[0:1]
+; GFX9-GISEL-NEXT:    v_cndmask_b32_sdwa v1, v3, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX9-GISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX9-GISEL-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
+; GFX9-GISEL-NEXT:    ; return to shader part epilog
+;
+; GFX12-LABEL: test_fmaximum_v2f16_vv:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    v_pk_maximum_f16 v0, v0, v1
+; GFX12-NEXT:    ; return to shader part epilog
   %val = call <2 x half> @llvm.maximum.v2f16(<2 x half> %a, <2 x half> %b)
   ret <2 x half> %val
 }
 
 define amdgpu_ps <2 x half> @test_fmaximum_v2f16_ss(<2 x half> inreg %a, <2 x half> inreg %b) {
-; GCN-LABEL: test_fmaximum_v2f16_ss:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    v_pk_maximum_f16 v0, s0, s1
-; GCN-NEXT:    ; return to shader part epilog
+; GFX9-SDAG-LABEL: test_fmaximum_v2f16_ss:
+; GFX9-SDAG:       ; %bb.0:
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v0, s1
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v1, s1
+; GFX9-SDAG-NEXT:    s_lshr_b32 s1, s1, 16
+; GFX9-SDAG-NEXT:    v_pk_max_f16 v1, s0, v1
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v2, 0x7e00
+; GFX9-SDAG-NEXT:    v_cmp_o_f16_e32 vcc, s0, v0
+; GFX9-SDAG-NEXT:    s_lshr_b32 s0, s0, 16
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v3, s1
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
+; GFX9-SDAG-NEXT:    v_cmp_o_f16_e32 vcc, s0, v3
+; GFX9-SDAG-NEXT:    v_cndmask_b32_sdwa v1, v2, v1, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX9-SDAG-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX9-SDAG-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
+; GFX9-SDAG-NEXT:    ; return to shader part epilog
+;
+; GFX9-GISEL-LABEL: test_fmaximum_v2f16_ss:
+; GFX9-GISEL:       ; %bb.0:
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v0, s1
+; GFX9-GISEL-NEXT:    s_lshr_b32 s1, s1, 16
+; GFX9-GISEL-NEXT:    s_lshr_b32 s2, s0, 16
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, s1
+; GFX9-GISEL-NEXT:    v_pk_max_f16 v1, s0, v0
+; GFX9-GISEL-NEXT:    v_cmp_o_f16_e32 vcc, s2, v2
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, 0x7e00
+; GFX9-GISEL-NEXT:    v_cmp_o_f16_e64 s[0:1], s0, v0
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e64 v0, v2, v1, s[0:1]
+; GFX9-GISEL-NEXT:    v_cndmask_b32_sdwa v1, v2, v1, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX9-GISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX9-GISEL-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
+; GFX9-GISEL-NEXT:    ; return to shader part epilog
+;
+; GFX12-LABEL: test_fmaximum_v2f16_ss:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    v_pk_maximum_f16 v0, s0, s1
+; GFX12-NEXT:    ; return to shader part epilog
   %val = call <2 x half> @llvm.maximum.v2f16(<2 x half> %a, <2 x half> %b)
   ret <2 x half> %val
 }
 
 define amdgpu_ps <3 x half> @test_fmaximum_v3f16_vv(<3 x half> %a, <3 x half> %b) {
+; GFX9-SDAG-LABEL: test_fmaximum_v3f16_vv:
+; GFX9-SDAG:       ; %bb.0:
+; GFX9-SDAG-NEXT:    v_pk_max_f16 v4, v1, v3
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v5, 0x7e00
+; GFX9-SDAG-NEXT:    v_cmp_o_f16_e32 vcc, v1, v3
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v1, v5, v4, vcc
+; GFX9-SDAG-NEXT:    v_pk_max_f16 v3, v0, v2
+; GFX9-SDAG-NEXT:    v_cmp_o_f16_e32 vcc, v0, v2
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v4, v5, v3, vcc
+; GFX9-SDAG-NEXT:    v_cmp_o_f16_sdwa vcc, v0, v2 src0_sel:WORD_1 src1_sel:WORD_1
+; GFX9-SDAG-NEXT:    v_cndmask_b32_sdwa v0, v5, v3, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX9-SDAG-NEXT:    s_mov_b32 s0, 0x5040100
+; GFX9-SDAG-NEXT:    v_perm_b32 v0, v0, v4, s0
+; GFX9-SDAG-NEXT:    ; return to shader part epilog
+;
+; GFX9-GISEL-LABEL: test_fmaximum_v3f16_vv:
+; GFX9-GISEL:       ; %bb.0:
+; GFX9-GISEL-NEXT:    v_pk_max_f16 v4, v0, v2
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v5, 0x7e00
+; GFX9-GISEL-NEXT:    v_cmp_o_f16_e64 s[0:1], v0, v2
+; GFX9-GISEL-NEXT:    v_cmp_o_f16_sdwa vcc, v0, v2 src0_sel:WORD_1 src1_sel:WORD_1
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e64 v0, v5, v4, s[0:1]
+; GFX9-GISEL-NEXT:    v_cndmask_b32_sdwa v2, v5, v4, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX9-GISEL-NEXT:    v_pk_max_f16 v4, v1, v3
+; GFX9-GISEL-NEXT:    v_cmp_o_f16_e32 vcc, v1, v3
+; GFX9-GISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v1, v5, v4, vcc
+; GFX9-GISEL-NEXT:    v_lshl_or_b32 v0, v2, 16, v0
+; GFX9-GISEL-NEXT:    ; return to shader part epilog
+;
 ; GFX12-SDAG-LABEL: test_fmaximum_v3f16_vv:
 ; GFX12-SDAG:       ; %bb.0:
 ; GFX12-SDAG-NEXT:    v_pk_maximum_f16 v0, v0, v2
@@ -187,6 +462,49 @@ define amdgpu_ps <3 x half> @test_fmaximum_v3f16_vv(<3 x half> %a, <3 x half> %b
 }
 
 define amdgpu_ps <3 x half> @test_fmaximum_v3f16_ss(<3 x half> inreg %a, <3 x half> inreg %b) {
+; GFX9-SDAG-LABEL: test_fmaximum_v3f16_ss:
+; GFX9-SDAG:       ; %bb.0:
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v0, s3
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v1, s3
+; GFX9-SDAG-NEXT:    v_pk_max_f16 v1, s1, v1
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v2, 0x7e00
+; GFX9-SDAG-NEXT:    v_cmp_o_f16_e32 vcc, s1, v0
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v0, s2
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v3, s2
+; GFX9-SDAG-NEXT:    s_lshr_b32 s1, s2, 16
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc
+; GFX9-SDAG-NEXT:    v_pk_max_f16 v3, s0, v3
+; GFX9-SDAG-NEXT:    v_cmp_o_f16_e32 vcc, s0, v0
+; GFX9-SDAG-NEXT:    s_lshr_b32 s0, s0, 16
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v4, s1
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v0, v2, v3, vcc
+; GFX9-SDAG-NEXT:    v_cmp_o_f16_e32 vcc, s0, v4
+; GFX9-SDAG-NEXT:    v_cndmask_b32_sdwa v2, v2, v3, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX9-SDAG-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX9-SDAG-NEXT:    v_lshl_or_b32 v0, v2, 16, v0
+; GFX9-SDAG-NEXT:    ; return to shader part epilog
+;
+; GFX9-GISEL-LABEL: test_fmaximum_v3f16_ss:
+; GFX9-GISEL:       ; %bb.0:
+; GFX9-GISEL-NEXT:    s_lshr_b32 s5, s2, 16
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v0, s2
+; GFX9-GISEL-NEXT:    s_lshr_b32 s4, s0, 16
+; GFX9-GISEL-NEXT:    v_pk_max_f16 v1, s0, v0
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, s5
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v4, 0x7e00
+; GFX9-GISEL-NEXT:    v_cmp_o_f16_e32 vcc, s0, v0
+; GFX9-GISEL-NEXT:    v_lshrrev_b32_e32 v3, 16, v1
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v0, v4, v1, vcc
+; GFX9-GISEL-NEXT:    v_cmp_o_f16_e32 vcc, s4, v2
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, s3
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v2, v4, v3, vcc
+; GFX9-GISEL-NEXT:    v_pk_max_f16 v3, s1, v1
+; GFX9-GISEL-NEXT:    v_cmp_o_f16_e32 vcc, s1, v1
+; GFX9-GISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v1, v4, v3, vcc
+; GFX9-GISEL-NEXT:    v_lshl_or_b32 v0, v2, 16, v0
+; GFX9-GISEL-NEXT:    ; return to shader part epilog
+;
 ; GFX12-SDAG-LABEL: test_fmaximum_v3f16_ss:
 ; GFX12-SDAG:       ; %bb.0:
 ; GFX12-SDAG-NEXT:    v_pk_maximum_f16 v0, s0, s2
@@ -206,97 +524,384 @@ define amdgpu_ps <3 x half> @test_fmaximum_v3f16_ss(<3 x half> inreg %a, <3 x ha
 }
 
 define amdgpu_ps <4 x half> @test_fmaximum_v4f16(<4 x half> %a, <4 x half> %b) {
-; GCN-LABEL: test_fmaximum_v4f16:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    v_pk_maximum_f16 v0, v0, v2
-; GCN-NEXT:    v_pk_maximum_f16 v1, v1, v3
-; GCN-NEXT:    ; return to shader part epilog
+; GFX9-SDAG-LABEL: test_fmaximum_v4f16:
+; GFX9-SDAG:       ; %bb.0:
+; GFX9-SDAG-NEXT:    v_pk_max_f16 v4, v1, v3
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v5, 0x7e00
+; GFX9-SDAG-NEXT:    v_cmp_o_f16_e32 vcc, v1, v3
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v6, v5, v4, vcc
+; GFX9-SDAG-NEXT:    v_cmp_o_f16_sdwa vcc, v1, v3 src0_sel:WORD_1 src1_sel:WORD_1
+; GFX9-SDAG-NEXT:    v_cndmask_b32_sdwa v1, v5, v4, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX9-SDAG-NEXT:    v_pk_max_f16 v3, v0, v2
+; GFX9-SDAG-NEXT:    v_cmp_o_f16_e32 vcc, v0, v2
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v4, v5, v3, vcc
+; GFX9-SDAG-NEXT:    v_cmp_o_f16_sdwa vcc, v0, v2 src0_sel:WORD_1 src1_sel:WORD_1
+; GFX9-SDAG-NEXT:    v_cndmask_b32_sdwa v0, v5, v3, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX9-SDAG-NEXT:    s_mov_b32 s0, 0x5040100
+; GFX9-SDAG-NEXT:    v_perm_b32 v0, v0, v4, s0
+; GFX9-SDAG-NEXT:    v_perm_b32 v1, v1, v6, s0
+; GFX9-SDAG-NEXT:    ; return to shader part epilog
+;
+; GFX9-GISEL-LABEL: test_fmaximum_v4f16:
+; GFX9-GISEL:       ; %bb.0:
+; GFX9-GISEL-NEXT:    v_pk_max_f16 v4, v0, v2
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v6, 0x7e00
+; GFX9-GISEL-NEXT:    v_cmp_o_f16_e32 vcc, v0, v2
+; GFX9-GISEL-NEXT:    v_lshrrev_b32_e32 v5, 16, v4
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v4, v6, v4, vcc
+; GFX9-GISEL-NEXT:    v_cmp_o_f16_sdwa vcc, v0, v2 src0_sel:WORD_1 src1_sel:WORD_1
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v0, v6, v5, vcc
+; GFX9-GISEL-NEXT:    v_and_b32_e32 v2, 0xffff, v4
+; GFX9-GISEL-NEXT:    v_lshl_or_b32 v0, v0, 16, v2
+; GFX9-GISEL-NEXT:    v_pk_max_f16 v2, v1, v3
+; GFX9-GISEL-NEXT:    v_cmp_o_f16_e64 s[0:1], v1, v3
+; GFX9-GISEL-NEXT:    v_cmp_o_f16_sdwa vcc, v1, v3 src0_sel:WORD_1 src1_sel:WORD_1
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e64 v1, v6, v2, s[0:1]
+; GFX9-GISEL-NEXT:    v_cndmask_b32_sdwa v2, v6, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX9-GISEL-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; GFX9-GISEL-NEXT:    v_lshl_or_b32 v1, v2, 16, v1
+; GFX9-GISEL-NEXT:    ; return to shader part epilog
+;
+; GFX12-LABEL: test_fmaximum_v4f16:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    v_pk_maximum_f16 v0, v0, v2
+; GFX12-NEXT:    v_pk_maximum_f16 v1, v1, v3
+; GFX12-NEXT:    ; return to shader part epilog
   %val = call <4 x half> @llvm.maximum.v4f16(<4 x half> %a, <4 x half> %b)
   ret <4 x half> %val
 }
 
 define amdgpu_ps <4 x half> @test_fmaximum_v4f16_ss(<4 x half> inreg %a, <4 x half> inreg %b) {
-; GCN-LABEL: test_fmaximum_v4f16_ss:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    v_pk_maximum_f16 v0, s0, s2
-; GCN-NEXT:    v_pk_maximum_f16 v1, s1, s3
-; GCN-NEXT:    ; return to shader part epilog
+; GFX9-SDAG-LABEL: test_fmaximum_v4f16_ss:
+; GFX9-SDAG:       ; %bb.0:
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v0, s3
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v1, s3
+; GFX9-SDAG-NEXT:    s_lshr_b32 s3, s3, 16
+; GFX9-SDAG-NEXT:    v_pk_max_f16 v1, s1, v1
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v2, 0x7e00
+; GFX9-SDAG-NEXT:    v_cmp_o_f16_e32 vcc, s1, v0
+; GFX9-SDAG-NEXT:    s_lshr_b32 s1, s1, 16
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v0, s3
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v3, v2, v1, vcc
+; GFX9-SDAG-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX9-SDAG-NEXT:    v_cmp_o_f16_e32 vcc, s1, v0
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v0, s2
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v4, s2
+; GFX9-SDAG-NEXT:    s_lshr_b32 s1, s2, 16
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc
+; GFX9-SDAG-NEXT:    v_pk_max_f16 v4, s0, v4
+; GFX9-SDAG-NEXT:    v_cmp_o_f16_e32 vcc, s0, v0
+; GFX9-SDAG-NEXT:    s_lshr_b32 s0, s0, 16
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v5, s1
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v0, v2, v4, vcc
+; GFX9-SDAG-NEXT:    v_cmp_o_f16_e32 vcc, s0, v5
+; GFX9-SDAG-NEXT:    v_cndmask_b32_sdwa v2, v2, v4, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX9-SDAG-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX9-SDAG-NEXT:    v_lshl_or_b32 v0, v2, 16, v0
+; GFX9-SDAG-NEXT:    v_and_b32_e32 v2, 0xffff, v3
+; GFX9-SDAG-NEXT:    v_lshl_or_b32 v1, v1, 16, v2
+; GFX9-SDAG-NEXT:    ; return to shader part epilog
+;
+; GFX9-GISEL-LABEL: test_fmaximum_v4f16_ss:
+; GFX9-GISEL:       ; %bb.0:
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v0, s2
+; GFX9-GISEL-NEXT:    s_lshr_b32 s2, s2, 16
+; GFX9-GISEL-NEXT:    v_pk_max_f16 v1, s0, v0
+; GFX9-GISEL-NEXT:    s_lshr_b32 s4, s0, 16
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, s2
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v4, 0x7e00
+; GFX9-GISEL-NEXT:    v_cmp_o_f16_e32 vcc, s0, v0
+; GFX9-GISEL-NEXT:    v_lshrrev_b32_e32 v3, 16, v1
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v0, v4, v1, vcc
+; GFX9-GISEL-NEXT:    v_cmp_o_f16_e32 vcc, s4, v2
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v1, v4, v3, vcc
+; GFX9-GISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX9-GISEL-NEXT:    s_lshr_b32 s2, s3, 16
+; GFX9-GISEL-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, s3
+; GFX9-GISEL-NEXT:    s_lshr_b32 s0, s1, 16
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v3, s2
+; GFX9-GISEL-NEXT:    v_pk_max_f16 v2, s1, v1
+; GFX9-GISEL-NEXT:    v_cmp_o_f16_e32 vcc, s0, v3
+; GFX9-GISEL-NEXT:    v_cmp_o_f16_e64 s[0:1], s1, v1
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e64 v1, v4, v2, s[0:1]
+; GFX9-GISEL-NEXT:    v_cndmask_b32_sdwa v2, v4, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX9-GISEL-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; GFX9-GISEL-NEXT:    v_lshl_or_b32 v1, v2, 16, v1
+; GFX9-GISEL-NEXT:    ; return to shader part epilog
+;
+; GFX12-LABEL: test_fmaximum_v4f16_ss:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    v_pk_maximum_f16 v0, s0, s2
+; GFX12-NEXT:    v_pk_maximum_f16 v1, s1, s3
+; GFX12-NEXT:    ; return to shader part epilog
   %val = call <4 x half> @llvm.maximum.v4f16(<4 x half> %a, <4 x half> %b)
   ret <4 x half> %val
 }
 
 define amdgpu_ps <2 x float> @test_fmaximum_f64_vv(double %a, double %b) {
-; GCN-LABEL: test_fmaximum_f64_vv:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    v_maximum_f64 v[0:1], v[0:1], v[2:3]
-; GCN-NEXT:    ; return to shader part epilog
+; GFX9-SDAG-LABEL: test_fmaximum_f64_vv:
+; GFX9-SDAG:       ; %bb.0:
+; GFX9-SDAG-NEXT:    v_max_f64 v[4:5], v[0:1], v[2:3]
+; GFX9-SDAG-NEXT:    v_cmp_u_f64_e32 vcc, v[0:1], v[2:3]
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v1, 0x7ff80000
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v0, v4, 0, vcc
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
+; GFX9-SDAG-NEXT:    ; return to shader part epilog
+;
+; GFX9-GISEL-LABEL: test_fmaximum_f64_vv:
+; GFX9-GISEL:       ; %bb.0:
+; GFX9-GISEL-NEXT:    v_max_f64 v[4:5], v[0:1], v[2:3]
+; GFX9-GISEL-NEXT:    v_cmp_o_f64_e32 vcc, v[0:1], v[2:3]
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, 0x7ff80000
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v0, 0, v4, vcc
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v1, v1, v5, vcc
+; GFX9-GISEL-NEXT:    ; return to shader part epilog
+;
+; GFX12-LABEL: test_fmaximum_f64_vv:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    v_maximum_f64 v[0:1], v[0:1], v[2:3]
+; GFX12-NEXT:    ; return to shader part epilog
   %val = call double @llvm.maximum.f64(double %a, double %b)
   %ret = bitcast double %val to <2 x float>
   ret <2 x float> %ret
 }
 
 define amdgpu_ps <2 x float> @test_fmaximum_f64_ss(double inreg %a, double inreg %b) {
-; GCN-LABEL: test_fmaximum_f64_ss:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    v_maximum_f64 v[0:1], s[0:1], s[2:3]
-; GCN-NEXT:    ; return to shader part epilog
+; GFX9-SDAG-LABEL: test_fmaximum_f64_ss:
+; GFX9-SDAG:       ; %bb.0:
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v0, s2
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v1, s3
+; GFX9-SDAG-NEXT:    v_max_f64 v[2:3], s[0:1], v[0:1]
+; GFX9-SDAG-NEXT:    v_cmp_u_f64_e32 vcc, s[0:1], v[0:1]
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v1, 0x7ff80000
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v0, v2, 0, vcc
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; GFX9-SDAG-NEXT:    ; return to shader part epilog
+;
+; GFX9-GISEL-LABEL: test_fmaximum_f64_ss:
+; GFX9-GISEL:       ; %bb.0:
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v0, s2
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, s3
+; GFX9-GISEL-NEXT:    v_max_f64 v[2:3], s[0:1], v[0:1]
+; GFX9-GISEL-NEXT:    v_cmp_o_f64_e32 vcc, s[0:1], v[0:1]
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, 0x7ff80000
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v0, 0, v2, vcc
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
+; GFX9-GISEL-NEXT:    ; return to shader part epilog
+;
+; GFX12-LABEL: test_fmaximum_f64_ss:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    v_maximum_f64 v[0:1], s[0:1], s[2:3]
+; GFX12-NEXT:    ; return to shader part epilog
   %val = call double @llvm.maximum.f64(double %a, double %b)
   %ret = bitcast double %val to <2 x float>
   ret <2 x float> %ret
 }
 
 define amdgpu_ps <4 x float> @test_fmaximum_v2f64_ss(<2 x double> inreg %a, <2 x double> inreg %b) {
-; GCN-LABEL: test_fmaximum_v2f64_ss:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    v_maximum_f64 v[0:1], s[0:1], s[4:5]
-; GCN-NEXT:    v_maximum_f64 v[2:3], s[2:3], s[6:7]
-; GCN-NEXT:    ; return to shader part epilog
+; GFX9-SDAG-LABEL: test_fmaximum_v2f64_ss:
+; GFX9-SDAG:       ; %bb.0:
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v0, s4
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v1, s5
+; GFX9-SDAG-NEXT:    v_max_f64 v[2:3], s[0:1], v[0:1]
+; GFX9-SDAG-NEXT:    v_cmp_u_f64_e32 vcc, s[0:1], v[0:1]
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v0, s6
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v1, s7
+; GFX9-SDAG-NEXT:    v_max_f64 v[4:5], s[2:3], v[0:1]
+; GFX9-SDAG-NEXT:    v_cmp_u_f64_e64 s[0:1], s[2:3], v[0:1]
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v6, 0x7ff80000
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v0, v2, 0, vcc
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v1, v3, v6, vcc
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v2, v4, 0, s[0:1]
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v3, v5, v6, s[0:1]
+; GFX9-SDAG-NEXT:    ; return to shader part epilog
+;
+; GFX9-GISEL-LABEL: test_fmaximum_v2f64_ss:
+; GFX9-GISEL:       ; %bb.0:
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v0, s4
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, s5
+; GFX9-GISEL-NEXT:    v_max_f64 v[2:3], s[0:1], v[0:1]
+; GFX9-GISEL-NEXT:    v_cmp_o_f64_e32 vcc, s[0:1], v[0:1]
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v0, s6
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, s7
+; GFX9-GISEL-NEXT:    v_max_f64 v[4:5], s[2:3], v[0:1]
+; GFX9-GISEL-NEXT:    v_cmp_o_f64_e64 s[0:1], s[2:3], v[0:1]
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v6, 0x7ff80000
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v0, 0, v2, vcc
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v1, v6, v3, vcc
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e64 v2, 0, v4, s[0:1]
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e64 v3, v6, v5, s[0:1]
+; GFX9-GISEL-NEXT:    ; return to shader part epilog
+;
+; GFX12-LABEL: test_fmaximum_v2f64_ss:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    v_maximum_f64 v[0:1], s[0:1], s[4:5]
+; GFX12-NEXT:    v_maximum_f64 v[2:3], s[2:3], s[6:7]
+; GFX12-NEXT:    ; return to shader part epilog
   %val = call <2 x double> @llvm.maximum.v2f64(<2 x double> %a, <2 x double> %b)
   %ret = bitcast <2 x double> %val to <4 x float>
   ret <4 x float> %ret
 }
 
 define amdgpu_ps <8 x float> @test_fmaximum_v4f64(<4 x double> %a, <4 x double> %b) {
-; GCN-LABEL: test_fmaximum_v4f64:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    v_maximum_f64 v[0:1], v[0:1], v[8:9]
-; GCN-NEXT:    v_maximum_f64 v[2:3], v[2:3], v[10:11]
-; GCN-NEXT:    v_maximum_f64 v[4:5], v[4:5], v[12:13]
-; GCN-NEXT:    v_maximum_f64 v[6:7], v[6:7], v[14:15]
-; GCN-NEXT:    ; return to shader part epilog
+; GFX9-SDAG-LABEL: test_fmaximum_v4f64:
+; GFX9-SDAG:       ; %bb.0:
+; GFX9-SDAG-NEXT:    v_max_f64 v[16:17], v[0:1], v[8:9]
+; GFX9-SDAG-NEXT:    v_cmp_u_f64_e32 vcc, v[0:1], v[8:9]
+; GFX9-SDAG-NEXT:    v_max_f64 v[8:9], v[2:3], v[10:11]
+; GFX9-SDAG-NEXT:    v_cmp_u_f64_e64 s[0:1], v[2:3], v[10:11]
+; GFX9-SDAG-NEXT:    v_max_f64 v[10:11], v[4:5], v[12:13]
+; GFX9-SDAG-NEXT:    v_cmp_u_f64_e64 s[2:3], v[4:5], v[12:13]
+; GFX9-SDAG-NEXT:    v_max_f64 v[12:13], v[6:7], v[14:15]
+; GFX9-SDAG-NEXT:    v_cmp_u_f64_e64 s[4:5], v[6:7], v[14:15]
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v7, 0x7ff80000
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v0, v16, 0, vcc
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v1, v17, v7, vcc
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v2, v8, 0, s[0:1]
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v3, v9, v7, s[0:1]
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v4, v10, 0, s[2:3]
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v5, v11, v7, s[2:3]
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v6, v12, 0, s[4:5]
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v7, v13, v7, s[4:5]
+; GFX9-SDAG-NEXT:    ; return to shader part epilog
+;
+; GFX9-GISEL-LABEL: test_fmaximum_v4f64:
+; GFX9-GISEL:       ; %bb.0:
+; GFX9-GISEL-NEXT:    v_max_f64 v[16:17], v[0:1], v[8:9]
+; GFX9-GISEL-NEXT:    v_cmp_o_f64_e32 vcc, v[0:1], v[8:9]
+; GFX9-GISEL-NEXT:    v_max_f64 v[8:9], v[2:3], v[10:11]
+; GFX9-GISEL-NEXT:    v_cmp_o_f64_e64 s[0:1], v[2:3], v[10:11]
+; GFX9-GISEL-NEXT:    v_max_f64 v[10:11], v[4:5], v[12:13]
+; GFX9-GISEL-NEXT:    v_cmp_o_f64_e64 s[2:3], v[4:5], v[12:13]
+; GFX9-GISEL-NEXT:    v_max_f64 v[12:13], v[6:7], v[14:15]
+; GFX9-GISEL-NEXT:    v_cmp_o_f64_e64 s[4:5], v[6:7], v[14:15]
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v18, 0x7ff80000
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v0, 0, v16, vcc
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v1, v18, v17, vcc
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e64 v2, 0, v8, s[0:1]
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e64 v3, v18, v9, s[0:1]
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e64 v4, 0, v10, s[2:3]
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e64 v5, v18, v11, s[2:3]
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e64 v6, 0, v12, s[4:5]
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e64 v7, v18, v13, s[4:5]
+; GFX9-GISEL-NEXT:    ; return to shader part epilog
+;
+; GFX12-LABEL: test_fmaximum_v4f64:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    v_maximum_f64 v[0:1], v[0:1], v[8:9]
+; GFX12-NEXT:    v_maximum_f64 v[2:3], v[2:3], v[10:11]
+; GFX12-NEXT:    v_maximum_f64 v[4:5], v[4:5], v[12:13]
+; GFX12-NEXT:    v_maximum_f64 v[6:7], v[6:7], v[14:15]
+; GFX12-NEXT:    ; return to shader part epilog
   %val = call <4 x double> @llvm.maximum.v4f64(<4 x double> %a, <4 x double> %b)
   %ret = bitcast <4 x double> %val to <8 x float>
   ret <8 x float> %ret
 }
 
 define amdgpu_ps <8 x float> @test_fmaximum_v4f64_ss(<4 x double> inreg %a, <4 x double> inreg %b) {
-; GCN-LABEL: test_fmaximum_v4f64_ss:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    v_maximum_f64 v[0:1], s[0:1], s[8:9]
-; GCN-NEXT:    v_maximum_f64 v[2:3], s[2:3], s[10:11]
-; GCN-NEXT:    v_maximum_f64 v[4:5], s[4:5], s[12:13]
-; GCN-NEXT:    v_maximum_f64 v[6:7], s[6:7], s[14:15]
-; GCN-NEXT:    ; return to shader part epilog
+; GFX9-SDAG-LABEL: test_fmaximum_v4f64_ss:
+; GFX9-SDAG:       ; %bb.0:
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v0, s8
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v1, s9
+; GFX9-SDAG-NEXT:    v_max_f64 v[2:3], s[0:1], v[0:1]
+; GFX9-SDAG-NEXT:    v_cmp_u_f64_e32 vcc, s[0:1], v[0:1]
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v10, 0x7ff80000
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v0, v2, 0, vcc
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v1, s10
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v2, s11
+; GFX9-SDAG-NEXT:    v_max_f64 v[4:5], s[2:3], v[1:2]
+; GFX9-SDAG-NEXT:    v_cmp_u_f64_e64 s[0:1], s[2:3], v[1:2]
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v1, s12
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v2, s13
+; GFX9-SDAG-NEXT:    v_max_f64 v[6:7], s[4:5], v[1:2]
+; GFX9-SDAG-NEXT:    v_cmp_u_f64_e64 s[2:3], s[4:5], v[1:2]
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v1, s14
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v2, s15
+; GFX9-SDAG-NEXT:    v_max_f64 v[8:9], s[6:7], v[1:2]
+; GFX9-SDAG-NEXT:    v_cmp_u_f64_e64 s[4:5], s[6:7], v[1:2]
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v1, v3, v10, vcc
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v2, v4, 0, s[0:1]
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v3, v5, v10, s[0:1]
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v4, v6, 0, s[2:3]
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v5, v7, v10, s[2:3]
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v6, v8, 0, s[4:5]
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v7, v9, v10, s[4:5]
+; GFX9-SDAG-NEXT:    ; return to shader part epilog
+;
+; GFX9-GISEL-LABEL: test_fmaximum_v4f64_ss:
+; GFX9-GISEL:       ; %bb.0:
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v0, s8
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, s9
+; GFX9-GISEL-NEXT:    v_max_f64 v[2:3], s[0:1], v[0:1]
+; GFX9-GISEL-NEXT:    v_cmp_o_f64_e32 vcc, s[0:1], v[0:1]
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v0, s10
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, s11
+; GFX9-GISEL-NEXT:    v_max_f64 v[4:5], s[2:3], v[0:1]
+; GFX9-GISEL-NEXT:    v_cmp_o_f64_e64 s[0:1], s[2:3], v[0:1]
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v0, s12
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, s13
+; GFX9-GISEL-NEXT:    v_max_f64 v[6:7], s[4:5], v[0:1]
+; GFX9-GISEL-NEXT:    v_cmp_o_f64_e64 s[2:3], s[4:5], v[0:1]
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v0, s14
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, s15
+; GFX9-GISEL-NEXT:    v_max_f64 v[8:9], s[6:7], v[0:1]
+; GFX9-GISEL-NEXT:    v_cmp_o_f64_e64 s[4:5], s[6:7], v[0:1]
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v10, 0x7ff80000
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v0, 0, v2, vcc
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v1, v10, v3, vcc
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e64 v2, 0, v4, s[0:1]
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e64 v3, v10, v5, s[0:1]
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e64 v4, 0, v6, s[2:3]
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e64 v5, v10, v7, s[2:3]
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e64 v6, 0, v8, s[4:5]
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e64 v7, v10, v9, s[4:5]
+; GFX9-GISEL-NEXT:    ; return to shader part epilog
+;
+; GFX12-LABEL: test_fmaximum_v4f64_ss:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    v_maximum_f64 v[0:1], s[0:1], s[8:9]
+; GFX12-NEXT:    v_maximum_f64 v[2:3], s[2:3], s[10:11]
+; GFX12-NEXT:    v_maximum_f64 v[4:5], s[4:5], s[12:13]
+; GFX12-NEXT:    v_maximum_f64 v[6:7], s[6:7], s[14:15]
+; GFX12-NEXT:    ; return to shader part epilog
   %val = call <4 x double> @llvm.maximum.v4f64(<4 x double> %a, <4 x double> %b)
   %ret = bitcast <4 x double> %val to <8 x float>
   ret <8 x float> %ret
 }
 
 define amdgpu_kernel void @fmaximumi_f32_move_to_valu(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr) {
-; GCN-LABEL: fmaximumi_f32_move_to_valu:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_clause 0x1
-; GCN-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
-; GCN-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
-; GCN-NEXT:    v_mov_b32_e32 v0, 0
-; GCN-NEXT:    s_wait_kmcnt 0x0
-; GCN-NEXT:    global_load_b32 v1, v0, s[2:3] scope:SCOPE_SYS
-; GCN-NEXT:    s_wait_loadcnt 0x0
-; GCN-NEXT:    global_load_b32 v2, v0, s[4:5] scope:SCOPE_SYS
-; GCN-NEXT:    s_wait_loadcnt 0x0
-; GCN-NEXT:    v_maximum_f32 v1, v1, v2
-; GCN-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GCN-NEXT:    s_endpgm
+; GFX9-LABEL: fmaximumi_f32_move_to_valu:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX9-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9-NEXT:    v_mov_b32_e32 v3, 0x7fc00000
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    global_load_dword v1, v0, s[2:3] glc
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    global_load_dword v2, v0, s[6:7] glc
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_max_f32_e32 v4, v1, v2
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v1, v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc
+; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT:    s_endpgm
+;
+; GFX12-LABEL: fmaximumi_f32_move_to_valu:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX12-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
+; GFX12-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    global_load_b32 v1, v0, s[2:3] scope:SCOPE_SYS
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    global_load_b32 v2, v0, s[4:5] scope:SCOPE_SYS
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    v_maximum_f32 v1, v1, v2
+; GFX12-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX12-NEXT:    s_endpgm
   %a = load volatile float, ptr addrspace(1) %aptr, align 4
   %b = load volatile float, ptr addrspace(1) %bptr, align 4
   %v = call float @llvm.maximum.f32(float %a, float %b)
@@ -305,6 +910,23 @@ define amdgpu_kernel void @fmaximumi_f32_move_to_valu(ptr addrspace(1) %out, ptr
 }
 
 define amdgpu_kernel void @fmaximum_f16_move_to_valu(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr) {
+; GFX9-LABEL: fmaximum_f16_move_to_valu:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX9-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9-NEXT:    v_mov_b32_e32 v3, 0x7e00
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    global_load_ushort v1, v0, s[2:3] glc
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    global_load_ushort v2, v0, s[6:7] glc
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_max_f16_e32 v4, v1, v2
+; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v1, v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc
+; GFX9-NEXT:    global_store_short v0, v1, s[0:1]
+; GFX9-NEXT:    s_endpgm
+;
 ; GFX12-SDAG-TRUE16-LABEL: fmaximum_f16_move_to_valu:
 ; GFX12-SDAG-TRUE16:       ; %bb.0:
 ; GFX12-SDAG-TRUE16-NEXT:    s_clause 0x1
@@ -371,6 +993,40 @@ define amdgpu_kernel void @fmaximum_f16_move_to_valu(ptr addrspace(1) %out, ptr
   ret void
 }
 
+define amdgpu_ps float @test_fmaximum_f32_ieee_on(float %a, float %b) #0 {
+; GFX9-LABEL: test_fmaximum_f32_ieee_on:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    v_max_f32_e32 v2, v0, v1
+; GFX9-NEXT:    v_mov_b32_e32 v3, 0x7fc00000
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v0, v1
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
+; GFX9-NEXT:    ; return to shader part epilog
+;
+; GFX12-LABEL: test_fmaximum_f32_ieee_on:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    v_maximum_f32 v0, v0, v1
+; GFX12-NEXT:    ; return to shader part epilog
+  %val = call float @llvm.maximum.f32(float %a, float %b)
+  ret float %val
+}
+
+define amdgpu_ps float @test_fmaximum_f32_ieee_off(float %a, float %b) #1 {
+; GFX9-LABEL: test_fmaximum_f32_ieee_off:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    v_max_f32_e32 v2, v0, v1
+; GFX9-NEXT:    v_mov_b32_e32 v3, 0x7fc00000
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v0, v1
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
+; GFX9-NEXT:    ; return to shader part epilog
+;
+; GFX12-LABEL: test_fmaximum_f32_ieee_off:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    v_maximum_f32 v0, v0, v1
+; GFX12-NEXT:    ; return to shader part epilog
+  %val = call float @llvm.maximum.f32(float %a, float %b)
+  ret float %val
+}
+
 declare float @llvm.maximum.f32(float, float)
 declare <2 x float> @llvm.maximum.v2f32(<2 x float>, <2 x float>)
 declare <3 x float> @llvm.maximum.v3f32(<3 x float>, <3 x float>)
@@ -383,3 +1039,6 @@ declare <4 x half> @llvm.maximum.v4f16(<4 x half>, <4 x half>)
 declare double @llvm.maximum.f64(double, double)
 declare <2 x double> @llvm.maximum.v2f64(<2 x double>, <2 x double>)
 declare <4 x double> @llvm.maximum.v4f64(<4 x double>, <4 x double>)
+
+attributes #0 = { nounwind "amdgpu-ieee"="true" }
+attributes #1 = { nounwind "amdgpu-ieee"="false" }
diff --git a/llvm/test/CodeGen/AMDGPU/fminimum.ll b/llvm/test/CodeGen/AMDGPU/fminimum.ll
index b25120f..474ac7c 100644
--- a/llvm/test/CodeGen/AMDGPU/fminimum.ll
+++ b/llvm/test/CodeGen/AMDGPU/fminimum.ll
@@ -1,117 +1,296 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=+real-true16 < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX12-SDAG,GFX12-SDAG-TRUE16 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=-real-true16 < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX12-SDAG,GFX12-SDAG-FAKE16 %s
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1200 -mattr=+real-true16 < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX12-GISEL,GFX12-GISEL-TRUE16 %s
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1200 -mattr=-real-true16 < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX12-GISEL,GFX12-GISEL-FAKE16 %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -enable-var-scope -check-prefixes=GFX9,GFX9-SDAG %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -enable-var-scope -check-prefixes=GFX9,GFX9-GISEL %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1200 -mattr=+real-true16 < %s | FileCheck -enable-var-scope -check-prefixes=GFX12,GFX12-SDAG,GFX12-SDAG-TRUE16 %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1200 -mattr=-real-true16 < %s | FileCheck -enable-var-scope -check-prefixes=GFX12,GFX12-SDAG,GFX12-SDAG-FAKE16 %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1200 -mattr=+real-true16 < %s | FileCheck -enable-var-scope -check-prefixes=GFX12,GFX12-GISEL,GFX12-GISEL-TRUE16 %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1200 -mattr=-real-true16 < %s | FileCheck -enable-var-scope -check-prefixes=GFX12,GFX12-GISEL,GFX12-GISEL-FAKE16 %s
 
 define amdgpu_ps float @test_fminimum_f32_vv(float %a, float %b) {
-; GCN-LABEL: test_fminimum_f32_vv:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    v_minimum_f32 v0, v0, v1
-; GCN-NEXT:    ; return to shader part epilog
+; GFX9-LABEL: test_fminimum_f32_vv:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    v_min_f32_e32 v2, v0, v1
+; GFX9-NEXT:    v_mov_b32_e32 v3, 0x7fc00000
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v0, v1
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
+; GFX9-NEXT:    ; return to shader part epilog
+;
+; GFX12-LABEL: test_fminimum_f32_vv:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    v_minimum_f32 v0, v0, v1
+; GFX12-NEXT:    ; return to shader part epilog
   %val = call float @llvm.minimum.f32(float %a, float %b)
   ret float %val
 }
 
 define amdgpu_ps float @test_fminimum_f32_ss(float inreg %a, float inreg %b) {
-; GCN-LABEL: test_fminimum_f32_ss:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_minimum_f32 s0, s0, s1
-; GCN-NEXT:    s_delay_alu instid0(SALU_CYCLE_3)
-; GCN-NEXT:    v_mov_b32_e32 v0, s0
-; GCN-NEXT:    ; return to shader part epilog
+; GFX9-LABEL: test_fminimum_f32_ss:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    v_mov_b32_e32 v0, s1
+; GFX9-NEXT:    v_min_f32_e32 v1, s0, v0
+; GFX9-NEXT:    v_mov_b32_e32 v2, 0x7fc00000
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, s0, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
+; GFX9-NEXT:    ; return to shader part epilog
+;
+; GFX12-LABEL: test_fminimum_f32_ss:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_minimum_f32 s0, s0, s1
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_3)
+; GFX12-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-NEXT:    ; return to shader part epilog
   %val = call float @llvm.minimum.f32(float %a, float %b)
   ret float %val
 }
 
 define amdgpu_ps float @test_fminimum_f32_vs(float %a, float inreg %b) {
-; GCN-LABEL: test_fminimum_f32_vs:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    v_minimum_f32 v0, v0, s0
-; GCN-NEXT:    ; return to shader part epilog
+; GFX9-LABEL: test_fminimum_f32_vs:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    v_min_f32_e32 v1, s0, v0
+; GFX9-NEXT:    v_mov_b32_e32 v2, 0x7fc00000
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, s0, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
+; GFX9-NEXT:    ; return to shader part epilog
+;
+; GFX12-LABEL: test_fminimum_f32_vs:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    v_minimum_f32 v0, v0, s0
+; GFX12-NEXT:    ; return to shader part epilog
   %val = call float @llvm.minimum.f32(float %a, float %b)
   ret float %val
 }
 
 define amdgpu_ps float @test_fminimum_nnan_f32(float %a, float %b) {
-; GCN-LABEL: test_fminimum_nnan_f32:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    v_minimum_f32 v0, v0, v1
-; GCN-NEXT:    ; return to shader part epilog
+; GFX9-LABEL: test_fminimum_nnan_f32:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    v_min_f32_e32 v0, v0, v1
+; GFX9-NEXT:    ; return to shader part epilog
+;
+; GFX12-LABEL: test_fminimum_nnan_f32:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    v_minimum_f32 v0, v0, v1
+; GFX12-NEXT:    ; return to shader part epilog
   %val = call nnan float @llvm.minimum.f32(float %a, float %b)
   ret float %val
 }
 
+define amdgpu_ps float @test_fminimum_nsz_f32(float %a, float %b) {
+; GFX9-LABEL: test_fminimum_nsz_f32:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    v_min_f32_e32 v2, v0, v1
+; GFX9-NEXT:    v_mov_b32_e32 v3, 0x7fc00000
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v0, v1
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
+; GFX9-NEXT:    ; return to shader part epilog
+;
+; GFX12-LABEL: test_fminimum_nsz_f32:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    v_minimum_f32 v0, v0, v1
+; GFX12-NEXT:    ; return to shader part epilog
+  %val = call nsz float @llvm.minimum.f32(float %a, float %b)
+  ret float %val
+}
+
+define amdgpu_ps float @test_fminimum_signed_zero_f32() {
+; GFX9-LABEL: test_fminimum_signed_zero_f32:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    v_bfrev_b32_e32 v0, 1
+; GFX9-NEXT:    ; return to shader part epilog
+;
+; GFX12-LABEL: test_fminimum_signed_zero_f32:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    v_bfrev_b32_e32 v0, 1
+; GFX12-NEXT:    ; return to shader part epilog
+  %val = call float @llvm.minimum.f32(float -0.0, float 0.0)
+  ret float %val
+}
+
 define amdgpu_ps <2 x float> @test_fminimum_v2f32(<2 x float> %a, <2 x float> %b) {
-; GCN-LABEL: test_fminimum_v2f32:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    v_minimum_f32 v0, v0, v2
-; GCN-NEXT:    v_minimum_f32 v1, v1, v3
-; GCN-NEXT:    ; return to shader part epilog
+; GFX9-LABEL: test_fminimum_v2f32:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    v_min_f32_e32 v4, v0, v2
+; GFX9-NEXT:    v_mov_b32_e32 v5, 0x7fc00000
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v0, v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v5, v4, vcc
+; GFX9-NEXT:    v_min_f32_e32 v2, v1, v3
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v1, v3
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v5, v2, vcc
+; GFX9-NEXT:    ; return to shader part epilog
+;
+; GFX12-LABEL: test_fminimum_v2f32:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    v_minimum_f32 v0, v0, v2
+; GFX12-NEXT:    v_minimum_f32 v1, v1, v3
+; GFX12-NEXT:    ; return to shader part epilog
   %val = call <2 x float> @llvm.minimum.v2f32(<2 x float> %a, <2 x float> %b)
   ret <2 x float> %val
 }
 
 define amdgpu_ps <2 x float> @test_fminimum_v2f32_ss(<2 x float> inreg %a, <2 x float> inreg %b) {
-; GCN-LABEL: test_fminimum_v2f32_ss:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_minimum_f32 s0, s0, s2
-; GCN-NEXT:    s_minimum_f32 s1, s1, s3
-; GCN-NEXT:    s_delay_alu instid0(SALU_CYCLE_3)
-; GCN-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GCN-NEXT:    ; return to shader part epilog
+; GFX9-LABEL: test_fminimum_v2f32_ss:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    v_mov_b32_e32 v0, s2
+; GFX9-NEXT:    v_min_f32_e32 v1, s0, v0
+; GFX9-NEXT:    v_mov_b32_e32 v2, 0x7fc00000
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, s0, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
+; GFX9-NEXT:    v_mov_b32_e32 v1, s3
+; GFX9-NEXT:    v_min_f32_e32 v3, s1, v1
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, s1, v1
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v2, v3, vcc
+; GFX9-NEXT:    ; return to shader part epilog
+;
+; GFX12-LABEL: test_fminimum_v2f32_ss:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_minimum_f32 s0, s0, s2
+; GFX12-NEXT:    s_minimum_f32 s1, s1, s3
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_3)
+; GFX12-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-NEXT:    ; return to shader part epilog
   %val = call <2 x float> @llvm.minimum.v2f32(<2 x float> %a, <2 x float> %b)
   ret <2 x float> %val
 }
 
 define amdgpu_ps <3 x float> @test_fminimum_v3f32(<3 x float> %a, <3 x float> %b) {
-; GCN-LABEL: test_fminimum_v3f32:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    v_minimum_f32 v0, v0, v3
-; GCN-NEXT:    v_minimum_f32 v1, v1, v4
-; GCN-NEXT:    v_minimum_f32 v2, v2, v5
-; GCN-NEXT:    ; return to shader part epilog
+; GFX9-LABEL: test_fminimum_v3f32:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    v_min_f32_e32 v6, v0, v3
+; GFX9-NEXT:    v_mov_b32_e32 v7, 0x7fc00000
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v0, v3
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v7, v6, vcc
+; GFX9-NEXT:    v_min_f32_e32 v3, v1, v4
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v1, v4
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v7, v3, vcc
+; GFX9-NEXT:    v_min_f32_e32 v3, v2, v5
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v2, v5
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v7, v3, vcc
+; GFX9-NEXT:    ; return to shader part epilog
+;
+; GFX12-LABEL: test_fminimum_v3f32:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    v_minimum_f32 v0, v0, v3
+; GFX12-NEXT:    v_minimum_f32 v1, v1, v4
+; GFX12-NEXT:    v_minimum_f32 v2, v2, v5
+; GFX12-NEXT:    ; return to shader part epilog
   %val = call <3 x float> @llvm.minimum.v3f32(<3 x float> %a, <3 x float> %b)
   ret <3 x float> %val
 }
 
 define amdgpu_ps <4 x float> @test_fminimum_v4f32(<4 x float> %a, <4 x float> %b) {
-; GCN-LABEL: test_fminimum_v4f32:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    v_minimum_f32 v0, v0, v4
-; GCN-NEXT:    v_minimum_f32 v1, v1, v5
-; GCN-NEXT:    v_minimum_f32 v2, v2, v6
-; GCN-NEXT:    v_minimum_f32 v3, v3, v7
-; GCN-NEXT:    ; return to shader part epilog
+; GFX9-LABEL: test_fminimum_v4f32:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    v_min_f32_e32 v8, v0, v4
+; GFX9-NEXT:    v_mov_b32_e32 v9, 0x7fc00000
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v0, v4
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v9, v8, vcc
+; GFX9-NEXT:    v_min_f32_e32 v4, v1, v5
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v1, v5
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v9, v4, vcc
+; GFX9-NEXT:    v_min_f32_e32 v4, v2, v6
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v2, v6
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v9, v4, vcc
+; GFX9-NEXT:    v_min_f32_e32 v4, v3, v7
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v3, v7
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v9, v4, vcc
+; GFX9-NEXT:    ; return to shader part epilog
+;
+; GFX12-LABEL: test_fminimum_v4f32:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    v_minimum_f32 v0, v0, v4
+; GFX12-NEXT:    v_minimum_f32 v1, v1, v5
+; GFX12-NEXT:    v_minimum_f32 v2, v2, v6
+; GFX12-NEXT:    v_minimum_f32 v3, v3, v7
+; GFX12-NEXT:    ; return to shader part epilog
   %val = call <4 x float> @llvm.minimum.v4f32(<4 x float> %a, <4 x float> %b)
   ret <4 x float> %val
 }
 
 define amdgpu_ps <16 x float> @test_fminimum_v16f32(<16 x float> %a, <16 x float> %b) {
-; GCN-LABEL: test_fminimum_v16f32:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    v_minimum_f32 v0, v0, v16
-; GCN-NEXT:    v_minimum_f32 v1, v1, v17
-; GCN-NEXT:    v_minimum_f32 v2, v2, v18
-; GCN-NEXT:    v_minimum_f32 v3, v3, v19
-; GCN-NEXT:    v_minimum_f32 v4, v4, v20
-; GCN-NEXT:    v_minimum_f32 v5, v5, v21
-; GCN-NEXT:    v_minimum_f32 v6, v6, v22
-; GCN-NEXT:    v_minimum_f32 v7, v7, v23
-; GCN-NEXT:    v_minimum_f32 v8, v8, v24
-; GCN-NEXT:    v_minimum_f32 v9, v9, v25
-; GCN-NEXT:    v_minimum_f32 v10, v10, v26
-; GCN-NEXT:    v_minimum_f32 v11, v11, v27
-; GCN-NEXT:    v_minimum_f32 v12, v12, v28
-; GCN-NEXT:    v_minimum_f32 v13, v13, v29
-; GCN-NEXT:    v_minimum_f32 v14, v14, v30
-; GCN-NEXT:    v_minimum_f32 v15, v15, v31
-; GCN-NEXT:    ; return to shader part epilog
+; GFX9-LABEL: test_fminimum_v16f32:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    v_min_f32_e32 v32, v1, v17
+; GFX9-NEXT:    v_mov_b32_e32 v33, 0x7fc00000
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v1, v17
+; GFX9-NEXT:    v_min_f32_e32 v1, v0, v16
+; GFX9-NEXT:    v_cmp_o_f32_e64 s[12:13], v0, v16
+; GFX9-NEXT:    v_min_f32_e32 v17, v2, v18
+; GFX9-NEXT:    v_cmp_o_f32_e64 s[0:1], v2, v18
+; GFX9-NEXT:    v_min_f32_e32 v18, v3, v19
+; GFX9-NEXT:    v_cmp_o_f32_e64 s[2:3], v3, v19
+; GFX9-NEXT:    v_min_f32_e32 v19, v4, v20
+; GFX9-NEXT:    v_cmp_o_f32_e64 s[4:5], v4, v20
+; GFX9-NEXT:    v_min_f32_e32 v20, v5, v21
+; GFX9-NEXT:    v_cmp_o_f32_e64 s[6:7], v5, v21
+; GFX9-NEXT:    v_min_f32_e32 v21, v6, v22
+; GFX9-NEXT:    v_cmp_o_f32_e64 s[8:9], v6, v22
+; GFX9-NEXT:    v_min_f32_e32 v22, v7, v23
+; GFX9-NEXT:    v_cmp_o_f32_e64 s[10:11], v7, v23
+; GFX9-NEXT:    v_min_f32_e32 v23, v8, v24
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, v33, v1, s[12:13]
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v33, v32, vcc
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v8, v24
+; GFX9-NEXT:    v_min_f32_e32 v34, v9, v25
+; GFX9-NEXT:    v_cndmask_b32_e32 v8, v33, v23, vcc
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v9, v25
+; GFX9-NEXT:    v_min_f32_e32 v35, v10, v26
+; GFX9-NEXT:    v_cndmask_b32_e32 v9, v33, v34, vcc
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v10, v26
+; GFX9-NEXT:    v_min_f32_e32 v36, v11, v27
+; GFX9-NEXT:    v_cndmask_b32_e32 v10, v33, v35, vcc
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v11, v27
+; GFX9-NEXT:    v_min_f32_e32 v37, v12, v28
+; GFX9-NEXT:    v_cndmask_b32_e32 v11, v33, v36, vcc
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v12, v28
+; GFX9-NEXT:    v_min_f32_e32 v16, v13, v29
+; GFX9-NEXT:    v_cndmask_b32_e32 v12, v33, v37, vcc
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v13, v29
+; GFX9-NEXT:    v_cndmask_b32_e32 v13, v33, v16, vcc
+; GFX9-NEXT:    v_min_f32_e32 v16, v14, v30
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v14, v30
+; GFX9-NEXT:    v_cndmask_b32_e32 v14, v33, v16, vcc
+; GFX9-NEXT:    v_min_f32_e32 v16, v15, v31
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v15, v31
+; GFX9-NEXT:    v_cndmask_b32_e64 v2, v33, v17, s[0:1]
+; GFX9-NEXT:    v_cndmask_b32_e64 v3, v33, v18, s[2:3]
+; GFX9-NEXT:    v_cndmask_b32_e64 v4, v33, v19, s[4:5]
+; GFX9-NEXT:    v_cndmask_b32_e64 v5, v33, v20, s[6:7]
+; GFX9-NEXT:    v_cndmask_b32_e64 v6, v33, v21, s[8:9]
+; GFX9-NEXT:    v_cndmask_b32_e64 v7, v33, v22, s[10:11]
+; GFX9-NEXT:    v_cndmask_b32_e32 v15, v33, v16, vcc
+; GFX9-NEXT:    ; return to shader part epilog
+;
+; GFX12-LABEL: test_fminimum_v16f32:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    v_minimum_f32 v0, v0, v16
+; GFX12-NEXT:    v_minimum_f32 v1, v1, v17
+; GFX12-NEXT:    v_minimum_f32 v2, v2, v18
+; GFX12-NEXT:    v_minimum_f32 v3, v3, v19
+; GFX12-NEXT:    v_minimum_f32 v4, v4, v20
+; GFX12-NEXT:    v_minimum_f32 v5, v5, v21
+; GFX12-NEXT:    v_minimum_f32 v6, v6, v22
+; GFX12-NEXT:    v_minimum_f32 v7, v7, v23
+; GFX12-NEXT:    v_minimum_f32 v8, v8, v24
+; GFX12-NEXT:    v_minimum_f32 v9, v9, v25
+; GFX12-NEXT:    v_minimum_f32 v10, v10, v26
+; GFX12-NEXT:    v_minimum_f32 v11, v11, v27
+; GFX12-NEXT:    v_minimum_f32 v12, v12, v28
+; GFX12-NEXT:    v_minimum_f32 v13, v13, v29
+; GFX12-NEXT:    v_minimum_f32 v14, v14, v30
+; GFX12-NEXT:    v_minimum_f32 v15, v15, v31
+; GFX12-NEXT:    ; return to shader part epilog
   %val = call <16 x float> @llvm.minimum.v16f32(<16 x float> %a, <16 x float> %b)
   ret <16 x float> %val
 }
 
 define amdgpu_ps half @test_fminimum_f16_vv(half %a, half %b) {
+; GFX9-LABEL: test_fminimum_f16_vv:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    v_min_f16_e32 v2, v0, v1
+; GFX9-NEXT:    v_mov_b32_e32 v3, 0x7e00
+; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v0, v1
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
+; GFX9-NEXT:    ; return to shader part epilog
+;
 ; GFX12-SDAG-TRUE16-LABEL: test_fminimum_f16_vv:
 ; GFX12-SDAG-TRUE16:       ; %bb.0:
 ; GFX12-SDAG-TRUE16-NEXT:    v_minimum_f16 v0.l, v0.l, v1.l
@@ -136,35 +315,131 @@ define amdgpu_ps half @test_fminimum_f16_vv(half %a, half %b) {
 }
 
 define amdgpu_ps half @test_fminimum_f16_ss(half inreg %a, half inreg %b) {
-; GCN-LABEL: test_fminimum_f16_ss:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_minimum_f16 s0, s0, s1
-; GCN-NEXT:    s_delay_alu instid0(SALU_CYCLE_3)
-; GCN-NEXT:    v_mov_b32_e32 v0, s0
-; GCN-NEXT:    ; return to shader part epilog
+; GFX9-LABEL: test_fminimum_f16_ss:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    v_mov_b32_e32 v0, s1
+; GFX9-NEXT:    v_min_f16_e32 v1, s0, v0
+; GFX9-NEXT:    v_mov_b32_e32 v2, 0x7e00
+; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, s0, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
+; GFX9-NEXT:    ; return to shader part epilog
+;
+; GFX12-LABEL: test_fminimum_f16_ss:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_minimum_f16 s0, s0, s1
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_3)
+; GFX12-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-NEXT:    ; return to shader part epilog
   %val = call half @llvm.minimum.f16(half %a, half %b)
   ret half %val
 }
 
 define amdgpu_ps <2 x half> @test_fminimum_v2f16_vv(<2 x half> %a, <2 x half> %b) {
-; GCN-LABEL: test_fminimum_v2f16_vv:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    v_pk_minimum_f16 v0, v0, v1
-; GCN-NEXT:    ; return to shader part epilog
+; GFX9-SDAG-LABEL: test_fminimum_v2f16_vv:
+; GFX9-SDAG:       ; %bb.0:
+; GFX9-SDAG-NEXT:    v_pk_min_f16 v2, v0, v1
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v3, 0x7e00
+; GFX9-SDAG-NEXT:    v_cmp_o_f16_e32 vcc, v0, v1
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v4, v3, v2, vcc
+; GFX9-SDAG-NEXT:    v_cmp_o_f16_sdwa vcc, v0, v1 src0_sel:WORD_1 src1_sel:WORD_1
+; GFX9-SDAG-NEXT:    v_cndmask_b32_sdwa v0, v3, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX9-SDAG-NEXT:    s_mov_b32 s0, 0x5040100
+; GFX9-SDAG-NEXT:    v_perm_b32 v0, v0, v4, s0
+; GFX9-SDAG-NEXT:    ; return to shader part epilog
+;
+; GFX9-GISEL-LABEL: test_fminimum_v2f16_vv:
+; GFX9-GISEL:       ; %bb.0:
+; GFX9-GISEL-NEXT:    v_pk_min_f16 v2, v0, v1
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v3, 0x7e00
+; GFX9-GISEL-NEXT:    v_cmp_o_f16_e64 s[0:1], v0, v1
+; GFX9-GISEL-NEXT:    v_cmp_o_f16_sdwa vcc, v0, v1 src0_sel:WORD_1 src1_sel:WORD_1
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e64 v0, v3, v2, s[0:1]
+; GFX9-GISEL-NEXT:    v_cndmask_b32_sdwa v1, v3, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX9-GISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX9-GISEL-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
+; GFX9-GISEL-NEXT:    ; return to shader part epilog
+;
+; GFX12-LABEL: test_fminimum_v2f16_vv:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    v_pk_minimum_f16 v0, v0, v1
+; GFX12-NEXT:    ; return to shader part epilog
   %val = call <2 x half> @llvm.minimum.v2f16(<2 x half> %a, <2 x half> %b)
   ret <2 x half> %val
 }
 
 define amdgpu_ps <2 x half> @test_fminimum_v2f16_ss(<2 x half> inreg %a, <2 x half> inreg %b) {
-; GCN-LABEL: test_fminimum_v2f16_ss:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    v_pk_minimum_f16 v0, s0, s1
-; GCN-NEXT:    ; return to shader part epilog
+; GFX9-SDAG-LABEL: test_fminimum_v2f16_ss:
+; GFX9-SDAG:       ; %bb.0:
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v0, s1
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v1, s1
+; GFX9-SDAG-NEXT:    s_lshr_b32 s1, s1, 16
+; GFX9-SDAG-NEXT:    v_pk_min_f16 v1, s0, v1
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v2, 0x7e00
+; GFX9-SDAG-NEXT:    v_cmp_o_f16_e32 vcc, s0, v0
+; GFX9-SDAG-NEXT:    s_lshr_b32 s0, s0, 16
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v3, s1
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
+; GFX9-SDAG-NEXT:    v_cmp_o_f16_e32 vcc, s0, v3
+; GFX9-SDAG-NEXT:    v_cndmask_b32_sdwa v1, v2, v1, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX9-SDAG-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX9-SDAG-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
+; GFX9-SDAG-NEXT:    ; return to shader part epilog
+;
+; GFX9-GISEL-LABEL: test_fminimum_v2f16_ss:
+; GFX9-GISEL:       ; %bb.0:
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v0, s1
+; GFX9-GISEL-NEXT:    s_lshr_b32 s1, s1, 16
+; GFX9-GISEL-NEXT:    s_lshr_b32 s2, s0, 16
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, s1
+; GFX9-GISEL-NEXT:    v_pk_min_f16 v1, s0, v0
+; GFX9-GISEL-NEXT:    v_cmp_o_f16_e32 vcc, s2, v2
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, 0x7e00
+; GFX9-GISEL-NEXT:    v_cmp_o_f16_e64 s[0:1], s0, v0
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e64 v0, v2, v1, s[0:1]
+; GFX9-GISEL-NEXT:    v_cndmask_b32_sdwa v1, v2, v1, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX9-GISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX9-GISEL-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
+; GFX9-GISEL-NEXT:    ; return to shader part epilog
+;
+; GFX12-LABEL: test_fminimum_v2f16_ss:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    v_pk_minimum_f16 v0, s0, s1
+; GFX12-NEXT:    ; return to shader part epilog
   %val = call <2 x half> @llvm.minimum.v2f16(<2 x half> %a, <2 x half> %b)
   ret <2 x half> %val
 }
 
 define amdgpu_ps <3 x half> @test_fminimum_v3f16_vv(<3 x half> %a, <3 x half> %b) {
+; GFX9-SDAG-LABEL: test_fminimum_v3f16_vv:
+; GFX9-SDAG:       ; %bb.0:
+; GFX9-SDAG-NEXT:    v_pk_min_f16 v4, v1, v3
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v5, 0x7e00
+; GFX9-SDAG-NEXT:    v_cmp_o_f16_e32 vcc, v1, v3
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v1, v5, v4, vcc
+; GFX9-SDAG-NEXT:    v_pk_min_f16 v3, v0, v2
+; GFX9-SDAG-NEXT:    v_cmp_o_f16_e32 vcc, v0, v2
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v4, v5, v3, vcc
+; GFX9-SDAG-NEXT:    v_cmp_o_f16_sdwa vcc, v0, v2 src0_sel:WORD_1 src1_sel:WORD_1
+; GFX9-SDAG-NEXT:    v_cndmask_b32_sdwa v0, v5, v3, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX9-SDAG-NEXT:    s_mov_b32 s0, 0x5040100
+; GFX9-SDAG-NEXT:    v_perm_b32 v0, v0, v4, s0
+; GFX9-SDAG-NEXT:    ; return to shader part epilog
+;
+; GFX9-GISEL-LABEL: test_fminimum_v3f16_vv:
+; GFX9-GISEL:       ; %bb.0:
+; GFX9-GISEL-NEXT:    v_pk_min_f16 v4, v0, v2
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v5, 0x7e00
+; GFX9-GISEL-NEXT:    v_cmp_o_f16_e64 s[0:1], v0, v2
+; GFX9-GISEL-NEXT:    v_cmp_o_f16_sdwa vcc, v0, v2 src0_sel:WORD_1 src1_sel:WORD_1
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e64 v0, v5, v4, s[0:1]
+; GFX9-GISEL-NEXT:    v_cndmask_b32_sdwa v2, v5, v4, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX9-GISEL-NEXT:    v_pk_min_f16 v4, v1, v3
+; GFX9-GISEL-NEXT:    v_cmp_o_f16_e32 vcc, v1, v3
+; GFX9-GISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v1, v5, v4, vcc
+; GFX9-GISEL-NEXT:    v_lshl_or_b32 v0, v2, 16, v0
+; GFX9-GISEL-NEXT:    ; return to shader part epilog
+;
 ; GFX12-SDAG-LABEL: test_fminimum_v3f16_vv:
 ; GFX12-SDAG:       ; %bb.0:
 ; GFX12-SDAG-NEXT:    v_pk_minimum_f16 v0, v0, v2
@@ -187,6 +462,49 @@ define amdgpu_ps <3 x half> @test_fminimum_v3f16_vv(<3 x half> %a, <3 x half> %b
 }
 
 define amdgpu_ps <3 x half> @test_fminimum_v3f16_ss(<3 x half> inreg %a, <3 x half> inreg %b) {
+; GFX9-SDAG-LABEL: test_fminimum_v3f16_ss:
+; GFX9-SDAG:       ; %bb.0:
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v0, s3
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v1, s3
+; GFX9-SDAG-NEXT:    v_pk_min_f16 v1, s1, v1
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v2, 0x7e00
+; GFX9-SDAG-NEXT:    v_cmp_o_f16_e32 vcc, s1, v0
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v0, s2
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v3, s2
+; GFX9-SDAG-NEXT:    s_lshr_b32 s1, s2, 16
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc
+; GFX9-SDAG-NEXT:    v_pk_min_f16 v3, s0, v3
+; GFX9-SDAG-NEXT:    v_cmp_o_f16_e32 vcc, s0, v0
+; GFX9-SDAG-NEXT:    s_lshr_b32 s0, s0, 16
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v4, s1
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v0, v2, v3, vcc
+; GFX9-SDAG-NEXT:    v_cmp_o_f16_e32 vcc, s0, v4
+; GFX9-SDAG-NEXT:    v_cndmask_b32_sdwa v2, v2, v3, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX9-SDAG-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX9-SDAG-NEXT:    v_lshl_or_b32 v0, v2, 16, v0
+; GFX9-SDAG-NEXT:    ; return to shader part epilog
+;
+; GFX9-GISEL-LABEL: test_fminimum_v3f16_ss:
+; GFX9-GISEL:       ; %bb.0:
+; GFX9-GISEL-NEXT:    s_lshr_b32 s5, s2, 16
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v0, s2
+; GFX9-GISEL-NEXT:    s_lshr_b32 s4, s0, 16
+; GFX9-GISEL-NEXT:    v_pk_min_f16 v1, s0, v0
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, s5
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v4, 0x7e00
+; GFX9-GISEL-NEXT:    v_cmp_o_f16_e32 vcc, s0, v0
+; GFX9-GISEL-NEXT:    v_lshrrev_b32_e32 v3, 16, v1
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v0, v4, v1, vcc
+; GFX9-GISEL-NEXT:    v_cmp_o_f16_e32 vcc, s4, v2
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, s3
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v2, v4, v3, vcc
+; GFX9-GISEL-NEXT:    v_pk_min_f16 v3, s1, v1
+; GFX9-GISEL-NEXT:    v_cmp_o_f16_e32 vcc, s1, v1
+; GFX9-GISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v1, v4, v3, vcc
+; GFX9-GISEL-NEXT:    v_lshl_or_b32 v0, v2, 16, v0
+; GFX9-GISEL-NEXT:    ; return to shader part epilog
+;
 ; GFX12-SDAG-LABEL: test_fminimum_v3f16_ss:
 ; GFX12-SDAG:       ; %bb.0:
 ; GFX12-SDAG-NEXT:    v_pk_minimum_f16 v0, s0, s2
@@ -206,97 +524,384 @@ define amdgpu_ps <3 x half> @test_fminimum_v3f16_ss(<3 x half> inreg %a, <3 x ha
 }
 
 define amdgpu_ps <4 x half> @test_fminimum_v4f16(<4 x half> %a, <4 x half> %b) {
-; GCN-LABEL: test_fminimum_v4f16:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    v_pk_minimum_f16 v0, v0, v2
-; GCN-NEXT:    v_pk_minimum_f16 v1, v1, v3
-; GCN-NEXT:    ; return to shader part epilog
+; GFX9-SDAG-LABEL: test_fminimum_v4f16:
+; GFX9-SDAG:       ; %bb.0:
+; GFX9-SDAG-NEXT:    v_pk_min_f16 v4, v1, v3
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v5, 0x7e00
+; GFX9-SDAG-NEXT:    v_cmp_o_f16_e32 vcc, v1, v3
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v6, v5, v4, vcc
+; GFX9-SDAG-NEXT:    v_cmp_o_f16_sdwa vcc, v1, v3 src0_sel:WORD_1 src1_sel:WORD_1
+; GFX9-SDAG-NEXT:    v_cndmask_b32_sdwa v1, v5, v4, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX9-SDAG-NEXT:    v_pk_min_f16 v3, v0, v2
+; GFX9-SDAG-NEXT:    v_cmp_o_f16_e32 vcc, v0, v2
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v4, v5, v3, vcc
+; GFX9-SDAG-NEXT:    v_cmp_o_f16_sdwa vcc, v0, v2 src0_sel:WORD_1 src1_sel:WORD_1
+; GFX9-SDAG-NEXT:    v_cndmask_b32_sdwa v0, v5, v3, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX9-SDAG-NEXT:    s_mov_b32 s0, 0x5040100
+; GFX9-SDAG-NEXT:    v_perm_b32 v0, v0, v4, s0
+; GFX9-SDAG-NEXT:    v_perm_b32 v1, v1, v6, s0
+; GFX9-SDAG-NEXT:    ; return to shader part epilog
+;
+; GFX9-GISEL-LABEL: test_fminimum_v4f16:
+; GFX9-GISEL:       ; %bb.0:
+; GFX9-GISEL-NEXT:    v_pk_min_f16 v4, v0, v2
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v6, 0x7e00
+; GFX9-GISEL-NEXT:    v_cmp_o_f16_e32 vcc, v0, v2
+; GFX9-GISEL-NEXT:    v_lshrrev_b32_e32 v5, 16, v4
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v4, v6, v4, vcc
+; GFX9-GISEL-NEXT:    v_cmp_o_f16_sdwa vcc, v0, v2 src0_sel:WORD_1 src1_sel:WORD_1
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v0, v6, v5, vcc
+; GFX9-GISEL-NEXT:    v_and_b32_e32 v2, 0xffff, v4
+; GFX9-GISEL-NEXT:    v_lshl_or_b32 v0, v0, 16, v2
+; GFX9-GISEL-NEXT:    v_pk_min_f16 v2, v1, v3
+; GFX9-GISEL-NEXT:    v_cmp_o_f16_e64 s[0:1], v1, v3
+; GFX9-GISEL-NEXT:    v_cmp_o_f16_sdwa vcc, v1, v3 src0_sel:WORD_1 src1_sel:WORD_1
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e64 v1, v6, v2, s[0:1]
+; GFX9-GISEL-NEXT:    v_cndmask_b32_sdwa v2, v6, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX9-GISEL-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; GFX9-GISEL-NEXT:    v_lshl_or_b32 v1, v2, 16, v1
+; GFX9-GISEL-NEXT:    ; return to shader part epilog
+;
+; GFX12-LABEL: test_fminimum_v4f16:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    v_pk_minimum_f16 v0, v0, v2
+; GFX12-NEXT:    v_pk_minimum_f16 v1, v1, v3
+; GFX12-NEXT:    ; return to shader part epilog
   %val = call <4 x half> @llvm.minimum.v4f16(<4 x half> %a, <4 x half> %b)
   ret <4 x half> %val
 }
 
 define amdgpu_ps <4 x half> @test_fminimum_v4f16_ss(<4 x half> inreg %a, <4 x half> inreg %b) {
-; GCN-LABEL: test_fminimum_v4f16_ss:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    v_pk_minimum_f16 v0, s0, s2
-; GCN-NEXT:    v_pk_minimum_f16 v1, s1, s3
-; GCN-NEXT:    ; return to shader part epilog
+; GFX9-SDAG-LABEL: test_fminimum_v4f16_ss:
+; GFX9-SDAG:       ; %bb.0:
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v0, s3
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v1, s3
+; GFX9-SDAG-NEXT:    s_lshr_b32 s3, s3, 16
+; GFX9-SDAG-NEXT:    v_pk_min_f16 v1, s1, v1
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v2, 0x7e00
+; GFX9-SDAG-NEXT:    v_cmp_o_f16_e32 vcc, s1, v0
+; GFX9-SDAG-NEXT:    s_lshr_b32 s1, s1, 16
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v0, s3
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v3, v2, v1, vcc
+; GFX9-SDAG-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX9-SDAG-NEXT:    v_cmp_o_f16_e32 vcc, s1, v0
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v0, s2
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v4, s2
+; GFX9-SDAG-NEXT:    s_lshr_b32 s1, s2, 16
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc
+; GFX9-SDAG-NEXT:    v_pk_min_f16 v4, s0, v4
+; GFX9-SDAG-NEXT:    v_cmp_o_f16_e32 vcc, s0, v0
+; GFX9-SDAG-NEXT:    s_lshr_b32 s0, s0, 16
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v5, s1
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v0, v2, v4, vcc
+; GFX9-SDAG-NEXT:    v_cmp_o_f16_e32 vcc, s0, v5
+; GFX9-SDAG-NEXT:    v_cndmask_b32_sdwa v2, v2, v4, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX9-SDAG-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX9-SDAG-NEXT:    v_lshl_or_b32 v0, v2, 16, v0
+; GFX9-SDAG-NEXT:    v_and_b32_e32 v2, 0xffff, v3
+; GFX9-SDAG-NEXT:    v_lshl_or_b32 v1, v1, 16, v2
+; GFX9-SDAG-NEXT:    ; return to shader part epilog
+;
+; GFX9-GISEL-LABEL: test_fminimum_v4f16_ss:
+; GFX9-GISEL:       ; %bb.0:
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v0, s2
+; GFX9-GISEL-NEXT:    s_lshr_b32 s2, s2, 16
+; GFX9-GISEL-NEXT:    v_pk_min_f16 v1, s0, v0
+; GFX9-GISEL-NEXT:    s_lshr_b32 s4, s0, 16
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, s2
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v4, 0x7e00
+; GFX9-GISEL-NEXT:    v_cmp_o_f16_e32 vcc, s0, v0
+; GFX9-GISEL-NEXT:    v_lshrrev_b32_e32 v3, 16, v1
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v0, v4, v1, vcc
+; GFX9-GISEL-NEXT:    v_cmp_o_f16_e32 vcc, s4, v2
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v1, v4, v3, vcc
+; GFX9-GISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX9-GISEL-NEXT:    s_lshr_b32 s2, s3, 16
+; GFX9-GISEL-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, s3
+; GFX9-GISEL-NEXT:    s_lshr_b32 s0, s1, 16
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v3, s2
+; GFX9-GISEL-NEXT:    v_pk_min_f16 v2, s1, v1
+; GFX9-GISEL-NEXT:    v_cmp_o_f16_e32 vcc, s0, v3
+; GFX9-GISEL-NEXT:    v_cmp_o_f16_e64 s[0:1], s1, v1
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e64 v1, v4, v2, s[0:1]
+; GFX9-GISEL-NEXT:    v_cndmask_b32_sdwa v2, v4, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX9-GISEL-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; GFX9-GISEL-NEXT:    v_lshl_or_b32 v1, v2, 16, v1
+; GFX9-GISEL-NEXT:    ; return to shader part epilog
+;
+; GFX12-LABEL: test_fminimum_v4f16_ss:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    v_pk_minimum_f16 v0, s0, s2
+; GFX12-NEXT:    v_pk_minimum_f16 v1, s1, s3
+; GFX12-NEXT:    ; return to shader part epilog
   %val = call <4 x half> @llvm.minimum.v4f16(<4 x half> %a, <4 x half> %b)
   ret <4 x half> %val
 }
 
 define amdgpu_ps <2 x float> @test_fminimum_f64_vv(double %a, double %b) {
-; GCN-LABEL: test_fminimum_f64_vv:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    v_minimum_f64 v[0:1], v[0:1], v[2:3]
-; GCN-NEXT:    ; return to shader part epilog
+; GFX9-SDAG-LABEL: test_fminimum_f64_vv:
+; GFX9-SDAG:       ; %bb.0:
+; GFX9-SDAG-NEXT:    v_min_f64 v[4:5], v[0:1], v[2:3]
+; GFX9-SDAG-NEXT:    v_cmp_u_f64_e32 vcc, v[0:1], v[2:3]
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v1, 0x7ff80000
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v0, v4, 0, vcc
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
+; GFX9-SDAG-NEXT:    ; return to shader part epilog
+;
+; GFX9-GISEL-LABEL: test_fminimum_f64_vv:
+; GFX9-GISEL:       ; %bb.0:
+; GFX9-GISEL-NEXT:    v_min_f64 v[4:5], v[0:1], v[2:3]
+; GFX9-GISEL-NEXT:    v_cmp_o_f64_e32 vcc, v[0:1], v[2:3]
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, 0x7ff80000
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v0, 0, v4, vcc
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v1, v1, v5, vcc
+; GFX9-GISEL-NEXT:    ; return to shader part epilog
+;
+; GFX12-LABEL: test_fminimum_f64_vv:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    v_minimum_f64 v[0:1], v[0:1], v[2:3]
+; GFX12-NEXT:    ; return to shader part epilog
   %val = call double @llvm.minimum.f64(double %a, double %b)
   %ret = bitcast double %val to <2 x float>
   ret <2 x float> %ret
 }
 
 define amdgpu_ps <2 x float> @test_fminimum_f64_ss(double inreg %a, double inreg %b) {
-; GCN-LABEL: test_fminimum_f64_ss:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    v_minimum_f64 v[0:1], s[0:1], s[2:3]
-; GCN-NEXT:    ; return to shader part epilog
+; GFX9-SDAG-LABEL: test_fminimum_f64_ss:
+; GFX9-SDAG:       ; %bb.0:
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v0, s2
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v1, s3
+; GFX9-SDAG-NEXT:    v_min_f64 v[2:3], s[0:1], v[0:1]
+; GFX9-SDAG-NEXT:    v_cmp_u_f64_e32 vcc, s[0:1], v[0:1]
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v1, 0x7ff80000
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v0, v2, 0, vcc
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; GFX9-SDAG-NEXT:    ; return to shader part epilog
+;
+; GFX9-GISEL-LABEL: test_fminimum_f64_ss:
+; GFX9-GISEL:       ; %bb.0:
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v0, s2
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, s3
+; GFX9-GISEL-NEXT:    v_min_f64 v[2:3], s[0:1], v[0:1]
+; GFX9-GISEL-NEXT:    v_cmp_o_f64_e32 vcc, s[0:1], v[0:1]
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, 0x7ff80000
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v0, 0, v2, vcc
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
+; GFX9-GISEL-NEXT:    ; return to shader part epilog
+;
+; GFX12-LABEL: test_fminimum_f64_ss:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    v_minimum_f64 v[0:1], s[0:1], s[2:3]
+; GFX12-NEXT:    ; return to shader part epilog
   %val = call double @llvm.minimum.f64(double %a, double %b)
   %ret = bitcast double %val to <2 x float>
   ret <2 x float> %ret
 }
 
 define amdgpu_ps <4 x float> @test_fminimum_v2f64_ss(<2 x double> inreg %a, <2 x double> inreg %b) {
-; GCN-LABEL: test_fminimum_v2f64_ss:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    v_minimum_f64 v[0:1], s[0:1], s[4:5]
-; GCN-NEXT:    v_minimum_f64 v[2:3], s[2:3], s[6:7]
-; GCN-NEXT:    ; return to shader part epilog
+; GFX9-SDAG-LABEL: test_fminimum_v2f64_ss:
+; GFX9-SDAG:       ; %bb.0:
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v0, s4
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v1, s5
+; GFX9-SDAG-NEXT:    v_min_f64 v[2:3], s[0:1], v[0:1]
+; GFX9-SDAG-NEXT:    v_cmp_u_f64_e32 vcc, s[0:1], v[0:1]
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v0, s6
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v1, s7
+; GFX9-SDAG-NEXT:    v_min_f64 v[4:5], s[2:3], v[0:1]
+; GFX9-SDAG-NEXT:    v_cmp_u_f64_e64 s[0:1], s[2:3], v[0:1]
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v6, 0x7ff80000
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v0, v2, 0, vcc
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v1, v3, v6, vcc
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v2, v4, 0, s[0:1]
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v3, v5, v6, s[0:1]
+; GFX9-SDAG-NEXT:    ; return to shader part epilog
+;
+; GFX9-GISEL-LABEL: test_fminimum_v2f64_ss:
+; GFX9-GISEL:       ; %bb.0:
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v0, s4
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, s5
+; GFX9-GISEL-NEXT:    v_min_f64 v[2:3], s[0:1], v[0:1]
+; GFX9-GISEL-NEXT:    v_cmp_o_f64_e32 vcc, s[0:1], v[0:1]
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v0, s6
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, s7
+; GFX9-GISEL-NEXT:    v_min_f64 v[4:5], s[2:3], v[0:1]
+; GFX9-GISEL-NEXT:    v_cmp_o_f64_e64 s[0:1], s[2:3], v[0:1]
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v6, 0x7ff80000
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v0, 0, v2, vcc
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v1, v6, v3, vcc
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e64 v2, 0, v4, s[0:1]
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e64 v3, v6, v5, s[0:1]
+; GFX9-GISEL-NEXT:    ; return to shader part epilog
+;
+; GFX12-LABEL: test_fminimum_v2f64_ss:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    v_minimum_f64 v[0:1], s[0:1], s[4:5]
+; GFX12-NEXT:    v_minimum_f64 v[2:3], s[2:3], s[6:7]
+; GFX12-NEXT:    ; return to shader part epilog
   %val = call <2 x double> @llvm.minimum.v2f64(<2 x double> %a, <2 x double> %b)
   %ret = bitcast <2 x double> %val to <4 x float>
   ret <4 x float> %ret
 }
 
 define amdgpu_ps <8 x float> @test_fminimum_v4f64(<4 x double> %a, <4 x double> %b) {
-; GCN-LABEL: test_fminimum_v4f64:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    v_minimum_f64 v[0:1], v[0:1], v[8:9]
-; GCN-NEXT:    v_minimum_f64 v[2:3], v[2:3], v[10:11]
-; GCN-NEXT:    v_minimum_f64 v[4:5], v[4:5], v[12:13]
-; GCN-NEXT:    v_minimum_f64 v[6:7], v[6:7], v[14:15]
-; GCN-NEXT:    ; return to shader part epilog
+; GFX9-SDAG-LABEL: test_fminimum_v4f64:
+; GFX9-SDAG:       ; %bb.0:
+; GFX9-SDAG-NEXT:    v_min_f64 v[16:17], v[0:1], v[8:9]
+; GFX9-SDAG-NEXT:    v_cmp_u_f64_e32 vcc, v[0:1], v[8:9]
+; GFX9-SDAG-NEXT:    v_min_f64 v[8:9], v[2:3], v[10:11]
+; GFX9-SDAG-NEXT:    v_cmp_u_f64_e64 s[0:1], v[2:3], v[10:11]
+; GFX9-SDAG-NEXT:    v_min_f64 v[10:11], v[4:5], v[12:13]
+; GFX9-SDAG-NEXT:    v_cmp_u_f64_e64 s[2:3], v[4:5], v[12:13]
+; GFX9-SDAG-NEXT:    v_min_f64 v[12:13], v[6:7], v[14:15]
+; GFX9-SDAG-NEXT:    v_cmp_u_f64_e64 s[4:5], v[6:7], v[14:15]
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v7, 0x7ff80000
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v0, v16, 0, vcc
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v1, v17, v7, vcc
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v2, v8, 0, s[0:1]
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v3, v9, v7, s[0:1]
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v4, v10, 0, s[2:3]
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v5, v11, v7, s[2:3]
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v6, v12, 0, s[4:5]
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v7, v13, v7, s[4:5]
+; GFX9-SDAG-NEXT:    ; return to shader part epilog
+;
+; GFX9-GISEL-LABEL: test_fminimum_v4f64:
+; GFX9-GISEL:       ; %bb.0:
+; GFX9-GISEL-NEXT:    v_min_f64 v[16:17], v[0:1], v[8:9]
+; GFX9-GISEL-NEXT:    v_cmp_o_f64_e32 vcc, v[0:1], v[8:9]
+; GFX9-GISEL-NEXT:    v_min_f64 v[8:9], v[2:3], v[10:11]
+; GFX9-GISEL-NEXT:    v_cmp_o_f64_e64 s[0:1], v[2:3], v[10:11]
+; GFX9-GISEL-NEXT:    v_min_f64 v[10:11], v[4:5], v[12:13]
+; GFX9-GISEL-NEXT:    v_cmp_o_f64_e64 s[2:3], v[4:5], v[12:13]
+; GFX9-GISEL-NEXT:    v_min_f64 v[12:13], v[6:7], v[14:15]
+; GFX9-GISEL-NEXT:    v_cmp_o_f64_e64 s[4:5], v[6:7], v[14:15]
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v18, 0x7ff80000
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v0, 0, v16, vcc
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v1, v18, v17, vcc
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e64 v2, 0, v8, s[0:1]
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e64 v3, v18, v9, s[0:1]
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e64 v4, 0, v10, s[2:3]
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e64 v5, v18, v11, s[2:3]
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e64 v6, 0, v12, s[4:5]
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e64 v7, v18, v13, s[4:5]
+; GFX9-GISEL-NEXT:    ; return to shader part epilog
+;
+; GFX12-LABEL: test_fminimum_v4f64:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    v_minimum_f64 v[0:1], v[0:1], v[8:9]
+; GFX12-NEXT:    v_minimum_f64 v[2:3], v[2:3], v[10:11]
+; GFX12-NEXT:    v_minimum_f64 v[4:5], v[4:5], v[12:13]
+; GFX12-NEXT:    v_minimum_f64 v[6:7], v[6:7], v[14:15]
+; GFX12-NEXT:    ; return to shader part epilog
   %val = call <4 x double> @llvm.minimum.v4f64(<4 x double> %a, <4 x double> %b)
   %ret = bitcast <4 x double> %val to <8 x float>
   ret <8 x float> %ret
 }
 
 define amdgpu_ps <8 x float> @test_fminimum_v4f64_ss(<4 x double> inreg %a, <4 x double> inreg %b) {
-; GCN-LABEL: test_fminimum_v4f64_ss:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    v_minimum_f64 v[0:1], s[0:1], s[8:9]
-; GCN-NEXT:    v_minimum_f64 v[2:3], s[2:3], s[10:11]
-; GCN-NEXT:    v_minimum_f64 v[4:5], s[4:5], s[12:13]
-; GCN-NEXT:    v_minimum_f64 v[6:7], s[6:7], s[14:15]
-; GCN-NEXT:    ; return to shader part epilog
+; GFX9-SDAG-LABEL: test_fminimum_v4f64_ss:
+; GFX9-SDAG:       ; %bb.0:
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v0, s8
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v1, s9
+; GFX9-SDAG-NEXT:    v_min_f64 v[2:3], s[0:1], v[0:1]
+; GFX9-SDAG-NEXT:    v_cmp_u_f64_e32 vcc, s[0:1], v[0:1]
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v10, 0x7ff80000
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v0, v2, 0, vcc
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v1, s10
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v2, s11
+; GFX9-SDAG-NEXT:    v_min_f64 v[4:5], s[2:3], v[1:2]
+; GFX9-SDAG-NEXT:    v_cmp_u_f64_e64 s[0:1], s[2:3], v[1:2]
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v1, s12
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v2, s13
+; GFX9-SDAG-NEXT:    v_min_f64 v[6:7], s[4:5], v[1:2]
+; GFX9-SDAG-NEXT:    v_cmp_u_f64_e64 s[2:3], s[4:5], v[1:2]
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v1, s14
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v2, s15
+; GFX9-SDAG-NEXT:    v_min_f64 v[8:9], s[6:7], v[1:2]
+; GFX9-SDAG-NEXT:    v_cmp_u_f64_e64 s[4:5], s[6:7], v[1:2]
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v1, v3, v10, vcc
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v2, v4, 0, s[0:1]
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v3, v5, v10, s[0:1]
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v4, v6, 0, s[2:3]
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v5, v7, v10, s[2:3]
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v6, v8, 0, s[4:5]
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v7, v9, v10, s[4:5]
+; GFX9-SDAG-NEXT:    ; return to shader part epilog
+;
+; GFX9-GISEL-LABEL: test_fminimum_v4f64_ss:
+; GFX9-GISEL:       ; %bb.0:
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v0, s8
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, s9
+; GFX9-GISEL-NEXT:    v_min_f64 v[2:3], s[0:1], v[0:1]
+; GFX9-GISEL-NEXT:    v_cmp_o_f64_e32 vcc, s[0:1], v[0:1]
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v0, s10
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, s11
+; GFX9-GISEL-NEXT:    v_min_f64 v[4:5], s[2:3], v[0:1]
+; GFX9-GISEL-NEXT:    v_cmp_o_f64_e64 s[0:1], s[2:3], v[0:1]
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v0, s12
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, s13
+; GFX9-GISEL-NEXT:    v_min_f64 v[6:7], s[4:5], v[0:1]
+; GFX9-GISEL-NEXT:    v_cmp_o_f64_e64 s[2:3], s[4:5], v[0:1]
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v0, s14
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, s15
+; GFX9-GISEL-NEXT:    v_min_f64 v[8:9], s[6:7], v[0:1]
+; GFX9-GISEL-NEXT:    v_cmp_o_f64_e64 s[4:5], s[6:7], v[0:1]
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v10, 0x7ff80000
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v0, 0, v2, vcc
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v1, v10, v3, vcc
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e64 v2, 0, v4, s[0:1]
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e64 v3, v10, v5, s[0:1]
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e64 v4, 0, v6, s[2:3]
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e64 v5, v10, v7, s[2:3]
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e64 v6, 0, v8, s[4:5]
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e64 v7, v10, v9, s[4:5]
+; GFX9-GISEL-NEXT:    ; return to shader part epilog
+;
+; GFX12-LABEL: test_fminimum_v4f64_ss:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    v_minimum_f64 v[0:1], s[0:1], s[8:9]
+; GFX12-NEXT:    v_minimum_f64 v[2:3], s[2:3], s[10:11]
+; GFX12-NEXT:    v_minimum_f64 v[4:5], s[4:5], s[12:13]
+; GFX12-NEXT:    v_minimum_f64 v[6:7], s[6:7], s[14:15]
+; GFX12-NEXT:    ; return to shader part epilog
   %val = call <4 x double> @llvm.minimum.v4f64(<4 x double> %a, <4 x double> %b)
   %ret = bitcast <4 x double> %val to <8 x float>
   ret <8 x float> %ret
 }
 
 define amdgpu_kernel void @fminimumi_f32_move_to_valu(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr) {
-; GCN-LABEL: fminimumi_f32_move_to_valu:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_clause 0x1
-; GCN-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
-; GCN-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
-; GCN-NEXT:    v_mov_b32_e32 v0, 0
-; GCN-NEXT:    s_wait_kmcnt 0x0
-; GCN-NEXT:    global_load_b32 v1, v0, s[2:3] scope:SCOPE_SYS
-; GCN-NEXT:    s_wait_loadcnt 0x0
-; GCN-NEXT:    global_load_b32 v2, v0, s[4:5] scope:SCOPE_SYS
-; GCN-NEXT:    s_wait_loadcnt 0x0
-; GCN-NEXT:    v_minimum_f32 v1, v1, v2
-; GCN-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GCN-NEXT:    s_endpgm
+; GFX9-LABEL: fminimumi_f32_move_to_valu:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX9-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9-NEXT:    v_mov_b32_e32 v3, 0x7fc00000
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    global_load_dword v1, v0, s[2:3] glc
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    global_load_dword v2, v0, s[6:7] glc
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_min_f32_e32 v4, v1, v2
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v1, v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc
+; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT:    s_endpgm
+;
+; GFX12-LABEL: fminimumi_f32_move_to_valu:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX12-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
+; GFX12-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    global_load_b32 v1, v0, s[2:3] scope:SCOPE_SYS
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    global_load_b32 v2, v0, s[4:5] scope:SCOPE_SYS
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    v_minimum_f32 v1, v1, v2
+; GFX12-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX12-NEXT:    s_endpgm
   %a = load volatile float, ptr addrspace(1) %aptr, align 4
   %b = load volatile float, ptr addrspace(1) %bptr, align 4
   %v = call float @llvm.minimum.f32(float %a, float %b)
@@ -305,6 +910,23 @@ define amdgpu_kernel void @fminimumi_f32_move_to_valu(ptr addrspace(1) %out, ptr
 }
 
 define amdgpu_kernel void @fminimum_f16_move_to_valu(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr) {
+; GFX9-LABEL: fminimum_f16_move_to_valu:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX9-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9-NEXT:    v_mov_b32_e32 v3, 0x7e00
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    global_load_ushort v1, v0, s[2:3] glc
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    global_load_ushort v2, v0, s[6:7] glc
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_min_f16_e32 v4, v1, v2
+; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v1, v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc
+; GFX9-NEXT:    global_store_short v0, v1, s[0:1]
+; GFX9-NEXT:    s_endpgm
+;
 ; GFX12-SDAG-TRUE16-LABEL: fminimum_f16_move_to_valu:
 ; GFX12-SDAG-TRUE16:       ; %bb.0:
 ; GFX12-SDAG-TRUE16-NEXT:    s_clause 0x1
@@ -371,6 +993,40 @@ define amdgpu_kernel void @fminimum_f16_move_to_valu(ptr addrspace(1) %out, ptr
   ret void
 }
 
+define amdgpu_ps float @test_fminimum_f32_ieee_on(float %a, float %b) #0 {
+; GFX9-LABEL: test_fminimum_f32_ieee_on:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    v_min_f32_e32 v2, v0, v1
+; GFX9-NEXT:    v_mov_b32_e32 v3, 0x7fc00000
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v0, v1
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
+; GFX9-NEXT:    ; return to shader part epilog
+;
+; GFX12-LABEL: test_fminimum_f32_ieee_on:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    v_minimum_f32 v0, v0, v1
+; GFX12-NEXT:    ; return to shader part epilog
+  %val = call float @llvm.minimum.f32(float %a, float %b)
+  ret float %val
+}
+
+define amdgpu_ps float @test_fminimum_f32_ieee_off(float %a, float %b) #1 {
+; GFX9-LABEL: test_fminimum_f32_ieee_off:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    v_min_f32_e32 v2, v0, v1
+; GFX9-NEXT:    v_mov_b32_e32 v3, 0x7fc00000
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v0, v1
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
+; GFX9-NEXT:    ; return to shader part epilog
+;
+; GFX12-LABEL: test_fminimum_f32_ieee_off:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    v_minimum_f32 v0, v0, v1
+; GFX12-NEXT:    ; return to shader part epilog
+  %val = call float @llvm.minimum.f32(float %a, float %b)
+  ret float %val
+}
+
 declare float @llvm.minimum.f32(float, float)
 declare <2 x float> @llvm.minimum.v2f32(<2 x float>, <2 x float>)
 declare <3 x float> @llvm.minimum.v3f32(<3 x float>, <3 x float>)
@@ -383,3 +1039,6 @@ declare <4 x half> @llvm.minimum.v4f16(<4 x half>, <4 x half>)
 declare double @llvm.minimum.f64(double, double)
 declare <2 x double> @llvm.minimum.v2f64(<2 x double>, <2 x double>)
 declare <4 x double> @llvm.minimum.v4f64(<4 x double>, <4 x double>)
+
+attributes #0 = { nounwind "amdgpu-ieee"="true" }
+attributes #1 = { nounwind "amdgpu-ieee"="false" }
diff --git a/llvm/test/CodeGen/AMDGPU/global-saddr-load.ll b/llvm/test/CodeGen/AMDGPU/global-saddr-load.ll
index 94afa88..9ebf6ae 100644
--- a/llvm/test/CodeGen/AMDGPU/global-saddr-load.ll
+++ b/llvm/test/CodeGen/AMDGPU/global-saddr-load.ll
@@ -4666,21 +4666,13 @@ define amdgpu_ps float @global_load_saddr_i8_offset_or_i64_imm_offset_16(ptr add
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    ; return to shader part epilog
 ;
-; GFX12-SDAG-LABEL: global_load_saddr_i8_offset_or_i64_imm_offset_16:
-; GFX12-SDAG:       ; %bb.0:
-; GFX12-SDAG-NEXT:    v_or_b32_e32 v0, 16, v0
-; GFX12-SDAG-NEXT:    v_mov_b32_e32 v1, 0
-; GFX12-SDAG-NEXT:    global_load_u8 v0, v[0:1], off
-; GFX12-SDAG-NEXT:    s_wait_loadcnt 0x0
-; GFX12-SDAG-NEXT:    ; return to shader part epilog
-;
-; GFX12-GISEL-LABEL: global_load_saddr_i8_offset_or_i64_imm_offset_16:
-; GFX12-GISEL:       ; %bb.0:
-; GFX12-GISEL-NEXT:    v_mov_b32_e32 v1, 0
-; GFX12-GISEL-NEXT:    v_or_b32_e32 v0, 16, v0
-; GFX12-GISEL-NEXT:    global_load_u8 v0, v[0:1], off
-; GFX12-GISEL-NEXT:    s_wait_loadcnt 0x0
-; GFX12-GISEL-NEXT:    ; return to shader part epilog
+; GFX12-LABEL: global_load_saddr_i8_offset_or_i64_imm_offset_16:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    v_or_b32_e32 v0, 16, v0
+; GFX12-NEXT:    v_mov_b32_e32 v1, 0
+; GFX12-NEXT:    global_load_u8 v0, v[0:1], off
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    ; return to shader part epilog
   %zext.idx = zext i32 %idx to i64
   %or = or i64 %zext.idx, 16
   %addr = inttoptr i64 %or to ptr addrspace(1)
@@ -4707,21 +4699,13 @@ define amdgpu_ps float @global_load_saddr_i8_offset_or_i64_imm_offset_4160(ptr a
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    ; return to shader part epilog
 ;
-; GFX12-SDAG-LABEL: global_load_saddr_i8_offset_or_i64_imm_offset_4160:
-; GFX12-SDAG:       ; %bb.0:
-; GFX12-SDAG-NEXT:    v_or_b32_e32 v0, 0x1040, v0
-; GFX12-SDAG-NEXT:    v_mov_b32_e32 v1, 0
-; GFX12-SDAG-NEXT:    global_load_u8 v0, v[0:1], off
-; GFX12-SDAG-NEXT:    s_wait_loadcnt 0x0
-; GFX12-SDAG-NEXT:    ; return to shader part epilog
-;
-; GFX12-GISEL-LABEL: global_load_saddr_i8_offset_or_i64_imm_offset_4160:
-; GFX12-GISEL:       ; %bb.0:
-; GFX12-GISEL-NEXT:    v_mov_b32_e32 v1, 0
-; GFX12-GISEL-NEXT:    v_or_b32_e32 v0, 0x1040, v0
-; GFX12-GISEL-NEXT:    global_load_u8 v0, v[0:1], off
-; GFX12-GISEL-NEXT:    s_wait_loadcnt 0x0
-; GFX12-GISEL-NEXT:    ; return to shader part epilog
+; GFX12-LABEL: global_load_saddr_i8_offset_or_i64_imm_offset_4160:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    v_or_b32_e32 v0, 0x1040, v0
+; GFX12-NEXT:    v_mov_b32_e32 v1, 0
+; GFX12-NEXT:    global_load_u8 v0, v[0:1], off
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    ; return to shader part epilog
   %zext.idx = zext i32 %idx to i64
   %or = or i64 %zext.idx, 4160
   %addr = inttoptr i64 %or to ptr addrspace(1)
diff --git a/llvm/test/CodeGen/AMDGPU/itofp.i128.ll b/llvm/test/CodeGen/AMDGPU/itofp.i128.ll
index 9684712..2f9182e 100644
--- a/llvm/test/CodeGen/AMDGPU/itofp.i128.ll
+++ b/llvm/test/CodeGen/AMDGPU/itofp.i128.ll
@@ -1066,13 +1066,13 @@ define double @uitofp_i128_to_f64(i128 %x) {
 ; GISEL-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[8:9]
 ; GISEL-NEXT:    v_lshlrev_b64 v[8:9], 30, v[2:3]
 ; GISEL-NEXT:    v_lshrrev_b32_e32 v5, 2, v1
-; GISEL-NEXT:    v_or_b32_e32 v9, v5, v8
+; GISEL-NEXT:    v_or_b32_e32 v9, v8, v5
 ; GISEL-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; GISEL-NEXT:  ; %bb.11: ; %itofp-if-then20
 ; GISEL-NEXT:    v_lshlrev_b64 v[2:3], 29, v[2:3]
 ; GISEL-NEXT:    v_lshrrev_b64 v[4:5], 3, v[0:1]
 ; GISEL-NEXT:    v_lshrrev_b32_e32 v0, 3, v1
-; GISEL-NEXT:    v_or_b32_e32 v9, v0, v2
+; GISEL-NEXT:    v_or_b32_e32 v9, v2, v0
 ; GISEL-NEXT:    v_mov_b32_e32 v7, v6
 ; GISEL-NEXT:  ; %bb.12: ; %Flow
 ; GISEL-NEXT:    s_or_b64 exec, exec, s[4:5]
diff --git a/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-nontemporal-metadata.ll b/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-nontemporal-metadata.ll
index 1e4b633..fc36ed9 100644
--- a/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-nontemporal-metadata.ll
+++ b/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-nontemporal-metadata.ll
@@ -45,27 +45,18 @@ define amdgpu_kernel void @buffer_nontemporal_load_store(ptr addrspace(7) %in, p
 ; GFX9-GISEL:       ; %bb.0: ; %entry
 ; GFX9-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
 ; GFX9-GISEL-NEXT:    s_load_dword s7, s[8:9], 0x10
-; GFX9-GISEL-NEXT:    s_mov_b32 s11, 0
-; GFX9-GISEL-NEXT:    s_mov_b32 s4, s11
-; GFX9-GISEL-NEXT:    s_mov_b32 s6, s11
 ; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-GISEL-NEXT:    s_mov_b32 s10, s1
+; GFX9-GISEL-NEXT:    s_mov_b32 s4, s1
 ; GFX9-GISEL-NEXT:    s_mov_b32 s5, s2
-; GFX9-GISEL-NEXT:    s_or_b64 s[4:5], s[10:11], s[4:5]
-; GFX9-GISEL-NEXT:    s_mov_b32 s10, s3
-; GFX9-GISEL-NEXT:    s_or_b64 s[6:7], s[10:11], s[6:7]
+; GFX9-GISEL-NEXT:    s_mov_b32 s6, s3
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX9-GISEL-NEXT:    buffer_load_dword v0, v0, s[4:7], 0 offen glc slc
 ; GFX9-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x20
 ; GFX9-GISEL-NEXT:    s_load_dword s7, s[8:9], 0x30
-; GFX9-GISEL-NEXT:    s_mov_b32 s4, s11
-; GFX9-GISEL-NEXT:    s_mov_b32 s6, s11
 ; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-GISEL-NEXT:    s_mov_b32 s10, s1
+; GFX9-GISEL-NEXT:    s_mov_b32 s4, s1
 ; GFX9-GISEL-NEXT:    s_mov_b32 s5, s2
-; GFX9-GISEL-NEXT:    s_or_b64 s[4:5], s[10:11], s[4:5]
-; GFX9-GISEL-NEXT:    s_mov_b32 s10, s3
-; GFX9-GISEL-NEXT:    s_or_b64 s[6:7], s[10:11], s[6:7]
+; GFX9-GISEL-NEXT:    s_mov_b32 s6, s3
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-GISEL-NEXT:    buffer_store_dword v0, v1, s[4:7], 0 offen glc slc
@@ -105,27 +96,18 @@ define amdgpu_kernel void @buffer_nontemporal_load_store(ptr addrspace(7) %in, p
 ; GFX942-GISEL:       ; %bb.0: ; %entry
 ; GFX942-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
 ; GFX942-GISEL-NEXT:    s_load_dword s11, s[4:5], 0x10
-; GFX942-GISEL-NEXT:    s_mov_b32 s7, 0
-; GFX942-GISEL-NEXT:    s_mov_b32 s8, s7
-; GFX942-GISEL-NEXT:    s_mov_b32 s10, s7
 ; GFX942-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-GISEL-NEXT:    s_mov_b32 s6, s1
+; GFX942-GISEL-NEXT:    s_mov_b32 s8, s1
 ; GFX942-GISEL-NEXT:    s_mov_b32 s9, s2
-; GFX942-GISEL-NEXT:    s_or_b64 s[8:9], s[6:7], s[8:9]
-; GFX942-GISEL-NEXT:    s_mov_b32 s6, s3
-; GFX942-GISEL-NEXT:    s_or_b64 s[10:11], s[6:7], s[10:11]
+; GFX942-GISEL-NEXT:    s_mov_b32 s10, s3
 ; GFX942-GISEL-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX942-GISEL-NEXT:    buffer_load_dword v0, v0, s[8:11], 0 offen nt
 ; GFX942-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x20
-; GFX942-GISEL-NEXT:    s_load_dword s9, s[4:5], 0x30
-; GFX942-GISEL-NEXT:    s_mov_b32 s4, s7
-; GFX942-GISEL-NEXT:    s_mov_b32 s8, s7
+; GFX942-GISEL-NEXT:    s_load_dword s7, s[4:5], 0x30
 ; GFX942-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-GISEL-NEXT:    s_mov_b32 s6, s1
+; GFX942-GISEL-NEXT:    s_mov_b32 s4, s1
 ; GFX942-GISEL-NEXT:    s_mov_b32 s5, s2
-; GFX942-GISEL-NEXT:    s_or_b64 s[4:5], s[6:7], s[4:5]
 ; GFX942-GISEL-NEXT:    s_mov_b32 s6, s3
-; GFX942-GISEL-NEXT:    s_or_b64 s[6:7], s[6:7], s[8:9]
 ; GFX942-GISEL-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX942-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-GISEL-NEXT:    buffer_store_dword v0, v1, s[4:7], 0 offen nt
@@ -168,29 +150,22 @@ define amdgpu_kernel void @buffer_nontemporal_load_store(ptr addrspace(7) %in, p
 ; GFX10-GISEL:       ; %bb.0: ; %entry
 ; GFX10-GISEL-NEXT:    s_clause 0x1
 ; GFX10-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
-; GFX10-GISEL-NEXT:    s_load_dword s5, s[8:9], 0x10
-; GFX10-GISEL-NEXT:    s_mov_b32 s7, 0
-; GFX10-GISEL-NEXT:    s_mov_b32 s10, s7
-; GFX10-GISEL-NEXT:    s_mov_b32 s4, s7
+; GFX10-GISEL-NEXT:    s_load_dword s7, s[8:9], 0x10
 ; GFX10-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-GISEL-NEXT:    s_mov_b32 s6, s1
-; GFX10-GISEL-NEXT:    s_mov_b32 s11, s2
 ; GFX10-GISEL-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-GISEL-NEXT:    s_or_b64 s[0:1], s[6:7], s[10:11]
+; GFX10-GISEL-NEXT:    s_mov_b32 s4, s1
+; GFX10-GISEL-NEXT:    s_mov_b32 s5, s2
 ; GFX10-GISEL-NEXT:    s_mov_b32 s6, s3
-; GFX10-GISEL-NEXT:    s_or_b64 s[2:3], s[6:7], s[4:5]
-; GFX10-GISEL-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen slc
+; GFX10-GISEL-NEXT:    buffer_load_dword v0, v0, s[4:7], 0 offen slc
 ; GFX10-GISEL-NEXT:    s_clause 0x1
-; GFX10-GISEL-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x20
-; GFX10-GISEL-NEXT:    s_load_dword s11, s[8:9], 0x30
+; GFX10-GISEL-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX10-GISEL-NEXT:    s_load_dword s7, s[8:9], 0x30
 ; GFX10-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-GISEL-NEXT:    s_mov_b32 s6, s1
-; GFX10-GISEL-NEXT:    s_mov_b32 s5, s2
 ; GFX10-GISEL-NEXT:    v_mov_b32_e32 v1, s0
-; GFX10-GISEL-NEXT:    s_or_b64 s[4:5], s[6:7], s[4:5]
+; GFX10-GISEL-NEXT:    s_mov_b32 s4, s1
+; GFX10-GISEL-NEXT:    s_mov_b32 s5, s2
 ; GFX10-GISEL-NEXT:    s_mov_b32 s6, s3
-; GFX10-GISEL-NEXT:    s_or_b64 s[6:7], s[6:7], s[10:11]
 ; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-GISEL-NEXT:    buffer_store_dword v0, v1, s[4:7], 0 offen glc slc
 ; GFX10-GISEL-NEXT:    s_endpgm
@@ -234,32 +209,21 @@ define amdgpu_kernel void @buffer_nontemporal_load_store(ptr addrspace(7) %in, p
 ; GFX11-GISEL:       ; %bb.0: ; %entry
 ; GFX11-GISEL-NEXT:    s_clause 0x1
 ; GFX11-GISEL-NEXT:    s_load_b128 s[0:3], s[4:5], 0x0
-; GFX11-GISEL-NEXT:    s_load_b32 s7, s[4:5], 0x10
-; GFX11-GISEL-NEXT:    s_mov_b32 s9, 0
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-GISEL-NEXT:    s_mov_b32 s10, s9
-; GFX11-GISEL-NEXT:    s_mov_b32 s6, s9
+; GFX11-GISEL-NEXT:    s_load_b32 s11, s[4:5], 0x10
 ; GFX11-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-GISEL-NEXT:    s_mov_b32 s8, s1
-; GFX11-GISEL-NEXT:    s_mov_b32 s11, s2
 ; GFX11-GISEL-NEXT:    v_mov_b32_e32 v0, s0
-; GFX11-GISEL-NEXT:    s_or_b64 s[0:1], s[8:9], s[10:11]
-; GFX11-GISEL-NEXT:    s_mov_b32 s8, s3
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-GISEL-NEXT:    s_or_b64 s[2:3], s[8:9], s[6:7]
-; GFX11-GISEL-NEXT:    buffer_load_b32 v0, v0, s[0:3], 0 offen slc dlc
+; GFX11-GISEL-NEXT:    s_mov_b32 s8, s1
+; GFX11-GISEL-NEXT:    s_mov_b32 s9, s2
+; GFX11-GISEL-NEXT:    s_mov_b32 s10, s3
+; GFX11-GISEL-NEXT:    buffer_load_b32 v0, v0, s[8:11], 0 offen slc dlc
 ; GFX11-GISEL-NEXT:    s_clause 0x1
 ; GFX11-GISEL-NEXT:    s_load_b128 s[0:3], s[4:5], 0x20
 ; GFX11-GISEL-NEXT:    s_load_b32 s7, s[4:5], 0x30
-; GFX11-GISEL-NEXT:    s_mov_b32 s4, s9
 ; GFX11-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-GISEL-NEXT:    s_mov_b32 s8, s1
-; GFX11-GISEL-NEXT:    s_mov_b32 s5, s2
 ; GFX11-GISEL-NEXT:    v_mov_b32_e32 v1, s0
-; GFX11-GISEL-NEXT:    s_or_b64 s[4:5], s[8:9], s[4:5]
-; GFX11-GISEL-NEXT:    s_mov_b32 s8, s3
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-GISEL-NEXT:    s_or_b64 s[6:7], s[8:9], s[6:7]
+; GFX11-GISEL-NEXT:    s_mov_b32 s4, s1
+; GFX11-GISEL-NEXT:    s_mov_b32 s5, s2
+; GFX11-GISEL-NEXT:    s_mov_b32 s6, s3
 ; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-GISEL-NEXT:    buffer_store_b32 v0, v1, s[4:7], 0 offen glc slc dlc
 ; GFX11-GISEL-NEXT:    s_endpgm
@@ -303,32 +267,21 @@ define amdgpu_kernel void @buffer_nontemporal_load_store(ptr addrspace(7) %in, p
 ; GFX12-GISEL:       ; %bb.0: ; %entry
 ; GFX12-GISEL-NEXT:    s_clause 0x1
 ; GFX12-GISEL-NEXT:    s_load_b128 s[0:3], s[4:5], 0x0
-; GFX12-GISEL-NEXT:    s_load_b32 s7, s[4:5], 0x10
-; GFX12-GISEL-NEXT:    s_mov_b32 s9, 0
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-GISEL-NEXT:    s_mov_b32 s10, s9
-; GFX12-GISEL-NEXT:    s_mov_b32 s6, s9
+; GFX12-GISEL-NEXT:    s_load_b32 s11, s[4:5], 0x10
 ; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
-; GFX12-GISEL-NEXT:    s_mov_b32 s8, s1
-; GFX12-GISEL-NEXT:    s_mov_b32 s11, s2
 ; GFX12-GISEL-NEXT:    v_mov_b32_e32 v0, s0
-; GFX12-GISEL-NEXT:    s_or_b64 s[0:1], s[8:9], s[10:11]
-; GFX12-GISEL-NEXT:    s_mov_b32 s8, s3
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-GISEL-NEXT:    s_or_b64 s[2:3], s[8:9], s[6:7]
-; GFX12-GISEL-NEXT:    buffer_load_b32 v0, v0, s[0:3], null offen th:TH_LOAD_NT
+; GFX12-GISEL-NEXT:    s_mov_b32 s8, s1
+; GFX12-GISEL-NEXT:    s_mov_b32 s9, s2
+; GFX12-GISEL-NEXT:    s_mov_b32 s10, s3
+; GFX12-GISEL-NEXT:    buffer_load_b32 v0, v0, s[8:11], null offen th:TH_LOAD_NT
 ; GFX12-GISEL-NEXT:    s_clause 0x1
 ; GFX12-GISEL-NEXT:    s_load_b128 s[0:3], s[4:5], 0x20
 ; GFX12-GISEL-NEXT:    s_load_b32 s7, s[4:5], 0x30
-; GFX12-GISEL-NEXT:    s_mov_b32 s4, s9
 ; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
-; GFX12-GISEL-NEXT:    s_mov_b32 s8, s1
-; GFX12-GISEL-NEXT:    s_mov_b32 s5, s2
 ; GFX12-GISEL-NEXT:    v_mov_b32_e32 v1, s0
-; GFX12-GISEL-NEXT:    s_or_b64 s[4:5], s[8:9], s[4:5]
-; GFX12-GISEL-NEXT:    s_mov_b32 s8, s3
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-GISEL-NEXT:    s_or_b64 s[6:7], s[8:9], s[6:7]
+; GFX12-GISEL-NEXT:    s_mov_b32 s4, s1
+; GFX12-GISEL-NEXT:    s_mov_b32 s5, s2
+; GFX12-GISEL-NEXT:    s_mov_b32 s6, s3
 ; GFX12-GISEL-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-GISEL-NEXT:    buffer_store_b32 v0, v1, s[4:7], null offen th:TH_STORE_NT
 ; GFX12-GISEL-NEXT:    s_endpgm
@@ -374,28 +327,19 @@ define amdgpu_kernel void @buffer_nontemporal_and_volatile_load_store(ptr addrsp
 ; GFX9-GISEL:       ; %bb.0: ; %entry
 ; GFX9-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
 ; GFX9-GISEL-NEXT:    s_load_dword s7, s[8:9], 0x10
-; GFX9-GISEL-NEXT:    s_mov_b32 s11, 0
-; GFX9-GISEL-NEXT:    s_mov_b32 s4, s11
-; GFX9-GISEL-NEXT:    s_mov_b32 s6, s11
 ; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-GISEL-NEXT:    s_mov_b32 s10, s1
+; GFX9-GISEL-NEXT:    s_mov_b32 s4, s1
 ; GFX9-GISEL-NEXT:    s_mov_b32 s5, s2
-; GFX9-GISEL-NEXT:    s_or_b64 s[4:5], s[10:11], s[4:5]
-; GFX9-GISEL-NEXT:    s_mov_b32 s10, s3
-; GFX9-GISEL-NEXT:    s_or_b64 s[6:7], s[10:11], s[6:7]
+; GFX9-GISEL-NEXT:    s_mov_b32 s6, s3
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX9-GISEL-NEXT:    buffer_load_dword v0, v0, s[4:7], 0 offen glc
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x20
 ; GFX9-GISEL-NEXT:    s_load_dword s7, s[8:9], 0x30
-; GFX9-GISEL-NEXT:    s_mov_b32 s4, s11
-; GFX9-GISEL-NEXT:    s_mov_b32 s6, s11
 ; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-GISEL-NEXT:    s_mov_b32 s10, s1
+; GFX9-GISEL-NEXT:    s_mov_b32 s4, s1
 ; GFX9-GISEL-NEXT:    s_mov_b32 s5, s2
-; GFX9-GISEL-NEXT:    s_or_b64 s[4:5], s[10:11], s[4:5]
-; GFX9-GISEL-NEXT:    s_mov_b32 s10, s3
-; GFX9-GISEL-NEXT:    s_or_b64 s[6:7], s[10:11], s[6:7]
+; GFX9-GISEL-NEXT:    s_mov_b32 s6, s3
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX9-GISEL-NEXT:    buffer_store_dword v0, v1, s[4:7], 0 offen
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
@@ -436,28 +380,19 @@ define amdgpu_kernel void @buffer_nontemporal_and_volatile_load_store(ptr addrsp
 ; GFX942-GISEL:       ; %bb.0: ; %entry
 ; GFX942-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
 ; GFX942-GISEL-NEXT:    s_load_dword s11, s[4:5], 0x10
-; GFX942-GISEL-NEXT:    s_mov_b32 s7, 0
-; GFX942-GISEL-NEXT:    s_mov_b32 s8, s7
-; GFX942-GISEL-NEXT:    s_mov_b32 s10, s7
 ; GFX942-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-GISEL-NEXT:    s_mov_b32 s6, s1
+; GFX942-GISEL-NEXT:    s_mov_b32 s8, s1
 ; GFX942-GISEL-NEXT:    s_mov_b32 s9, s2
-; GFX942-GISEL-NEXT:    s_or_b64 s[8:9], s[6:7], s[8:9]
-; GFX942-GISEL-NEXT:    s_mov_b32 s6, s3
-; GFX942-GISEL-NEXT:    s_or_b64 s[10:11], s[6:7], s[10:11]
+; GFX942-GISEL-NEXT:    s_mov_b32 s10, s3
 ; GFX942-GISEL-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX942-GISEL-NEXT:    buffer_load_dword v0, v0, s[8:11], 0 offen sc0 sc1
 ; GFX942-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x20
-; GFX942-GISEL-NEXT:    s_load_dword s9, s[4:5], 0x30
-; GFX942-GISEL-NEXT:    s_mov_b32 s4, s7
-; GFX942-GISEL-NEXT:    s_mov_b32 s8, s7
+; GFX942-GISEL-NEXT:    s_load_dword s7, s[4:5], 0x30
 ; GFX942-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-GISEL-NEXT:    s_mov_b32 s6, s1
+; GFX942-GISEL-NEXT:    s_mov_b32 s4, s1
 ; GFX942-GISEL-NEXT:    s_mov_b32 s5, s2
-; GFX942-GISEL-NEXT:    s_or_b64 s[4:5], s[6:7], s[4:5]
 ; GFX942-GISEL-NEXT:    s_mov_b32 s6, s3
-; GFX942-GISEL-NEXT:    s_or_b64 s[6:7], s[6:7], s[8:9]
 ; GFX942-GISEL-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX942-GISEL-NEXT:    buffer_store_dword v0, v1, s[4:7], 0 offen sc0 sc1
 ; GFX942-GISEL-NEXT:    s_waitcnt vmcnt(0)
@@ -501,30 +436,23 @@ define amdgpu_kernel void @buffer_nontemporal_and_volatile_load_store(ptr addrsp
 ; GFX10-GISEL:       ; %bb.0: ; %entry
 ; GFX10-GISEL-NEXT:    s_clause 0x1
 ; GFX10-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
-; GFX10-GISEL-NEXT:    s_load_dword s5, s[8:9], 0x10
-; GFX10-GISEL-NEXT:    s_mov_b32 s7, 0
-; GFX10-GISEL-NEXT:    s_mov_b32 s10, s7
-; GFX10-GISEL-NEXT:    s_mov_b32 s4, s7
+; GFX10-GISEL-NEXT:    s_load_dword s7, s[8:9], 0x10
 ; GFX10-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-GISEL-NEXT:    s_mov_b32 s6, s1
-; GFX10-GISEL-NEXT:    s_mov_b32 s11, s2
 ; GFX10-GISEL-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-GISEL-NEXT:    s_or_b64 s[0:1], s[6:7], s[10:11]
+; GFX10-GISEL-NEXT:    s_mov_b32 s4, s1
+; GFX10-GISEL-NEXT:    s_mov_b32 s5, s2
 ; GFX10-GISEL-NEXT:    s_mov_b32 s6, s3
-; GFX10-GISEL-NEXT:    s_or_b64 s[2:3], s[6:7], s[4:5]
-; GFX10-GISEL-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen glc dlc
+; GFX10-GISEL-NEXT:    buffer_load_dword v0, v0, s[4:7], 0 offen glc dlc
 ; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-GISEL-NEXT:    s_clause 0x1
-; GFX10-GISEL-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x20
-; GFX10-GISEL-NEXT:    s_load_dword s11, s[8:9], 0x30
+; GFX10-GISEL-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX10-GISEL-NEXT:    s_load_dword s7, s[8:9], 0x30
 ; GFX10-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-GISEL-NEXT:    s_mov_b32 s6, s1
-; GFX10-GISEL-NEXT:    s_mov_b32 s5, s2
 ; GFX10-GISEL-NEXT:    v_mov_b32_e32 v1, s0
-; GFX10-GISEL-NEXT:    s_or_b64 s[4:5], s[6:7], s[4:5]
+; GFX10-GISEL-NEXT:    s_mov_b32 s4, s1
+; GFX10-GISEL-NEXT:    s_mov_b32 s5, s2
 ; GFX10-GISEL-NEXT:    s_mov_b32 s6, s3
-; GFX10-GISEL-NEXT:    s_or_b64 s[6:7], s[6:7], s[10:11]
 ; GFX10-GISEL-NEXT:    buffer_store_dword v0, v1, s[4:7], 0 offen
 ; GFX10-GISEL-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-GISEL-NEXT:    s_endpgm
@@ -569,33 +497,22 @@ define amdgpu_kernel void @buffer_nontemporal_and_volatile_load_store(ptr addrsp
 ; GFX11-GISEL:       ; %bb.0: ; %entry
 ; GFX11-GISEL-NEXT:    s_clause 0x1
 ; GFX11-GISEL-NEXT:    s_load_b128 s[0:3], s[4:5], 0x0
-; GFX11-GISEL-NEXT:    s_load_b32 s7, s[4:5], 0x10
-; GFX11-GISEL-NEXT:    s_mov_b32 s9, 0
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-GISEL-NEXT:    s_mov_b32 s10, s9
-; GFX11-GISEL-NEXT:    s_mov_b32 s6, s9
+; GFX11-GISEL-NEXT:    s_load_b32 s11, s[4:5], 0x10
 ; GFX11-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-GISEL-NEXT:    s_mov_b32 s8, s1
-; GFX11-GISEL-NEXT:    s_mov_b32 s11, s2
 ; GFX11-GISEL-NEXT:    v_mov_b32_e32 v0, s0
-; GFX11-GISEL-NEXT:    s_or_b64 s[0:1], s[8:9], s[10:11]
-; GFX11-GISEL-NEXT:    s_mov_b32 s8, s3
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-GISEL-NEXT:    s_or_b64 s[2:3], s[8:9], s[6:7]
-; GFX11-GISEL-NEXT:    buffer_load_b32 v0, v0, s[0:3], 0 offen glc dlc
+; GFX11-GISEL-NEXT:    s_mov_b32 s8, s1
+; GFX11-GISEL-NEXT:    s_mov_b32 s9, s2
+; GFX11-GISEL-NEXT:    s_mov_b32 s10, s3
+; GFX11-GISEL-NEXT:    buffer_load_b32 v0, v0, s[8:11], 0 offen glc dlc
 ; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-GISEL-NEXT:    s_clause 0x1
 ; GFX11-GISEL-NEXT:    s_load_b128 s[0:3], s[4:5], 0x20
 ; GFX11-GISEL-NEXT:    s_load_b32 s7, s[4:5], 0x30
-; GFX11-GISEL-NEXT:    s_mov_b32 s4, s9
 ; GFX11-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-GISEL-NEXT:    s_mov_b32 s8, s1
-; GFX11-GISEL-NEXT:    s_mov_b32 s5, s2
 ; GFX11-GISEL-NEXT:    v_mov_b32_e32 v1, s0
-; GFX11-GISEL-NEXT:    s_or_b64 s[4:5], s[8:9], s[4:5]
-; GFX11-GISEL-NEXT:    s_mov_b32 s8, s3
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-GISEL-NEXT:    s_or_b64 s[6:7], s[8:9], s[6:7]
+; GFX11-GISEL-NEXT:    s_mov_b32 s4, s1
+; GFX11-GISEL-NEXT:    s_mov_b32 s5, s2
+; GFX11-GISEL-NEXT:    s_mov_b32 s6, s3
 ; GFX11-GISEL-NEXT:    buffer_store_b32 v0, v1, s[4:7], 0 offen dlc
 ; GFX11-GISEL-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-GISEL-NEXT:    s_endpgm
@@ -640,33 +557,22 @@ define amdgpu_kernel void @buffer_nontemporal_and_volatile_load_store(ptr addrsp
 ; GFX12-GISEL:       ; %bb.0: ; %entry
 ; GFX12-GISEL-NEXT:    s_clause 0x1
 ; GFX12-GISEL-NEXT:    s_load_b128 s[0:3], s[4:5], 0x0
-; GFX12-GISEL-NEXT:    s_load_b32 s7, s[4:5], 0x10
-; GFX12-GISEL-NEXT:    s_mov_b32 s9, 0
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-GISEL-NEXT:    s_mov_b32 s10, s9
-; GFX12-GISEL-NEXT:    s_mov_b32 s6, s9
+; GFX12-GISEL-NEXT:    s_load_b32 s11, s[4:5], 0x10
 ; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
-; GFX12-GISEL-NEXT:    s_mov_b32 s8, s1
-; GFX12-GISEL-NEXT:    s_mov_b32 s11, s2
 ; GFX12-GISEL-NEXT:    v_mov_b32_e32 v0, s0
-; GFX12-GISEL-NEXT:    s_or_b64 s[0:1], s[8:9], s[10:11]
-; GFX12-GISEL-NEXT:    s_mov_b32 s8, s3
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-GISEL-NEXT:    s_or_b64 s[2:3], s[8:9], s[6:7]
-; GFX12-GISEL-NEXT:    buffer_load_b32 v0, v0, s[0:3], null offen th:TH_LOAD_NT scope:SCOPE_SYS
+; GFX12-GISEL-NEXT:    s_mov_b32 s8, s1
+; GFX12-GISEL-NEXT:    s_mov_b32 s9, s2
+; GFX12-GISEL-NEXT:    s_mov_b32 s10, s3
+; GFX12-GISEL-NEXT:    buffer_load_b32 v0, v0, s[8:11], null offen th:TH_LOAD_NT scope:SCOPE_SYS
 ; GFX12-GISEL-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-GISEL-NEXT:    s_clause 0x1
 ; GFX12-GISEL-NEXT:    s_load_b128 s[0:3], s[4:5], 0x20
 ; GFX12-GISEL-NEXT:    s_load_b32 s7, s[4:5], 0x30
-; GFX12-GISEL-NEXT:    s_mov_b32 s4, s9
 ; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
-; GFX12-GISEL-NEXT:    s_mov_b32 s8, s1
-; GFX12-GISEL-NEXT:    s_mov_b32 s5, s2
 ; GFX12-GISEL-NEXT:    v_mov_b32_e32 v1, s0
-; GFX12-GISEL-NEXT:    s_or_b64 s[4:5], s[8:9], s[4:5]
-; GFX12-GISEL-NEXT:    s_mov_b32 s8, s3
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-GISEL-NEXT:    s_or_b64 s[6:7], s[8:9], s[6:7]
+; GFX12-GISEL-NEXT:    s_mov_b32 s4, s1
+; GFX12-GISEL-NEXT:    s_mov_b32 s5, s2
+; GFX12-GISEL-NEXT:    s_mov_b32 s6, s3
 ; GFX12-GISEL-NEXT:    buffer_store_b32 v0, v1, s[4:7], null offen th:TH_STORE_NT scope:SCOPE_SYS
 ; GFX12-GISEL-NEXT:    s_wait_storecnt 0x0
 ; GFX12-GISEL-NEXT:    s_endpgm
diff --git a/llvm/test/CodeGen/BPF/BTF/binary-format.ll b/llvm/test/CodeGen/BPF/BTF/binary-format.ll
index 3b1be1a..fd09566 100644
--- a/llvm/test/CodeGen/BPF/BTF/binary-format.ll
+++ b/llvm/test/CodeGen/BPF/BTF/binary-format.ll
@@ -7,7 +7,7 @@
 ;   clang -target bpf -O2 -g -gdwarf-5 -gembed-source -S -emit-llvm t.c
 
 ; Function Attrs: nounwind readnone
-define dso_local i32 @f(i32 returned %a) local_unnamed_addr #0 !dbg !7 {
+define dso_local i32 @f(i32 returned %a) local_unnamed_addr !dbg !7 {
 entry:
   call void @llvm.dbg.value(metadata i32 %a, metadata !12, metadata !DIExpression()), !dbg !13
   ret i32 %a, !dbg !14
@@ -42,10 +42,7 @@ entry:
 ; CHECK-EB: 0x00000050 00000008 0000000f 00000018 00000410
 
 ; Function Attrs: nounwind readnone speculatable
-declare void @llvm.dbg.value(metadata, metadata, metadata) #1
-
-attributes #0 = { nounwind readnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { nounwind readnone speculatable }
+declare void @llvm.dbg.value(metadata, metadata, metadata)
 
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!3, !4, !5}
diff --git a/llvm/test/CodeGen/BPF/BTF/builtin-btf-type-id.ll b/llvm/test/CodeGen/BPF/BTF/builtin-btf-type-id.ll
index 2fb8d25..1672334 100644
--- a/llvm/test/CodeGen/BPF/BTF/builtin-btf-type-id.ll
+++ b/llvm/test/CodeGen/BPF/BTF/builtin-btf-type-id.ll
@@ -24,7 +24,7 @@
 @bpf_log = internal global ptr inttoptr (i64 999 to ptr), align 8, !dbg !17
 
 ; Function Attrs: nounwind
-define dso_local void @prog1() #0 !dbg !28 {
+define dso_local void @prog1() !dbg !28 {
 entry:
   %0 = load ptr, ptr @bpf_log, align 8, !dbg !31, !tbaa !32
   %1 = call i64 @llvm.bpf.btf.type.id(i32 0, i64 0), !dbg !36, !llvm.preserve.access.index !7
@@ -33,10 +33,10 @@ entry:
 }
 
 ; Function Attrs: nounwind readnone
-declare i64 @llvm.bpf.btf.type.id(i32, i64) #1
+declare i64 @llvm.bpf.btf.type.id(i32, i64)
 
 ; Function Attrs: nounwind
-define dso_local void @prog2() #0 !dbg !38 {
+define dso_local void @prog2() !dbg !38 {
 entry:
   %0 = load ptr, ptr @bpf_log, align 8, !dbg !39, !tbaa !32
   %1 = call i64 @llvm.bpf.btf.type.id(i32 1, i64 0), !dbg !40, !llvm.preserve.access.index !6
@@ -45,7 +45,7 @@ entry:
 }
 
 ; Function Attrs: nounwind
-define dso_local void @prog3() #0 !dbg !42 {
+define dso_local void @prog3() !dbg !42 {
 entry:
   %0 = load ptr, ptr @bpf_log, align 8, !dbg !43, !tbaa !32
   %1 = call i64 @llvm.bpf.btf.type.id(i32 2, i64 1), !dbg !44, !llvm.preserve.access.index !11
@@ -96,9 +96,6 @@ entry:
 ; CHECK-NEXT:        .long   48
 ; CHECK-NEXT:        .long   7
 
-attributes #0 = { nounwind "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { nounwind readnone }
-
 !llvm.dbg.cu = !{!2}
 !llvm.module.flags = !{!24, !25, !26}
 !llvm.ident = !{!27}
diff --git a/llvm/test/CodeGen/BPF/BTF/char-no-debuginfo.ll b/llvm/test/CodeGen/BPF/BTF/char-no-debuginfo.ll
index cc14a32b..1c2b1d1 100644
--- a/llvm/test/CodeGen/BPF/BTF/char-no-debuginfo.ll
+++ b/llvm/test/CodeGen/BPF/BTF/char-no-debuginfo.ll
@@ -10,7 +10,7 @@
 @g = dso_local local_unnamed_addr global i32 5, section "maps", align 4
 
 ; Function Attrs: norecurse nounwind readonly
-define dso_local i32 @test() local_unnamed_addr #0 {
+define dso_local i32 @test() local_unnamed_addr {
   %1 = load i32, ptr @g, align 4, !tbaa !2
   ret i32 %1
 }
@@ -18,8 +18,6 @@ define dso_local i32 @test() local_unnamed_addr #0 {
 ; CHECK-NOT:         .section        .BTF
 ; CHECK-NOT:         .section        .BTF.ext
 
-attributes #0 = { norecurse nounwind readonly "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-
 !llvm.module.flags = !{!0}
 !llvm.ident = !{!1}
 
diff --git a/llvm/test/CodeGen/BPF/BTF/extern-builtin.ll b/llvm/test/CodeGen/BPF/BTF/extern-builtin.ll
index a855016..fa0aa5b 100644
--- a/llvm/test/CodeGen/BPF/BTF/extern-builtin.ll
+++ b/llvm/test/CodeGen/BPF/BTF/extern-builtin.ll
@@ -10,7 +10,7 @@
 ;   clang -target bpf -O2 -g -S -emit-llvm test.c
 
 ; Function Attrs: nounwind readonly
-define dso_local i64 @test(ptr readonly %skb) local_unnamed_addr #0 !dbg !13 {
+define dso_local i64 @test(ptr readonly %skb) local_unnamed_addr !dbg !13 {
 entry:
   call void @llvm.dbg.value(metadata ptr %skb, metadata !17, metadata !DIExpression()), !dbg !18
   %call = tail call i64 @llvm.bpf.load.byte(ptr %skb, i64 10), !dbg !19
@@ -54,13 +54,9 @@ entry:
 ; CHECK-NEXT:        .byte   0
 
 ; Function Attrs: nounwind readonly
-declare !dbg !4 i64 @llvm.bpf.load.byte(ptr, i64) #1
+declare !dbg !4 i64 @llvm.bpf.load.byte(ptr, i64)
 ; Function Attrs: nounwind readnone speculatable willreturn
-declare void @llvm.dbg.value(metadata, metadata, metadata) #2
-
-attributes #0 = { nounwind readonly "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { nounwind readonly }
-attributes #2 = { nounwind readnone speculatable willreturn }
+declare void @llvm.dbg.value(metadata, metadata, metadata)
 
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!9, !10, !11}
diff --git a/llvm/test/CodeGen/BPF/BTF/extern-func-arg.ll b/llvm/test/CodeGen/BPF/BTF/extern-func-arg.ll
index b7cbb48f..9a31beb 100644
--- a/llvm/test/CodeGen/BPF/BTF/extern-func-arg.ll
+++ b/llvm/test/CodeGen/BPF/BTF/extern-func-arg.ll
@@ -8,9 +8,9 @@
 ;   clang -target bpf -O2 -g -S -emit-llvm test.c
 
 ; Function Attrs: nounwind
-define dso_local i32 @test() local_unnamed_addr #0 !dbg !13 {
+define dso_local i32 @test() local_unnamed_addr !dbg !13 {
 entry:
-  %call = tail call i32 @global_func(i8 signext 0) #2, !dbg !16
+  %call = tail call i32 @global_func(i8 signext 0), !dbg !16
   ret i32 %call, !dbg !17
 }
 
@@ -49,11 +49,7 @@ entry:
 ; CHECK:             .ascii  "char"                  # string offset=55
 ; CHECK:             .ascii  "global_func"           # string offset=60
 
-declare !dbg !4 dso_local i32 @global_func(i8 signext) local_unnamed_addr #1
-
-attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #2 = { nounwind }
+declare !dbg !4 dso_local i32 @global_func(i8 signext) local_unnamed_addr
 
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!9, !10, !11}
diff --git a/llvm/test/CodeGen/BPF/BTF/extern-global-var.ll b/llvm/test/CodeGen/BPF/BTF/extern-global-var.ll
index 299aa1d..c3f93ab 100644
--- a/llvm/test/CodeGen/BPF/BTF/extern-global-var.ll
+++ b/llvm/test/CodeGen/BPF/BTF/extern-global-var.ll
@@ -10,7 +10,7 @@
 @a = external dso_local local_unnamed_addr global i8, align 1
 
 ; Function Attrs: norecurse nounwind readonly
-define dso_local i32 @foo() local_unnamed_addr #0 !dbg !7 {
+define dso_local i32 @foo() local_unnamed_addr !dbg !7 {
   %1 = load i8, ptr @a, align 1, !dbg !11, !tbaa !12
   %2 = sext i8 %1 to i32, !dbg !11
   ret i32 %2, !dbg !15
@@ -45,8 +45,6 @@ define dso_local i32 @foo() local_unnamed_addr #0 !dbg !7 {
 ; CHECK-NEXT:        .ascii  "/home/yhs/work/tests/llvm/bug/test.c" # string offset=15
 ; CHECK-NEXT:        .byte   0
 
-attributes #0 = { norecurse nounwind readonly "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!3, !4, !5}
 !llvm.ident = !{!6}
diff --git a/llvm/test/CodeGen/BPF/BTF/extern-var-func-weak-section.ll b/llvm/test/CodeGen/BPF/BTF/extern-var-func-weak-section.ll
index d11addd..0ddd634 100644
--- a/llvm/test/CodeGen/BPF/BTF/extern-var-func-weak-section.ll
+++ b/llvm/test/CodeGen/BPF/BTF/extern-var-func-weak-section.ll
@@ -10,12 +10,12 @@
 ;   clang -target bpf -O2 -g -S -emit-llvm test.c
 
 ; Function Attrs: nounwind
-define dso_local i32 @test() local_unnamed_addr #0 !dbg !13 {
+define dso_local i32 @test() local_unnamed_addr !dbg !13 {
 entry:
-  %call = tail call i32 @global_func(i8 signext 0) #2, !dbg !16
+  %call = tail call i32 @global_func(i8 signext 0), !dbg !16
   ret i32 %call, !dbg !17
 }
-declare !dbg !4 extern_weak dso_local i32 @global_func(i8 signext) local_unnamed_addr #1 section "abc"
+declare !dbg !4 extern_weak dso_local i32 @global_func(i8 signext) local_unnamed_addr section "abc"
 
 ; CHECK:             .section        .BTF,"",@progbits
 ; CHECK-NEXT:        .short  60319                   # 0xeb9f
@@ -69,10 +69,6 @@ declare !dbg !4 extern_weak dso_local i32 @global_func(i8 signext) local_unnamed
 ; CHECK-NEXT:        .byte   0
 ; CHECK-NEXT:        .ascii  "abc"                   # string offset=72
 
-attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #2 = { nounwind }
-
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!9, !10, !11}
 !llvm.ident = !{!12}
diff --git a/llvm/test/CodeGen/BPF/BTF/extern-var-func-weak.ll b/llvm/test/CodeGen/BPF/BTF/extern-var-func-weak.ll
index 9e82295..fbfc03b 100644
--- a/llvm/test/CodeGen/BPF/BTF/extern-var-func-weak.ll
+++ b/llvm/test/CodeGen/BPF/BTF/extern-var-func-weak.ll
@@ -10,12 +10,12 @@
 ;   clang -target bpf -O2 -g -S -emit-llvm test.c
 
 ; Function Attrs: nounwind
-define dso_local i32 @test() local_unnamed_addr #0 !dbg !13 {
+define dso_local i32 @test() local_unnamed_addr !dbg !13 {
 entry:
-  %call = tail call i32 @global_func(i8 signext 0) #2, !dbg !16
+  %call = tail call i32 @global_func(i8 signext 0), !dbg !16
   ret i32 %call, !dbg !17
 }
-declare !dbg !4 extern_weak dso_local i32 @global_func(i8 signext) local_unnamed_addr #1
+declare !dbg !4 extern_weak dso_local i32 @global_func(i8 signext) local_unnamed_addr
 
 ; CHECK:             .section        .BTF,"",@progbits
 ; CHECK-NEXT:        .short  60319                   # 0xeb9f
@@ -62,10 +62,6 @@ declare !dbg !4 extern_weak dso_local i32 @global_func(i8 signext) local_unnamed
 ; CHECK-NEXT:        .ascii  "global_func"           # string offset=60
 ; CHECK-NEXT:        .byte   0
 
-attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #2 = { nounwind }
-
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!9, !10, !11}
 !llvm.ident = !{!12}
diff --git a/llvm/test/CodeGen/BPF/BTF/extern-var-func.ll b/llvm/test/CodeGen/BPF/BTF/extern-var-func.ll
index 262abb3..0ba4732 100644
--- a/llvm/test/CodeGen/BPF/BTF/extern-var-func.ll
+++ b/llvm/test/CodeGen/BPF/BTF/extern-var-func.ll
@@ -10,9 +10,9 @@
 ;   clang -target bpf -O2 -g -S -emit-llvm test.c
 
 ; Function Attrs: nounwind
-define dso_local i32 @test() local_unnamed_addr #0 !dbg !13 {
+define dso_local i32 @test() local_unnamed_addr !dbg !13 {
 entry:
-  %call = tail call i32 @global_func(i8 signext 0) #2, !dbg !16
+  %call = tail call i32 @global_func(i8 signext 0), !dbg !16
   ret i32 %call, !dbg !17
 }
 
@@ -61,11 +61,7 @@ entry:
 ; CHECK-NEXT:        .ascii  "global_func"           # string offset=60
 ; CHECK-NEXT:        .byte   0
 
-declare !dbg !4 dso_local i32 @global_func(i8 signext) local_unnamed_addr #1
-
-attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #2 = { nounwind }
+declare !dbg !4 dso_local i32 @global_func(i8 signext) local_unnamed_addr
 
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!9, !10, !11}
diff --git a/llvm/test/CodeGen/BPF/BTF/extern-var-section.ll b/llvm/test/CodeGen/BPF/BTF/extern-var-section.ll
index b6e14fc..27793d1 100644
--- a/llvm/test/CodeGen/BPF/BTF/extern-var-section.ll
+++ b/llvm/test/CodeGen/BPF/BTF/extern-var-section.ll
@@ -13,9 +13,9 @@
 @ch = external dso_local local_unnamed_addr global i8, section "abc", align 1, !dbg !0
 
 ; Function Attrs: nounwind
-define dso_local i32 @test() local_unnamed_addr #0 !dbg !16 {
+define dso_local i32 @test() local_unnamed_addr !dbg !16 {
 entry:
-  %call = tail call i32 @global_func(i8 signext 0) #2, !dbg !19
+  %call = tail call i32 @global_func(i8 signext 0), !dbg !19
   %0 = load i8, ptr @ch, align 1, !dbg !20, !tbaa !21
   %conv = sext i8 %0 to i32, !dbg !20
   %add = add nsw i32 %call, %conv, !dbg !24
@@ -84,11 +84,7 @@ entry:
 ; CHECK-NEXT:        .ascii  "abc"                   # string offset=75
 ; CHECK-NEXT:        .byte   0
 
-declare !dbg !6 dso_local i32 @global_func(i8 signext) local_unnamed_addr #1 section "abc"
-
-attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #2 = { nounwind }
+declare !dbg !6 dso_local i32 @global_func(i8 signext) local_unnamed_addr section "abc"
 
 !llvm.dbg.cu = !{!2}
 !llvm.module.flags = !{!12, !13, !14}
diff --git a/llvm/test/CodeGen/BPF/BTF/extern-var-struct-weak.ll b/llvm/test/CodeGen/BPF/BTF/extern-var-struct-weak.ll
index 63ab578..ffec16b 100644
--- a/llvm/test/CodeGen/BPF/BTF/extern-var-struct-weak.ll
+++ b/llvm/test/CodeGen/BPF/BTF/extern-var-struct-weak.ll
@@ -12,7 +12,7 @@
 
 @global = extern_weak dso_local local_unnamed_addr global %struct.t1, align 4, !dbg !0
 ; Function Attrs: norecurse nounwind readonly
-define dso_local i32 @test() local_unnamed_addr #0 !dbg !15 {
+define dso_local i32 @test() local_unnamed_addr !dbg !15 {
 entry:
   %0 = load i32, ptr @global, align 4, !dbg !18, !tbaa !19
   ret i32 %0, !dbg !24
@@ -68,8 +68,6 @@ entry:
 ; CHECK-NEXT:        .ascii  "global"                # string offset=66
 ; CHECK-NEXT:        .byte   0
 
-attributes #0 = { norecurse nounwind readonly "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-
 !llvm.dbg.cu = !{!2}
 !llvm.module.flags = !{!11, !12, !13}
 !llvm.ident = !{!14}
diff --git a/llvm/test/CodeGen/BPF/BTF/extern-var-struct.ll b/llvm/test/CodeGen/BPF/BTF/extern-var-struct.ll
index 3ecda4f..dfe5e5e 100644
--- a/llvm/test/CodeGen/BPF/BTF/extern-var-struct.ll
+++ b/llvm/test/CodeGen/BPF/BTF/extern-var-struct.ll
@@ -13,7 +13,7 @@
 @global = external dso_local local_unnamed_addr global %struct.t1, align 4, !dbg !0
 
 ; Function Attrs: norecurse nounwind readonly
-define dso_local i32 @test() local_unnamed_addr #0 !dbg !15 {
+define dso_local i32 @test() local_unnamed_addr !dbg !15 {
 entry:
   %0 = load i32, ptr @global, align 4, !dbg !18, !tbaa !19
   ret i32 %0, !dbg !24
@@ -69,8 +69,6 @@ entry:
 ; CHECK-NEXT:        .ascii  "global"                # string offset=66
 ; CHECK-NEXT:        .byte   0
 
-attributes #0 = { norecurse nounwind readonly "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-
 !llvm.dbg.cu = !{!2}
 !llvm.module.flags = !{!11, !12, !13}
 !llvm.ident = !{!14}
diff --git a/llvm/test/CodeGen/BPF/BTF/extern-var-weak-section.ll b/llvm/test/CodeGen/BPF/BTF/extern-var-weak-section.ll
index 57ca18c..7d28987 100644
--- a/llvm/test/CodeGen/BPF/BTF/extern-var-weak-section.ll
+++ b/llvm/test/CodeGen/BPF/BTF/extern-var-weak-section.ll
@@ -12,15 +12,15 @@
 
 @ch = extern_weak dso_local local_unnamed_addr global i8, section "abc", align 1, !dbg !0
 ; Function Attrs: nounwind
-define dso_local i32 @test() local_unnamed_addr #0 !dbg !16 {
+define dso_local i32 @test() local_unnamed_addr !dbg !16 {
 entry:
-  %call = tail call i32 @global_func(i8 signext 0) #2, !dbg !19
+  %call = tail call i32 @global_func(i8 signext 0), !dbg !19
   %0 = load i8, ptr @ch, align 1, !dbg !20, !tbaa !21
   %conv = sext i8 %0 to i32, !dbg !20
   %add = add nsw i32 %call, %conv, !dbg !24
   ret i32 %add, !dbg !25
 }
-declare !dbg !6 extern_weak dso_local i32 @global_func(i8 signext) local_unnamed_addr #1 section "abc"
+declare !dbg !6 extern_weak dso_local i32 @global_func(i8 signext) local_unnamed_addr section "abc"
 
 ; CHECK:             .section        .BTF,"",@progbits
 ; CHECK-NEXT:        .short  60319                   # 0xeb9f
@@ -84,10 +84,6 @@ declare !dbg !6 extern_weak dso_local i32 @global_func(i8 signext) local_unnamed
 ; CHECK-NEXT:        .ascii  "abc"                   # string offset=75
 ; CHECK-NEXT:        .byte   0
 
-attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #2 = { nounwind }
-
 !llvm.dbg.cu = !{!2}
 !llvm.module.flags = !{!12, !13, !14}
 !llvm.ident = !{!15}
diff --git a/llvm/test/CodeGen/BPF/BTF/filename.ll b/llvm/test/CodeGen/BPF/BTF/filename.ll
index ae08aea..0d8742fa 100644
--- a/llvm/test/CodeGen/BPF/BTF/filename.ll
+++ b/llvm/test/CodeGen/BPF/BTF/filename.ll
@@ -7,7 +7,7 @@
 ;   clang -target bpf -O2 -g -S -emit-llvm t.c
 
 ; Function Attrs: norecurse nounwind readnone uwtable
-define dso_local i32 @test() local_unnamed_addr #0 !dbg !7 {
+define dso_local i32 @test() local_unnamed_addr !dbg !7 {
   ret i32 0, !dbg !11
 }
 
@@ -63,8 +63,6 @@ define dso_local i32 @test() local_unnamed_addr #0 !dbg !7 {
 ; CHECK-NEXT:        .long   0
 ; CHECK-NEXT:        .long   1038                    # Line 1 Col 14
 
-attributes #0 = { norecurse nounwind readnone uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
-
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!3, !4, !5}
 !llvm.ident = !{!6}
diff --git a/llvm/test/CodeGen/BPF/BTF/func-func-ptr.ll b/llvm/test/CodeGen/BPF/BTF/func-func-ptr.ll
index b700be9..f8c3de5 100644
--- a/llvm/test/CodeGen/BPF/BTF/func-func-ptr.ll
+++ b/llvm/test/CodeGen/BPF/BTF/func-func-ptr.ll
@@ -14,7 +14,7 @@
 @b1 = common dso_local local_unnamed_addr global %struct.t1 zeroinitializer, align 8, !dbg !6
 
 ; Function Attrs: nounwind readnone
-define dso_local void @f1(i32 %p2) local_unnamed_addr #0 !dbg !19 {
+define dso_local void @f1(i32 %p2) local_unnamed_addr !dbg !19 {
 entry:
   call void @llvm.dbg.value(metadata i32 %p2, metadata !21, metadata !DIExpression()), !dbg !22
   ret void, !dbg !23
@@ -95,10 +95,7 @@ entry:
 ; CHECK-NEXT:        .long   3091                    # Line 3 Col 19
 
 ; Function Attrs: nounwind readnone speculatable
-declare void @llvm.dbg.value(metadata, metadata, metadata) #1
-
-attributes #0 = { nounwind readnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "frame-pointer"="all" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { nounwind readnone speculatable }
+declare void @llvm.dbg.value(metadata, metadata, metadata)
 
 !llvm.dbg.cu = !{!2}
 !llvm.module.flags = !{!15, !16, !17}
diff --git a/llvm/test/CodeGen/BPF/BTF/func-non-void.ll b/llvm/test/CodeGen/BPF/BTF/func-non-void.ll
index 2f562b2..745645d 100644
--- a/llvm/test/CodeGen/BPF/BTF/func-non-void.ll
+++ b/llvm/test/CodeGen/BPF/BTF/func-non-void.ll
@@ -7,7 +7,7 @@
 ;   clang -target bpf -O2 -g -S -emit-llvm t.c
 
 ; Function Attrs: nounwind readnone
-define dso_local i32 @f1(i32 returned) local_unnamed_addr #0 !dbg !7 {
+define dso_local i32 @f1(i32 returned) local_unnamed_addr !dbg !7 {
   call void @llvm.dbg.value(metadata i32 %0, metadata !12, metadata !DIExpression()), !dbg !13
   ret i32 %0, !dbg !14
 }
@@ -73,10 +73,7 @@ define dso_local i32 @f1(i32 returned) local_unnamed_addr #0 !dbg !7 {
 ; CHECK-NEXT:        .long   1042                    # Line 1 Col 18
 
 ; Function Attrs: nounwind readnone speculatable
-declare void @llvm.dbg.value(metadata, metadata, metadata) #1
-
-attributes #0 = { nounwind readnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "frame-pointer"="all" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { nounwind readnone speculatable }
+declare void @llvm.dbg.value(metadata, metadata, metadata)
 
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!3, !4, !5}
diff --git a/llvm/test/CodeGen/BPF/BTF/func-source.ll b/llvm/test/CodeGen/BPF/BTF/func-source.ll
index a485d2c..c305e83 100644
--- a/llvm/test/CodeGen/BPF/BTF/func-source.ll
+++ b/llvm/test/CodeGen/BPF/BTF/func-source.ll
@@ -10,7 +10,7 @@
 ; correct reference to the lines in the string table.
 
 ; Function Attrs: norecurse nounwind readnone
-define dso_local void @f() local_unnamed_addr #0 !dbg !7 {
+define dso_local void @f() local_unnamed_addr !dbg !7 {
 entry:
   ret void, !dbg !10
 }
@@ -63,8 +63,6 @@ entry:
 ; CHECK-NEXT:        .long   18
 ; CHECK-NEXT:        .long   1040                    # Line 1 Col 16
 
-attributes #0 = { norecurse nounwind readnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!3, !4, !5}
 !llvm.ident = !{!6}
diff --git a/llvm/test/CodeGen/BPF/BTF/func-typedef.ll b/llvm/test/CodeGen/BPF/BTF/func-typedef.ll
index 2570536..388deeb 100644
--- a/llvm/test/CodeGen/BPF/BTF/func-typedef.ll
+++ b/llvm/test/CodeGen/BPF/BTF/func-typedef.ll
@@ -9,7 +9,7 @@
 ;   clang -target bpf -O2 -g -S -emit-llvm t.c
 
 ; Function Attrs: nounwind readnone
-define dso_local i32 @f(i32 returned %a) local_unnamed_addr #0 !dbg !7 {
+define dso_local i32 @f(i32 returned %a) local_unnamed_addr !dbg !7 {
 entry:
   call void @llvm.dbg.value(metadata i32 %a, metadata !14, metadata !DIExpression()), !dbg !15
   ret i32 %a, !dbg !16
@@ -85,12 +85,8 @@ entry:
 ; CHECK-NEXT:        .long   0
 ; CHECK-NEXT:        .long   3092                    # Line 3 Col 20
 
-
 ; Function Attrs: nounwind readnone speculatable
-declare void @llvm.dbg.value(metadata, metadata, metadata) #1
-
-attributes #0 = { nounwind readnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { nounwind readnone speculatable }
+declare void @llvm.dbg.value(metadata, metadata, metadata)
 
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!3, !4, !5}
diff --git a/llvm/test/CodeGen/BPF/BTF/func-unused-arg.ll b/llvm/test/CodeGen/BPF/BTF/func-unused-arg.ll
index f9439e6..380642c 100644
--- a/llvm/test/CodeGen/BPF/BTF/func-unused-arg.ll
+++ b/llvm/test/CodeGen/BPF/BTF/func-unused-arg.ll
@@ -7,7 +7,7 @@
 ;   clang -target bpf -O2 -g -S -emit-llvm t.c
 
 ; Function Attrs: nounwind readnone
-define dso_local i32 @f1(i32) local_unnamed_addr #0 !dbg !7 {
+define dso_local i32 @f1(i32) local_unnamed_addr !dbg !7 {
   call void @llvm.dbg.value(metadata i32 %0, metadata !12, metadata !DIExpression()), !dbg !13
   ret i32 0, !dbg !14
 }
@@ -69,10 +69,7 @@ define dso_local i32 @f1(i32) local_unnamed_addr #0 !dbg !7 {
 ; CHECK-NEXT:        .long   1042                    # Line 1 Col 18
 
 ; Function Attrs: nounwind readnone speculatable
-declare void @llvm.dbg.value(metadata, metadata, metadata) #1
-
-attributes #0 = { nounwind readnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "frame-pointer"="all" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { nounwind readnone speculatable }
+declare void @llvm.dbg.value(metadata, metadata, metadata)
 
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!3, !4, !5}
diff --git a/llvm/test/CodeGen/BPF/BTF/func-void.ll b/llvm/test/CodeGen/BPF/BTF/func-void.ll
index bf70b6a..9205700 100644
--- a/llvm/test/CodeGen/BPF/BTF/func-void.ll
+++ b/llvm/test/CodeGen/BPF/BTF/func-void.ll
@@ -7,7 +7,7 @@
 ;   clang -target bpf -O2 -g -S -emit-llvm t.c
 
 ; Function Attrs: norecurse nounwind readnone
-define dso_local void @f1() local_unnamed_addr #0 !dbg !7 {
+define dso_local void @f1() local_unnamed_addr !dbg !7 {
   ret void, !dbg !10
 }
 
@@ -57,8 +57,6 @@ define dso_local void @f1() local_unnamed_addr #0 !dbg !7 {
 ; CHECK-NEXT:        .long   0
 ; CHECK-NEXT:        .long   1040                    # Line 1 Col 16
 
-attributes #0 = { norecurse nounwind readnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "frame-pointer"="all" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!3, !4, !5}
 !llvm.ident = !{!6}
diff --git a/llvm/test/CodeGen/BPF/BTF/local-var-readonly-1.ll b/llvm/test/CodeGen/BPF/BTF/local-var-readonly-1.ll
index 6ef7a30..5c797f7 100644
--- a/llvm/test/CodeGen/BPF/BTF/local-var-readonly-1.ll
+++ b/llvm/test/CodeGen/BPF/BTF/local-var-readonly-1.ll
@@ -21,16 +21,16 @@
 @__const.test.val = private unnamed_addr constant %struct.anon { [4 x i32] [i32 2, i32 3, i32 4, i32 5] }, align 4
 
 ; Function Attrs: nounwind
-define dso_local i32 @test() local_unnamed_addr #0 !dbg !7 {
+define dso_local i32 @test() local_unnamed_addr !dbg !7 {
 entry:
   %val = alloca %struct.anon, align 4
   call void @llvm.dbg.value(metadata ptr @.str, metadata !12, metadata !DIExpression()), !dbg !25
-  call void @llvm.lifetime.start.p0(i64 16, ptr nonnull %val) #4, !dbg !26
+  call void @llvm.lifetime.start.p0(i64 16, ptr nonnull %val), !dbg !26
   call void @llvm.dbg.declare(metadata ptr %val, metadata !16, metadata !DIExpression()), !dbg !27
   call void @llvm.memcpy.p0.p0.i64(ptr nonnull align 4 dereferenceable(16) %val, ptr nonnull align 4 dereferenceable(16) @__const.test.val, i64 16, i1 false), !dbg !27
-  tail call void @foo(ptr @.str) #4, !dbg !28
-  call void @foo(ptr nonnull %val) #4, !dbg !29
-  call void @llvm.lifetime.end.p0(i64 16, ptr nonnull %val) #4, !dbg !30
+  tail call void @foo(ptr @.str), !dbg !28
+  call void @foo(ptr nonnull %val), !dbg !29
+  call void @llvm.lifetime.end.p0(i64 16, ptr nonnull %val), !dbg !30
   ret i32 0, !dbg !31
 }
 
@@ -39,27 +39,21 @@ entry:
 ; CHECK-NOT:   BTF_KIND_DATASEC
 
 ; Function Attrs: argmemonly nounwind willreturn
-declare void @llvm.lifetime.start.p0(i64 immarg, ptr nocapture) #1
+declare void @llvm.lifetime.start.p0(i64 immarg, ptr nocapture)
 
 ; Function Attrs: nounwind readnone speculatable willreturn
-declare void @llvm.dbg.declare(metadata, metadata, metadata) #2
+declare void @llvm.dbg.declare(metadata, metadata, metadata)
 
 ; Function Attrs: argmemonly nounwind willreturn
-declare void @llvm.memcpy.p0.p0.i64(ptr noalias nocapture writeonly, ptr noalias nocapture readonly, i64, i1 immarg) #1
+declare void @llvm.memcpy.p0.p0.i64(ptr noalias nocapture writeonly, ptr noalias nocapture readonly, i64, i1 immarg)
 
-declare !dbg !32 dso_local void @foo(ptr) local_unnamed_addr #3
+declare !dbg !32 dso_local void @foo(ptr) local_unnamed_addr
 
 ; Function Attrs: argmemonly nounwind willreturn
-declare void @llvm.lifetime.end.p0(i64 immarg, ptr nocapture) #1
+declare void @llvm.lifetime.end.p0(i64 immarg, ptr nocapture)
 
 ; Function Attrs: nounwind readnone speculatable willreturn
-declare void @llvm.dbg.value(metadata, metadata, metadata) #2
-
-attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { argmemonly nounwind willreturn }
-attributes #2 = { nounwind readnone speculatable willreturn }
-attributes #3 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #4 = { nounwind }
+declare void @llvm.dbg.value(metadata, metadata, metadata)
 
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!3, !4, !5}
diff --git a/llvm/test/CodeGen/BPF/BTF/local-var-readonly-2.ll b/llvm/test/CodeGen/BPF/BTF/local-var-readonly-2.ll
index 0e183a5..243cd87 100644
--- a/llvm/test/CodeGen/BPF/BTF/local-var-readonly-2.ll
+++ b/llvm/test/CodeGen/BPF/BTF/local-var-readonly-2.ll
@@ -19,14 +19,14 @@
 @__const.test.val = private unnamed_addr constant %struct.anon { [4 x i32] [i32 2, i32 3, i32 4, i32 5], i8 4 }, align 4
 
 ; Function Attrs: nounwind
-define dso_local i32 @test() local_unnamed_addr #0 !dbg !7 {
+define dso_local i32 @test() local_unnamed_addr !dbg !7 {
 entry:
   %val = alloca %struct.anon, align 4
-  call void @llvm.lifetime.start.p0(i64 20, ptr nonnull %val) #4, !dbg !23
+  call void @llvm.lifetime.start.p0(i64 20, ptr nonnull %val), !dbg !23
   call void @llvm.dbg.declare(metadata ptr %val, metadata !12, metadata !DIExpression()), !dbg !24
   call void @llvm.memcpy.p0.p0.i64(ptr nonnull align 4 dereferenceable(20) %val, ptr nonnull align 4 dereferenceable(20) @__const.test.val, i64 20, i1 false), !dbg !24
-  call void @foo(ptr nonnull %val) #4, !dbg !25
-  call void @llvm.lifetime.end.p0(i64 20, ptr nonnull %val) #4, !dbg !26
+  call void @foo(ptr nonnull %val), !dbg !25
+  call void @llvm.lifetime.end.p0(i64 20, ptr nonnull %val), !dbg !26
   ret i32 0, !dbg !27
 }
 
@@ -38,24 +38,18 @@ entry:
 ; CHECK:             .ascii  ".rodata"                       # string offset=42
 
 ; Function Attrs: argmemonly nounwind willreturn
-declare void @llvm.lifetime.start.p0(i64 immarg, ptr nocapture) #1
+declare void @llvm.lifetime.start.p0(i64 immarg, ptr nocapture)
 
 ; Function Attrs: nounwind readnone speculatable willreturn
-declare void @llvm.dbg.declare(metadata, metadata, metadata) #2
+declare void @llvm.dbg.declare(metadata, metadata, metadata)
 
 ; Function Attrs: argmemonly nounwind willreturn
-declare void @llvm.memcpy.p0.p0.i64(ptr noalias nocapture writeonly, ptr noalias nocapture readonly, i64, i1 immarg) #1
+declare void @llvm.memcpy.p0.p0.i64(ptr noalias nocapture writeonly, ptr noalias nocapture readonly, i64, i1 immarg)
 
-declare !dbg !28 dso_local void @foo(ptr) local_unnamed_addr #3
+declare !dbg !28 dso_local void @foo(ptr) local_unnamed_addr
 
 ; Function Attrs: argmemonly nounwind willreturn
-declare void @llvm.lifetime.end.p0(i64 immarg, ptr nocapture) #1
-
-attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { argmemonly nounwind willreturn }
-attributes #2 = { nounwind readnone speculatable willreturn }
-attributes #3 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #4 = { nounwind }
+declare void @llvm.lifetime.end.p0(i64 immarg, ptr nocapture)
 
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!3, !4, !5}
diff --git a/llvm/test/CodeGen/BPF/BTF/local-var.ll b/llvm/test/CodeGen/BPF/BTF/local-var.ll
index dd79923..fa605d8 100644
--- a/llvm/test/CodeGen/BPF/BTF/local-var.ll
+++ b/llvm/test/CodeGen/BPF/BTF/local-var.ll
@@ -7,7 +7,7 @@
 ;   clang -target bpf -O2 -g -S -emit-llvm test.c
 
 ; Function Attrs: nounwind
-define dso_local i32 @foo(i8 signext) local_unnamed_addr #0 !dbg !7 {
+define dso_local i32 @foo(i8 signext) local_unnamed_addr !dbg !7 {
   %2 = alloca i16, align 2
   call void @llvm.dbg.value(metadata i8 %0, metadata !13, metadata !DIExpression()), !dbg !17
   call void @llvm.lifetime.start.p0(i64 2, ptr nonnull %2), !dbg !18
@@ -59,20 +59,16 @@ define dso_local i32 @foo(i8 signext) local_unnamed_addr #0 !dbg !7 {
 ; CHECK-NEXT:        .byte   0
 
 ; Function Attrs: nounwind readnone speculatable
-declare void @llvm.dbg.declare(metadata, metadata, metadata) #1
+declare void @llvm.dbg.declare(metadata, metadata, metadata)
 
 ; Function Attrs: argmemonly nounwind
-declare void @llvm.lifetime.start.p0(i64, ptr nocapture) #2
+declare void @llvm.lifetime.start.p0(i64, ptr nocapture)
 
 ; Function Attrs: argmemonly nounwind
-declare void @llvm.lifetime.end.p0(i64, ptr nocapture) #2
+declare void @llvm.lifetime.end.p0(i64, ptr nocapture)
 
 ; Function Attrs: nounwind readnone speculatable
-declare void @llvm.dbg.value(metadata, metadata, metadata) #1
-
-attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { nounwind readnone speculatable }
-attributes #2 = { argmemonly nounwind }
+declare void @llvm.dbg.value(metadata, metadata, metadata)
 
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!3, !4, !5}
diff --git a/llvm/test/CodeGen/BPF/BTF/pruning-const.ll b/llvm/test/CodeGen/BPF/BTF/pruning-const.ll
index 8fef9c2..733815d 100644
--- a/llvm/test/CodeGen/BPF/BTF/pruning-const.ll
+++ b/llvm/test/CodeGen/BPF/BTF/pruning-const.ll
@@ -22,14 +22,14 @@
 %struct.s2 = type { %struct.tt }
 
 ; Function Attrs: norecurse nounwind readnone
-define dso_local i32 @test1(ptr nocapture readnone %arg) local_unnamed_addr #0 !dbg !7 {
+define dso_local i32 @test1(ptr nocapture readnone %arg) local_unnamed_addr !dbg !7 {
 entry:
   call void @llvm.dbg.value(metadata ptr %arg, metadata !22, metadata !DIExpression()), !dbg !23
   ret i32 0, !dbg !24
 }
 
 ; Function Attrs: norecurse nounwind readonly
-define dso_local i32 @test2(ptr nocapture readonly %arg) local_unnamed_addr #1 !dbg !25 {
+define dso_local i32 @test2(ptr nocapture readonly %arg) local_unnamed_addr !dbg !25 {
 entry:
   call void @llvm.dbg.value(metadata ptr %arg, metadata !33, metadata !DIExpression()), !dbg !34
   %0 = load i32, ptr %arg, align 4, !dbg !35, !tbaa !36
@@ -64,11 +64,7 @@ entry:
 ; CHECK:        .ascii  "m2"                    # string offset=72
 
 ; Function Attrs: nounwind readnone speculatable willreturn
-declare void @llvm.dbg.value(metadata, metadata, metadata) #2
-
-attributes #0 = { norecurse nounwind readnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { norecurse nounwind readonly "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #2 = { nounwind readnone speculatable willreturn }
+declare void @llvm.dbg.value(metadata, metadata, metadata)
 
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!3, !4, !5}
diff --git a/llvm/test/CodeGen/BPF/BTF/pruning-typedef.ll b/llvm/test/CodeGen/BPF/BTF/pruning-typedef.ll
index 4c8aa1f..727daea 100644
--- a/llvm/test/CodeGen/BPF/BTF/pruning-typedef.ll
+++ b/llvm/test/CodeGen/BPF/BTF/pruning-typedef.ll
@@ -24,14 +24,14 @@
 %struct.s2 = type { %struct.tt }
 
 ; Function Attrs: norecurse nounwind readnone
-define dso_local i32 @test1(ptr nocapture readnone %arg) local_unnamed_addr #0 !dbg !7 {
+define dso_local i32 @test1(ptr nocapture readnone %arg) local_unnamed_addr !dbg !7 {
 entry:
   call void @llvm.dbg.value(metadata ptr %arg, metadata !23, metadata !DIExpression()), !dbg !24
   ret i32 0, !dbg !25
 }
 
 ; Function Attrs: norecurse nounwind readonly
-define dso_local i32 @test2(ptr nocapture readonly %arg) local_unnamed_addr #1 !dbg !26 {
+define dso_local i32 @test2(ptr nocapture readonly %arg) local_unnamed_addr !dbg !26 {
 entry:
   call void @llvm.dbg.value(metadata ptr %arg, metadata !34, metadata !DIExpression()), !dbg !35
   %0 = load i32, ptr %arg, align 4, !dbg !36, !tbaa !37
@@ -71,11 +71,7 @@ entry:
 ; CHECK:        .ascii  "m2"                    # string offset=81
 
 ; Function Attrs: nounwind readnone speculatable willreturn
-declare void @llvm.dbg.value(metadata, metadata, metadata) #2
-
-attributes #0 = { norecurse nounwind readnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { norecurse nounwind readonly "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #2 = { nounwind readnone speculatable willreturn }
+declare void @llvm.dbg.value(metadata, metadata, metadata)
 
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!3, !4, !5}
diff --git a/llvm/test/CodeGen/BPF/BTF/static-func.ll b/llvm/test/CodeGen/BPF/BTF/static-func.ll
index fc79dbf..6506407 100644
--- a/llvm/test/CodeGen/BPF/BTF/static-func.ll
+++ b/llvm/test/CodeGen/BPF/BTF/static-func.ll
@@ -9,18 +9,18 @@
 ;   clang -target bpf -O2 -g -S -emit-llvm test.c
 
 ; Function Attrs: nounwind
-define dso_local i32 @test2() local_unnamed_addr #0 !dbg !12 {
+define dso_local i32 @test2() local_unnamed_addr !dbg !12 {
 entry:
   %call = tail call fastcc i32 @test1(), !dbg !13
   ret i32 %call, !dbg !14
 }
 ; Function Attrs: noinline nounwind
-define internal fastcc i32 @test1() unnamed_addr #1 !dbg !15 {
+define internal fastcc i32 @test1() unnamed_addr !dbg !15 {
 entry:
-  %call = tail call i32 @foo() #3, !dbg !16
+  %call = tail call i32 @foo(), !dbg !16
   ret i32 %call, !dbg !17
 }
-declare !dbg !4 dso_local i32 @foo() local_unnamed_addr #2
+declare !dbg !4 dso_local i32 @foo() local_unnamed_addr
 
 ; CHECK:             .section        .BTF,"",@progbits
 ; CHECK-NEXT:        .short  60319                   # 0xeb9f
@@ -67,11 +67,6 @@ declare !dbg !4 dso_local i32 @foo() local_unnamed_addr #2
 ; CHECK-NEXT:        .ascii  "foo"                   # string offset=60
 ; CHECK-NEXT:        .byte   0
 
-attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { noinline nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #2 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #3 = { nounwind }
-
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!8, !9, !10}
 !llvm.ident = !{!11}
diff --git a/llvm/test/CodeGen/BPF/BTF/static-var-derived-type.ll b/llvm/test/CodeGen/BPF/BTF/static-var-derived-type.ll
index 1827c97..fedec38 100644
--- a/llvm/test/CodeGen/BPF/BTF/static-var-derived-type.ll
+++ b/llvm/test/CodeGen/BPF/BTF/static-var-derived-type.ll
@@ -17,7 +17,7 @@
 @v4 = internal constant ptr null, align 8, !dbg !19
 
 ; Function Attrs: norecurse nounwind
-define dso_local i64 @foo() local_unnamed_addr #0 !dbg !27 {
+define dso_local i64 @foo() local_unnamed_addr !dbg !27 {
   %1 = load volatile ptr, ptr @v1, align 8, !dbg !29, !tbaa !30
   %2 = load volatile ptr, ptr @v2, align 8, !dbg !34, !tbaa !30
   %3 = ptrtoint ptr %1 to i64, !dbg !35
@@ -141,8 +141,6 @@ define dso_local i64 @foo() local_unnamed_addr #0 !dbg !27 {
 ; CHECK-NEXT:        .ascii  ".rodata"               # string offset=87
 ; CHECK-NEXT:        .byte   0
 
-attributes #0 = { norecurse nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-
 !llvm.dbg.cu = !{!2}
 !llvm.module.flags = !{!23, !24, !25}
 !llvm.ident = !{!26}
diff --git a/llvm/test/CodeGen/BPF/BTF/static-var-inited-sec.ll b/llvm/test/CodeGen/BPF/BTF/static-var-inited-sec.ll
index cc785b7..deef48a 100644
--- a/llvm/test/CodeGen/BPF/BTF/static-var-inited-sec.ll
+++ b/llvm/test/CodeGen/BPF/BTF/static-var-inited-sec.ll
@@ -14,7 +14,7 @@
 @a = internal global i8 3, section "maps", align 1, !dbg !10
 
 ; Function Attrs: norecurse nounwind
-define dso_local i32 @foo() local_unnamed_addr #0 !dbg !2 {
+define dso_local i32 @foo() local_unnamed_addr !dbg !2 {
   %1 = load volatile i8, ptr @a, align 1, !dbg !20, !tbaa !21
   %2 = sext i8 %1 to i32, !dbg !20
   %3 = load volatile i16, ptr @foo.b, align 2, !dbg !24, !tbaa !25
@@ -93,8 +93,6 @@ define dso_local i32 @foo() local_unnamed_addr #0 !dbg !2 {
 ; CHECK-NEXT:        .ascii  "maps"                  # string offset=71
 ; CHECK-NEXT:        .byte   0
 
-attributes #0 = { norecurse nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-
 !llvm.dbg.cu = !{!7}
 !llvm.module.flags = !{!16, !17, !18}
 !llvm.ident = !{!19}
diff --git a/llvm/test/CodeGen/BPF/BTF/static-var-inited.ll b/llvm/test/CodeGen/BPF/BTF/static-var-inited.ll
index 2b62882..8f29a83 100644
--- a/llvm/test/CodeGen/BPF/BTF/static-var-inited.ll
+++ b/llvm/test/CodeGen/BPF/BTF/static-var-inited.ll
@@ -14,7 +14,7 @@
 @a = internal global i8 3, align 1, !dbg !10
 
 ; Function Attrs: norecurse nounwind
-define dso_local i32 @foo() local_unnamed_addr #0 !dbg !2 {
+define dso_local i32 @foo() local_unnamed_addr !dbg !2 {
   %1 = load volatile i8, ptr @a, align 1, !dbg !20, !tbaa !21
   %2 = sext i8 %1 to i32, !dbg !20
   %3 = load volatile i16, ptr @foo.b, align 2, !dbg !24, !tbaa !25
@@ -93,8 +93,6 @@ define dso_local i32 @foo() local_unnamed_addr #0 !dbg !2 {
 ; CHECK-NEXT:        .ascii  ".data"                 # string offset=71
 ; CHECK-NEXT:        .byte   0
 
-attributes #0 = { norecurse nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-
 !llvm.dbg.cu = !{!7}
 !llvm.module.flags = !{!16, !17, !18}
 !llvm.ident = !{!19}
diff --git a/llvm/test/CodeGen/BPF/BTF/static-var-readonly-sec.ll b/llvm/test/CodeGen/BPF/BTF/static-var-readonly-sec.ll
index a4ae948..e16b467 100644
--- a/llvm/test/CodeGen/BPF/BTF/static-var-readonly-sec.ll
+++ b/llvm/test/CodeGen/BPF/BTF/static-var-readonly-sec.ll
@@ -14,7 +14,7 @@
 @a = internal constant i8 0, section "maps", align 1, !dbg !10
 
 ; Function Attrs: norecurse nounwind
-define dso_local i32 @foo() local_unnamed_addr #0 !dbg !2 {
+define dso_local i32 @foo() local_unnamed_addr !dbg !2 {
   %1 = load volatile i8, ptr @a, align 1, !dbg !22, !tbaa !23
   %2 = sext i8 %1 to i32, !dbg !22
   %3 = load volatile i16, ptr @foo.b, align 2, !dbg !26, !tbaa !27
@@ -99,8 +99,6 @@ define dso_local i32 @foo() local_unnamed_addr #0 !dbg !2 {
 ; CHECK-NEXT:        .ascii  "maps"                  # string offset=71
 ; CHECK-NEXT:        .byte   0
 
-attributes #0 = { norecurse nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-
 !llvm.dbg.cu = !{!7}
 !llvm.module.flags = !{!18, !19, !20}
 !llvm.ident = !{!21}
diff --git a/llvm/test/CodeGen/BPF/BTF/static-var-readonly.ll b/llvm/test/CodeGen/BPF/BTF/static-var-readonly.ll
index a9d60ce..1ddd499 100644
--- a/llvm/test/CodeGen/BPF/BTF/static-var-readonly.ll
+++ b/llvm/test/CodeGen/BPF/BTF/static-var-readonly.ll
@@ -14,7 +14,7 @@
 @a = internal constant i8 0, align 1, !dbg !10
 
 ; Function Attrs: norecurse nounwind
-define dso_local i32 @foo() local_unnamed_addr #0 !dbg !2 {
+define dso_local i32 @foo() local_unnamed_addr !dbg !2 {
   %1 = load volatile i8, ptr @a, align 1, !dbg !22, !tbaa !23
   %2 = sext i8 %1 to i32, !dbg !22
   %3 = load volatile i16, ptr @foo.b, align 2, !dbg !26, !tbaa !27
@@ -99,8 +99,6 @@ define dso_local i32 @foo() local_unnamed_addr #0 !dbg !2 {
 ; CHECK-NEXT:        .ascii  ".rodata"               # string offset=71
 ; CHECK-NEXT:        .byte   0
 
-attributes #0 = { norecurse nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-
 !llvm.dbg.cu = !{!7}
 !llvm.module.flags = !{!18, !19, !20}
 !llvm.ident = !{!21}
diff --git a/llvm/test/CodeGen/BPF/BTF/static-var-sec.ll b/llvm/test/CodeGen/BPF/BTF/static-var-sec.ll
index ac27b2b..0ff8f2e 100644
--- a/llvm/test/CodeGen/BPF/BTF/static-var-sec.ll
+++ b/llvm/test/CodeGen/BPF/BTF/static-var-sec.ll
@@ -14,7 +14,7 @@
 @a = internal global i8 0, section "maps", align 1, !dbg !10
 
 ; Function Attrs: norecurse nounwind
-define dso_local i32 @foo() local_unnamed_addr #0 !dbg !2 {
+define dso_local i32 @foo() local_unnamed_addr !dbg !2 {
   %1 = load volatile i8, ptr @a, align 1, !dbg !20, !tbaa !21
   %2 = sext i8 %1 to i32, !dbg !20
   %3 = load volatile i16, ptr @foo.b, align 2, !dbg !24, !tbaa !25
@@ -93,8 +93,6 @@ define dso_local i32 @foo() local_unnamed_addr #0 !dbg !2 {
 ; CHECK-NEXT:        .ascii  "maps"                  # string offset=71
 ; CHECK-NEXT:        .byte   0
 
-attributes #0 = { norecurse nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-
 !llvm.dbg.cu = !{!7}
 !llvm.module.flags = !{!16, !17, !18}
 !llvm.ident = !{!19}
diff --git a/llvm/test/CodeGen/BPF/BTF/static-var-zerolen-array.ll b/llvm/test/CodeGen/BPF/BTF/static-var-zerolen-array.ll
index 28da203..fe9f508 100644
--- a/llvm/test/CodeGen/BPF/BTF/static-var-zerolen-array.ll
+++ b/llvm/test/CodeGen/BPF/BTF/static-var-zerolen-array.ll
@@ -15,7 +15,7 @@
 @sv = internal global { i32, i32, [10 x i8] } { i32 3, i32 4, [10 x i8] c"abcdefghi\00" }, align 4, !dbg !0
 
 ; Function Attrs: norecurse nounwind
-define dso_local i32 @test() local_unnamed_addr #0 !dbg !21 {
+define dso_local i32 @test() local_unnamed_addr !dbg !21 {
   %1 = load volatile i32, ptr @sv, align 4, !dbg !24, !tbaa !25
   ret i32 %1, !dbg !29
 }
@@ -104,8 +104,6 @@ define dso_local i32 @test() local_unnamed_addr #0 !dbg !21 {
 ; CHECK-NEXT:        .ascii  ".data"                 # string offset=89
 ; CHECK-NEXT:        .byte   0
 
-attributes #0 = { norecurse nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-
 !llvm.dbg.cu = !{!2}
 !llvm.module.flags = !{!17, !18, !19}
 !llvm.ident = !{!20}
diff --git a/llvm/test/CodeGen/BPF/BTF/static-var.ll b/llvm/test/CodeGen/BPF/BTF/static-var.ll
index 461bd27..f7710e3 100644
--- a/llvm/test/CodeGen/BPF/BTF/static-var.ll
+++ b/llvm/test/CodeGen/BPF/BTF/static-var.ll
@@ -14,7 +14,7 @@
 @a = internal global i8 0, align 1, !dbg !10
 
 ; Function Attrs: norecurse nounwind
-define dso_local i32 @foo() local_unnamed_addr #0 !dbg !2 {
+define dso_local i32 @foo() local_unnamed_addr !dbg !2 {
   %1 = load volatile i8, ptr @a, align 1, !dbg !20, !tbaa !21
   %2 = sext i8 %1 to i32, !dbg !20
   %3 = load volatile i16, ptr @foo.b, align 2, !dbg !24, !tbaa !25
@@ -93,8 +93,6 @@ define dso_local i32 @foo() local_unnamed_addr #0 !dbg !2 {
 ; CHECK-NEXT:        .ascii  ".bss"                  # string offset=71
 ; CHECK-NEXT:        .byte   0
 
-attributes #0 = { norecurse nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-
 !llvm.dbg.cu = !{!7}
 !llvm.module.flags = !{!16, !17, !18}
 !llvm.ident = !{!19}
diff --git a/llvm/test/CodeGen/BPF/BTF/struct-anon-2.ll b/llvm/test/CodeGen/BPF/BTF/struct-anon-2.ll
index 5b125ea..68d4be0 100644
--- a/llvm/test/CodeGen/BPF/BTF/struct-anon-2.ll
+++ b/llvm/test/CodeGen/BPF/BTF/struct-anon-2.ll
@@ -15,7 +15,7 @@
 %struct.anon.0 = type { i64 }
 
 ; Function Attrs: norecurse nounwind readnone
-define dso_local i32 @f1(ptr nocapture readnone %s1) local_unnamed_addr #0 !dbg !7 {
+define dso_local i32 @f1(ptr nocapture readnone %s1) local_unnamed_addr !dbg !7 {
 entry:
   call void @llvm.dbg.value(metadata ptr %s1, metadata !25, metadata !DIExpression()), !dbg !26
   ret i32 0, !dbg !27
@@ -65,12 +65,8 @@ entry:
 ; CHECK:             .ascii  "B1"                    # string offset=17
 ; CHECK:             .ascii  "long int"              # string offset=20
 
-
 ; Function Attrs: nounwind readnone speculatable willreturn
-declare void @llvm.dbg.value(metadata, metadata, metadata) #1
-
-attributes #0 = { norecurse nounwind readnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { nounwind readnone speculatable willreturn }
+declare void @llvm.dbg.value(metadata, metadata, metadata)
 
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!3, !4, !5}
diff --git a/llvm/test/CodeGen/BPF/BTF/weak-global-2.ll b/llvm/test/CodeGen/BPF/BTF/weak-global-2.ll
index 4b3b557..14cb8e0 100644
--- a/llvm/test/CodeGen/BPF/BTF/weak-global-2.ll
+++ b/llvm/test/CodeGen/BPF/BTF/weak-global-2.ll
@@ -11,7 +11,7 @@
 
 @g = weak dso_local local_unnamed_addr global i8 2, align 1, !dbg !0
 ; Function Attrs: norecurse nounwind readonly
-define dso_local i32 @test() local_unnamed_addr #0 !dbg !11 {
+define dso_local i32 @test() local_unnamed_addr !dbg !11 {
 entry:
   %0 = load i8, ptr @g, align 1, !dbg !15, !tbaa !16
   %conv = sext i8 %0 to i32, !dbg !15
@@ -37,9 +37,6 @@ entry:
 ; CHECK:             .byte   103                     # string offset=60
 ; CHECK:             .ascii  ".data"                 # string offset=62
 
-
-attributes #0 = { norecurse nounwind readonly "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-
 !llvm.dbg.cu = !{!2}
 !llvm.module.flags = !{!7, !8, !9}
 !llvm.ident = !{!10}
diff --git a/llvm/test/CodeGen/BPF/BTF/weak-global.ll b/llvm/test/CodeGen/BPF/BTF/weak-global.ll
index ea0a887..5605e0b 100644
--- a/llvm/test/CodeGen/BPF/BTF/weak-global.ll
+++ b/llvm/test/CodeGen/BPF/BTF/weak-global.ll
@@ -11,7 +11,7 @@
 
 @g = weak dso_local local_unnamed_addr global i8 0, align 1, !dbg !0
 ; Function Attrs: norecurse nounwind readonly
-define dso_local i32 @test() local_unnamed_addr #0 !dbg !11 {
+define dso_local i32 @test() local_unnamed_addr !dbg !11 {
 entry:
   %0 = load i8, ptr @g, align 1, !dbg !15, !tbaa !16
   %conv = sext i8 %0 to i32, !dbg !15
@@ -37,8 +37,6 @@ entry:
 ; CHECK:             .byte   103                     # string offset=60
 ; CHECK:             .ascii  ".bss"                  # string offset=62
 
-attributes #0 = { norecurse nounwind readonly "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-
 !llvm.dbg.cu = !{!2}
 !llvm.module.flags = !{!7, !8, !9}
 !llvm.ident = !{!10}
diff --git a/llvm/test/CodeGen/BPF/CORE/btf-id-duplicate.ll b/llvm/test/CodeGen/BPF/CORE/btf-id-duplicate.ll
index 23a4617..eecb993 100644
--- a/llvm/test/CodeGen/BPF/CORE/btf-id-duplicate.ll
+++ b/llvm/test/CodeGen/BPF/CORE/btf-id-duplicate.ll
@@ -13,7 +13,7 @@
 %struct.s1 = type { i32, i32 }
 
 ; Function Attrs: nounwind
-define dso_local i32 @foo(ptr %arg) #0 !dbg !7 {
+define dso_local i32 @foo(ptr %arg) !dbg !7 {
 entry:
   %arg.addr = alloca ptr, align 8
   store ptr %arg, ptr %arg.addr, align 8, !tbaa !18
@@ -24,13 +24,13 @@ entry:
 }
 
 ; Function Attrs: nofree nosync nounwind readnone speculatable willreturn
-declare void @llvm.dbg.declare(metadata, metadata, metadata) #1
+declare void @llvm.dbg.declare(metadata, metadata, metadata)
 
 ; Function Attrs: nounwind readnone
-declare i64 @llvm.bpf.btf.type.id(i32, i64) #2
+declare i64 @llvm.bpf.btf.type.id(i32, i64)
 
 ; Function Attrs: nounwind
-define dso_local i32 @bar(ptr %arg) #0 !dbg !25 {
+define dso_local i32 @bar(ptr %arg) !dbg !25 {
 entry:
   %arg.addr = alloca ptr, align 8
   store ptr %arg, ptr %arg.addr, align 8, !tbaa !18
@@ -58,10 +58,6 @@ entry:
 ; CHECK-NEXT:        .long   26
 ; CHECK-NEXT:        .long   6
 
-attributes #0 = { nounwind "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { nofree nosync nounwind readnone speculatable willreturn }
-attributes #2 = { nounwind readnone }
-
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!3, !4, !5}
 !llvm.ident = !{!6}
diff --git a/llvm/test/CodeGen/BPF/CORE/field-reloc-alu32.ll b/llvm/test/CodeGen/BPF/CORE/field-reloc-alu32.ll
index 40a2432..0851f25 100644
--- a/llvm/test/CodeGen/BPF/CORE/field-reloc-alu32.ll
+++ b/llvm/test/CodeGen/BPF/CORE/field-reloc-alu32.ll
@@ -15,7 +15,7 @@ target triple = "bpf"
 @c = common dso_local global %struct.b zeroinitializer, align 4, !dbg !0
 
 ; Function Attrs: nounwind readnone
-define dso_local i32 @f() local_unnamed_addr #0 !dbg !15 {
+define dso_local i32 @f() local_unnamed_addr !dbg !15 {
 entry:
   %0 = tail call ptr @llvm.preserve.struct.access.index.p0.p0.bs(ptr elementtype(%struct.b) nonnull @c, i32 1, i32 1), !dbg !18, !llvm.preserve.access.index !6
   %1 = tail call i32 @llvm.bpf.preserve.field.info.p0(ptr %0, i64 0), !dbg !19
@@ -40,13 +40,10 @@ entry:
 ; CHECK-NEXT:        .long   0
 
 ; Function Attrs: nounwind readnone
-declare ptr @llvm.preserve.struct.access.index.p0.p0.bs(ptr, i32, i32) #1
+declare ptr @llvm.preserve.struct.access.index.p0.p0.bs(ptr, i32, i32)
 
 ; Function Attrs: nounwind readnone
-declare i32 @llvm.bpf.preserve.field.info.p0(ptr, i64) #1
-
-attributes #0 = { nounwind readnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { nounwind readnone }
+declare i32 @llvm.bpf.preserve.field.info.p0(ptr, i64)
 
 !llvm.dbg.cu = !{!2}
 !llvm.module.flags = !{!11, !12, !13}
diff --git a/llvm/test/CodeGen/BPF/CORE/field-reloc-bitfield-1-bpfeb.ll b/llvm/test/CodeGen/BPF/CORE/field-reloc-bitfield-1-bpfeb.ll
index b8b7a0b..51df39b 100644
--- a/llvm/test/CodeGen/BPF/CORE/field-reloc-bitfield-1-bpfeb.ll
+++ b/llvm/test/CodeGen/BPF/CORE/field-reloc-bitfield-1-bpfeb.ll
@@ -25,7 +25,7 @@ target triple = "bpfeb"
 %struct.s = type { i64, i32, i32, i32, i8, i8 }
 
 ; Function Attrs: nounwind readnone
-define dso_local i32 @test(ptr %arg) local_unnamed_addr #0 !dbg !13 {
+define dso_local i32 @test(ptr %arg) local_unnamed_addr !dbg !13 {
 ; CHECK-ALU64-LABEL: test:
 ; CHECK-ALU64:       .Ltest$local:
 ; CHECK-ALU64-NEXT:    .type .Ltest$local,@function
@@ -122,17 +122,13 @@ entry:
 ; CHECK-NEXT:        .long   4
 
 ; Function Attrs: nounwind readnone
-declare ptr @llvm.preserve.struct.access.index.p0.p0.ss(ptr, i32, i32) #1
+declare ptr @llvm.preserve.struct.access.index.p0.p0.ss(ptr, i32, i32)
 
 ; Function Attrs: nounwind readnone
-declare i32 @llvm.bpf.preserve.field.info.p0(ptr, i64) #1
+declare i32 @llvm.bpf.preserve.field.info.p0(ptr, i64)
 
 ; Function Attrs: nounwind readnone speculatable
-declare void @llvm.dbg.value(metadata, metadata, metadata) #2
-
-attributes #0 = { nounwind readnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { nounwind readnone }
-attributes #2 = { nounwind readnone speculatable }
+declare void @llvm.dbg.value(metadata, metadata, metadata)
 
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!9, !10, !11}
@@ -177,4 +173,3 @@ attributes #2 = { nounwind readnone speculatable }
 !36 = !DILocation(line: 14, column: 10, scope: !13)
 !37 = !DILocation(line: 13, column: 67, scope: !13)
 !38 = !DILocation(line: 12, column: 3, scope: !13)
-
diff --git a/llvm/test/CodeGen/BPF/CORE/field-reloc-bitfield-1.ll b/llvm/test/CodeGen/BPF/CORE/field-reloc-bitfield-1.ll
index 4cf0a13..295c105 100644
--- a/llvm/test/CodeGen/BPF/CORE/field-reloc-bitfield-1.ll
+++ b/llvm/test/CodeGen/BPF/CORE/field-reloc-bitfield-1.ll
@@ -25,7 +25,7 @@ target triple = "bpfel"
 %struct.s = type { i64, i32, i32, i32, i8, i8 }
 
 ; Function Attrs: nounwind readnone
-define dso_local i32 @test(ptr %arg) local_unnamed_addr #0 !dbg !13 {
+define dso_local i32 @test(ptr %arg) local_unnamed_addr !dbg !13 {
 ; CHECK-ALU64-LABEL: test:
 ; CHECK-ALU64:       .Ltest$local:
 ; CHECK-ALU64-NEXT:    .type .Ltest$local,@function
@@ -122,17 +122,13 @@ entry:
 ; CHECK-NEXT:        .long   4
 
 ; Function Attrs: nounwind readnone
-declare ptr @llvm.preserve.struct.access.index.p0.p0.ss(ptr, i32, i32) #1
+declare ptr @llvm.preserve.struct.access.index.p0.p0.ss(ptr, i32, i32)
 
 ; Function Attrs: nounwind readnone
-declare i32 @llvm.bpf.preserve.field.info.p0(ptr, i64) #1
+declare i32 @llvm.bpf.preserve.field.info.p0(ptr, i64)
 
 ; Function Attrs: nounwind readnone speculatable
-declare void @llvm.dbg.value(metadata, metadata, metadata) #2
-
-attributes #0 = { nounwind readnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { nounwind readnone }
-attributes #2 = { nounwind readnone speculatable }
+declare void @llvm.dbg.value(metadata, metadata, metadata)
 
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!9, !10, !11}
diff --git a/llvm/test/CodeGen/BPF/CORE/field-reloc-bitfield-2-bpfeb.ll b/llvm/test/CodeGen/BPF/CORE/field-reloc-bitfield-2-bpfeb.ll
index cdcd7e6..8f83404 100644
--- a/llvm/test/CodeGen/BPF/CORE/field-reloc-bitfield-2-bpfeb.ll
+++ b/llvm/test/CodeGen/BPF/CORE/field-reloc-bitfield-2-bpfeb.ll
@@ -26,7 +26,7 @@ target triple = "bpfeb"
 %struct.s = type <{ i8, i16 }>
 
 ; Function Attrs: nounwind readnone
-define dso_local i32 @test(ptr %arg) local_unnamed_addr #0 !dbg !13 {
+define dso_local i32 @test(ptr %arg) local_unnamed_addr !dbg !13 {
 entry:
   call void @llvm.dbg.value(metadata ptr %arg, metadata !27, metadata !DIExpression()), !dbg !28
   %0 = tail call ptr @llvm.preserve.struct.access.index.p0.p0.ss(ptr elementtype(%struct.s) %arg, i32 1, i32 4), !dbg !29, !llvm.preserve.access.index !18
@@ -70,17 +70,13 @@ entry:
 ; CHECK-NEXT:        .long   4
 
 ; Function Attrs: nounwind readnone
-declare ptr @llvm.preserve.struct.access.index.p0.p0.ss(ptr, i32, i32) #1
+declare ptr @llvm.preserve.struct.access.index.p0.p0.ss(ptr, i32, i32)
 
 ; Function Attrs: nounwind readnone
-declare i32 @llvm.bpf.preserve.field.info.p0(ptr, i64) #1
+declare i32 @llvm.bpf.preserve.field.info.p0(ptr, i64)
 
 ; Function Attrs: nounwind readnone speculatable
-declare void @llvm.dbg.value(metadata, metadata, metadata) #2
-
-attributes #0 = { nounwind readnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { nounwind readnone }
-attributes #2 = { nounwind readnone speculatable }
+declare void @llvm.dbg.value(metadata, metadata, metadata)
 
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!9, !10, !11}
diff --git a/llvm/test/CodeGen/BPF/CORE/field-reloc-bitfield-2.ll b/llvm/test/CodeGen/BPF/CORE/field-reloc-bitfield-2.ll
index dd7f1c7..1a7619a 100644
--- a/llvm/test/CodeGen/BPF/CORE/field-reloc-bitfield-2.ll
+++ b/llvm/test/CodeGen/BPF/CORE/field-reloc-bitfield-2.ll
@@ -26,7 +26,7 @@ target triple = "bpfel"
 %struct.s = type <{ i8, i16 }>
 
 ; Function Attrs: nounwind readnone
-define dso_local i32 @test(ptr %arg) local_unnamed_addr #0 !dbg !13 {
+define dso_local i32 @test(ptr %arg) local_unnamed_addr !dbg !13 {
 entry:
   call void @llvm.dbg.value(metadata ptr %arg, metadata !27, metadata !DIExpression()), !dbg !28
   %0 = tail call ptr @llvm.preserve.struct.access.index.p0.p0.ss(ptr elementtype(%struct.s) %arg, i32 1, i32 4), !dbg !29, !llvm.preserve.access.index !18
@@ -70,17 +70,13 @@ entry:
 ; CHECK-NEXT:        .long   4
 
 ; Function Attrs: nounwind readnone
-declare ptr @llvm.preserve.struct.access.index.p0.p0.ss(ptr, i32, i32) #1
+declare ptr @llvm.preserve.struct.access.index.p0.p0.ss(ptr, i32, i32)
 
 ; Function Attrs: nounwind readnone
-declare i32 @llvm.bpf.preserve.field.info.p0(ptr, i64) #1
+declare i32 @llvm.bpf.preserve.field.info.p0(ptr, i64)
 
 ; Function Attrs: nounwind readnone speculatable
-declare void @llvm.dbg.value(metadata, metadata, metadata) #2
-
-attributes #0 = { nounwind readnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { nounwind readnone }
-attributes #2 = { nounwind readnone speculatable }
+declare void @llvm.dbg.value(metadata, metadata, metadata)
 
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!9, !10, !11}
diff --git a/llvm/test/CodeGen/BPF/CORE/field-reloc-duplicate.ll b/llvm/test/CodeGen/BPF/CORE/field-reloc-duplicate.ll
index 126bd0a..5a98b05 100644
--- a/llvm/test/CodeGen/BPF/CORE/field-reloc-duplicate.ll
+++ b/llvm/test/CodeGen/BPF/CORE/field-reloc-duplicate.ll
@@ -13,7 +13,7 @@
 %struct.s1 = type { i32, i32 }
 
 ; Function Attrs: nounwind
-define dso_local i32 @foo(ptr %arg) #0 !dbg !7 {
+define dso_local i32 @foo(ptr %arg) !dbg !7 {
 entry:
   %arg.addr = alloca ptr, align 8
   store ptr %arg, ptr %arg.addr, align 8, !tbaa !18
@@ -25,13 +25,13 @@ entry:
 }
 
 ; Function Attrs: nounwind readnone speculatable willreturn
-declare void @llvm.dbg.declare(metadata, metadata, metadata) #1
+declare void @llvm.dbg.declare(metadata, metadata, metadata)
 
 ; Function Attrs: nounwind readnone
-declare ptr @llvm.preserve.struct.access.index.p0.p0.s1s(ptr, i32 immarg, i32 immarg) #2
+declare ptr @llvm.preserve.struct.access.index.p0.p0.s1s(ptr, i32 immarg, i32 immarg)
 
 ; Function Attrs: nounwind
-define dso_local i32 @bar(ptr %arg) #0 !dbg !29 {
+define dso_local i32 @bar(ptr %arg) !dbg !29 {
 entry:
   %arg.addr = alloca ptr, align 8
   store ptr %arg, ptr %arg.addr, align 8, !tbaa !18
@@ -60,10 +60,6 @@ entry:
 ; CHECK-NEXT:        .long   26
 ; CHECK-NEXT:        .long   0
 
-attributes #0 = { nounwind "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { nounwind readnone speculatable willreturn }
-attributes #2 = { nounwind readnone }
-
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!3, !4, !5}
 !llvm.ident = !{!6}
diff --git a/llvm/test/CodeGen/BPF/CORE/intrinsic-array-2.ll b/llvm/test/CodeGen/BPF/CORE/intrinsic-array-2.ll
index 90681d3c..00c3a6d 100644
--- a/llvm/test/CodeGen/BPF/CORE/intrinsic-array-2.ll
+++ b/llvm/test/CodeGen/BPF/CORE/intrinsic-array-2.ll
@@ -17,7 +17,7 @@ target triple = "bpf"
 %struct.s1 = type { i32 }
 
 ; Function Attrs: nounwind readnone
-define dso_local i32 @test() local_unnamed_addr #0 !dbg !17 {
+define dso_local i32 @test() local_unnamed_addr !dbg !17 {
 entry:
   call void @llvm.dbg.value(metadata ptr null, metadata !21, metadata !DIExpression()), !dbg !22
   %0 = tail call ptr @llvm.preserve.array.access.index.p0.s1s.p0.s1s(ptr elementtype(%struct.s1) null, i32 0, i32 0), !dbg !23, !llvm.preserve.access.index !8
@@ -40,17 +40,13 @@ entry:
 ; CHECK-NEXT:        .long   2
 
 ; Function Attrs: nounwind readnone
-declare ptr @llvm.preserve.array.access.index.p0.s1s.p0.s1s(ptr, i32 immarg, i32 immarg) #1
+declare ptr @llvm.preserve.array.access.index.p0.s1s.p0.s1s(ptr, i32 immarg, i32 immarg)
 
 ; Function Attrs: nounwind readnone
-declare i32 @llvm.bpf.preserve.field.info.p0.s1s(ptr, i64 immarg) #1
+declare i32 @llvm.bpf.preserve.field.info.p0.s1s(ptr, i64 immarg)
 
 ; Function Attrs: nounwind readnone speculatable willreturn
-declare void @llvm.dbg.value(metadata, metadata, metadata) #2
-
-attributes #0 = { nounwind readnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { nounwind readnone }
-attributes #2 = { nounwind readnone speculatable willreturn }
+declare void @llvm.dbg.value(metadata, metadata, metadata)
 
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!13, !14, !15}
diff --git a/llvm/test/CodeGen/BPF/CORE/intrinsic-array.ll b/llvm/test/CodeGen/BPF/CORE/intrinsic-array.ll
index d6bed6c..7e2e8e6 100644
--- a/llvm/test/CodeGen/BPF/CORE/intrinsic-array.ll
+++ b/llvm/test/CodeGen/BPF/CORE/intrinsic-array.ll
@@ -15,12 +15,12 @@ target triple = "bpf"
 %struct.s = type { i32, i32 }
 
 ; Function Attrs: nounwind
-define dso_local i32 @test(ptr %arg) local_unnamed_addr #0 !dbg !7 {
+define dso_local i32 @test(ptr %arg) local_unnamed_addr !dbg !7 {
 entry:
   call void @llvm.dbg.value(metadata ptr %arg, metadata !17, metadata !DIExpression()), !dbg !18
   %0 = tail call ptr @llvm.preserve.array.access.index.p0.ss.p0.ss(ptr elementtype(%struct.s) %arg, i32 0, i32 2), !dbg !19, !llvm.preserve.access.index !11
   %1 = tail call ptr @llvm.preserve.struct.access.index.p0.p0.ss(ptr elementtype(%struct.s) %0, i32 1, i32 1), !dbg !19, !llvm.preserve.access.index !12
-  %call = tail call i32 @get_value(ptr %1) #4, !dbg !20
+  %call = tail call i32 @get_value(ptr %1), !dbg !20
   ret i32 %call, !dbg !21
 }
 ; CHECK-LABEL: test
@@ -39,22 +39,16 @@ entry:
 ; CHECK-NEXT: .long   26
 ; CHECK-NEXT: .long   0
 
-declare dso_local i32 @get_value(ptr) local_unnamed_addr #1
+declare dso_local i32 @get_value(ptr) local_unnamed_addr
 
 ; Function Attrs: nounwind readnone
-declare ptr @llvm.preserve.array.access.index.p0.ss.p0.ss(ptr, i32 immarg, i32 immarg) #2
+declare ptr @llvm.preserve.array.access.index.p0.ss.p0.ss(ptr, i32 immarg, i32 immarg)
 
 ; Function Attrs: nounwind readnone
-declare ptr @llvm.preserve.struct.access.index.p0.p0.ss(ptr, i32 immarg, i32 immarg) #2
+declare ptr @llvm.preserve.struct.access.index.p0.p0.ss(ptr, i32 immarg, i32 immarg)
 
 ; Function Attrs: nounwind readnone speculatable
-declare void @llvm.dbg.value(metadata, metadata, metadata) #3
-
-attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "frame-pointer"="all" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #2 = { nounwind readnone }
-attributes #3 = { nounwind readnone speculatable }
-attributes #4 = { nounwind }
+declare void @llvm.dbg.value(metadata, metadata, metadata)
 
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!3, !4, !5}
diff --git a/llvm/test/CodeGen/BPF/CORE/intrinsic-fieldinfo-byte-size-1.ll b/llvm/test/CodeGen/BPF/CORE/intrinsic-fieldinfo-byte-size-1.ll
index 525f38d..cb6674f 100644
--- a/llvm/test/CodeGen/BPF/CORE/intrinsic-fieldinfo-byte-size-1.ll
+++ b/llvm/test/CodeGen/BPF/CORE/intrinsic-fieldinfo-byte-size-1.ll
@@ -22,7 +22,7 @@ target triple = "bpf"
 %struct.s1 = type { i32 }
 
 ; Function Attrs: nounwind readnone
-define dso_local i32 @test(ptr %arg) local_unnamed_addr #0 !dbg !11 {
+define dso_local i32 @test(ptr %arg) local_unnamed_addr !dbg !11 {
 entry:
   call void @llvm.dbg.value(metadata ptr %arg, metadata !28, metadata !DIExpression()), !dbg !33
   %0 = tail call ptr @llvm.preserve.union.access.index.p0.u1s.p0.u1s(ptr %arg, i32 1), !dbg !34, !llvm.preserve.access.index !16
@@ -85,20 +85,16 @@ entry:
 ; CHECK-NEXT:        .long   1
 
 ; Function Attrs: nounwind readnone
-declare ptr @llvm.preserve.union.access.index.p0.u1s.p0.u1s(ptr, i32) #1
+declare ptr @llvm.preserve.union.access.index.p0.u1s.p0.u1s(ptr, i32)
 
 ; Function Attrs: nounwind readnone
-declare ptr @llvm.preserve.struct.access.index.p0.p0.s1s(ptr, i32, i32) #1
+declare ptr @llvm.preserve.struct.access.index.p0.p0.s1s(ptr, i32, i32)
 
 ; Function Attrs: nounwind readnone
-declare i32 @llvm.bpf.preserve.field.info.p0(ptr, i64) #1
+declare i32 @llvm.bpf.preserve.field.info.p0(ptr, i64)
 
 ; Function Attrs: nounwind readnone speculatable willreturn
-declare void @llvm.dbg.value(metadata, metadata, metadata) #2
-
-attributes #0 = { nounwind readnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { nounwind readnone }
-attributes #2 = { nounwind readnone speculatable willreturn }
+declare void @llvm.dbg.value(metadata, metadata, metadata)
 
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!7, !8, !9}
diff --git a/llvm/test/CodeGen/BPF/CORE/intrinsic-fieldinfo-byte-size-2.ll b/llvm/test/CodeGen/BPF/CORE/intrinsic-fieldinfo-byte-size-2.ll
index 11235b5..2697201 100644
--- a/llvm/test/CodeGen/BPF/CORE/intrinsic-fieldinfo-byte-size-2.ll
+++ b/llvm/test/CodeGen/BPF/CORE/intrinsic-fieldinfo-byte-size-2.ll
@@ -21,7 +21,7 @@ target triple = "bpf"
 %struct.s1 = type { i32, i8 }
 
 ; Function Attrs: nounwind readnone
-define dso_local i32 @test(ptr %arg) local_unnamed_addr #0 !dbg !11 {
+define dso_local i32 @test(ptr %arg) local_unnamed_addr !dbg !11 {
 entry:
   call void @llvm.dbg.value(metadata ptr %arg, metadata !27, metadata !DIExpression()), !dbg !31
   %0 = tail call ptr @llvm.preserve.union.access.index.p0.u1s.p0.u1s(ptr %arg, i32 1), !dbg !32, !llvm.preserve.access.index !16
@@ -71,27 +71,23 @@ entry:
 ; CHECK-NEXT:        .long   1
 
 ; Function Attrs: nounwind readnone
-declare ptr @llvm.preserve.union.access.index.p0.u1s.p0.u1s(ptr, i32) #1
+declare ptr @llvm.preserve.union.access.index.p0.u1s.p0.u1s(ptr, i32)
 
 ; Function Attrs: nounwind readnone
-declare i32 @llvm.bpf.preserve.field.info.p0.s1s(ptr, i64) #1
+declare i32 @llvm.bpf.preserve.field.info.p0.s1s(ptr, i64)
 
 ; Function Attrs: nounwind readnone
-declare ptr @llvm.preserve.struct.access.index.p0.p0.s1s(ptr, i32, i32) #1
+declare ptr @llvm.preserve.struct.access.index.p0.p0.s1s(ptr, i32, i32)
 
 ; Function Attrs: nounwind readnone
-declare i32 @llvm.bpf.preserve.field.info.p0(ptr, i64) #1
+declare i32 @llvm.bpf.preserve.field.info.p0(ptr, i64)
 
 ; Function Attrs: nounwind readnone
 
 ; Function Attrs: nounwind readnone
 
 ; Function Attrs: nounwind readnone speculatable willreturn
-declare void @llvm.dbg.value(metadata, metadata, metadata) #2
-
-attributes #0 = { nounwind readnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { nounwind readnone }
-attributes #2 = { nounwind readnone speculatable willreturn }
+declare void @llvm.dbg.value(metadata, metadata, metadata)
 
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!7, !8, !9}
diff --git a/llvm/test/CodeGen/BPF/CORE/intrinsic-fieldinfo-byte-size-3.ll b/llvm/test/CodeGen/BPF/CORE/intrinsic-fieldinfo-byte-size-3.ll
index e3382d6..b7541f0 100644
--- a/llvm/test/CodeGen/BPF/CORE/intrinsic-fieldinfo-byte-size-3.ll
+++ b/llvm/test/CodeGen/BPF/CORE/intrinsic-fieldinfo-byte-size-3.ll
@@ -20,7 +20,7 @@ target triple = "bpf"
 %struct.s1 = type { [10 x [10 x i32]] }
 
 ; Function Attrs: nounwind readnone
-define dso_local i32 @test(ptr %arg) local_unnamed_addr #0 !dbg !18 {
+define dso_local i32 @test(ptr %arg) local_unnamed_addr !dbg !18 {
 entry:
   call void @llvm.dbg.value(metadata ptr %arg, metadata !31, metadata !DIExpression()), !dbg !34
   %0 = tail call ptr @llvm.preserve.union.access.index.p0.u1s.p0.u1s(ptr %arg, i32 1), !dbg !35, !llvm.preserve.access.index !22
@@ -60,27 +60,23 @@ entry:
 ; CHECK-NEXT:        .long   1
 
 ; Function Attrs: nounwind readnone
-declare ptr @llvm.preserve.union.access.index.p0.u1s.p0.u1s(ptr, i32) #1
+declare ptr @llvm.preserve.union.access.index.p0.u1s.p0.u1s(ptr, i32)
 
 ; Function Attrs: nounwind readnone
-declare ptr @llvm.preserve.struct.access.index.p0.p0.s1s(ptr, i32, i32) #1
+declare ptr @llvm.preserve.struct.access.index.p0.p0.s1s(ptr, i32, i32)
 
 ; Function Attrs: nounwind readnone
-declare ptr @llvm.preserve.array.access.index.p0.p0(ptr, i32, i32) #1
+declare ptr @llvm.preserve.array.access.index.p0.p0(ptr, i32, i32)
 
 ; Function Attrs: nounwind readnone
-declare i32 @llvm.bpf.preserve.field.info.p0(ptr, i64) #1
+declare i32 @llvm.bpf.preserve.field.info.p0(ptr, i64)
 
 ; Function Attrs: nounwind readnone
 
 ; Function Attrs: nounwind readnone
 
 ; Function Attrs: nounwind readnone speculatable willreturn
-declare void @llvm.dbg.value(metadata, metadata, metadata) #2
-
-attributes #0 = { nounwind readnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { nounwind readnone }
-attributes #2 = { nounwind readnone speculatable willreturn }
+declare void @llvm.dbg.value(metadata, metadata, metadata)
 
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!14, !15, !16}
diff --git a/llvm/test/CodeGen/BPF/CORE/intrinsic-fieldinfo-byte-size-4.ll b/llvm/test/CodeGen/BPF/CORE/intrinsic-fieldinfo-byte-size-4.ll
index fda7592..0220567 100644
--- a/llvm/test/CodeGen/BPF/CORE/intrinsic-fieldinfo-byte-size-4.ll
+++ b/llvm/test/CodeGen/BPF/CORE/intrinsic-fieldinfo-byte-size-4.ll
@@ -15,7 +15,7 @@ target triple = "bpf"
 %struct.s1 = type { i32, i8, i32 }
 
 ; Function Attrs: nounwind readnone
-define dso_local i32 @test(ptr readnone %arg) local_unnamed_addr #0 !dbg !11 {
+define dso_local i32 @test(ptr readnone %arg) local_unnamed_addr !dbg !11 {
 entry:
   call void @llvm.dbg.value(metadata ptr %arg, metadata !23, metadata !DIExpression()), !dbg !24
   %0 = tail call ptr @llvm.preserve.struct.access.index.p0.p0.s1s(ptr elementtype(%struct.s1) %arg, i32 1, i32 1), !dbg !25, !llvm.preserve.access.index !17
@@ -41,17 +41,13 @@ entry:
 ; CHECK-NEXT:        .long   1
 
 ; Function Attrs: nounwind readnone
-declare ptr @llvm.preserve.struct.access.index.p0.p0.s1s(ptr, i32, i32) #1
+declare ptr @llvm.preserve.struct.access.index.p0.p0.s1s(ptr, i32, i32)
 
 ; Function Attrs: nounwind readnone
-declare i32 @llvm.bpf.preserve.field.info.p0(ptr, i64) #1
+declare i32 @llvm.bpf.preserve.field.info.p0(ptr, i64)
 
 ; Function Attrs: nounwind readnone speculatable
-declare void @llvm.dbg.value(metadata, metadata, metadata) #2
-
-attributes #0 = { nounwind readnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { nounwind readnone }
-attributes #2 = { nounwind readnone speculatable }
+declare void @llvm.dbg.value(metadata, metadata, metadata)
 
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!7, !8, !9}
diff --git a/llvm/test/CodeGen/BPF/CORE/intrinsic-fieldinfo-existence-1.ll b/llvm/test/CodeGen/BPF/CORE/intrinsic-fieldinfo-existence-1.ll
index 69872db3..0404deb 100644
--- a/llvm/test/CodeGen/BPF/CORE/intrinsic-fieldinfo-existence-1.ll
+++ b/llvm/test/CodeGen/BPF/CORE/intrinsic-fieldinfo-existence-1.ll
@@ -22,7 +22,7 @@ target triple = "bpf"
 %union.u1 = type { i32 }
 
 ; Function Attrs: nounwind readnone
-define dso_local i32 @test(ptr %arg1, ptr %arg2) local_unnamed_addr #0 !dbg !11 {
+define dso_local i32 @test(ptr %arg1, ptr %arg2) local_unnamed_addr !dbg !11 {
 entry:
   call void @llvm.dbg.value(metadata ptr %arg1, metadata !29, metadata !DIExpression()), !dbg !35
   call void @llvm.dbg.value(metadata ptr %arg2, metadata !30, metadata !DIExpression()), !dbg !35
@@ -85,29 +85,25 @@ entry:
 ; CHECK-NEXT:        .long   2
 
 ; Function Attrs: nounwind readnone
-declare ptr @llvm.preserve.struct.access.index.p0.p0.s1s(ptr, i32, i32) #1
+declare ptr @llvm.preserve.struct.access.index.p0.p0.s1s(ptr, i32, i32)
 
 ; Function Attrs: nounwind readnone
-declare i32 @llvm.bpf.preserve.field.info.p0(ptr, i64) #1
+declare i32 @llvm.bpf.preserve.field.info.p0(ptr, i64)
 
 ; Function Attrs: nounwind readnone
 
 ; Function Attrs: nounwind readnone
 
 ; Function Attrs: nounwind readnone
-declare ptr @llvm.preserve.union.access.index.p0.u1s.p0.u1s(ptr, i32) #1
+declare ptr @llvm.preserve.union.access.index.p0.u1s.p0.u1s(ptr, i32)
 
 ; Function Attrs: nounwind readnone
-declare ptr @llvm.preserve.struct.access.index.p0.p0.u1s(ptr, i32, i32) #1
+declare ptr @llvm.preserve.struct.access.index.p0.p0.u1s(ptr, i32, i32)
 
 ; Function Attrs: nounwind readnone
 
 ; Function Attrs: nounwind readnone speculatable willreturn
-declare void @llvm.dbg.value(metadata, metadata, metadata) #2
-
-attributes #0 = { nounwind readnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { nounwind readnone }
-attributes #2 = { nounwind readnone speculatable willreturn }
+declare void @llvm.dbg.value(metadata, metadata, metadata)
 
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!7, !8, !9}
diff --git a/llvm/test/CodeGen/BPF/CORE/intrinsic-fieldinfo-existence-2.ll b/llvm/test/CodeGen/BPF/CORE/intrinsic-fieldinfo-existence-2.ll
index 90706e9..240083f 100644
--- a/llvm/test/CodeGen/BPF/CORE/intrinsic-fieldinfo-existence-2.ll
+++ b/llvm/test/CodeGen/BPF/CORE/intrinsic-fieldinfo-existence-2.ll
@@ -20,7 +20,7 @@ target triple = "bpf"
 %struct.s1 = type { i32, i16 }
 
 ; Function Attrs: nounwind readnone
-define dso_local i32 @test(ptr %arg) local_unnamed_addr #0 !dbg !11 {
+define dso_local i32 @test(ptr %arg) local_unnamed_addr !dbg !11 {
 entry:
   call void @llvm.dbg.value(metadata ptr %arg, metadata !27, metadata !DIExpression()), !dbg !30
   %0 = tail call ptr @llvm.preserve.union.access.index.p0.u1s.p0.u1s(ptr %arg, i32 1), !dbg !31, !llvm.preserve.access.index !16
@@ -59,24 +59,20 @@ entry:
 ; CHECK-NEXT:        .long   2
 
 ; Function Attrs: nounwind readnone
-declare ptr @llvm.preserve.union.access.index.p0.u1s.p0.u1s(ptr, i32) #1
+declare ptr @llvm.preserve.union.access.index.p0.u1s.p0.u1s(ptr, i32)
 
 ; Function Attrs: nounwind readnone
-declare ptr @llvm.preserve.struct.access.index.p0.p0.s1s(ptr, i32, i32) #1
+declare ptr @llvm.preserve.struct.access.index.p0.p0.s1s(ptr, i32, i32)
 
 ; Function Attrs: nounwind readnone
-declare i32 @llvm.bpf.preserve.field.info.p0(ptr, i64) #1
+declare i32 @llvm.bpf.preserve.field.info.p0(ptr, i64)
 
 ; Function Attrs: nounwind readnone
 
 ; Function Attrs: nounwind readnone
 
 ; Function Attrs: nounwind readnone speculatable willreturn
-declare void @llvm.dbg.value(metadata, metadata, metadata) #2
-
-attributes #0 = { nounwind readnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { nounwind readnone }
-attributes #2 = { nounwind readnone speculatable willreturn }
+declare void @llvm.dbg.value(metadata, metadata, metadata)
 
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!7, !8, !9}
diff --git a/llvm/test/CodeGen/BPF/CORE/intrinsic-fieldinfo-existence-3.ll b/llvm/test/CodeGen/BPF/CORE/intrinsic-fieldinfo-existence-3.ll
index 2297040..57dd5b7 100644
--- a/llvm/test/CodeGen/BPF/CORE/intrinsic-fieldinfo-existence-3.ll
+++ b/llvm/test/CodeGen/BPF/CORE/intrinsic-fieldinfo-existence-3.ll
@@ -19,7 +19,7 @@ target triple = "bpf"
 %struct.s1 = type { [10 x [10 x i32]] }
 
 ; Function Attrs: nounwind readnone
-define dso_local i32 @test(ptr %arg) local_unnamed_addr #0 !dbg !18 {
+define dso_local i32 @test(ptr %arg) local_unnamed_addr !dbg !18 {
 entry:
   call void @llvm.dbg.value(metadata ptr %arg, metadata !31, metadata !DIExpression()), !dbg !34
   %0 = tail call ptr @llvm.preserve.union.access.index.p0.u1s.p0.u1s(ptr %arg, i32 1), !dbg !35, !llvm.preserve.access.index !22
@@ -59,27 +59,23 @@ entry:
 ; CHECK-NEXT:        .long   2
 
 ; Function Attrs: nounwind readnone
-declare ptr @llvm.preserve.union.access.index.p0.u1s.p0.u1s(ptr, i32) #1
+declare ptr @llvm.preserve.union.access.index.p0.u1s.p0.u1s(ptr, i32)
 
 ; Function Attrs: nounwind readnone
-declare ptr @llvm.preserve.struct.access.index.p0.p0.s1s(ptr, i32, i32) #1
+declare ptr @llvm.preserve.struct.access.index.p0.p0.s1s(ptr, i32, i32)
 
 ; Function Attrs: nounwind readnone
-declare ptr @llvm.preserve.array.access.index.p0.p0(ptr, i32, i32) #1
+declare ptr @llvm.preserve.array.access.index.p0.p0(ptr, i32, i32)
 
 ; Function Attrs: nounwind readnone
-declare i32 @llvm.bpf.preserve.field.info.p0(ptr, i64) #1
+declare i32 @llvm.bpf.preserve.field.info.p0(ptr, i64)
 
 ; Function Attrs: nounwind readnone
 
 ; Function Attrs: nounwind readnone
 
 ; Function Attrs: nounwind readnone speculatable willreturn
-declare void @llvm.dbg.value(metadata, metadata, metadata) #2
-
-attributes #0 = { nounwind readnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { nounwind readnone }
-attributes #2 = { nounwind readnone speculatable willreturn }
+declare void @llvm.dbg.value(metadata, metadata, metadata)
 
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!14, !15, !16}
diff --git a/llvm/test/CodeGen/BPF/CORE/intrinsic-fieldinfo-lshift-1-bpfeb.ll b/llvm/test/CodeGen/BPF/CORE/intrinsic-fieldinfo-lshift-1-bpfeb.ll
index 503a26c..7caa667 100644
--- a/llvm/test/CodeGen/BPF/CORE/intrinsic-fieldinfo-lshift-1-bpfeb.ll
+++ b/llvm/test/CodeGen/BPF/CORE/intrinsic-fieldinfo-lshift-1-bpfeb.ll
@@ -23,7 +23,7 @@ target triple = "bpfeb"
 %struct.s1 = type { i32 }
 
 ; Function Attrs: nounwind readnone
-define dso_local i32 @test(ptr %arg) local_unnamed_addr #0 !dbg !11 {
+define dso_local i32 @test(ptr %arg) local_unnamed_addr !dbg !11 {
 entry:
   call void @llvm.dbg.value(metadata ptr %arg, metadata !28, metadata !DIExpression()), !dbg !33
   %0 = tail call ptr @llvm.preserve.union.access.index.p0.u1s.p0.u1s(ptr %arg, i32 1), !dbg !34, !llvm.preserve.access.index !16
@@ -86,20 +86,16 @@ entry:
 ; CHECK-NEXT:        .long   4
 
 ; Function Attrs: nounwind readnone
-declare ptr @llvm.preserve.union.access.index.p0.u1s.p0.u1s(ptr, i32) #1
+declare ptr @llvm.preserve.union.access.index.p0.u1s.p0.u1s(ptr, i32)
 
 ; Function Attrs: nounwind readnone
-declare ptr @llvm.preserve.struct.access.index.p0.p0.s1s(ptr, i32, i32) #1
+declare ptr @llvm.preserve.struct.access.index.p0.p0.s1s(ptr, i32, i32)
 
 ; Function Attrs: nounwind readnone
-declare i32 @llvm.bpf.preserve.field.info.p0(ptr, i64) #1
+declare i32 @llvm.bpf.preserve.field.info.p0(ptr, i64)
 
 ; Function Attrs: nounwind readnone speculatable willreturn
-declare void @llvm.dbg.value(metadata, metadata, metadata) #2
-
-attributes #0 = { nounwind readnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { nounwind readnone }
-attributes #2 = { nounwind readnone speculatable willreturn }
+declare void @llvm.dbg.value(metadata, metadata, metadata)
 
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!7, !8, !9}
diff --git a/llvm/test/CodeGen/BPF/CORE/intrinsic-fieldinfo-lshift-1.ll b/llvm/test/CodeGen/BPF/CORE/intrinsic-fieldinfo-lshift-1.ll
index 0327f1a..c518573 100644
--- a/llvm/test/CodeGen/BPF/CORE/intrinsic-fieldinfo-lshift-1.ll
+++ b/llvm/test/CodeGen/BPF/CORE/intrinsic-fieldinfo-lshift-1.ll
@@ -23,7 +23,7 @@ target triple = "bpfel"
 %struct.s1 = type { i32 }
 
 ; Function Attrs: nounwind readnone
-define dso_local i32 @test(ptr %arg) local_unnamed_addr #0 !dbg !11 {
+define dso_local i32 @test(ptr %arg) local_unnamed_addr !dbg !11 {
 entry:
   call void @llvm.dbg.value(metadata ptr %arg, metadata !28, metadata !DIExpression()), !dbg !33
   %0 = tail call ptr @llvm.preserve.union.access.index.p0.u1s.p0.u1s(ptr %arg, i32 1), !dbg !34, !llvm.preserve.access.index !16
@@ -86,20 +86,16 @@ entry:
 ; CHECK-NEXT:        .long   4
 
 ; Function Attrs: nounwind readnone
-declare ptr @llvm.preserve.union.access.index.p0.u1s.p0.u1s(ptr, i32) #1
+declare ptr @llvm.preserve.union.access.index.p0.u1s.p0.u1s(ptr, i32)
 
 ; Function Attrs: nounwind readnone
-declare ptr @llvm.preserve.struct.access.index.p0.p0.s1s(ptr, i32, i32) #1
+declare ptr @llvm.preserve.struct.access.index.p0.p0.s1s(ptr, i32, i32)
 
 ; Function Attrs: nounwind readnone
-declare i32 @llvm.bpf.preserve.field.info.p0(ptr, i64) #1
+declare i32 @llvm.bpf.preserve.field.info.p0(ptr, i64)
 
 ; Function Attrs: nounwind readnone speculatable willreturn
-declare void @llvm.dbg.value(metadata, metadata, metadata) #2
-
-attributes #0 = { nounwind readnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { nounwind readnone }
-attributes #2 = { nounwind readnone speculatable willreturn }
+declare void @llvm.dbg.value(metadata, metadata, metadata)
 
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!7, !8, !9}
diff --git a/llvm/test/CodeGen/BPF/CORE/intrinsic-fieldinfo-lshift-2.ll b/llvm/test/CodeGen/BPF/CORE/intrinsic-fieldinfo-lshift-2.ll
index 2a92d08..6bf29d4 100644
--- a/llvm/test/CodeGen/BPF/CORE/intrinsic-fieldinfo-lshift-2.ll
+++ b/llvm/test/CodeGen/BPF/CORE/intrinsic-fieldinfo-lshift-2.ll
@@ -21,7 +21,7 @@ target triple = "bpf"
 %struct.s1 = type { i32, i16 }
 
 ; Function Attrs: nounwind readnone
-define dso_local i32 @test(ptr %arg) local_unnamed_addr #0 !dbg !11 {
+define dso_local i32 @test(ptr %arg) local_unnamed_addr !dbg !11 {
 entry:
   call void @llvm.dbg.value(metadata ptr %arg, metadata !27, metadata !DIExpression()), !dbg !30
   %0 = tail call ptr @llvm.preserve.union.access.index.p0.u1s.p0.u1s(ptr %arg, i32 1), !dbg !31, !llvm.preserve.access.index !16
@@ -60,24 +60,20 @@ entry:
 ; CHECK-NEXT:        .long   4
 
 ; Function Attrs: nounwind readnone
-declare ptr @llvm.preserve.union.access.index.p0.u1s.p0.u1s(ptr, i32) #1
+declare ptr @llvm.preserve.union.access.index.p0.u1s.p0.u1s(ptr, i32)
 
 ; Function Attrs: nounwind readnone
-declare ptr @llvm.preserve.struct.access.index.p0.p0.s1s(ptr, i32, i32) #1
+declare ptr @llvm.preserve.struct.access.index.p0.p0.s1s(ptr, i32, i32)
 
 ; Function Attrs: nounwind readnone
-declare i32 @llvm.bpf.preserve.field.info.p0(ptr, i64) #1
+declare i32 @llvm.bpf.preserve.field.info.p0(ptr, i64)
 
 ; Function Attrs: nounwind readnone
 
 ; Function Attrs: nounwind readnone
 
 ; Function Attrs: nounwind readnone speculatable willreturn
-declare void @llvm.dbg.value(metadata, metadata, metadata) #2
-
-attributes #0 = { nounwind readnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { nounwind readnone }
-attributes #2 = { nounwind readnone speculatable willreturn }
+declare void @llvm.dbg.value(metadata, metadata, metadata)
 
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!7, !8, !9}
diff --git a/llvm/test/CodeGen/BPF/CORE/intrinsic-fieldinfo-rshift-1.ll b/llvm/test/CodeGen/BPF/CORE/intrinsic-fieldinfo-rshift-1.ll
index 6e62bb3..441366f 100644
--- a/llvm/test/CodeGen/BPF/CORE/intrinsic-fieldinfo-rshift-1.ll
+++ b/llvm/test/CodeGen/BPF/CORE/intrinsic-fieldinfo-rshift-1.ll
@@ -22,7 +22,7 @@ target triple = "bpf"
 %struct.s1 = type { i32 }
 
 ; Function Attrs: nounwind readnone
-define dso_local i32 @test(ptr %arg) local_unnamed_addr #0 !dbg !11 {
+define dso_local i32 @test(ptr %arg) local_unnamed_addr !dbg !11 {
 entry:
   call void @llvm.dbg.value(metadata ptr %arg, metadata !28, metadata !DIExpression()), !dbg !33
   %0 = tail call ptr @llvm.preserve.union.access.index.p0.u1s.p0.u1s(ptr %arg, i32 1), !dbg !34, !llvm.preserve.access.index !16
@@ -85,20 +85,16 @@ entry:
 ; CHECK-NEXT:        .long   5
 
 ; Function Attrs: nounwind readnone
-declare ptr @llvm.preserve.union.access.index.p0.u1s.p0.u1s(ptr, i32) #1
+declare ptr @llvm.preserve.union.access.index.p0.u1s.p0.u1s(ptr, i32)
 
 ; Function Attrs: nounwind readnone
-declare ptr @llvm.preserve.struct.access.index.p0.p0.s1s(ptr, i32, i32) #1
+declare ptr @llvm.preserve.struct.access.index.p0.p0.s1s(ptr, i32, i32)
 
 ; Function Attrs: nounwind readnone
-declare i32 @llvm.bpf.preserve.field.info.p0(ptr, i64) #1
+declare i32 @llvm.bpf.preserve.field.info.p0(ptr, i64)
 
 ; Function Attrs: nounwind readnone speculatable willreturn
-declare void @llvm.dbg.value(metadata, metadata, metadata) #2
-
-attributes #0 = { nounwind readnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { nounwind readnone }
-attributes #2 = { nounwind readnone speculatable willreturn }
+declare void @llvm.dbg.value(metadata, metadata, metadata)
 
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!7, !8, !9}
diff --git a/llvm/test/CodeGen/BPF/CORE/intrinsic-fieldinfo-rshift-2.ll b/llvm/test/CodeGen/BPF/CORE/intrinsic-fieldinfo-rshift-2.ll
index 77ea26a..7bc994d 100644
--- a/llvm/test/CodeGen/BPF/CORE/intrinsic-fieldinfo-rshift-2.ll
+++ b/llvm/test/CodeGen/BPF/CORE/intrinsic-fieldinfo-rshift-2.ll
@@ -20,7 +20,7 @@ target triple = "bpf"
 %struct.s1 = type { i32, i8 }
 
 ; Function Attrs: nounwind readnone
-define dso_local i32 @test(ptr %arg) local_unnamed_addr #0 !dbg !11 {
+define dso_local i32 @test(ptr %arg) local_unnamed_addr !dbg !11 {
 entry:
   call void @llvm.dbg.value(metadata ptr %arg, metadata !27, metadata !DIExpression()), !dbg !30
   %0 = tail call ptr @llvm.preserve.union.access.index.p0.u1s.p0.u1s(ptr %arg, i32 1), !dbg !31, !llvm.preserve.access.index !16
@@ -59,24 +59,20 @@ entry:
 ; CHECK-NEXT:        .long   5
 
 ; Function Attrs: nounwind readnone
-declare ptr @llvm.preserve.union.access.index.p0.u1s.p0.u1s(ptr, i32) #1
+declare ptr @llvm.preserve.union.access.index.p0.u1s.p0.u1s(ptr, i32)
 
 ; Function Attrs: nounwind readnone
-declare ptr @llvm.preserve.struct.access.index.p0.p0.s1s(ptr, i32, i32) #1
+declare ptr @llvm.preserve.struct.access.index.p0.p0.s1s(ptr, i32, i32)
 
 ; Function Attrs: nounwind readnone
-declare i32 @llvm.bpf.preserve.field.info.p0(ptr, i64) #1
+declare i32 @llvm.bpf.preserve.field.info.p0(ptr, i64)
 
 ; Function Attrs: nounwind readnone
 
 ; Function Attrs: nounwind readnone
 
 ; Function Attrs: nounwind readnone speculatable willreturn
-declare void @llvm.dbg.value(metadata, metadata, metadata) #2
-
-attributes #0 = { nounwind readnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { nounwind readnone }
-attributes #2 = { nounwind readnone speculatable willreturn }
+declare void @llvm.dbg.value(metadata, metadata, metadata)
 
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!7, !8, !9}
diff --git a/llvm/test/CodeGen/BPF/CORE/intrinsic-fieldinfo-rshift-3.ll b/llvm/test/CodeGen/BPF/CORE/intrinsic-fieldinfo-rshift-3.ll
index 556f69f..ebfecff 100644
--- a/llvm/test/CodeGen/BPF/CORE/intrinsic-fieldinfo-rshift-3.ll
+++ b/llvm/test/CodeGen/BPF/CORE/intrinsic-fieldinfo-rshift-3.ll
@@ -20,7 +20,7 @@ target triple = "bpf"
 %struct.s1 = type { [5 x [5 x i8]] }
 
 ; Function Attrs: nounwind readnone
-define dso_local i32 @test(ptr %arg) local_unnamed_addr #0 !dbg !18 {
+define dso_local i32 @test(ptr %arg) local_unnamed_addr !dbg !18 {
 entry:
   call void @llvm.dbg.value(metadata ptr %arg, metadata !32, metadata !DIExpression()), !dbg !35
   %0 = tail call ptr @llvm.preserve.union.access.index.p0.u1s.p0.u1s(ptr %arg, i32 1), !dbg !36, !llvm.preserve.access.index !23
@@ -60,27 +60,23 @@ entry:
 ; CHECK-NEXT:        .long   5
 
 ; Function Attrs: nounwind readnone
-declare ptr @llvm.preserve.union.access.index.p0.u1s.p0.u1s(ptr, i32) #1
+declare ptr @llvm.preserve.union.access.index.p0.u1s.p0.u1s(ptr, i32)
 
 ; Function Attrs: nounwind readnone
-declare ptr @llvm.preserve.struct.access.index.p0.p0.s1s(ptr, i32, i32) #1
+declare ptr @llvm.preserve.struct.access.index.p0.p0.s1s(ptr, i32, i32)
 
 ; Function Attrs: nounwind readnone
-declare ptr @llvm.preserve.array.access.index.p0.p0(ptr, i32, i32) #1
+declare ptr @llvm.preserve.array.access.index.p0.p0(ptr, i32, i32)
 
 ; Function Attrs: nounwind readnone
-declare i32 @llvm.bpf.preserve.field.info.p0(ptr, i64) #1
+declare i32 @llvm.bpf.preserve.field.info.p0(ptr, i64)
 
 ; Function Attrs: nounwind readnone
 
 ; Function Attrs: nounwind readnone
 
 ; Function Attrs: nounwind readnone speculatable willreturn
-declare void @llvm.dbg.value(metadata, metadata, metadata) #2
-
-attributes #0 = { nounwind readnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { nounwind readnone }
-attributes #2 = { nounwind readnone speculatable willreturn }
+declare void @llvm.dbg.value(metadata, metadata, metadata)
 
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!14, !15, !16}
diff --git a/llvm/test/CodeGen/BPF/CORE/intrinsic-fieldinfo-signedness-1.ll b/llvm/test/CodeGen/BPF/CORE/intrinsic-fieldinfo-signedness-1.ll
index 2741050..d50701c 100644
--- a/llvm/test/CodeGen/BPF/CORE/intrinsic-fieldinfo-signedness-1.ll
+++ b/llvm/test/CodeGen/BPF/CORE/intrinsic-fieldinfo-signedness-1.ll
@@ -22,7 +22,7 @@ target triple = "bpf"
 %union.u1 = type { i32 }
 
 ; Function Attrs: nounwind readnone
-define dso_local i32 @test(ptr %arg1, ptr %arg2) local_unnamed_addr #0 !dbg !11 {
+define dso_local i32 @test(ptr %arg1, ptr %arg2) local_unnamed_addr !dbg !11 {
 entry:
   call void @llvm.dbg.value(metadata ptr %arg1, metadata !29, metadata !DIExpression()), !dbg !35
   call void @llvm.dbg.value(metadata ptr %arg2, metadata !30, metadata !DIExpression()), !dbg !35
@@ -85,29 +85,25 @@ entry:
 ; CHECK-NEXT:        .long   3
 
 ; Function Attrs: nounwind readnone
-declare ptr @llvm.preserve.struct.access.index.p0.p0.s1s(ptr, i32, i32) #1
+declare ptr @llvm.preserve.struct.access.index.p0.p0.s1s(ptr, i32, i32)
 
 ; Function Attrs: nounwind readnone
-declare i32 @llvm.bpf.preserve.field.info.p0(ptr, i64) #1
+declare i32 @llvm.bpf.preserve.field.info.p0(ptr, i64)
 
 ; Function Attrs: nounwind readnone
 
 ; Function Attrs: nounwind readnone
 
 ; Function Attrs: nounwind readnone
-declare ptr @llvm.preserve.union.access.index.p0.u1s.p0.u1s(ptr, i32) #1
+declare ptr @llvm.preserve.union.access.index.p0.u1s.p0.u1s(ptr, i32)
 
 ; Function Attrs: nounwind readnone
-declare ptr @llvm.preserve.struct.access.index.p0.p0.u1s(ptr, i32, i32) #1
+declare ptr @llvm.preserve.struct.access.index.p0.p0.u1s(ptr, i32, i32)
 
 ; Function Attrs: nounwind readnone
 
 ; Function Attrs: nounwind readnone speculatable willreturn
-declare void @llvm.dbg.value(metadata, metadata, metadata) #2
-
-attributes #0 = { nounwind readnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { nounwind readnone }
-attributes #2 = { nounwind readnone speculatable willreturn }
+declare void @llvm.dbg.value(metadata, metadata, metadata)
 
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!7, !8, !9}
diff --git a/llvm/test/CodeGen/BPF/CORE/intrinsic-fieldinfo-signedness-2.ll b/llvm/test/CodeGen/BPF/CORE/intrinsic-fieldinfo-signedness-2.ll
index b71bbf3..312d40f 100644
--- a/llvm/test/CodeGen/BPF/CORE/intrinsic-fieldinfo-signedness-2.ll
+++ b/llvm/test/CodeGen/BPF/CORE/intrinsic-fieldinfo-signedness-2.ll
@@ -25,7 +25,7 @@ target triple = "bpf"
 %struct.s1 = type { i32, i16 }
 
 ; Function Attrs: nounwind readnone
-define dso_local i32 @test(ptr %arg) local_unnamed_addr #0 !dbg !20 {
+define dso_local i32 @test(ptr %arg) local_unnamed_addr !dbg !20 {
 entry:
   call void @llvm.dbg.value(metadata ptr %arg, metadata !37, metadata !DIExpression()), !dbg !41
   %0 = tail call ptr @llvm.preserve.union.access.index.p0.u1s.p0.u1s(ptr %arg, i32 1), !dbg !42, !llvm.preserve.access.index !24
@@ -76,24 +76,20 @@ entry:
 ; CHECK-NEXT:        .long   3
 
 ; Function Attrs: nounwind readnone
-declare ptr @llvm.preserve.union.access.index.p0.u1s.p0.u1s(ptr, i32) #1
+declare ptr @llvm.preserve.union.access.index.p0.u1s.p0.u1s(ptr, i32)
 
 ; Function Attrs: nounwind readnone
-declare ptr @llvm.preserve.struct.access.index.p0.p0.s1s(ptr, i32, i32) #1
+declare ptr @llvm.preserve.struct.access.index.p0.p0.s1s(ptr, i32, i32)
 
 ; Function Attrs: nounwind readnone
-declare i32 @llvm.bpf.preserve.field.info.p0(ptr, i64) #1
+declare i32 @llvm.bpf.preserve.field.info.p0(ptr, i64)
 
 ; Function Attrs: nounwind readnone
 
 ; Function Attrs: nounwind readnone
 
 ; Function Attrs: nounwind readnone speculatable willreturn
-declare void @llvm.dbg.value(metadata, metadata, metadata) #2
-
-attributes #0 = { nounwind readnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { nounwind readnone }
-attributes #2 = { nounwind readnone speculatable willreturn }
+declare void @llvm.dbg.value(metadata, metadata, metadata)
 
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!16, !17, !18}
diff --git a/llvm/test/CodeGen/BPF/CORE/intrinsic-fieldinfo-signedness-3.ll b/llvm/test/CodeGen/BPF/CORE/intrinsic-fieldinfo-signedness-3.ll
index 5caea97..12a21c7 100644
--- a/llvm/test/CodeGen/BPF/CORE/intrinsic-fieldinfo-signedness-3.ll
+++ b/llvm/test/CodeGen/BPF/CORE/intrinsic-fieldinfo-signedness-3.ll
@@ -24,7 +24,7 @@ target triple = "bpf"
 %struct.s1 = type { [10 x i32], [10 x [10 x i32]] }
 
 ; Function Attrs: nounwind readnone
-define dso_local i32 @test(ptr %arg) local_unnamed_addr #0 !dbg !29 {
+define dso_local i32 @test(ptr %arg) local_unnamed_addr !dbg !29 {
 entry:
   call void @llvm.dbg.value(metadata ptr %arg, metadata !43, metadata !DIExpression()), !dbg !46
   %0 = tail call ptr @llvm.preserve.union.access.index.p0.u1s.p0.u1s(ptr %arg, i32 1), !dbg !47, !llvm.preserve.access.index !33
@@ -66,27 +66,23 @@ entry:
 ; CHECK-NEXT:        .long   3
 
 ; Function Attrs: nounwind readnone
-declare ptr @llvm.preserve.union.access.index.p0.u1s.p0.u1s(ptr, i32) #1
+declare ptr @llvm.preserve.union.access.index.p0.u1s.p0.u1s(ptr, i32)
 
 ; Function Attrs: nounwind readnone
-declare ptr @llvm.preserve.struct.access.index.p0.p0.s1s(ptr, i32, i32) #1
+declare ptr @llvm.preserve.struct.access.index.p0.p0.s1s(ptr, i32, i32)
 
 ; Function Attrs: nounwind readnone
-declare ptr @llvm.preserve.array.access.index.p0.p0(ptr, i32, i32) #1
+declare ptr @llvm.preserve.array.access.index.p0.p0(ptr, i32, i32)
 
 ; Function Attrs: nounwind readnone
-declare i32 @llvm.bpf.preserve.field.info.p0(ptr, i64) #1
+declare i32 @llvm.bpf.preserve.field.info.p0(ptr, i64)
 
 ; Function Attrs: nounwind readnone
 
 ; Function Attrs: nounwind readnone
 
 ; Function Attrs: nounwind readnone speculatable willreturn
-declare void @llvm.dbg.value(metadata, metadata, metadata) #2
-
-attributes #0 = { nounwind readnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { nounwind readnone }
-attributes #2 = { nounwind readnone speculatable willreturn }
+declare void @llvm.dbg.value(metadata, metadata, metadata)
 
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!25, !26, !27}
diff --git a/llvm/test/CodeGen/BPF/CORE/intrinsic-struct.ll b/llvm/test/CodeGen/BPF/CORE/intrinsic-struct.ll
index 8b95b1c4..13c7d1d 100644
--- a/llvm/test/CodeGen/BPF/CORE/intrinsic-struct.ll
+++ b/llvm/test/CodeGen/BPF/CORE/intrinsic-struct.ll
@@ -15,11 +15,11 @@ target triple = "bpf"
 %struct.s = type { i32, i32 }
 
 ; Function Attrs: nounwind
-define dso_local i32 @test(ptr %arg) local_unnamed_addr #0 !dbg !7 {
+define dso_local i32 @test(ptr %arg) local_unnamed_addr !dbg !7 {
 entry:
   call void @llvm.dbg.value(metadata ptr %arg, metadata !17, metadata !DIExpression()), !dbg !18
   %0 = tail call ptr @llvm.preserve.struct.access.index.p0.p0.ss(ptr elementtype(%struct.s) %arg, i32 1, i32 1), !dbg !19, !llvm.preserve.access.index !12
-  %call = tail call i32 @get_value(ptr %0) #4, !dbg !20
+  %call = tail call i32 @get_value(ptr %0), !dbg !20
   ret i32 %call, !dbg !21
 }
 
@@ -39,19 +39,13 @@ entry:
 ; CHECK-NEXT: .long   26
 ; CHECK-NEXT: .long   0
 
-declare dso_local i32 @get_value(ptr) local_unnamed_addr #1
+declare dso_local i32 @get_value(ptr) local_unnamed_addr
 
 ; Function Attrs: nounwind readnone
-declare ptr @llvm.preserve.struct.access.index.p0.p0.ss(ptr, i32 immarg, i32 immarg) #2
+declare ptr @llvm.preserve.struct.access.index.p0.p0.ss(ptr, i32 immarg, i32 immarg)
 
 ; Function Attrs: nounwind readnone speculatable
-declare void @llvm.dbg.value(metadata, metadata, metadata) #3
-
-attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "frame-pointer"="all" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #2 = { nounwind readnone }
-attributes #3 = { nounwind readnone speculatable }
-attributes #4 = { nounwind }
+declare void @llvm.dbg.value(metadata, metadata, metadata)
 
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!3, !4, !5}
diff --git a/llvm/test/CodeGen/BPF/CORE/intrinsic-typeinfo-enum-value.ll b/llvm/test/CodeGen/BPF/CORE/intrinsic-typeinfo-enum-value.ll
index 88658b6..8583322 100644
--- a/llvm/test/CodeGen/BPF/CORE/intrinsic-typeinfo-enum-value.ll
+++ b/llvm/test/CodeGen/BPF/CORE/intrinsic-typeinfo-enum-value.ll
@@ -20,7 +20,7 @@ target triple = "bpf"
 @2 = private unnamed_addr constant [18 x i8] c"VAL10:-2147483648\00", align 1
 
 ; Function Attrs: nounwind readnone
-define dso_local i32 @test() local_unnamed_addr #0 !dbg !18 {
+define dso_local i32 @test() local_unnamed_addr !dbg !18 {
 entry:
   %0 = tail call i64 @llvm.bpf.preserve.enum.value(i32 0, ptr @0, i64 0), !dbg !23, !llvm.preserve.access.index !3
   %1 = tail call i64 @llvm.bpf.preserve.enum.value(i32 1, ptr @1, i64 1), !dbg !24, !llvm.preserve.access.index !3
@@ -81,10 +81,7 @@ entry:
 ; CHECK-NEXT:        .long   11
 
 ; Function Attrs: nounwind readnone
-declare i64 @llvm.bpf.preserve.enum.value(i32, ptr, i64) #1
-
-attributes #0 = { nounwind readnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { nounwind readnone }
+declare i64 @llvm.bpf.preserve.enum.value(i32, ptr, i64)
 
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!14, !15, !16}
diff --git a/llvm/test/CodeGen/BPF/CORE/intrinsic-typeinfo-type-exist.ll b/llvm/test/CodeGen/BPF/CORE/intrinsic-typeinfo-type-exist.ll
index 0bdf954..6f316d9 100644
--- a/llvm/test/CodeGen/BPF/CORE/intrinsic-typeinfo-type-exist.ll
+++ b/llvm/test/CodeGen/BPF/CORE/intrinsic-typeinfo-type-exist.ll
@@ -17,7 +17,7 @@
 target triple = "bpf"
 
 ; Function Attrs: nounwind readnone
-define dso_local i32 @test() local_unnamed_addr #0 !dbg !17 {
+define dso_local i32 @test() local_unnamed_addr !dbg !17 {
 entry:
   %0 = tail call i32 @llvm.bpf.preserve.type.info(i32 0, i64 0), !dbg !19, !llvm.preserve.access.index !8
   %1 = tail call i32 @llvm.bpf.preserve.type.info(i32 1, i64 0), !dbg !20, !llvm.preserve.access.index !21
@@ -59,10 +59,7 @@ entry:
 ; CHECK-NEXT:        .long   8
 
 ; Function Attrs: nounwind readnone
-declare i32 @llvm.bpf.preserve.type.info(i32, i64) #1
-
-attributes #0 = { nounwind readnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { nounwind readnone }
+declare i32 @llvm.bpf.preserve.type.info(i32, i64)
 
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!13, !14, !15}
diff --git a/llvm/test/CodeGen/BPF/CORE/intrinsic-typeinfo-type-size-1.ll b/llvm/test/CodeGen/BPF/CORE/intrinsic-typeinfo-type-size-1.ll
index ddd3711..d3aacc72 100644
--- a/llvm/test/CodeGen/BPF/CORE/intrinsic-typeinfo-type-size-1.ll
+++ b/llvm/test/CodeGen/BPF/CORE/intrinsic-typeinfo-type-size-1.ll
@@ -17,7 +17,7 @@
 target triple = "bpf"
 
 ; Function Attrs: nounwind readnone
-define dso_local i32 @test() local_unnamed_addr #0 !dbg !17 {
+define dso_local i32 @test() local_unnamed_addr !dbg !17 {
 entry:
   %0 = tail call i32 @llvm.bpf.preserve.type.info(i32 0, i64 1), !dbg !19, !llvm.preserve.access.index !8
   %1 = tail call i32 @llvm.bpf.preserve.type.info(i32 1, i64 1), !dbg !20, !llvm.preserve.access.index !21
@@ -59,10 +59,7 @@ entry:
 ; CHECK-NEXT:        .long   9
 
 ; Function Attrs: nounwind readnone
-declare i32 @llvm.bpf.preserve.type.info(i32, i64) #1
-
-attributes #0 = { nounwind readnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { nounwind readnone }
+declare i32 @llvm.bpf.preserve.type.info(i32, i64)
 
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!13, !14, !15}
diff --git a/llvm/test/CodeGen/BPF/CORE/intrinsic-typeinfo-type-size-2.ll b/llvm/test/CodeGen/BPF/CORE/intrinsic-typeinfo-type-size-2.ll
index b2f8e48..ad4fc96 100644
--- a/llvm/test/CodeGen/BPF/CORE/intrinsic-typeinfo-type-size-2.ll
+++ b/llvm/test/CodeGen/BPF/CORE/intrinsic-typeinfo-type-size-2.ll
@@ -20,7 +20,7 @@
 target triple = "bpf"
 
 ; Function Attrs: nounwind readnone
-define dso_local i32 @test() local_unnamed_addr #0 !dbg !17 {
+define dso_local i32 @test() local_unnamed_addr !dbg !17 {
 entry:
   call void @llvm.dbg.declare(metadata ptr undef, metadata !20, metadata !DIExpression()), !dbg !28
   call void @llvm.dbg.declare(metadata ptr undef, metadata !19, metadata !DIExpression()), !dbg !29
@@ -65,14 +65,10 @@ entry:
 ; CHECK-NEXT:        .long   9
 
 ; Function Attrs: nounwind readnone speculatable willreturn
-declare void @llvm.dbg.declare(metadata, metadata, metadata) #1
+declare void @llvm.dbg.declare(metadata, metadata, metadata)
 
 ; Function Attrs: nounwind readnone
-declare i32 @llvm.bpf.preserve.type.info(i32, i64) #2
-
-attributes #0 = { nounwind readnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { nounwind readnone speculatable willreturn }
-attributes #2 = { nounwind readnone }
+declare i32 @llvm.bpf.preserve.type.info(i32, i64)
 
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!13, !14, !15}
diff --git a/llvm/test/CodeGen/BPF/CORE/intrinsic-union.ll b/llvm/test/CodeGen/BPF/CORE/intrinsic-union.ll
index ef360929..e0217dd 100644
--- a/llvm/test/CodeGen/BPF/CORE/intrinsic-union.ll
+++ b/llvm/test/CodeGen/BPF/CORE/intrinsic-union.ll
@@ -15,11 +15,11 @@ target triple = "bpf"
 %union.u = type { i32 }
 
 ; Function Attrs: nounwind
-define dso_local i32 @test(ptr %arg) local_unnamed_addr #0 !dbg !7 {
+define dso_local i32 @test(ptr %arg) local_unnamed_addr !dbg !7 {
 entry:
   call void @llvm.dbg.value(metadata ptr %arg, metadata !17, metadata !DIExpression()), !dbg !18
   %0 = tail call ptr @llvm.preserve.union.access.index.p0.us.p0.us(ptr %arg, i32 1), !dbg !19, !llvm.preserve.access.index !12
-  %call = tail call i32 @get_value(ptr %0) #4, !dbg !20
+  %call = tail call i32 @get_value(ptr %0), !dbg !20
   ret i32 %call, !dbg !21
 }
 ; CHECK-LABEL: test
@@ -38,19 +38,13 @@ entry:
 ; CHECK-NEXT: .long   26
 ; CHECK-NEXT: .long   0
 
-declare dso_local i32 @get_value(ptr) local_unnamed_addr #1
+declare dso_local i32 @get_value(ptr) local_unnamed_addr
 
 ; Function Attrs: nounwind readnone
-declare ptr @llvm.preserve.union.access.index.p0.us.p0.us(ptr, i32 immarg) #2
+declare ptr @llvm.preserve.union.access.index.p0.us.p0.us(ptr, i32 immarg)
 
 ; Function Attrs: nounwind readnone speculatable
-declare void @llvm.dbg.value(metadata, metadata, metadata) #3
-
-attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "frame-pointer"="all" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #2 = { nounwind readnone }
-attributes #3 = { nounwind readnone speculatable }
-attributes #4 = { nounwind }
+declare void @llvm.dbg.value(metadata, metadata, metadata)
 
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!3, !4, !5}
diff --git a/llvm/test/CodeGen/BPF/CORE/no-elf-ama-symbol.ll b/llvm/test/CodeGen/BPF/CORE/no-elf-ama-symbol.ll
index 4c6ce1e..819ee31 100644
--- a/llvm/test/CodeGen/BPF/CORE/no-elf-ama-symbol.ll
+++ b/llvm/test/CodeGen/BPF/CORE/no-elf-ama-symbol.ll
@@ -15,7 +15,7 @@ target triple = "bpf"
 %struct.tt = type { i32 }
 
 ; Function Attrs: nounwind readonly
-define dso_local i32 @test(ptr readonly %arg) local_unnamed_addr #0 !dbg !7 {
+define dso_local i32 @test(ptr readonly %arg) local_unnamed_addr !dbg !7 {
 entry:
   call void @llvm.dbg.value(metadata ptr %arg, metadata !16, metadata !DIExpression()), !dbg !17
   %0 = tail call ptr @llvm.preserve.struct.access.index.p0.p0.tts(ptr elementtype(%struct.tt) %arg, i32 0, i32 0), !dbg !18, !llvm.preserve.access.index !12
@@ -26,14 +26,10 @@ entry:
 ; CHECK-NOT: llvm.tt:0:0$0:0
 
 ; Function Attrs: nounwind readnone
-declare ptr @llvm.preserve.struct.access.index.p0.p0.tts(ptr, i32, i32) #1
+declare ptr @llvm.preserve.struct.access.index.p0.p0.tts(ptr, i32, i32)
 
 ; Function Attrs: nounwind readnone speculatable
-declare void @llvm.dbg.value(metadata, metadata, metadata) #2
-
-attributes #0 = { nounwind readonly "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { nounwind readnone }
-attributes #2 = { nounwind readnone speculatable}
+declare void @llvm.dbg.value(metadata, metadata, metadata)
 
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!3, !4, !5}
diff --git a/llvm/test/CodeGen/BPF/CORE/no-narrow-load.ll b/llvm/test/CodeGen/BPF/CORE/no-narrow-load.ll
index 9998c98..c3f8395 100644
--- a/llvm/test/CodeGen/BPF/CORE/no-narrow-load.ll
+++ b/llvm/test/CodeGen/BPF/CORE/no-narrow-load.ll
@@ -28,7 +28,7 @@ target triple = "bpf"
 %struct.data_t = type { i32, i32 }
 
 ; Function Attrs: nounwind
-define dso_local void @test(ptr readonly %args) local_unnamed_addr #0 !dbg !12 {
+define dso_local void @test(ptr readonly %args) local_unnamed_addr !dbg !12 {
 entry:
   %data = alloca i64, align 8
   call void @llvm.dbg.value(metadata ptr %args, metadata !22, metadata !DIExpression()), !dbg !29
@@ -36,7 +36,7 @@ entry:
   %1 = load i32, ptr %0, align 4, !dbg !30, !tbaa !31
   %and = and i32 %1, 65536, !dbg !36
   call void @llvm.dbg.value(metadata i32 %and, metadata !23, metadata !DIExpression()), !dbg !29
-  call void @llvm.lifetime.start.p0(i64 8, ptr nonnull %data) #5, !dbg !37
+  call void @llvm.lifetime.start.p0(i64 8, ptr nonnull %data), !dbg !37
   call void @llvm.dbg.declare(metadata ptr %data, metadata !24, metadata !DIExpression()), !dbg !38
   store i64 0, ptr %data, align 8, !dbg !38
   %tobool = icmp eq i32 %and, 0, !dbg !39
@@ -60,8 +60,8 @@ lor.end:                                          ; preds = %lor.end.critedge, %
   %5 = phi i32 [ %phitmp, %cond.false ], [ 1, %lor.end.critedge ]
   %d2 = getelementptr inbounds %struct.data_t, ptr %data, i64 0, i32 1, !dbg !49
   store i32 %5, ptr %d2, align 4, !dbg !50, !tbaa !51
-  call void @output(ptr nonnull %data) #5, !dbg !52
-  call void @llvm.lifetime.end.p0(i64 8, ptr nonnull %data) #5, !dbg !53
+  call void @output(ptr nonnull %data), !dbg !52
+  call void @llvm.lifetime.end.p0(i64 8, ptr nonnull %data), !dbg !53
   ret void, !dbg !53
 }
 
@@ -71,28 +71,21 @@ lor.end:                                          ; preds = %lor.end.critedge, %
 ; CHECK: r[[LOAD]] &= 32768
 
 ; Function Attrs: nounwind readnone speculatable willreturn
-declare void @llvm.dbg.declare(metadata, metadata, metadata) #1
+declare void @llvm.dbg.declare(metadata, metadata, metadata)
 
 ; Function Attrs: argmemonly nounwind willreturn
-declare void @llvm.lifetime.start.p0(i64 immarg, ptr nocapture) #2
+declare void @llvm.lifetime.start.p0(i64 immarg, ptr nocapture)
 
 ; Function Attrs: nounwind readnone
-declare ptr @llvm.preserve.struct.access.index.p0.p0.info_ts(ptr, i32 immarg, i32 immarg) #3
+declare ptr @llvm.preserve.struct.access.index.p0.p0.info_ts(ptr, i32 immarg, i32 immarg)
 
-declare !dbg !4 dso_local void @output(ptr) local_unnamed_addr #4
+declare !dbg !4 dso_local void @output(ptr) local_unnamed_addr
 
 ; Function Attrs: argmemonly nounwind willreturn
-declare void @llvm.lifetime.end.p0(i64 immarg, ptr nocapture) #2
+declare void @llvm.lifetime.end.p0(i64 immarg, ptr nocapture)
 
 ; Function Attrs: nounwind readnone speculatable willreturn
-declare void @llvm.dbg.value(metadata, metadata, metadata) #1
-
-attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { nounwind readnone speculatable willreturn }
-attributes #2 = { argmemonly nounwind willreturn }
-attributes #3 = { nounwind readnone }
-attributes #4 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #5 = { nounwind }
+declare void @llvm.dbg.value(metadata, metadata, metadata)
 
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!8, !9, !10}
diff --git a/llvm/test/CodeGen/BPF/CORE/offset-reloc-access-str.ll b/llvm/test/CodeGen/BPF/CORE/offset-reloc-access-str.ll
index 5da2bbd..1ce453c 100644
--- a/llvm/test/CodeGen/BPF/CORE/offset-reloc-access-str.ll
+++ b/llvm/test/CodeGen/BPF/CORE/offset-reloc-access-str.ll
@@ -18,13 +18,13 @@ target triple = "bpf"
 %struct.t = type { i32, i32 }
 
 ; Function Attrs: nounwind
-define dso_local i32 @test(ptr %arg1, ptr %arg2) local_unnamed_addr #0 !dbg !7 {
+define dso_local i32 @test(ptr %arg1, ptr %arg2) local_unnamed_addr !dbg !7 {
 entry:
   call void @llvm.dbg.value(metadata ptr %arg1, metadata !22, metadata !DIExpression()), !dbg !24
   call void @llvm.dbg.value(metadata ptr %arg2, metadata !23, metadata !DIExpression()), !dbg !24
   %0 = tail call ptr @llvm.preserve.struct.access.index.p0.p0.ss(ptr elementtype(%struct.s) %arg1, i32 1, i32 1), !dbg !25, !llvm.preserve.access.index !12
   %1 = tail call ptr @llvm.preserve.struct.access.index.p0.p0.ts(ptr elementtype(%struct.t) %arg2, i32 1, i32 1), !dbg !26, !llvm.preserve.access.index !17
-  %call = tail call i32 @get_value(ptr %0, ptr %1) #4, !dbg !27
+  %call = tail call i32 @get_value(ptr %0, ptr %1), !dbg !27
   ret i32 %call, !dbg !28
 }
 
@@ -46,22 +46,16 @@ entry:
 ; CHECK-NEXT:        .long   [[ACCESS_STR]]
 ; CHECK-NEXT:        .long   0
 
-declare dso_local i32 @get_value(ptr, ptr) local_unnamed_addr #1
+declare dso_local i32 @get_value(ptr, ptr) local_unnamed_addr
 
 ; Function Attrs: nounwind readnone
-declare ptr @llvm.preserve.struct.access.index.p0.p0.ss(ptr, i32 immarg, i32 immarg) #2
+declare ptr @llvm.preserve.struct.access.index.p0.p0.ss(ptr, i32 immarg, i32 immarg)
 
 ; Function Attrs: nounwind readnone
-declare ptr @llvm.preserve.struct.access.index.p0.p0.ts(ptr, i32 immarg, i32 immarg) #2
+declare ptr @llvm.preserve.struct.access.index.p0.p0.ts(ptr, i32 immarg, i32 immarg)
 
 ; Function Attrs: nounwind readnone speculatable
-declare void @llvm.dbg.value(metadata, metadata, metadata) #3
-
-attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #2 = { nounwind readnone }
-attributes #3 = { nounwind readnone speculatable }
-attributes #4 = { nounwind }
+declare void @llvm.dbg.value(metadata, metadata, metadata)
 
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!3, !4, !5}
diff --git a/llvm/test/CodeGen/BPF/CORE/offset-reloc-basic.ll b/llvm/test/CodeGen/BPF/CORE/offset-reloc-basic.ll
index 024ed04..0fdd704 100644
--- a/llvm/test/CodeGen/BPF/CORE/offset-reloc-basic.ll
+++ b/llvm/test/CodeGen/BPF/CORE/offset-reloc-basic.ll
@@ -24,19 +24,19 @@ target triple = "bpf"
 %struct.net_device = type opaque
 
 ; Function Attrs: nounwind
-define dso_local i32 @bpf_prog(ptr) local_unnamed_addr #0 !dbg !15 {
+define dso_local i32 @bpf_prog(ptr) local_unnamed_addr !dbg !15 {
   %2 = alloca ptr, align 8
   call void @llvm.dbg.value(metadata ptr %0, metadata !26, metadata !DIExpression()), !dbg !28
-  call void @llvm.lifetime.start.p0(i64 8, ptr nonnull %2) #4, !dbg !29
+  call void @llvm.lifetime.start.p0(i64 8, ptr nonnull %2), !dbg !29
   call void @llvm.dbg.value(metadata ptr null, metadata !27, metadata !DIExpression()), !dbg !28
   store ptr null, ptr %2, align 8, !dbg !30, !tbaa !31
   %3 = tail call ptr @llvm.preserve.struct.access.index.p0.net_devices.p0.sk_buffs(ptr elementtype(%struct.sk_buff) %0, i32 1, i32 1), !dbg !35, !llvm.preserve.access.index !19
-  %4 = call i32 inttoptr (i64 4 to ptr)(ptr nonnull %2, i32 8, ptr %3) #4, !dbg !36
+  %4 = call i32 inttoptr (i64 4 to ptr)(ptr nonnull %2, i32 8, ptr %3), !dbg !36
   %5 = load ptr, ptr %2, align 8, !dbg !37, !tbaa !31
   call void @llvm.dbg.value(metadata ptr %5, metadata !27, metadata !DIExpression()), !dbg !28
   %6 = icmp ne ptr %5, null, !dbg !38
   %7 = zext i1 %6 to i32, !dbg !38
-  call void @llvm.lifetime.end.p0(i64 8, ptr nonnull %2) #4, !dbg !39
+  call void @llvm.lifetime.end.p0(i64 8, ptr nonnull %2), !dbg !39
   ret i32 %7, !dbg !40
 }
 
@@ -122,22 +122,16 @@ define dso_local i32 @bpf_prog(ptr) local_unnamed_addr #0 !dbg !15 {
 ; CHECK-NEXT:        .long   0
 
 ; Function Attrs: argmemonly nounwind
-declare void @llvm.lifetime.start.p0(i64 immarg, ptr nocapture) #1
+declare void @llvm.lifetime.start.p0(i64 immarg, ptr nocapture)
 
 ; Function Attrs: nounwind readnone
-declare ptr @llvm.preserve.struct.access.index.p0.net_devices.p0.sk_buffs(ptr, i32 immarg, i32 immarg) #2
+declare ptr @llvm.preserve.struct.access.index.p0.net_devices.p0.sk_buffs(ptr, i32 immarg, i32 immarg)
 
 ; Function Attrs: argmemonly nounwind
-declare void @llvm.lifetime.end.p0(i64 immarg, ptr nocapture) #1
+declare void @llvm.lifetime.end.p0(i64 immarg, ptr nocapture)
 
 ; Function Attrs: nounwind readnone speculatable
-declare void @llvm.dbg.value(metadata, metadata, metadata) #3
-
-attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "frame-pointer"="all" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { argmemonly nounwind }
-attributes #2 = { nounwind readnone }
-attributes #3 = { nounwind readnone speculatable }
-attributes #4 = { nounwind }
+declare void @llvm.dbg.value(metadata, metadata, metadata)
 
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!11, !12, !13}
diff --git a/llvm/test/CodeGen/BPF/CORE/offset-reloc-cast-array-1.ll b/llvm/test/CodeGen/BPF/CORE/offset-reloc-cast-array-1.ll
index e12221e..65859c86 100644
--- a/llvm/test/CodeGen/BPF/CORE/offset-reloc-cast-array-1.ll
+++ b/llvm/test/CodeGen/BPF/CORE/offset-reloc-cast-array-1.ll
@@ -21,7 +21,7 @@ target triple = "bpf"
 %struct.v1 = type { i32, i32 }
 
 ; Function Attrs: nounwind
-define dso_local i32 @test(ptr %arg) local_unnamed_addr #0 !dbg !22 {
+define dso_local i32 @test(ptr %arg) local_unnamed_addr !dbg !22 {
 entry:
   call void @llvm.dbg.value(metadata ptr %arg, metadata !32, metadata !DIExpression()), !dbg !33
   %0 = tail call ptr @llvm.preserve.struct.access.index.p0.p0(ptr elementtype(%struct.v3) %arg, i32 1, i32 1), !dbg !34, !llvm.preserve.access.index !26
@@ -30,7 +30,7 @@ entry:
   %3 = tail call ptr @llvm.preserve.array.access.index.p0.p0(ptr elementtype([4 x %struct.v1]) %2, i32 0, i32 0), !dbg !34, !llvm.preserve.access.index !4
   %4 = tail call ptr @llvm.preserve.array.access.index.p0.p0(ptr elementtype([4 x %struct.v1]) %3, i32 1, i32 2), !dbg !34, !llvm.preserve.access.index !5
   %5 = tail call ptr @llvm.preserve.struct.access.index.p0.p0(ptr elementtype(%struct.v1) %4, i32 1, i32 1), !dbg !34, !llvm.preserve.access.index !8
-  %call = tail call i32 @get_value(ptr %5) #4, !dbg !35
+  %call = tail call i32 @get_value(ptr %5), !dbg !35
   ret i32 %call, !dbg !36
 }
 
@@ -60,13 +60,13 @@ entry:
 ; CHECK-NEXT:         .long   107
 ; CHECK-NEXT:         .long   0
 
-declare dso_local i32 @get_value(ptr) local_unnamed_addr #1
+declare dso_local i32 @get_value(ptr) local_unnamed_addr
 
 ; Function Attrs: nounwind readnone
-declare ptr @llvm.preserve.struct.access.index.p0.p0(ptr, i32, i32) #2
+declare ptr @llvm.preserve.struct.access.index.p0.p0(ptr, i32, i32)
 
 ; Function Attrs: nounwind readnone
-declare ptr @llvm.preserve.array.access.index.p0.p0(ptr, i32, i32) #2
+declare ptr @llvm.preserve.array.access.index.p0.p0(ptr, i32, i32)
 
 ; Function Attrs: nounwind readnone
 
@@ -75,13 +75,7 @@ declare ptr @llvm.preserve.array.access.index.p0.p0(ptr, i32, i32) #2
 ; Function Attrs: nounwind readnone
 
 ; Function Attrs: nounwind readnone speculatable willreturn
-declare void @llvm.dbg.value(metadata, metadata, metadata) #3
-
-attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #2 = { nounwind readnone }
-attributes #3 = { nounwind readnone speculatable willreturn }
-attributes #4 = { nounwind }
+declare void @llvm.dbg.value(metadata, metadata, metadata)
 
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!18, !19, !20}
diff --git a/llvm/test/CodeGen/BPF/CORE/offset-reloc-cast-array-2.ll b/llvm/test/CodeGen/BPF/CORE/offset-reloc-cast-array-2.ll
index 1764c9d..f42e7e6 100644
--- a/llvm/test/CodeGen/BPF/CORE/offset-reloc-cast-array-2.ll
+++ b/llvm/test/CodeGen/BPF/CORE/offset-reloc-cast-array-2.ll
@@ -21,7 +21,7 @@ target triple = "bpf"
 %struct.v1 = type { i32, i32 }
 
 ; Function Attrs: nounwind
-define dso_local i32 @test(ptr %arg) local_unnamed_addr #0 !dbg !24 {
+define dso_local i32 @test(ptr %arg) local_unnamed_addr !dbg !24 {
 entry:
   call void @llvm.dbg.value(metadata ptr %arg, metadata !34, metadata !DIExpression()), !dbg !35
   %0 = tail call ptr @llvm.preserve.struct.access.index.p0.p0(ptr elementtype(%struct.v3) %arg, i32 1, i32 1), !dbg !36, !llvm.preserve.access.index !28
@@ -31,7 +31,7 @@ entry:
   %4 = tail call ptr @llvm.preserve.array.access.index.p0.p0(ptr elementtype([4 x [4 x %struct.v1]]) %3, i32 1, i32 2), !dbg !36, !llvm.preserve.access.index !5
   %5 = tail call ptr @llvm.preserve.array.access.index.p0.p0(ptr elementtype([4 x %struct.v1]) %4, i32 1, i32 3), !dbg !36, !llvm.preserve.access.index !18
   %6 = tail call ptr @llvm.preserve.struct.access.index.p0.p0(ptr elementtype(%struct.v1) %5, i32 1, i32 1), !dbg !36, !llvm.preserve.access.index !8
-  %call = tail call i32 @get_value(ptr %6) #4, !dbg !37
+  %call = tail call i32 @get_value(ptr %6), !dbg !37
   ret i32 %call, !dbg !38
 }
 
@@ -62,13 +62,13 @@ entry:
 ; CHECK-NEXT:         .long   107
 ; CHECK-NEXT:         .long   0
 
-declare dso_local i32 @get_value(ptr) local_unnamed_addr #1
+declare dso_local i32 @get_value(ptr) local_unnamed_addr
 
 ; Function Attrs: nounwind readnone
-declare ptr @llvm.preserve.struct.access.index.p0.p0(ptr, i32, i32) #2
+declare ptr @llvm.preserve.struct.access.index.p0.p0(ptr, i32, i32)
 
 ; Function Attrs: nounwind readnone
-declare ptr @llvm.preserve.array.access.index.p0.p0(ptr, i32, i32) #2
+declare ptr @llvm.preserve.array.access.index.p0.p0(ptr, i32, i32)
 
 ; Function Attrs: nounwind readnone
 
@@ -79,13 +79,7 @@ declare ptr @llvm.preserve.array.access.index.p0.p0(ptr, i32, i32) #2
 ; Function Attrs: nounwind readnone
 
 ; Function Attrs: nounwind readnone speculatable willreturn
-declare void @llvm.dbg.value(metadata, metadata, metadata) #3
-
-attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #2 = { nounwind readnone }
-attributes #3 = { nounwind readnone speculatable willreturn }
-attributes #4 = { nounwind }
+declare void @llvm.dbg.value(metadata, metadata, metadata)
 
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!20, !21, !22}
diff --git a/llvm/test/CodeGen/BPF/CORE/offset-reloc-cast-struct-1.ll b/llvm/test/CodeGen/BPF/CORE/offset-reloc-cast-struct-1.ll
index bbff3f6..38b1c99 100644
--- a/llvm/test/CodeGen/BPF/CORE/offset-reloc-cast-struct-1.ll
+++ b/llvm/test/CodeGen/BPF/CORE/offset-reloc-cast-struct-1.ll
@@ -21,12 +21,12 @@ target triple = "bpf"
 %struct.v1 = type { i32, i32 }
 
 ; Function Attrs: nounwind
-define dso_local i32 @test(ptr %arg) local_unnamed_addr #0 !dbg !14 {
+define dso_local i32 @test(ptr %arg) local_unnamed_addr !dbg !14 {
 entry:
   call void @llvm.dbg.value(metadata ptr %arg, metadata !28, metadata !DIExpression()), !dbg !29
   %0 = tail call ptr @llvm.preserve.struct.access.index.p0.v2s.p0.v3s(ptr elementtype(%struct.v3) %arg, i32 1, i32 1), !dbg !30, !llvm.preserve.access.index !18
   %1 = tail call ptr @llvm.preserve.struct.access.index.p0.p0.v1s(ptr elementtype(%struct.v1) %0, i32 1, i32 1), !dbg !30, !llvm.preserve.access.index !5
-  %call = tail call i32 @get_value(ptr %1) #4, !dbg !31
+  %call = tail call i32 @get_value(ptr %1), !dbg !31
   ret i32 %call, !dbg !32
 }
 
@@ -60,22 +60,16 @@ entry:
 ; CHECK-NEXT:        .long   [[ACCESS_STR]]
 ; CHECK-NEXT:        .long   0
 
-declare dso_local i32 @get_value(ptr) local_unnamed_addr #1
+declare dso_local i32 @get_value(ptr) local_unnamed_addr
 
 ; Function Attrs: nounwind readnone
-declare ptr @llvm.preserve.struct.access.index.p0.v2s.p0.v3s(ptr, i32, i32) #2
+declare ptr @llvm.preserve.struct.access.index.p0.v2s.p0.v3s(ptr, i32, i32)
 
 ; Function Attrs: nounwind readnone
-declare ptr @llvm.preserve.struct.access.index.p0.p0.v1s(ptr, i32, i32) #2
+declare ptr @llvm.preserve.struct.access.index.p0.p0.v1s(ptr, i32, i32)
 
 ; Function Attrs: nounwind readnone speculatable willreturn
-declare void @llvm.dbg.value(metadata, metadata, metadata) #3
-
-attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #2 = { nounwind readnone }
-attributes #3 = { nounwind readnone speculatable willreturn }
-attributes #4 = { nounwind }
+declare void @llvm.dbg.value(metadata, metadata, metadata)
 
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!10, !11, !12}
diff --git a/llvm/test/CodeGen/BPF/CORE/offset-reloc-cast-struct-2.ll b/llvm/test/CodeGen/BPF/CORE/offset-reloc-cast-struct-2.ll
index bdc17e6..7730ee3a 100644
--- a/llvm/test/CodeGen/BPF/CORE/offset-reloc-cast-struct-2.ll
+++ b/llvm/test/CodeGen/BPF/CORE/offset-reloc-cast-struct-2.ll
@@ -24,12 +24,12 @@ target triple = "bpf"
 %struct.v1 = type { i32, i32 }
 
 ; Function Attrs: nounwind
-define dso_local i32 @test(ptr %arg) local_unnamed_addr #0 !dbg !15 {
+define dso_local i32 @test(ptr %arg) local_unnamed_addr !dbg !15 {
 entry:
   call void @llvm.dbg.value(metadata ptr %arg, metadata !33, metadata !DIExpression()), !dbg !34
   %0 = tail call ptr @llvm.preserve.struct.access.index.p0.v2s.p0.v3s(ptr elementtype(%struct.v3) %arg, i32 1, i32 1), !dbg !35, !llvm.preserve.access.index !20
   %1 = tail call ptr @llvm.preserve.struct.access.index.p0.p0.v1s(ptr elementtype(%struct.v1) %0, i32 1, i32 1), !dbg !35, !llvm.preserve.access.index !6
-  %call = tail call i32 @get_value(ptr %1) #4, !dbg !36
+  %call = tail call i32 @get_value(ptr %1), !dbg !36
   ret i32 %call, !dbg !37
 }
 
@@ -47,7 +47,6 @@ entry:
 ; CHECK:             .ascii  "0:1"                   # string offset=45
 ; CHECK:             .ascii  "v1"                    # string offset=91
 
-
 ; CHECK:             .long   16                      # FieldReloc
 ; CHECK-NEXT:        .long   39                      # Field reloc section string offset=39
 ; CHECK-NEXT:        .long   2
@@ -60,22 +59,16 @@ entry:
 ; CHECK-NEXT:        .long   45
 ; CHECK-NEXT:        .long   0
 
-declare dso_local i32 @get_value(ptr) local_unnamed_addr #1
+declare dso_local i32 @get_value(ptr) local_unnamed_addr
 
 ; Function Attrs: nounwind readnone
-declare ptr @llvm.preserve.struct.access.index.p0.v2s.p0.v3s(ptr, i32, i32) #2
+declare ptr @llvm.preserve.struct.access.index.p0.v2s.p0.v3s(ptr, i32, i32)
 
 ; Function Attrs: nounwind readnone
-declare ptr @llvm.preserve.struct.access.index.p0.p0.v1s(ptr, i32, i32) #2
+declare ptr @llvm.preserve.struct.access.index.p0.p0.v1s(ptr, i32, i32)
 
 ; Function Attrs: nounwind readnone speculatable willreturn
-declare void @llvm.dbg.value(metadata, metadata, metadata) #3
-
-attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #2 = { nounwind readnone }
-attributes #3 = { nounwind readnone speculatable willreturn }
-attributes #4 = { nounwind }
+declare void @llvm.dbg.value(metadata, metadata, metadata)
 
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!11, !12, !13}
diff --git a/llvm/test/CodeGen/BPF/CORE/offset-reloc-cast-struct-3.ll b/llvm/test/CodeGen/BPF/CORE/offset-reloc-cast-struct-3.ll
index dea6e40..e5ef549 100644
--- a/llvm/test/CodeGen/BPF/CORE/offset-reloc-cast-struct-3.ll
+++ b/llvm/test/CodeGen/BPF/CORE/offset-reloc-cast-struct-3.ll
@@ -22,14 +22,14 @@ target triple = "bpf"
 %struct.v1 = type { i32, i32 }
 
 ; Function Attrs: nounwind
-define dso_local i32 @test(ptr %arg) local_unnamed_addr #0 !dbg !19 {
+define dso_local i32 @test(ptr %arg) local_unnamed_addr !dbg !19 {
 entry:
   call void @llvm.dbg.value(metadata ptr %arg, metadata !30, metadata !DIExpression()), !dbg !31
   %0 = tail call ptr @llvm.preserve.struct.access.index.p0.p0(ptr elementtype(%struct.v3) %arg, i32 1, i32 1), !dbg !32, !llvm.preserve.access.index !24
   %1 = tail call ptr @llvm.preserve.array.access.index.p0.p0(ptr elementtype([40 x i32]) %0, i32 1, i32 4), !dbg !32, !llvm.preserve.access.index !11
   %2 = bitcast ptr %1 to ptr, !dbg !32
   %3 = tail call ptr @llvm.preserve.struct.access.index.p0.p0(ptr elementtype(%struct.v1) %2, i32 1, i32 1), !dbg !32, !llvm.preserve.access.index !6
-  %call = tail call i32 @get_value(ptr %3) #4, !dbg !33
+  %call = tail call i32 @get_value(ptr %3), !dbg !33
   ret i32 %call, !dbg !34
 }
 
@@ -60,24 +60,18 @@ entry:
 ; CHECK-NEXT:        .long   118
 ; CHECK-NEXT:        .long   0
 
-declare dso_local i32 @get_value(ptr) local_unnamed_addr #1
+declare dso_local i32 @get_value(ptr) local_unnamed_addr
 
 ; Function Attrs: nounwind readnone
-declare ptr @llvm.preserve.struct.access.index.p0.p0(ptr, i32, i32) #2
+declare ptr @llvm.preserve.struct.access.index.p0.p0(ptr, i32, i32)
 
 ; Function Attrs: nounwind readnone
-declare ptr @llvm.preserve.array.access.index.p0.p0(ptr, i32, i32) #2
+declare ptr @llvm.preserve.array.access.index.p0.p0(ptr, i32, i32)
 
 ; Function Attrs: nounwind readnone
 
 ; Function Attrs: nounwind readnone speculatable willreturn
-declare void @llvm.dbg.value(metadata, metadata, metadata) #3
-
-attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #2 = { nounwind readnone }
-attributes #3 = { nounwind readnone speculatable willreturn }
-attributes #4 = { nounwind }
+declare void @llvm.dbg.value(metadata, metadata, metadata)
 
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!15, !16, !17}
diff --git a/llvm/test/CodeGen/BPF/CORE/offset-reloc-cast-union-1.ll b/llvm/test/CodeGen/BPF/CORE/offset-reloc-cast-union-1.ll
index 98fdfde..7aeaed4 100644
--- a/llvm/test/CodeGen/BPF/CORE/offset-reloc-cast-union-1.ll
+++ b/llvm/test/CodeGen/BPF/CORE/offset-reloc-cast-union-1.ll
@@ -24,14 +24,14 @@ target triple = "bpf"
 %union.v1 = type { i32 }
 
 ; Function Attrs: nounwind
-define dso_local i32 @test(ptr %arg) local_unnamed_addr #0 !dbg !15 {
+define dso_local i32 @test(ptr %arg) local_unnamed_addr !dbg !15 {
 entry:
   call void @llvm.dbg.value(metadata ptr %arg, metadata !33, metadata !DIExpression()), !dbg !34
   %0 = tail call ptr @llvm.preserve.union.access.index.p0.p0(ptr %arg, i32 1), !dbg !35, !llvm.preserve.access.index !20
   %1 = bitcast ptr %0 to ptr, !dbg !35
   %2 = tail call ptr @llvm.preserve.union.access.index.p0.p0(ptr %1, i32 1), !dbg !35, !llvm.preserve.access.index !6
   %b = getelementptr inbounds %union.v1, ptr %2, i64 0, i32 0, !dbg !35
-  %call = tail call i32 @get_value(ptr %b) #4, !dbg !36
+  %call = tail call i32 @get_value(ptr %b), !dbg !36
   ret i32 %call, !dbg !37
 }
 
@@ -61,21 +61,15 @@ entry:
 ; CHECK-NEXT:        .long   45
 ; CHECK-NEXT:        .long   0
 
-declare dso_local i32 @get_value(ptr) local_unnamed_addr #1
+declare dso_local i32 @get_value(ptr) local_unnamed_addr
 
 ; Function Attrs: nounwind readnone
-declare ptr @llvm.preserve.union.access.index.p0.p0(ptr, i32) #2
+declare ptr @llvm.preserve.union.access.index.p0.p0(ptr, i32)
 
 ; Function Attrs: nounwind readnone
 
 ; Function Attrs: nounwind readnone speculatable willreturn
-declare void @llvm.dbg.value(metadata, metadata, metadata) #3
-
-attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #2 = { nounwind readnone }
-attributes #3 = { nounwind readnone speculatable willreturn }
-attributes #4 = { nounwind }
+declare void @llvm.dbg.value(metadata, metadata, metadata)
 
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!11, !12, !13}
diff --git a/llvm/test/CodeGen/BPF/CORE/offset-reloc-cast-union-2.ll b/llvm/test/CodeGen/BPF/CORE/offset-reloc-cast-union-2.ll
index 7b63699..12c3936 100644
--- a/llvm/test/CodeGen/BPF/CORE/offset-reloc-cast-union-2.ll
+++ b/llvm/test/CodeGen/BPF/CORE/offset-reloc-cast-union-2.ll
@@ -22,7 +22,7 @@ target triple = "bpf"
 %union.v1 = type { i32 }
 
 ; Function Attrs: nounwind
-define dso_local i32 @test(ptr %arg) local_unnamed_addr #0 !dbg !19 {
+define dso_local i32 @test(ptr %arg) local_unnamed_addr !dbg !19 {
 entry:
   call void @llvm.dbg.value(metadata ptr %arg, metadata !30, metadata !DIExpression()), !dbg !31
   %0 = tail call ptr @llvm.preserve.union.access.index.p0.p0(ptr %arg, i32 1), !dbg !32, !llvm.preserve.access.index !24
@@ -31,7 +31,7 @@ entry:
   %2 = bitcast ptr %1 to ptr, !dbg !32
   %3 = tail call ptr @llvm.preserve.union.access.index.p0.p0(ptr %2, i32 1), !dbg !32, !llvm.preserve.access.index !6
   %b = getelementptr inbounds %union.v1, ptr %3, i64 0, i32 0, !dbg !32
-  %call = tail call i32 @get_value(ptr %b) #4, !dbg !33
+  %call = tail call i32 @get_value(ptr %b), !dbg !33
   ret i32 %call, !dbg !34
 }
 
@@ -62,24 +62,18 @@ entry:
 ; CHECK-NEXT:        .long   118
 ; CHECK-NEXT:        .long   0
 
-declare dso_local i32 @get_value(ptr) local_unnamed_addr #1
+declare dso_local i32 @get_value(ptr) local_unnamed_addr
 
 ; Function Attrs: nounwind readnone
-declare ptr @llvm.preserve.union.access.index.p0.p0(ptr, i32) #2
+declare ptr @llvm.preserve.union.access.index.p0.p0(ptr, i32)
 
 ; Function Attrs: nounwind readnone
-declare ptr @llvm.preserve.array.access.index.p0.p0(ptr, i32, i32) #2
+declare ptr @llvm.preserve.array.access.index.p0.p0(ptr, i32, i32)
 
 ; Function Attrs: nounwind readnone
 
 ; Function Attrs: nounwind readnone speculatable willreturn
-declare void @llvm.dbg.value(metadata, metadata, metadata) #3
-
-attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #2 = { nounwind readnone }
-attributes #3 = { nounwind readnone speculatable willreturn }
-attributes #4 = { nounwind }
+declare void @llvm.dbg.value(metadata, metadata, metadata)
 
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!15, !16, !17}
diff --git a/llvm/test/CodeGen/BPF/CORE/offset-reloc-end-load.ll b/llvm/test/CodeGen/BPF/CORE/offset-reloc-end-load.ll
index 499e368..ee1f0e2 100644
--- a/llvm/test/CodeGen/BPF/CORE/offset-reloc-end-load.ll
+++ b/llvm/test/CodeGen/BPF/CORE/offset-reloc-end-load.ll
@@ -14,7 +14,7 @@ target triple = "bpf"
 %struct.s = type { i32, i32 }
 
 ; Function Attrs: nounwind readonly
-define dso_local i32 @test(ptr readonly %arg) local_unnamed_addr #0 !dbg !11 {
+define dso_local i32 @test(ptr readonly %arg) local_unnamed_addr !dbg !11 {
 entry:
   call void @llvm.dbg.value(metadata ptr %arg, metadata !20, metadata !DIExpression()), !dbg !21
   %0 = tail call ptr @llvm.preserve.struct.access.index.p0.p0.ss(ptr elementtype(%struct.s) %arg, i32 1, i32 1), !dbg !22, !llvm.preserve.access.index !15
@@ -42,14 +42,10 @@ entry:
 ; CHECK-NEXT:  .long   0
 
 ; Function Attrs: nounwind readnone
-declare ptr @llvm.preserve.struct.access.index.p0.p0.ss(ptr, i32, i32) #1
+declare ptr @llvm.preserve.struct.access.index.p0.p0.ss(ptr, i32, i32)
 
 ; Function Attrs: nounwind readnone speculatable willreturn
-declare void @llvm.dbg.value(metadata, metadata, metadata) #2
-
-attributes #0 = { nounwind readonly "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { nounwind readnone }
-attributes #2 = { nounwind readnone speculatable willreturn }
+declare void @llvm.dbg.value(metadata, metadata, metadata)
 
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!7, !8, !9}
diff --git a/llvm/test/CodeGen/BPF/CORE/offset-reloc-end-ret.ll b/llvm/test/CodeGen/BPF/CORE/offset-reloc-end-ret.ll
index 2aadbdf..3d66435 100644
--- a/llvm/test/CodeGen/BPF/CORE/offset-reloc-end-ret.ll
+++ b/llvm/test/CodeGen/BPF/CORE/offset-reloc-end-ret.ll
@@ -14,7 +14,7 @@ target triple = "bpf"
 %struct.s = type { i32, i32 }
 
 ; Function Attrs: nounwind readnone
-define dso_local ptr @test(ptr readnone %arg) local_unnamed_addr #0 !dbg !7 {
+define dso_local ptr @test(ptr readnone %arg) local_unnamed_addr !dbg !7 {
 entry:
   call void @llvm.dbg.value(metadata ptr %arg, metadata !19, metadata !DIExpression()), !dbg !20
   %0 = tail call ptr @llvm.preserve.struct.access.index.p0.p0.ss(ptr elementtype(%struct.s) %arg, i32 1, i32 1), !dbg !21, !llvm.preserve.access.index !13
@@ -42,14 +42,10 @@ entry:
 ; CHECK-NEXT:  .long   0
 
 ; Function Attrs: nounwind readnone
-declare ptr @llvm.preserve.struct.access.index.p0.p0.ss(ptr, i32, i32) #1
+declare ptr @llvm.preserve.struct.access.index.p0.p0.ss(ptr, i32, i32)
 
 ; Function Attrs: nounwind readnone speculatable willreturn
-declare void @llvm.dbg.value(metadata, metadata, metadata) #2
-
-attributes #0 = { nounwind readnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { nounwind readnone }
-attributes #2 = { nounwind readnone speculatable willreturn }
+declare void @llvm.dbg.value(metadata, metadata, metadata)
 
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!3, !4, !5}
diff --git a/llvm/test/CodeGen/BPF/CORE/offset-reloc-fieldinfo-1.ll b/llvm/test/CodeGen/BPF/CORE/offset-reloc-fieldinfo-1.ll
index 34ea050..cf75909 100644
--- a/llvm/test/CodeGen/BPF/CORE/offset-reloc-fieldinfo-1.ll
+++ b/llvm/test/CodeGen/BPF/CORE/offset-reloc-fieldinfo-1.ll
@@ -40,11 +40,11 @@ target triple = "bpfel"
 %struct.s = type { i32, i16 }
 
 ; Function Attrs: nounwind
-define dso_local i32 @field_read(ptr %arg) local_unnamed_addr #0 !dbg !20 {
+define dso_local i32 @field_read(ptr %arg) local_unnamed_addr !dbg !20 {
 entry:
   %ull = alloca i64, align 8
   call void @llvm.dbg.value(metadata ptr %arg, metadata !31, metadata !DIExpression()), !dbg !37
-  call void @llvm.lifetime.start.p0(i64 8, ptr nonnull %ull) #5, !dbg !38
+  call void @llvm.lifetime.start.p0(i64 8, ptr nonnull %ull), !dbg !38
   %0 = tail call ptr @llvm.preserve.struct.access.index.p0.p0.ss(ptr elementtype(%struct.s) %arg, i32 1, i32 2), !dbg !39, !llvm.preserve.access.index !25
   %1 = tail call i32 @llvm.bpf.preserve.field.info.p0(ptr %0, i64 0), !dbg !40
   call void @llvm.dbg.value(metadata i32 %1, metadata !34, metadata !DIExpression()), !dbg !37
@@ -52,7 +52,7 @@ entry:
   call void @llvm.dbg.value(metadata i32 %2, metadata !35, metadata !DIExpression()), !dbg !37
   %idx.ext = zext i32 %1 to i64, !dbg !43
   %add.ptr = getelementptr i8, ptr %arg, i64 %idx.ext, !dbg !43
-  call void @bpf_probe_read(ptr nonnull %ull, i32 %2, ptr %add.ptr) #5, !dbg !44
+  call void @bpf_probe_read(ptr nonnull %ull, i32 %2, ptr %add.ptr), !dbg !44
   %3 = call i32 @llvm.bpf.preserve.field.info.p0(ptr %0, i64 4), !dbg !45
   call void @llvm.dbg.value(metadata i32 %3, metadata !36, metadata !DIExpression()), !dbg !37
   %4 = load i64, ptr %ull, align 8, !dbg !46, !tbaa !47
@@ -68,7 +68,7 @@ entry:
   %shr3 = lshr i64 %shl, %sh_prom1, !dbg !53
   %retval.0.in = select i1 %tobool, i64 %shr3, i64 %shr, !dbg !53
   %retval.0 = trunc i64 %retval.0.in to i32, !dbg !37
-  call void @llvm.lifetime.end.p0(i64 8, ptr nonnull %ull) #5, !dbg !54
+  call void @llvm.lifetime.end.p0(i64 8, ptr nonnull %ull), !dbg !54
   ret i32 %retval.0, !dbg !54
 }
 
@@ -114,28 +114,21 @@ entry:
 ; CHECK-NEXT:        .long   3
 
 ; Function Attrs: argmemonly nounwind willreturn
-declare void @llvm.lifetime.start.p0(i64, ptr nocapture) #1
+declare void @llvm.lifetime.start.p0(i64, ptr nocapture)
 
 ; Function Attrs: nounwind readnone
-declare ptr @llvm.preserve.struct.access.index.p0.p0.ss(ptr, i32, i32) #2
+declare ptr @llvm.preserve.struct.access.index.p0.p0.ss(ptr, i32, i32)
 
 ; Function Attrs: nounwind readnone
-declare i32 @llvm.bpf.preserve.field.info.p0(ptr, i64) #2
+declare i32 @llvm.bpf.preserve.field.info.p0(ptr, i64)
 
-declare dso_local void @bpf_probe_read(ptr, i32, ptr) local_unnamed_addr #3
+declare dso_local void @bpf_probe_read(ptr, i32, ptr) local_unnamed_addr
 
 ; Function Attrs: argmemonly nounwind willreturn
-declare void @llvm.lifetime.end.p0(i64, ptr nocapture) #1
+declare void @llvm.lifetime.end.p0(i64, ptr nocapture)
 
 ; Function Attrs: nounwind readnone speculatable willreturn
-declare void @llvm.dbg.value(metadata, metadata, metadata) #4
-
-attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { argmemonly nounwind willreturn }
-attributes #2 = { nounwind readnone }
-attributes #3 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #4 = { nounwind readnone speculatable willreturn }
-attributes #5 = { nounwind }
+declare void @llvm.dbg.value(metadata, metadata, metadata)
 
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!16, !17, !18}
diff --git a/llvm/test/CodeGen/BPF/CORE/offset-reloc-fieldinfo-2-bpfeb.ll b/llvm/test/CodeGen/BPF/CORE/offset-reloc-fieldinfo-2-bpfeb.ll
index 01c5e69..d5b2d052 100644
--- a/llvm/test/CodeGen/BPF/CORE/offset-reloc-fieldinfo-2-bpfeb.ll
+++ b/llvm/test/CodeGen/BPF/CORE/offset-reloc-fieldinfo-2-bpfeb.ll
@@ -42,7 +42,7 @@ target triple = "bpfeb"
 %struct.s = type { i32, i16 }
 
 ; Function Attrs: nounwind readonly
-define dso_local i32 @field_read(ptr %arg) local_unnamed_addr #0 !dbg !26 {
+define dso_local i32 @field_read(ptr %arg) local_unnamed_addr !dbg !26 {
 entry:
   call void @llvm.dbg.value(metadata ptr %arg, metadata !37, metadata !DIExpression()), !dbg !41
   %0 = tail call ptr @llvm.preserve.struct.access.index.p0.p0.ss(ptr elementtype(%struct.s) %arg, i32 1, i32 2), !dbg !42, !llvm.preserve.access.index !31
@@ -157,17 +157,13 @@ sw.epilog:                                        ; preds = %entry, %sw.bb9, %sw
 ; CHECK-NEXT:        .long   3
 
 ; Function Attrs: nounwind readnone
-declare ptr @llvm.preserve.struct.access.index.p0.p0.ss(ptr, i32, i32) #1
+declare ptr @llvm.preserve.struct.access.index.p0.p0.ss(ptr, i32, i32)
 
 ; Function Attrs: nounwind readnone
-declare i32 @llvm.bpf.preserve.field.info.p0(ptr, i64) #1
+declare i32 @llvm.bpf.preserve.field.info.p0(ptr, i64)
 
 ; Function Attrs: nounwind readnone speculatable willreturn
-declare void @llvm.dbg.value(metadata, metadata, metadata) #2
-
-attributes #0 = { nounwind readonly "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { nounwind readnone }
-attributes #2 = { nounwind readnone speculatable willreturn }
+declare void @llvm.dbg.value(metadata, metadata, metadata)
 
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!22, !23, !24}
diff --git a/llvm/test/CodeGen/BPF/CORE/offset-reloc-fieldinfo-2.ll b/llvm/test/CodeGen/BPF/CORE/offset-reloc-fieldinfo-2.ll
index d458d41..5076e79 100644
--- a/llvm/test/CodeGen/BPF/CORE/offset-reloc-fieldinfo-2.ll
+++ b/llvm/test/CodeGen/BPF/CORE/offset-reloc-fieldinfo-2.ll
@@ -42,7 +42,7 @@ target triple = "bpfel"
 %struct.s = type { i32, i16 }
 
 ; Function Attrs: nounwind readonly
-define dso_local i32 @field_read(ptr %arg) local_unnamed_addr #0 !dbg !26 {
+define dso_local i32 @field_read(ptr %arg) local_unnamed_addr !dbg !26 {
 entry:
   call void @llvm.dbg.value(metadata ptr %arg, metadata !37, metadata !DIExpression()), !dbg !41
   %0 = tail call ptr @llvm.preserve.struct.access.index.p0.p0.ss(ptr elementtype(%struct.s) %arg, i32 1, i32 2), !dbg !42, !llvm.preserve.access.index !31
@@ -157,17 +157,13 @@ sw.epilog:                                        ; preds = %entry, %sw.bb9, %sw
 ; CHECK-NEXT:        .long   3
 
 ; Function Attrs: nounwind readnone
-declare ptr @llvm.preserve.struct.access.index.p0.p0.ss(ptr, i32, i32) #1
+declare ptr @llvm.preserve.struct.access.index.p0.p0.ss(ptr, i32, i32)
 
 ; Function Attrs: nounwind readnone
-declare i32 @llvm.bpf.preserve.field.info.p0(ptr, i64) #1
+declare i32 @llvm.bpf.preserve.field.info.p0(ptr, i64)
 
 ; Function Attrs: nounwind readnone speculatable willreturn
-declare void @llvm.dbg.value(metadata, metadata, metadata) #2
-
-attributes #0 = { nounwind readonly "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { nounwind readnone }
-attributes #2 = { nounwind readnone speculatable willreturn }
+declare void @llvm.dbg.value(metadata, metadata, metadata)
 
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!22, !23, !24}
diff --git a/llvm/test/CodeGen/BPF/CORE/offset-reloc-global-1.ll b/llvm/test/CodeGen/BPF/CORE/offset-reloc-global-1.ll
index 7657b78..2f42118 100644
--- a/llvm/test/CodeGen/BPF/CORE/offset-reloc-global-1.ll
+++ b/llvm/test/CodeGen/BPF/CORE/offset-reloc-global-1.ll
@@ -19,10 +19,10 @@ target triple = "bpf"
 @g = dso_local global %struct.v3 zeroinitializer, section "stats", align 4, !dbg !0
 
 ; Function Attrs: nounwind
-define dso_local i32 @test() local_unnamed_addr #0 !dbg !16 {
+define dso_local i32 @test() local_unnamed_addr !dbg !16 {
 entry:
   %0 = tail call ptr @llvm.preserve.struct.access.index.p0.p0.v3s(ptr elementtype(%struct.v3) nonnull @g, i32 1, i32 1), !dbg !19, !llvm.preserve.access.index !7
-  %call = tail call i32 @get_value(ptr %0) #3, !dbg !20
+  %call = tail call i32 @get_value(ptr %0), !dbg !20
   ret i32 %call, !dbg !21
 }
 
@@ -45,15 +45,10 @@ entry:
 ; CHECK-NEXT:         .long   23
 ; CHECK-NEXT:         .long   0
 
-declare dso_local i32 @get_value(ptr) local_unnamed_addr #1
+declare dso_local i32 @get_value(ptr) local_unnamed_addr
 
 ; Function Attrs: nounwind readnone
-declare ptr @llvm.preserve.struct.access.index.p0.p0.v3s(ptr, i32, i32) #2
-
-attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #2 = { nounwind readnone }
-attributes #3 = { nounwind }
+declare ptr @llvm.preserve.struct.access.index.p0.p0.v3s(ptr, i32, i32)
 
 !llvm.dbg.cu = !{!2}
 !llvm.module.flags = !{!12, !13, !14}
diff --git a/llvm/test/CodeGen/BPF/CORE/offset-reloc-global-2.ll b/llvm/test/CodeGen/BPF/CORE/offset-reloc-global-2.ll
index bed14ab..f43df76 100644
--- a/llvm/test/CodeGen/BPF/CORE/offset-reloc-global-2.ll
+++ b/llvm/test/CodeGen/BPF/CORE/offset-reloc-global-2.ll
@@ -19,12 +19,12 @@ target triple = "bpf"
 @g = dso_local global [4 x [5 x %struct.v3]] zeroinitializer, section "stats", align 4, !dbg !0
 
 ; Function Attrs: nounwind
-define dso_local i32 @test() local_unnamed_addr #0 !dbg !23 {
+define dso_local i32 @test() local_unnamed_addr !dbg !23 {
 entry:
   %0 = tail call ptr @llvm.preserve.array.access.index.p0.v3s.p0.v3s(ptr elementtype([4 x [5 x %struct.v3]]) nonnull @g, i32 1, i32 1), !dbg !26, !llvm.preserve.access.index !6
   %1 = tail call ptr @llvm.preserve.array.access.index.p0.v3s.p0.v3s(ptr elementtype([5 x %struct.v3]) %0, i32 1, i32 2), !dbg !26, !llvm.preserve.access.index !16
   %2 = tail call ptr @llvm.preserve.struct.access.index.p0.p0.v3s(ptr elementtype(%struct.v3) %1, i32 1, i32 1), !dbg !26, !llvm.preserve.access.index !8
-  %call = tail call i32 @get_value(ptr %2) #3, !dbg !27
+  %call = tail call i32 @get_value(ptr %2), !dbg !27
   ret i32 %call, !dbg !28
 }
 
@@ -47,21 +47,15 @@ entry:
 ; CHECK-NEXT:         .long   23
 ; CHECK-NEXT:         .long   0
 
-
-declare dso_local i32 @get_value(ptr) local_unnamed_addr #1
+declare dso_local i32 @get_value(ptr) local_unnamed_addr
 
 ; Function Attrs: nounwind readnone
-declare ptr @llvm.preserve.array.access.index.p0.v3s.p0.v3s(ptr, i32, i32) #2
+declare ptr @llvm.preserve.array.access.index.p0.v3s.p0.v3s(ptr, i32, i32)
 
 ; Function Attrs: nounwind readnone
 
 ; Function Attrs: nounwind readnone
-declare ptr @llvm.preserve.struct.access.index.p0.p0.v3s(ptr, i32, i32) #2
-
-attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #2 = { nounwind readnone }
-attributes #3 = { nounwind }
+declare ptr @llvm.preserve.struct.access.index.p0.p0.v3s(ptr, i32, i32)
 
 !llvm.dbg.cu = !{!2}
 !llvm.module.flags = !{!19, !20, !21}
diff --git a/llvm/test/CodeGen/BPF/CORE/offset-reloc-global-3.ll b/llvm/test/CodeGen/BPF/CORE/offset-reloc-global-3.ll
index 49b89e2..5bc2bf9 100644
--- a/llvm/test/CodeGen/BPF/CORE/offset-reloc-global-3.ll
+++ b/llvm/test/CodeGen/BPF/CORE/offset-reloc-global-3.ll
@@ -19,11 +19,11 @@ target triple = "bpf"
 @g = dso_local local_unnamed_addr global ptr null, section "stats", align 8, !dbg !0
 
 ; Function Attrs: nounwind
-define dso_local i32 @test() local_unnamed_addr #0 !dbg !17 {
+define dso_local i32 @test() local_unnamed_addr !dbg !17 {
 entry:
   %0 = load ptr, ptr @g, align 8, !dbg !20, !tbaa !21
   %1 = tail call ptr @llvm.preserve.struct.access.index.p0.p0.v3s(ptr elementtype(%struct.v3) %0, i32 1, i32 1), !dbg !20, !llvm.preserve.access.index !8
-  %call = tail call i32 @get_value(ptr %1) #3, !dbg !25
+  %call = tail call i32 @get_value(ptr %1), !dbg !25
   ret i32 %call, !dbg !26
 }
 
@@ -45,15 +45,10 @@ entry:
 ; CHECK-NEXT:         .long   23
 ; CHECK-NEXT:         .long   0
 
-declare dso_local i32 @get_value(ptr) local_unnamed_addr #1
+declare dso_local i32 @get_value(ptr) local_unnamed_addr
 
 ; Function Attrs: nounwind readnone
-declare ptr @llvm.preserve.struct.access.index.p0.p0.v3s(ptr, i32, i32) #2
-
-attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #2 = { nounwind readnone }
-attributes #3 = { nounwind }
+declare ptr @llvm.preserve.struct.access.index.p0.p0.v3s(ptr, i32, i32)
 
 !llvm.dbg.cu = !{!2}
 !llvm.module.flags = !{!13, !14, !15}
diff --git a/llvm/test/CodeGen/BPF/CORE/offset-reloc-ignore.ll b/llvm/test/CodeGen/BPF/CORE/offset-reloc-ignore.ll
index 4ff170cf..983383c 100644
--- a/llvm/test/CodeGen/BPF/CORE/offset-reloc-ignore.ll
+++ b/llvm/test/CodeGen/BPF/CORE/offset-reloc-ignore.ll
@@ -13,11 +13,11 @@
 target triple = "bpf"
 
 ; Function Attrs: nounwind
-define dso_local i32 @test(ptr %arg) local_unnamed_addr #0 !dbg !10 {
+define dso_local i32 @test(ptr %arg) local_unnamed_addr !dbg !10 {
 entry:
   call void @llvm.dbg.value(metadata ptr %arg, metadata !14, metadata !DIExpression()), !dbg !15
   %0 = tail call ptr @llvm.preserve.array.access.index.p0.p0(ptr elementtype(i32) %arg, i32 0, i32 4), !dbg !16, !llvm.preserve.access.index !4
-  %call = tail call i32 @get_value(ptr %0) #4, !dbg !17
+  %call = tail call i32 @get_value(ptr %0), !dbg !17
   ret i32 %call, !dbg !18
 }
 
@@ -26,19 +26,13 @@ entry:
 ; CHECK:             .section        .BTF.ext,"",@progbits
 ; CHECK-NOT:         .long   16                      # FieldReloc
 
-declare dso_local i32 @get_value(ptr) local_unnamed_addr #1
+declare dso_local i32 @get_value(ptr) local_unnamed_addr
 
 ; Function Attrs: nounwind readnone
-declare ptr @llvm.preserve.array.access.index.p0.p0(ptr, i32, i32) #2
+declare ptr @llvm.preserve.array.access.index.p0.p0(ptr, i32, i32)
 
 ; Function Attrs: nounwind readnone speculatable willreturn
-declare void @llvm.dbg.value(metadata, metadata, metadata) #3
-
-attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #2 = { nounwind readnone }
-attributes #3 = { nounwind readnone speculatable willreturn }
-attributes #4 = { nounwind }
+declare void @llvm.dbg.value(metadata, metadata, metadata)
 
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!6, !7, !8}
diff --git a/llvm/test/CodeGen/BPF/CORE/offset-reloc-middle-chain.ll b/llvm/test/CodeGen/BPF/CORE/offset-reloc-middle-chain.ll
index e5f86c2..c67d57f 100644
--- a/llvm/test/CodeGen/BPF/CORE/offset-reloc-middle-chain.ll
+++ b/llvm/test/CodeGen/BPF/CORE/offset-reloc-middle-chain.ll
@@ -29,7 +29,7 @@ target triple = "bpf"
 %struct.t1 = type { i32 }
 
 ; Function Attrs: nounwind
-define dso_local void @test(ptr %arg) local_unnamed_addr #0 !dbg !7 {
+define dso_local void @test(ptr %arg) local_unnamed_addr !dbg !7 {
 entry:
   call void @llvm.dbg.value(metadata ptr %arg, metadata !22, metadata !DIExpression()), !dbg !29
   %0 = tail call ptr @llvm.preserve.struct.access.index.p0.s1s.p0.r1s(ptr elementtype(%struct.r1) %arg, i32 0, i32 0), !dbg !30, !llvm.preserve.access.index !11
@@ -38,7 +38,7 @@ entry:
   call void @llvm.dbg.value(metadata ptr %1, metadata !25, metadata !DIExpression()), !dbg !29
   %2 = tail call ptr @llvm.preserve.struct.access.index.p0.p0.t1s(ptr elementtype(%struct.t1) %1, i32 0, i32 0), !dbg !32, !llvm.preserve.access.index !17
   call void @llvm.dbg.value(metadata ptr %2, metadata !27, metadata !DIExpression()), !dbg !29
-  tail call void @test1(ptr %0, ptr %1, ptr %2) #4, !dbg !36
+  tail call void @test1(ptr %0, ptr %1, ptr %2), !dbg !36
   ret void, !dbg !37
 }
 
@@ -67,24 +67,18 @@ entry:
 ; CHECK-NEXT:        .long   0
 
 ; Function Attrs: nounwind readnone
-declare ptr @llvm.preserve.struct.access.index.p0.s1s.p0.r1s(ptr, i32, i32) #1
+declare ptr @llvm.preserve.struct.access.index.p0.s1s.p0.r1s(ptr, i32, i32)
 
 ; Function Attrs: nounwind readnone
-declare ptr @llvm.preserve.struct.access.index.p0.t1s.p0.s1s(ptr, i32, i32) #1
+declare ptr @llvm.preserve.struct.access.index.p0.t1s.p0.s1s(ptr, i32, i32)
 
 ; Function Attrs: nounwind readnone
-declare ptr @llvm.preserve.struct.access.index.p0.p0.t1s(ptr, i32, i32) #1
+declare ptr @llvm.preserve.struct.access.index.p0.p0.t1s(ptr, i32, i32)
 
-declare dso_local void @test1(ptr, ptr, ptr) local_unnamed_addr #2
+declare dso_local void @test1(ptr, ptr, ptr) local_unnamed_addr
 
 ; Function Attrs: nounwind readnone speculatable willreturn
-declare void @llvm.dbg.value(metadata, metadata, metadata) #3
-
-attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { nounwind readnone }
-attributes #2 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #3 = { nounwind readnone speculatable willreturn }
-attributes #4 = { nounwind }
+declare void @llvm.dbg.value(metadata, metadata, metadata)
 
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!3, !4, !5}
diff --git a/llvm/test/CodeGen/BPF/CORE/offset-reloc-multi-array-1.ll b/llvm/test/CodeGen/BPF/CORE/offset-reloc-multi-array-1.ll
index 8ca3ef5..7ffb4de 100644
--- a/llvm/test/CodeGen/BPF/CORE/offset-reloc-multi-array-1.ll
+++ b/llvm/test/CodeGen/BPF/CORE/offset-reloc-multi-array-1.ll
@@ -17,14 +17,14 @@ target triple = "bpf"
 %struct.v3 = type { i32, [4 x [4 x i32]] }
 
 ; Function Attrs: nounwind
-define dso_local i32 @test(ptr %arg) local_unnamed_addr #0 !dbg !21 {
+define dso_local i32 @test(ptr %arg) local_unnamed_addr !dbg !21 {
 entry:
   call void @llvm.dbg.value(metadata ptr %arg, metadata !25, metadata !DIExpression()), !dbg !26
   %0 = tail call ptr @llvm.preserve.array.access.index.p0.v3s.p0.v3s(ptr elementtype(%struct.v3) %arg, i32 0, i32 1), !dbg !27, !llvm.preserve.access.index !4
   %1 = tail call ptr @llvm.preserve.struct.access.index.p0.p0.v3s(ptr elementtype(%struct.v3) %0, i32 1, i32 1), !dbg !27, !llvm.preserve.access.index !6
   %2 = tail call ptr @llvm.preserve.array.access.index.p0.p0(ptr elementtype([4 x [4 x i32]]) %1, i32 1, i32 2), !dbg !27, !llvm.preserve.access.index !11
   %3 = tail call ptr @llvm.preserve.array.access.index.p0.p0(ptr elementtype([4 x i32]) %2, i32 1, i32 3), !dbg !27, !llvm.preserve.access.index !15
-  %call = tail call i32 @get_value(ptr %3) #4, !dbg !28
+  %call = tail call i32 @get_value(ptr %3), !dbg !28
   ret i32 %call, !dbg !29
 }
 
@@ -46,27 +46,21 @@ entry:
 ; CHECK-NEXT:         .long   58
 ; CHECK-NEXT:         .long   0
 
-declare dso_local i32 @get_value(ptr) local_unnamed_addr #1
+declare dso_local i32 @get_value(ptr) local_unnamed_addr
 
 ; Function Attrs: nounwind readnone
-declare ptr @llvm.preserve.array.access.index.p0.v3s.p0.v3s(ptr, i32, i32) #2
+declare ptr @llvm.preserve.array.access.index.p0.v3s.p0.v3s(ptr, i32, i32)
 
 ; Function Attrs: nounwind readnone
-declare ptr @llvm.preserve.struct.access.index.p0.p0.v3s(ptr, i32, i32) #2
+declare ptr @llvm.preserve.struct.access.index.p0.p0.v3s(ptr, i32, i32)
 
 ; Function Attrs: nounwind readnone
-declare ptr @llvm.preserve.array.access.index.p0.p0(ptr, i32, i32) #2
+declare ptr @llvm.preserve.array.access.index.p0.p0(ptr, i32, i32)
 
 ; Function Attrs: nounwind readnone
 
 ; Function Attrs: nounwind readnone speculatable willreturn
-declare void @llvm.dbg.value(metadata, metadata, metadata) #3
-
-attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #2 = { nounwind readnone }
-attributes #3 = { nounwind readnone speculatable willreturn }
-attributes #4 = { nounwind }
+declare void @llvm.dbg.value(metadata, metadata, metadata)
 
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!17, !18, !19}
diff --git a/llvm/test/CodeGen/BPF/CORE/offset-reloc-multi-array-2.ll b/llvm/test/CodeGen/BPF/CORE/offset-reloc-multi-array-2.ll
index b2ba5a8..55bb7c58 100644
--- a/llvm/test/CodeGen/BPF/CORE/offset-reloc-multi-array-2.ll
+++ b/llvm/test/CodeGen/BPF/CORE/offset-reloc-multi-array-2.ll
@@ -17,7 +17,7 @@ target triple = "bpf"
 %struct.v3 = type { i32, [4 x [4 x [4 x i32]]] }
 
 ; Function Attrs: nounwind
-define dso_local i32 @test(ptr %arg) local_unnamed_addr #0 !dbg !23 {
+define dso_local i32 @test(ptr %arg) local_unnamed_addr !dbg !23 {
 entry:
   call void @llvm.dbg.value(metadata ptr %arg, metadata !27, metadata !DIExpression()), !dbg !28
   %0 = tail call ptr @llvm.preserve.array.access.index.p0.v3s.p0.v3s(ptr elementtype(%struct.v3) %arg, i32 0, i32 1), !dbg !29, !llvm.preserve.access.index !4
@@ -25,7 +25,7 @@ entry:
   %2 = tail call ptr @llvm.preserve.array.access.index.p0.p0(ptr elementtype([4 x [4 x [4 x i32]]]) %1, i32 1, i32 2), !dbg !29, !llvm.preserve.access.index !11
   %3 = tail call ptr @llvm.preserve.array.access.index.p0.p0(ptr elementtype([4 x [4 x i32]]) %2, i32 1, i32 3), !dbg !29, !llvm.preserve.access.index !15
   %4 = tail call ptr @llvm.preserve.array.access.index.p0.p0(ptr elementtype([4 x i32]) %3, i32 1, i32 2), !dbg !29, !llvm.preserve.access.index !17
-  %call = tail call i32 @get_value(ptr %4) #4, !dbg !30
+  %call = tail call i32 @get_value(ptr %4), !dbg !30
   ret i32 %call, !dbg !31
 }
 
@@ -47,29 +47,23 @@ entry:
 ; CHECK-NEXT:        .long   58
 ; CHECK-NEXT:        .long   0
 
-declare dso_local i32 @get_value(ptr) local_unnamed_addr #1
+declare dso_local i32 @get_value(ptr) local_unnamed_addr
 
 ; Function Attrs: nounwind readnone
-declare ptr @llvm.preserve.array.access.index.p0.v3s.p0.v3s(ptr, i32, i32) #2
+declare ptr @llvm.preserve.array.access.index.p0.v3s.p0.v3s(ptr, i32, i32)
 
 ; Function Attrs: nounwind readnone
-declare ptr @llvm.preserve.struct.access.index.p0.p0.v3s(ptr, i32, i32) #2
+declare ptr @llvm.preserve.struct.access.index.p0.p0.v3s(ptr, i32, i32)
 
 ; Function Attrs: nounwind readnone
-declare ptr @llvm.preserve.array.access.index.p0.p0(ptr, i32, i32) #2
+declare ptr @llvm.preserve.array.access.index.p0.p0(ptr, i32, i32)
 
 ; Function Attrs: nounwind readnone
 
 ; Function Attrs: nounwind readnone
 
 ; Function Attrs: nounwind readnone speculatable willreturn
-declare void @llvm.dbg.value(metadata, metadata, metadata) #3
-
-attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #2 = { nounwind readnone }
-attributes #3 = { nounwind readnone speculatable willreturn }
-attributes #4 = { nounwind }
+declare void @llvm.dbg.value(metadata, metadata, metadata)
 
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!19, !20, !21}
diff --git a/llvm/test/CodeGen/BPF/CORE/offset-reloc-multilevel.ll b/llvm/test/CodeGen/BPF/CORE/offset-reloc-multilevel.ll
index e00bbb8..a5b4604 100644
--- a/llvm/test/CodeGen/BPF/CORE/offset-reloc-multilevel.ll
+++ b/llvm/test/CodeGen/BPF/CORE/offset-reloc-multilevel.ll
@@ -28,16 +28,16 @@ target triple = "bpf"
 %struct.net_device = type { i32, i32 }
 
 ; Function Attrs: nounwind
-define dso_local i32 @bpf_prog(ptr) local_unnamed_addr #0 !dbg !15 {
+define dso_local i32 @bpf_prog(ptr) local_unnamed_addr !dbg !15 {
   %2 = alloca i32, align 4
   call void @llvm.dbg.value(metadata ptr %0, metadata !28, metadata !DIExpression()), !dbg !30
-  call void @llvm.lifetime.start.p0(i64 4, ptr nonnull %2) #4, !dbg !31
+  call void @llvm.lifetime.start.p0(i64 4, ptr nonnull %2), !dbg !31
   %3 = tail call ptr @llvm.preserve.struct.access.index.p0.net_devices.p0.sk_buffs(ptr elementtype(%struct.sk_buff) %0, i32 1, i32 1), !dbg !32, !llvm.preserve.access.index !19
   %4 = tail call ptr @llvm.preserve.struct.access.index.p0.p0.net_devices(ptr elementtype(%struct.net_device) %3, i32 0, i32 0), !dbg !32, !llvm.preserve.access.index !23
-  %5 = call i32 inttoptr (i64 4 to ptr)(ptr nonnull %2, i32 4, ptr %4) #4, !dbg !33
+  %5 = call i32 inttoptr (i64 4 to ptr)(ptr nonnull %2, i32 4, ptr %4), !dbg !33
   %6 = load i32, ptr %2, align 4, !dbg !34, !tbaa !35
   call void @llvm.dbg.value(metadata i32 %6, metadata !29, metadata !DIExpression()), !dbg !30
-  call void @llvm.lifetime.end.p0(i64 4, ptr nonnull %2) #4, !dbg !39
+  call void @llvm.lifetime.end.p0(i64 4, ptr nonnull %2), !dbg !39
   ret i32 %6, !dbg !40
 }
 
@@ -130,25 +130,19 @@ define dso_local i32 @bpf_prog(ptr) local_unnamed_addr #0 !dbg !15 {
 ; CHECK-NEXT:        .long   0
 
 ; Function Attrs: argmemonly nounwind
-declare void @llvm.lifetime.start.p0(i64 immarg, ptr nocapture) #1
+declare void @llvm.lifetime.start.p0(i64 immarg, ptr nocapture)
 
 ; Function Attrs: nounwind readnone
-declare ptr @llvm.preserve.struct.access.index.p0.net_devices.p0.sk_buffs(ptr, i32 immarg, i32 immarg) #2
+declare ptr @llvm.preserve.struct.access.index.p0.net_devices.p0.sk_buffs(ptr, i32 immarg, i32 immarg)
 
 ; Function Attrs: nounwind readnone
-declare ptr @llvm.preserve.struct.access.index.p0.p0.net_devices(ptr, i32 immarg, i32 immarg) #2
+declare ptr @llvm.preserve.struct.access.index.p0.p0.net_devices(ptr, i32 immarg, i32 immarg)
 
 ; Function Attrs: argmemonly nounwind
-declare void @llvm.lifetime.end.p0(i64 immarg, ptr nocapture) #1
+declare void @llvm.lifetime.end.p0(i64 immarg, ptr nocapture)
 
 ; Function Attrs: nounwind readnone speculatable
-declare void @llvm.dbg.value(metadata, metadata, metadata) #3
-
-attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "frame-pointer"="all" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { argmemonly nounwind }
-attributes #2 = { nounwind readnone }
-attributes #3 = { nounwind readnone speculatable }
-attributes #4 = { nounwind }
+declare void @llvm.dbg.value(metadata, metadata, metadata)
 
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!11, !12, !13}
diff --git a/llvm/test/CodeGen/BPF/CORE/offset-reloc-pointer-1.ll b/llvm/test/CodeGen/BPF/CORE/offset-reloc-pointer-1.ll
index b4d1844..ffd77ed 100644
--- a/llvm/test/CodeGen/BPF/CORE/offset-reloc-pointer-1.ll
+++ b/llvm/test/CodeGen/BPF/CORE/offset-reloc-pointer-1.ll
@@ -16,11 +16,11 @@ target triple = "bpf"
 %struct.v3 = type { i32, i32 }
 
 ; Function Attrs: nounwind
-define dso_local i32 @test(ptr %arg) local_unnamed_addr #0 !dbg !15 {
+define dso_local i32 @test(ptr %arg) local_unnamed_addr !dbg !15 {
 entry:
   call void @llvm.dbg.value(metadata ptr %arg, metadata !19, metadata !DIExpression()), !dbg !20
   %0 = tail call ptr @llvm.preserve.array.access.index.p0.v3s.p0.v3s(ptr elementtype(%struct.v3) %arg, i32 0, i32 1), !dbg !21, !llvm.preserve.access.index !4
-  %call = tail call i32 @get_value(ptr %0) #4, !dbg !22
+  %call = tail call i32 @get_value(ptr %0), !dbg !22
   ret i32 %call, !dbg !23
 }
 
@@ -42,19 +42,13 @@ entry:
 ; CHECK-NEXT:         .long   32
 ; CHECK-NEXT:         .long   0
 
-declare dso_local i32 @get_value(ptr) local_unnamed_addr #1
+declare dso_local i32 @get_value(ptr) local_unnamed_addr
 
 ; Function Attrs: nounwind readnone
-declare ptr @llvm.preserve.array.access.index.p0.v3s.p0.v3s(ptr, i32, i32) #2
+declare ptr @llvm.preserve.array.access.index.p0.v3s.p0.v3s(ptr, i32, i32)
 
 ; Function Attrs: nounwind readnone speculatable willreturn
-declare void @llvm.dbg.value(metadata, metadata, metadata) #3
-
-attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #2 = { nounwind readnone }
-attributes #3 = { nounwind readnone speculatable willreturn }
-attributes #4 = { nounwind }
+declare void @llvm.dbg.value(metadata, metadata, metadata)
 
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!11, !12, !13}
diff --git a/llvm/test/CodeGen/BPF/CORE/offset-reloc-pointer-2.ll b/llvm/test/CodeGen/BPF/CORE/offset-reloc-pointer-2.ll
index 87b88bc..cb0aff3 100644
--- a/llvm/test/CodeGen/BPF/CORE/offset-reloc-pointer-2.ll
+++ b/llvm/test/CodeGen/BPF/CORE/offset-reloc-pointer-2.ll
@@ -16,12 +16,12 @@ target triple = "bpf"
 %struct.v3 = type { i32, i32 }
 
 ; Function Attrs: nounwind
-define dso_local i32 @test(ptr %arg) local_unnamed_addr #0 !dbg !15 {
+define dso_local i32 @test(ptr %arg) local_unnamed_addr !dbg !15 {
 entry:
   call void @llvm.dbg.value(metadata ptr %arg, metadata !19, metadata !DIExpression()), !dbg !20
   %0 = tail call ptr @llvm.preserve.array.access.index.p0.v3s.p0.v3s(ptr elementtype(%struct.v3) %arg, i32 0, i32 1), !dbg !21, !llvm.preserve.access.index !4
   %1 = tail call ptr @llvm.preserve.struct.access.index.p0.p0.v3s(ptr elementtype(%struct.v3) %0, i32 1, i32 1), !dbg !21, !llvm.preserve.access.index !6
-  %call = tail call i32 @get_value(ptr %1) #4, !dbg !22
+  %call = tail call i32 @get_value(ptr %1), !dbg !22
   ret i32 %call, !dbg !23
 }
 
@@ -42,22 +42,16 @@ entry:
 ; CHECK-NEXT:         .long   32
 ; CHECK-NEXT:         .long   0
 
-declare dso_local i32 @get_value(ptr) local_unnamed_addr #1
+declare dso_local i32 @get_value(ptr) local_unnamed_addr
 
 ; Function Attrs: nounwind readnone
-declare ptr @llvm.preserve.array.access.index.p0.v3s.p0.v3s(ptr, i32, i32) #2
+declare ptr @llvm.preserve.array.access.index.p0.v3s.p0.v3s(ptr, i32, i32)
 
 ; Function Attrs: nounwind readnone
-declare ptr @llvm.preserve.struct.access.index.p0.p0.v3s(ptr, i32, i32) #2
+declare ptr @llvm.preserve.struct.access.index.p0.p0.v3s(ptr, i32, i32)
 
 ; Function Attrs: nounwind readnone speculatable willreturn
-declare void @llvm.dbg.value(metadata, metadata, metadata) #3
-
-attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #2 = { nounwind readnone }
-attributes #3 = { nounwind readnone speculatable willreturn }
-attributes #4 = { nounwind }
+declare void @llvm.dbg.value(metadata, metadata, metadata)
 
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!11, !12, !13}
diff --git a/llvm/test/CodeGen/BPF/CORE/offset-reloc-struct-anonymous.ll b/llvm/test/CodeGen/BPF/CORE/offset-reloc-struct-anonymous.ll
index 8ebbfea..2081b3f 100644
--- a/llvm/test/CodeGen/BPF/CORE/offset-reloc-struct-anonymous.ll
+++ b/llvm/test/CodeGen/BPF/CORE/offset-reloc-struct-anonymous.ll
@@ -27,17 +27,17 @@ target triple = "bpf"
 %struct.anon = type { i32, i32 }
 
 ; Function Attrs: nounwind
-define dso_local i32 @bpf_prog(ptr) local_unnamed_addr #0 !dbg !15 {
+define dso_local i32 @bpf_prog(ptr) local_unnamed_addr !dbg !15 {
   %2 = alloca i32, align 4
   call void @llvm.dbg.value(metadata ptr %0, metadata !31, metadata !DIExpression()), !dbg !33
-  call void @llvm.lifetime.start.p0(i64 4, ptr nonnull %2) #4, !dbg !34
+  call void @llvm.lifetime.start.p0(i64 4, ptr nonnull %2), !dbg !34
   %3 = tail call ptr @llvm.preserve.struct.access.index.p0.anons.p0.sk_buffs(ptr elementtype(%struct.sk_buff) %0, i32 1, i32 1), !dbg !35, !llvm.preserve.access.index !19
   %4 = tail call ptr @llvm.preserve.array.access.index.p0.anons.p0.anons(ptr elementtype([10 x %struct.anon]) %3, i32 1, i32 5), !dbg !35, !llvm.preserve.access.index !23
   %5 = tail call ptr @llvm.preserve.struct.access.index.p0.p0.anons(ptr elementtype(%struct.anon) %4, i32 0, i32 0), !dbg !35, !llvm.preserve.access.index !24
-  %6 = call i32 inttoptr (i64 4 to ptr)(ptr nonnull %2, i32 4, ptr %5) #4, !dbg !36
+  %6 = call i32 inttoptr (i64 4 to ptr)(ptr nonnull %2, i32 4, ptr %5), !dbg !36
   %7 = load i32, ptr %2, align 4, !dbg !37, !tbaa !38
   call void @llvm.dbg.value(metadata i32 %7, metadata !32, metadata !DIExpression()), !dbg !33
-  call void @llvm.lifetime.end.p0(i64 4, ptr nonnull %2) #4, !dbg !42
+  call void @llvm.lifetime.end.p0(i64 4, ptr nonnull %2), !dbg !42
   ret i32 %7, !dbg !43
 }
 
@@ -140,28 +140,22 @@ define dso_local i32 @bpf_prog(ptr) local_unnamed_addr #0 !dbg !15 {
 ; CHECK-NEXT:        .long   0
 
 ; Function Attrs: argmemonly nounwind
-declare void @llvm.lifetime.start.p0(i64 immarg, ptr nocapture) #1
+declare void @llvm.lifetime.start.p0(i64 immarg, ptr nocapture)
 
 ; Function Attrs: nounwind readnone
-declare ptr @llvm.preserve.struct.access.index.p0.anons.p0.sk_buffs(ptr, i32 immarg, i32 immarg) #2
+declare ptr @llvm.preserve.struct.access.index.p0.anons.p0.sk_buffs(ptr, i32 immarg, i32 immarg)
 
 ; Function Attrs: nounwind readnone
-declare ptr @llvm.preserve.array.access.index.p0.anons.p0.anons(ptr, i32 immarg, i32 immarg) #2
+declare ptr @llvm.preserve.array.access.index.p0.anons.p0.anons(ptr, i32 immarg, i32 immarg)
 
 ; Function Attrs: nounwind readnone
-declare ptr @llvm.preserve.struct.access.index.p0.p0.anons(ptr, i32 immarg, i32 immarg) #2
+declare ptr @llvm.preserve.struct.access.index.p0.p0.anons(ptr, i32 immarg, i32 immarg)
 
 ; Function Attrs: argmemonly nounwind
-declare void @llvm.lifetime.end.p0(i64 immarg, ptr nocapture) #1
+declare void @llvm.lifetime.end.p0(i64 immarg, ptr nocapture)
 
 ; Function Attrs: nounwind readnone speculatable
-declare void @llvm.dbg.value(metadata, metadata, metadata) #3
-
-attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "frame-pointer"="all" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { argmemonly nounwind }
-attributes #2 = { nounwind readnone }
-attributes #3 = { nounwind readnone speculatable }
-attributes #4 = { nounwind }
+declare void @llvm.dbg.value(metadata, metadata, metadata)
 
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!11, !12, !13}
diff --git a/llvm/test/CodeGen/BPF/CORE/offset-reloc-struct-array.ll b/llvm/test/CodeGen/BPF/CORE/offset-reloc-struct-array.ll
index 64ec250..4e51366 100644
--- a/llvm/test/CodeGen/BPF/CORE/offset-reloc-struct-array.ll
+++ b/llvm/test/CodeGen/BPF/CORE/offset-reloc-struct-array.ll
@@ -28,17 +28,17 @@ target triple = "bpf"
 %struct.net_device = type { i32, i32 }
 
 ; Function Attrs: nounwind
-define dso_local i32 @bpf_prog(ptr) local_unnamed_addr #0 !dbg !15 {
+define dso_local i32 @bpf_prog(ptr) local_unnamed_addr !dbg !15 {
   %2 = alloca i32, align 4
   call void @llvm.dbg.value(metadata ptr %0, metadata !31, metadata !DIExpression()), !dbg !33
-  call void @llvm.lifetime.start.p0(i64 4, ptr nonnull %2) #4, !dbg !34
+  call void @llvm.lifetime.start.p0(i64 4, ptr nonnull %2), !dbg !34
   %3 = tail call ptr @llvm.preserve.struct.access.index.p0.net_devices.p0.sk_buffs(ptr elementtype(%struct.sk_buff) %0, i32 1, i32 1), !dbg !35, !llvm.preserve.access.index !19
   %4 = tail call ptr @llvm.preserve.array.access.index.p0.net_devices.p0.net_devices(ptr elementtype([10 x %struct.net_device]) %3, i32 1, i32 5), !dbg !35, !llvm.preserve.access.index !23
   %5 = tail call ptr @llvm.preserve.struct.access.index.p0.p0.net_devices(ptr elementtype(%struct.net_device) %4, i32 0, i32 0), !dbg !35, !llvm.preserve.access.index !24
-  %6 = call i32 inttoptr (i64 4 to ptr)(ptr nonnull %2, i32 4, ptr %5) #4, !dbg !36
+  %6 = call i32 inttoptr (i64 4 to ptr)(ptr nonnull %2, i32 4, ptr %5), !dbg !36
   %7 = load i32, ptr %2, align 4, !dbg !37, !tbaa !38
   call void @llvm.dbg.value(metadata i32 %7, metadata !32, metadata !DIExpression()), !dbg !33
-  call void @llvm.lifetime.end.p0(i64 4, ptr nonnull %2) #4, !dbg !42
+  call void @llvm.lifetime.end.p0(i64 4, ptr nonnull %2), !dbg !42
   ret i32 %7, !dbg !43
 }
 
@@ -143,28 +143,22 @@ define dso_local i32 @bpf_prog(ptr) local_unnamed_addr #0 !dbg !15 {
 ; CHECK-NEXT:        .long   0
 
 ; Function Attrs: argmemonly nounwind
-declare void @llvm.lifetime.start.p0(i64 immarg, ptr nocapture) #1
+declare void @llvm.lifetime.start.p0(i64 immarg, ptr nocapture)
 
 ; Function Attrs: nounwind readnone
-declare ptr @llvm.preserve.struct.access.index.p0.net_devices.p0.sk_buffs(ptr, i32 immarg, i32 immarg) #2
+declare ptr @llvm.preserve.struct.access.index.p0.net_devices.p0.sk_buffs(ptr, i32 immarg, i32 immarg)
 
 ; Function Attrs: nounwind readnone
-declare ptr @llvm.preserve.array.access.index.p0.net_devices.p0.net_devices(ptr, i32 immarg, i32 immarg) #2
+declare ptr @llvm.preserve.array.access.index.p0.net_devices.p0.net_devices(ptr, i32 immarg, i32 immarg)
 
 ; Function Attrs: nounwind readnone
-declare ptr @llvm.preserve.struct.access.index.p0.p0.net_devices(ptr, i32 immarg, i32 immarg) #2
+declare ptr @llvm.preserve.struct.access.index.p0.p0.net_devices(ptr, i32 immarg, i32 immarg)
 
 ; Function Attrs: argmemonly nounwind
-declare void @llvm.lifetime.end.p0(i64 immarg, ptr nocapture) #1
+declare void @llvm.lifetime.end.p0(i64 immarg, ptr nocapture)
 
 ; Function Attrs: nounwind readnone speculatable
-declare void @llvm.dbg.value(metadata, metadata, metadata) #3
-
-attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "frame-pointer"="all" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { argmemonly nounwind }
-attributes #2 = { nounwind readnone }
-attributes #3 = { nounwind readnone speculatable }
-attributes #4 = { nounwind }
+declare void @llvm.dbg.value(metadata, metadata, metadata)
 
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!11, !12, !13}
diff --git a/llvm/test/CodeGen/BPF/CORE/offset-reloc-typedef-array.ll b/llvm/test/CodeGen/BPF/CORE/offset-reloc-typedef-array.ll
index ed462e1..eb0620d 100644
--- a/llvm/test/CodeGen/BPF/CORE/offset-reloc-typedef-array.ll
+++ b/llvm/test/CodeGen/BPF/CORE/offset-reloc-typedef-array.ll
@@ -20,12 +20,12 @@ target triple = "bpf"
 %struct.__s = type { [7 x i32] }
 
 ; Function Attrs: nounwind
-define dso_local i32 @test(ptr %arg) local_unnamed_addr #0 !dbg !7 {
+define dso_local i32 @test(ptr %arg) local_unnamed_addr !dbg !7 {
 entry:
   call void @llvm.dbg.value(metadata ptr %arg, metadata !24, metadata !DIExpression()), !dbg !25
   %0 = tail call ptr @llvm.preserve.struct.access.index.p0.p0.__ss(ptr elementtype(%struct.__s) %arg, i32 0, i32 0), !dbg !26, !llvm.preserve.access.index !13
   %1 = tail call ptr @llvm.preserve.array.access.index.p0.p0(ptr elementtype([7 x i32]) %0, i32 1, i32 1), !dbg !26, !llvm.preserve.access.index !19
-  %call = tail call i32 @get_value(ptr %1) #4, !dbg !27
+  %call = tail call i32 @get_value(ptr %1), !dbg !27
   ret i32 %call, !dbg !28
 }
 
@@ -48,22 +48,16 @@ entry:
 ; CHECK-NEXT:    .long   [[ACCESS_STR]]
 ; CHECK-NEXT:    .long   0
 
-declare dso_local i32 @get_value(ptr) local_unnamed_addr #1
+declare dso_local i32 @get_value(ptr) local_unnamed_addr
 
 ; Function Attrs: nounwind readnone
-declare ptr @llvm.preserve.struct.access.index.p0.p0.__ss(ptr, i32 immarg, i32 immarg) #2
+declare ptr @llvm.preserve.struct.access.index.p0.p0.__ss(ptr, i32 immarg, i32 immarg)
 
 ; Function Attrs: nounwind readnone
-declare ptr @llvm.preserve.array.access.index.p0.p0(ptr, i32 immarg, i32 immarg) #2
+declare ptr @llvm.preserve.array.access.index.p0.p0(ptr, i32 immarg, i32 immarg)
 
 ; Function Attrs: nounwind readnone speculatable
-declare void @llvm.dbg.value(metadata, metadata, metadata) #3
-
-attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #2 = { nounwind readnone }
-attributes #3 = { nounwind readnone speculatable }
-attributes #4 = { nounwind }
+declare void @llvm.dbg.value(metadata, metadata, metadata)
 
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!3, !4, !5}
diff --git a/llvm/test/CodeGen/BPF/CORE/offset-reloc-typedef-struct-2.ll b/llvm/test/CodeGen/BPF/CORE/offset-reloc-typedef-struct-2.ll
index 6b806ae..c4edda1 100644
--- a/llvm/test/CodeGen/BPF/CORE/offset-reloc-typedef-struct-2.ll
+++ b/llvm/test/CodeGen/BPF/CORE/offset-reloc-typedef-struct-2.ll
@@ -18,7 +18,7 @@ target triple = "bpf"
 %struct.__t = type { i32 }
 
 ; Function Attrs: nounwind readonly
-define dso_local i32 @test(ptr readonly %arg) local_unnamed_addr #0 !dbg !13 {
+define dso_local i32 @test(ptr readonly %arg) local_unnamed_addr !dbg !13 {
 entry:
   call void @llvm.dbg.value(metadata ptr %arg, metadata !18, metadata !DIExpression()), !dbg !19
   %0 = tail call ptr @llvm.preserve.struct.access.index.p0.p0.__ts(ptr elementtype(%struct.__t) %arg, i32 0, i32 0), !dbg !20, !llvm.preserve.access.index !4
@@ -50,14 +50,10 @@ entry:
 ; CHECK-NEXT:        .long   0
 
 ; Function Attrs: nounwind readnone
-declare ptr @llvm.preserve.struct.access.index.p0.p0.__ts(ptr, i32, i32) #1
+declare ptr @llvm.preserve.struct.access.index.p0.p0.__ts(ptr, i32, i32)
 
 ; Function Attrs: nounwind readnone speculatable
-declare void @llvm.dbg.value(metadata, metadata, metadata) #2
-
-attributes #0 = { nounwind readonly "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { nounwind readnone }
-attributes #2 = { nounwind readnone speculatable}
+declare void @llvm.dbg.value(metadata, metadata, metadata)
 
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!9, !10, !11}
diff --git a/llvm/test/CodeGen/BPF/CORE/offset-reloc-typedef-struct.ll b/llvm/test/CodeGen/BPF/CORE/offset-reloc-typedef-struct.ll
index c2b5a11..f8cf253 100644
--- a/llvm/test/CodeGen/BPF/CORE/offset-reloc-typedef-struct.ll
+++ b/llvm/test/CodeGen/BPF/CORE/offset-reloc-typedef-struct.ll
@@ -20,11 +20,11 @@ target triple = "bpf"
 %struct.__s = type { i32, i32 }
 
 ; Function Attrs: nounwind
-define dso_local i32 @test(ptr %arg) local_unnamed_addr #0 !dbg !7 {
+define dso_local i32 @test(ptr %arg) local_unnamed_addr !dbg !7 {
 entry:
   call void @llvm.dbg.value(metadata ptr %arg, metadata !21, metadata !DIExpression()), !dbg !22
   %0 = tail call ptr @llvm.preserve.struct.access.index.p0.p0.__ss(ptr elementtype(%struct.__s) %arg, i32 1, i32 1), !dbg !23, !llvm.preserve.access.index !14
-  %call = tail call i32 @get_value(ptr %0) #4, !dbg !24
+  %call = tail call i32 @get_value(ptr %0), !dbg !24
   ret i32 %call, !dbg !25
 }
 
@@ -47,19 +47,13 @@ entry:
 ; CHECK-NEXT:   .long   [[ACCESS_STR]]
 ; CHECK-NEXT:   .long   0
 
-declare dso_local i32 @get_value(ptr) local_unnamed_addr #1
+declare dso_local i32 @get_value(ptr) local_unnamed_addr
 
 ; Function Attrs: nounwind readnone
-declare ptr @llvm.preserve.struct.access.index.p0.p0.__ss(ptr, i32 immarg, i32 immarg) #2
+declare ptr @llvm.preserve.struct.access.index.p0.p0.__ss(ptr, i32 immarg, i32 immarg)
 
 ; Function Attrs: nounwind readnone speculatable
-declare void @llvm.dbg.value(metadata, metadata, metadata) #3
-
-attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #2 = { nounwind readnone }
-attributes #3 = { nounwind readnone speculatable }
-attributes #4 = { nounwind }
+declare void @llvm.dbg.value(metadata, metadata, metadata)
 
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!3, !4, !5}
diff --git a/llvm/test/CodeGen/BPF/CORE/offset-reloc-typedef-union-2.ll b/llvm/test/CodeGen/BPF/CORE/offset-reloc-typedef-union-2.ll
index a63b7e7..0fe7c1f 100644
--- a/llvm/test/CodeGen/BPF/CORE/offset-reloc-typedef-union-2.ll
+++ b/llvm/test/CodeGen/BPF/CORE/offset-reloc-typedef-union-2.ll
@@ -18,7 +18,7 @@ target triple = "bpf"
 %union.__t = type { i32 }
 
 ; Function Attrs: nounwind readonly
-define dso_local i32 @test(ptr readonly %arg) local_unnamed_addr #0 !dbg !13 {
+define dso_local i32 @test(ptr readonly %arg) local_unnamed_addr !dbg !13 {
 entry:
   call void @llvm.dbg.value(metadata ptr %arg, metadata !18, metadata !DIExpression()), !dbg !19
   %0 = tail call ptr @llvm.preserve.union.access.index.p0.__ts.p0.__ts(ptr %arg, i32 0), !dbg !20, !llvm.preserve.access.index !4
@@ -50,14 +50,10 @@ entry:
 ; CHECK-NEXT:        .long   0
 
 ; Function Attrs: nounwind readnone
-declare ptr @llvm.preserve.union.access.index.p0.__ts.p0.__ts(ptr, i32) #1
+declare ptr @llvm.preserve.union.access.index.p0.__ts.p0.__ts(ptr, i32)
 
 ; Function Attrs: nounwind readnone speculatable
-declare void @llvm.dbg.value(metadata, metadata, metadata) #2
-
-attributes #0 = { nounwind readonly "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { nounwind readnone }
-attributes #2 = { nounwind readnone speculatable}
+declare void @llvm.dbg.value(metadata, metadata, metadata)
 
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!9, !10, !11}
diff --git a/llvm/test/CodeGen/BPF/CORE/offset-reloc-typedef-union.ll b/llvm/test/CodeGen/BPF/CORE/offset-reloc-typedef-union.ll
index 4b3d178..aa8705d 100644
--- a/llvm/test/CodeGen/BPF/CORE/offset-reloc-typedef-union.ll
+++ b/llvm/test/CodeGen/BPF/CORE/offset-reloc-typedef-union.ll
@@ -20,11 +20,11 @@ target triple = "bpf"
 %union.__s = type { i32 }
 
 ; Function Attrs: nounwind
-define dso_local i32 @test(ptr %arg) local_unnamed_addr #0 !dbg !7 {
+define dso_local i32 @test(ptr %arg) local_unnamed_addr !dbg !7 {
 entry:
   call void @llvm.dbg.value(metadata ptr %arg, metadata !21, metadata !DIExpression()), !dbg !22
   %0 = tail call ptr @llvm.preserve.union.access.index.p0.__ss.p0.__ss(ptr %arg, i32 1), !dbg !23, !llvm.preserve.access.index !14
-  %call = tail call i32 @get_value(ptr %0) #4, !dbg !24
+  %call = tail call i32 @get_value(ptr %0), !dbg !24
   ret i32 %call, !dbg !25
 }
 
@@ -47,19 +47,13 @@ entry:
 ; CHECK-NEXT:    .long   [[ACCESS_STR]]
 ; CHECK-NEXT:    .long   0
 
-declare dso_local i32 @get_value(ptr) local_unnamed_addr #1
+declare dso_local i32 @get_value(ptr) local_unnamed_addr
 
 ; Function Attrs: nounwind readnone
-declare ptr @llvm.preserve.union.access.index.p0.__ss.p0.__ss(ptr, i32 immarg) #2
+declare ptr @llvm.preserve.union.access.index.p0.__ss.p0.__ss(ptr, i32 immarg)
 
 ; Function Attrs: nounwind readnone speculatable
-declare void @llvm.dbg.value(metadata, metadata, metadata) #3
-
-attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #2 = { nounwind readnone }
-attributes #3 = { nounwind readnone speculatable }
-attributes #4 = { nounwind }
+declare void @llvm.dbg.value(metadata, metadata, metadata)
 
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!3, !4, !5}
diff --git a/llvm/test/CodeGen/BPF/CORE/offset-reloc-typedef.ll b/llvm/test/CodeGen/BPF/CORE/offset-reloc-typedef.ll
index e757327..5195d17 100644
--- a/llvm/test/CodeGen/BPF/CORE/offset-reloc-typedef.ll
+++ b/llvm/test/CodeGen/BPF/CORE/offset-reloc-typedef.ll
@@ -24,13 +24,13 @@ target triple = "bpf"
 %struct.s = type { i32, i32 }
 
 ; Function Attrs: nounwind
-define dso_local i32 @test(ptr %arg) local_unnamed_addr #0 !dbg !7 {
+define dso_local i32 @test(ptr %arg) local_unnamed_addr !dbg !7 {
 entry:
   call void @llvm.dbg.value(metadata ptr %arg, metadata !28, metadata !DIExpression()), !dbg !29
   %0 = tail call ptr @llvm.preserve.array.access.index.p0.us.p0.us(ptr elementtype([7 x %union.u]) %arg, i32 0, i32 1), !dbg !30, !llvm.preserve.access.index !14
   %1 = tail call ptr @llvm.preserve.union.access.index.p0.us.p0.us(ptr %0, i32 1), !dbg !30, !llvm.preserve.access.index !16
   %2 = tail call ptr @llvm.preserve.struct.access.index.p0.p0.ss(ptr elementtype(%struct.s) %1, i32 1, i32 1), !dbg !30, !llvm.preserve.access.index !20
-  %call = tail call i32 @get_value(ptr %2) #4, !dbg !31
+  %call = tail call i32 @get_value(ptr %2), !dbg !31
   ret i32 %call, !dbg !32
 }
 
@@ -53,25 +53,19 @@ entry:
 ; CHECK-NEXT:    .long   [[ACCESS_STR:[0-9]+]]
 ; CHECK-NEXT:    .long   0
 
-declare dso_local i32 @get_value(ptr) local_unnamed_addr #1
+declare dso_local i32 @get_value(ptr) local_unnamed_addr
 
 ; Function Attrs: nounwind readnone
-declare ptr @llvm.preserve.array.access.index.p0.us.p0.us(ptr, i32 immarg, i32 immarg) #2
+declare ptr @llvm.preserve.array.access.index.p0.us.p0.us(ptr, i32 immarg, i32 immarg)
 
 ; Function Attrs: nounwind readnone
-declare ptr @llvm.preserve.union.access.index.p0.us.p0.us(ptr, i32 immarg) #2
+declare ptr @llvm.preserve.union.access.index.p0.us.p0.us(ptr, i32 immarg)
 
 ; Function Attrs: nounwind readnone
-declare ptr @llvm.preserve.struct.access.index.p0.p0.ss(ptr, i32 immarg, i32 immarg) #2
+declare ptr @llvm.preserve.struct.access.index.p0.p0.ss(ptr, i32 immarg, i32 immarg)
 
 ; Function Attrs: nounwind readnone speculatable
-declare void @llvm.dbg.value(metadata, metadata, metadata) #3
-
-attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #2 = { nounwind readnone }
-attributes #3 = { nounwind readnone speculatable }
-attributes #4 = { nounwind }
+declare void @llvm.dbg.value(metadata, metadata, metadata)
 
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!3, !4, !5}
diff --git a/llvm/test/CodeGen/BPF/CORE/offset-reloc-union.ll b/llvm/test/CodeGen/BPF/CORE/offset-reloc-union.ll
index 824eba9a..e156999 100644
--- a/llvm/test/CodeGen/BPF/CORE/offset-reloc-union.ll
+++ b/llvm/test/CodeGen/BPF/CORE/offset-reloc-union.ll
@@ -31,17 +31,17 @@ target triple = "bpf"
 %union.anon = type { i32 }
 
 ; Function Attrs: nounwind
-define dso_local i32 @bpf_prog(ptr) local_unnamed_addr #0 !dbg !15 {
+define dso_local i32 @bpf_prog(ptr) local_unnamed_addr !dbg !15 {
   %2 = alloca i32, align 4
   call void @llvm.dbg.value(metadata ptr %0, metadata !32, metadata !DIExpression()), !dbg !34
-  call void @llvm.lifetime.start.p0(i64 4, ptr nonnull %2) #4, !dbg !35
+  call void @llvm.lifetime.start.p0(i64 4, ptr nonnull %2), !dbg !35
   %3 = tail call ptr @llvm.preserve.union.access.index.p0.sk_buffs.p0.sk_buffs(ptr %0, i32 1), !dbg !36, !llvm.preserve.access.index !19
   %4 = tail call ptr @llvm.preserve.struct.access.index.p0.anons.p0.anons(ptr elementtype(%struct.anon) %3, i32 1, i32 1), !dbg !36, !llvm.preserve.access.index !23
   %5 = tail call ptr @llvm.preserve.union.access.index.p0.anons.p0.anons(ptr %4, i32 0), !dbg !36, !llvm.preserve.access.index !27
-  %6 = call i32 inttoptr (i64 4 to ptr)(ptr nonnull %2, i32 4, ptr %5) #4, !dbg !37
+  %6 = call i32 inttoptr (i64 4 to ptr)(ptr nonnull %2, i32 4, ptr %5), !dbg !37
   %7 = load i32, ptr %2, align 4, !dbg !38, !tbaa !39
   call void @llvm.dbg.value(metadata i32 %7, metadata !33, metadata !DIExpression()), !dbg !34
-  call void @llvm.lifetime.end.p0(i64 4, ptr nonnull %2) #4, !dbg !43
+  call void @llvm.lifetime.end.p0(i64 4, ptr nonnull %2), !dbg !43
   ret i32 %7, !dbg !44
 }
 
@@ -145,28 +145,22 @@ define dso_local i32 @bpf_prog(ptr) local_unnamed_addr #0 !dbg !15 {
 ; CHECK-NEXT:        .long   0
 
 ; Function Attrs: argmemonly nounwind
-declare void @llvm.lifetime.start.p0(i64 immarg, ptr nocapture) #1
+declare void @llvm.lifetime.start.p0(i64 immarg, ptr nocapture)
 
 ; Function Attrs: nounwind readnone
-declare ptr @llvm.preserve.union.access.index.p0.sk_buffs.p0.sk_buffs(ptr, i32 immarg) #2
+declare ptr @llvm.preserve.union.access.index.p0.sk_buffs.p0.sk_buffs(ptr, i32 immarg)
 
 ; Function Attrs: nounwind readnone
-declare ptr @llvm.preserve.struct.access.index.p0.anons.p0.anons(ptr, i32 immarg, i32 immarg) #2
+declare ptr @llvm.preserve.struct.access.index.p0.anons.p0.anons(ptr, i32 immarg, i32 immarg)
 
 ; Function Attrs: nounwind readnone
-declare ptr @llvm.preserve.union.access.index.p0.anons.p0.anons(ptr, i32 immarg) #2
+declare ptr @llvm.preserve.union.access.index.p0.anons.p0.anons(ptr, i32 immarg)
 
 ; Function Attrs: argmemonly nounwind
-declare void @llvm.lifetime.end.p0(i64 immarg, ptr nocapture) #1
+declare void @llvm.lifetime.end.p0(i64 immarg, ptr nocapture)
 
 ; Function Attrs: nounwind readnone speculatable
-declare void @llvm.dbg.value(metadata, metadata, metadata) #3
-
-attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "frame-pointer"="all" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { argmemonly nounwind }
-attributes #2 = { nounwind readnone }
-attributes #3 = { nounwind readnone speculatable }
-attributes #4 = { nounwind }
+declare void @llvm.dbg.value(metadata, metadata, metadata)
 
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!11, !12, !13}
diff --git a/llvm/test/CodeGen/BPF/CORE/store-addr.ll b/llvm/test/CodeGen/BPF/CORE/store-addr.ll
index 33bbd01..2c8a0c4 100644
--- a/llvm/test/CodeGen/BPF/CORE/store-addr.ll
+++ b/llvm/test/CodeGen/BPF/CORE/store-addr.ll
@@ -22,17 +22,17 @@ target triple = "bpf"
 %struct.t = type { i32 }
 
 ; Function Attrs: nounwind
-define dso_local i32 @test(ptr %arg) local_unnamed_addr #0 !dbg !14 {
+define dso_local i32 @test(ptr %arg) local_unnamed_addr !dbg !14 {
 entry:
   %param = alloca [1 x i64], align 8
   call void @llvm.dbg.value(metadata ptr %arg, metadata !22, metadata !DIExpression()), !dbg !27
-  call void @llvm.lifetime.start.p0(i64 8, ptr nonnull %param) #5, !dbg !28
+  call void @llvm.lifetime.start.p0(i64 8, ptr nonnull %param), !dbg !28
   call void @llvm.dbg.declare(metadata ptr %param, metadata !23, metadata !DIExpression()), !dbg !29
   %0 = tail call ptr @llvm.preserve.struct.access.index.p0.p0.ts(ptr elementtype(%struct.t) %arg, i32 0, i32 0), !dbg !30, !llvm.preserve.access.index !18
   %1 = ptrtoint ptr %0 to i64, !dbg !31
   store i64 %1, ptr %param, align 8, !dbg !33, !tbaa !34
-  %call = call i32 @foo(ptr nonnull %param) #5, !dbg !38
-  call void @llvm.lifetime.end.p0(i64 8, ptr nonnull %param) #5, !dbg !39
+  %call = call i32 @foo(ptr nonnull %param), !dbg !38
+  call void @llvm.lifetime.end.p0(i64 8, ptr nonnull %param), !dbg !39
   ret i32 %call, !dbg !40
 }
 
@@ -41,28 +41,21 @@ entry:
 ; CHECK:  *(u64 *)(r10 - 8) = r1
 
 ; Function Attrs: nounwind readnone speculatable
-declare void @llvm.dbg.declare(metadata, metadata, metadata) #1
+declare void @llvm.dbg.declare(metadata, metadata, metadata)
 
 ; Function Attrs: argmemonly nounwind
-declare void @llvm.lifetime.start.p0(i64, ptr nocapture) #2
+declare void @llvm.lifetime.start.p0(i64, ptr nocapture)
 
 ; Function Attrs: nounwind readnone
-declare ptr @llvm.preserve.struct.access.index.p0.p0.ts(ptr, i32, i32) #3
+declare ptr @llvm.preserve.struct.access.index.p0.p0.ts(ptr, i32, i32)
 
-declare !dbg !5 dso_local i32 @foo(ptr) local_unnamed_addr #4
+declare !dbg !5 dso_local i32 @foo(ptr) local_unnamed_addr
 
 ; Function Attrs: argmemonly nounwind
-declare void @llvm.lifetime.end.p0(i64, ptr nocapture) #2
+declare void @llvm.lifetime.end.p0(i64, ptr nocapture)
 
 ; Function Attrs: nounwind readnone speculatable
-declare void @llvm.dbg.value(metadata, metadata, metadata) #1
-
-attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { nounwind readnone speculatable }
-attributes #2 = { argmemonly nounwind }
-attributes #3 = { nounwind readnone }
-attributes #4 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #5 = { nounwind }
+declare void @llvm.dbg.value(metadata, metadata, metadata)
 
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!10, !11, !12}
diff --git a/llvm/test/CodeGen/BPF/adjust-opt-icmp1.ll b/llvm/test/CodeGen/BPF/adjust-opt-icmp1.ll
index 8a4b37d..09ca422 100644
--- a/llvm/test/CodeGen/BPF/adjust-opt-icmp1.ll
+++ b/llvm/test/CodeGen/BPF/adjust-opt-icmp1.ll
@@ -20,12 +20,12 @@
 ;   clang -target bpf -O2 -S -emit-llvm -Xclang -disable-llvm-passes test.c
 
 ; Function Attrs: nounwind
-define dso_local i32 @test() #0 {
+define dso_local i32 @test() {
 entry:
   %retval = alloca i32, align 4
   %ret = alloca i32, align 4
   %cleanup.dest.slot = alloca i32, align 4
-  call void @llvm.lifetime.start.p0(i64 4, ptr %ret) #3
+  call void @llvm.lifetime.start.p0(i64 4, ptr %ret)
   %call = call i32 @foo()
   store i32 %call, ptr %ret, align 4, !tbaa !2
   %0 = load i32, ptr %ret, align 4, !tbaa !2
@@ -62,25 +62,20 @@ if.end:                                           ; preds = %lor.lhs.false
   br label %cleanup
 
 cleanup:                                          ; preds = %if.end, %if.then
-  call void @llvm.lifetime.end.p0(i64 4, ptr %ret) #3
+  call void @llvm.lifetime.end.p0(i64 4, ptr %ret)
   %3 = load i32, ptr %retval, align 4
   ret i32 %3
 }
 
 ; Function Attrs: argmemonly nounwind willreturn
-declare void @llvm.lifetime.start.p0(i64 immarg, ptr nocapture) #1
+declare void @llvm.lifetime.start.p0(i64 immarg, ptr nocapture)
 
-declare dso_local i32 @foo(...) #2
+declare dso_local i32 @foo(...)
 
-declare dso_local i32 @bar(i32) #2
+declare dso_local i32 @bar(i32)
 
 ; Function Attrs: argmemonly nounwind willreturn
-declare void @llvm.lifetime.end.p0(i64 immarg, ptr nocapture) #1
-
-attributes #0 = { nounwind "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { argmemonly nounwind willreturn }
-attributes #2 = { "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #3 = { nounwind }
+declare void @llvm.lifetime.end.p0(i64 immarg, ptr nocapture)
 
 !llvm.module.flags = !{!0}
 !llvm.ident = !{!1}
diff --git a/llvm/test/CodeGen/BPF/adjust-opt-icmp2.ll b/llvm/test/CodeGen/BPF/adjust-opt-icmp2.ll
index ad157fe..bbda062 100644
--- a/llvm/test/CodeGen/BPF/adjust-opt-icmp2.ll
+++ b/llvm/test/CodeGen/BPF/adjust-opt-icmp2.ll
@@ -18,12 +18,12 @@
 ;   clang -target bpf -O2 -S -emit-llvm -Xclang -disable-llvm-passes test.c
 
 ; Function Attrs: nounwind
-define dso_local i32 @test() #0 {
+define dso_local i32 @test() {
 entry:
   %retval = alloca i32, align 4
   %ret = alloca i32, align 4
   %cleanup.dest.slot = alloca i32, align 4
-  call void @llvm.lifetime.start.p0(i64 4, ptr %ret) #3
+  call void @llvm.lifetime.start.p0(i64 4, ptr %ret)
   %call = call i32 @foo()
   store i32 %call, ptr %ret, align 4, !tbaa !2
   %0 = load i32, ptr %ret, align 4, !tbaa !2
@@ -65,25 +65,20 @@ if.end3:                                          ; preds = %if.end
   br label %cleanup
 
 cleanup:                                          ; preds = %if.end3, %if.then2, %if.then
-  call void @llvm.lifetime.end.p0(i64 4, ptr %ret) #3
+  call void @llvm.lifetime.end.p0(i64 4, ptr %ret)
   %3 = load i32, ptr %retval, align 4
   ret i32 %3
 }
 
 ; Function Attrs: argmemonly nounwind willreturn
-declare void @llvm.lifetime.start.p0(i64 immarg, ptr nocapture) #1
+declare void @llvm.lifetime.start.p0(i64 immarg, ptr nocapture)
 
-declare dso_local i32 @foo(...) #2
+declare dso_local i32 @foo(...)
 
-declare dso_local i32 @bar(i32) #2
+declare dso_local i32 @bar(i32)
 
 ; Function Attrs: argmemonly nounwind willreturn
-declare void @llvm.lifetime.end.p0(i64 immarg, ptr nocapture) #1
-
-attributes #0 = { nounwind "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { argmemonly nounwind willreturn }
-attributes #2 = { "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #3 = { nounwind }
+declare void @llvm.lifetime.end.p0(i64 immarg, ptr nocapture)
 
 !llvm.module.flags = !{!0}
 !llvm.ident = !{!1}
diff --git a/llvm/test/CodeGen/BPF/adjust-opt-speculative1.ll b/llvm/test/CodeGen/BPF/adjust-opt-speculative1.ll
index d118fa0..d34d652 100644
--- a/llvm/test/CodeGen/BPF/adjust-opt-speculative1.ll
+++ b/llvm/test/CodeGen/BPF/adjust-opt-speculative1.ll
@@ -15,12 +15,12 @@
 ;   clang -target bpf -O2 -S -emit-llvm -Xclang -disable-llvm-passes test.c
 
 ; Function Attrs: nounwind
-define dso_local ptr @test(ptr %p) #0 {
+define dso_local ptr @test(ptr %p) {
 entry:
   %p.addr = alloca ptr, align 8
   %ret = alloca i64, align 8
   store ptr %p, ptr %p.addr, align 8, !tbaa !2
-  call void @llvm.lifetime.start.p0(i64 8, ptr %ret) #3
+  call void @llvm.lifetime.start.p0(i64 8, ptr %ret)
   %call = call i64 @foo()
   store i64 %call, ptr %ret, align 8, !tbaa !6
   %0 = load i64, ptr %ret, align 8, !tbaa !6
@@ -36,7 +36,7 @@ if.then:                                          ; preds = %entry
 
 if.end:                                           ; preds = %if.then, %entry
   %3 = load ptr, ptr %p.addr, align 8, !tbaa !2
-  call void @llvm.lifetime.end.p0(i64 8, ptr %ret) #3
+  call void @llvm.lifetime.end.p0(i64 8, ptr %ret)
   ret ptr %3
 }
 ; CHECK-COMMON:  [[REG6:r[0-9]+]] = r1
@@ -57,17 +57,12 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK-COMMON:  exit
 
 ; Function Attrs: argmemonly nounwind willreturn
-declare void @llvm.lifetime.start.p0(i64 immarg, ptr nocapture) #1
+declare void @llvm.lifetime.start.p0(i64 immarg, ptr nocapture)
 
-declare dso_local i64 @foo(...) #2
+declare dso_local i64 @foo(...)
 
 ; Function Attrs: argmemonly nounwind willreturn
-declare void @llvm.lifetime.end.p0(i64 immarg, ptr nocapture) #1
-
-attributes #0 = { nounwind "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { argmemonly nounwind willreturn }
-attributes #2 = { "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #3 = { nounwind }
+declare void @llvm.lifetime.end.p0(i64 immarg, ptr nocapture)
 
 !llvm.module.flags = !{!0}
 !llvm.ident = !{!1}
diff --git a/llvm/test/CodeGen/BPF/adjust-opt-speculative2.ll b/llvm/test/CodeGen/BPF/adjust-opt-speculative2.ll
index 218fa5d..5f3fa94 100644
--- a/llvm/test/CodeGen/BPF/adjust-opt-speculative2.ll
+++ b/llvm/test/CodeGen/BPF/adjust-opt-speculative2.ll
@@ -15,12 +15,12 @@
 ;   clang -target bpf -O2 -S -emit-llvm -Xclang -disable-llvm-passes test.c
 
 ; Function Attrs: nounwind
-define dso_local ptr @test(ptr %p) #0 {
+define dso_local ptr @test(ptr %p) {
 entry:
   %p.addr = alloca ptr, align 8
   %ret = alloca i32, align 4
   store ptr %p, ptr %p.addr, align 8, !tbaa !2
-  call void @llvm.lifetime.start.p0(i64 4, ptr %ret) #3
+  call void @llvm.lifetime.start.p0(i64 4, ptr %ret)
   %call = call i32 @foo()
   store i32 %call, ptr %ret, align 4, !tbaa !6
   %0 = load i32, ptr %ret, align 4, !tbaa !6
@@ -37,7 +37,7 @@ if.then:                                          ; preds = %entry
 
 if.end:                                           ; preds = %if.then, %entry
   %3 = load ptr, ptr %p.addr, align 8, !tbaa !2
-  call void @llvm.lifetime.end.p0(i64 4, ptr %ret) #3
+  call void @llvm.lifetime.end.p0(i64 4, ptr %ret)
   ret ptr %3
 }
 
@@ -66,17 +66,12 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK-COMMON:  exit
 
 ; Function Attrs: argmemonly nounwind willreturn
-declare void @llvm.lifetime.start.p0(i64 immarg, ptr nocapture) #1
+declare void @llvm.lifetime.start.p0(i64 immarg, ptr nocapture)
 
-declare dso_local i32 @foo(...) #2
+declare dso_local i32 @foo(...)
 
 ; Function Attrs: argmemonly nounwind willreturn
-declare void @llvm.lifetime.end.p0(i64 immarg, ptr nocapture) #1
-
-attributes #0 = { nounwind "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { argmemonly nounwind willreturn }
-attributes #2 = { "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #3 = { nounwind }
+declare void @llvm.lifetime.end.p0(i64 immarg, ptr nocapture)
 
 !llvm.module.flags = !{!0}
 !llvm.ident = !{!1}
diff --git a/llvm/test/CodeGen/BPF/callx.ll b/llvm/test/CodeGen/BPF/callx.ll
index d83e0f6..e027c1f 100644
--- a/llvm/test/CodeGen/BPF/callx.ll
+++ b/llvm/test/CodeGen/BPF/callx.ll
@@ -3,16 +3,13 @@
 ;   int test(int (*f)(void)) { return f(); }
 
 ; Function Attrs: nounwind
-define dso_local i32 @test(ptr nocapture %f) local_unnamed_addr #0 {
+define dso_local i32 @test(ptr nocapture %f) local_unnamed_addr {
 entry:
-  %call = tail call i32 %f() #1
+  %call = tail call i32 %f()
 ; CHECK: callx r{{[0-9]+}}
   ret i32 %call
 }
 
-attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { nounwind }
-
 !llvm.module.flags = !{!0}
 !llvm.ident = !{!1}
 
diff --git a/llvm/test/CodeGen/BPF/dwarfdump.ll b/llvm/test/CodeGen/BPF/dwarfdump.ll
index a3a6b52..d50c025 100644
--- a/llvm/test/CodeGen/BPF/dwarfdump.ll
+++ b/llvm/test/CodeGen/BPF/dwarfdump.ll
@@ -10,7 +10,7 @@ target triple = "bpf"
 @testprog.myvar_c = internal unnamed_addr global i32 0, align 4, !dbg !0
 
 ; Function Attrs: nounwind
-define i32 @testprog(i32, i32) local_unnamed_addr #0 !dbg !2 {
+define i32 @testprog(i32, i32) local_unnamed_addr !dbg !2 {
   tail call void @llvm.dbg.value(metadata i32 %0, i64 0, metadata !11, metadata !16), !dbg !17
   tail call void @llvm.dbg.value(metadata i32 %1, i64 0, metadata !12, metadata !16), !dbg !18
   %3 = load i32, ptr @testprog.myvar_c, align 4, !dbg !19, !tbaa !20
@@ -21,10 +21,7 @@ define i32 @testprog(i32, i32) local_unnamed_addr #0 !dbg !2 {
 }
 
 ; Function Attrs: nounwind readnone
-declare void @llvm.dbg.value(metadata, i64, metadata, metadata) #1
-
-attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { nounwind readnone }
+declare void @llvm.dbg.value(metadata, i64, metadata, metadata)
 
 !llvm.dbg.cu = !{!7}
 !llvm.module.flags = !{!13, !14}
diff --git a/llvm/test/CodeGen/BPF/i128.ll b/llvm/test/CodeGen/BPF/i128.ll
index a966e3e..3c94e0c 100644
--- a/llvm/test/CodeGen/BPF/i128.ll
+++ b/llvm/test/CodeGen/BPF/i128.ll
@@ -19,14 +19,14 @@
 %struct.ipv6_key_t = type { i32, i128, i16 }
 
 ; Function Attrs: nounwind
-define dso_local i32 @test(i32 %pid) local_unnamed_addr #0 {
+define dso_local i32 @test(i32 %pid) local_unnamed_addr {
 entry:
   %ipv6_key = alloca %struct.ipv6_key_t, align 16
-  call void @llvm.lifetime.start.p0(i64 48, ptr nonnull %ipv6_key) #4
+  call void @llvm.lifetime.start.p0(i64 48, ptr nonnull %ipv6_key)
   call void @llvm.memset.p0.i64(ptr nonnull align 16 dereferenceable(48) %ipv6_key, i8 0, i64 48, i1 false)
   store i32 %pid, ptr %ipv6_key, align 16, !tbaa !2
-  call void @test1(ptr nonnull %ipv6_key) #4
-  call void @llvm.lifetime.end.p0(i64 48, ptr nonnull %ipv6_key) #4
+  call void @test1(ptr nonnull %ipv6_key)
+  call void @llvm.lifetime.end.p0(i64 48, ptr nonnull %ipv6_key)
   ret i32 0
 }
 
@@ -35,21 +35,15 @@ entry:
 ; CHECK:       *(u32 *)(r10 - 48) = r{{[0-9]+}}
 
 ; Function Attrs: argmemonly nounwind willreturn
-declare void @llvm.lifetime.start.p0(i64 immarg, ptr nocapture) #1
+declare void @llvm.lifetime.start.p0(i64 immarg, ptr nocapture)
 
 ; Function Attrs: argmemonly nounwind willreturn writeonly
-declare void @llvm.memset.p0.i64(ptr nocapture writeonly, i8, i64, i1 immarg) #2
+declare void @llvm.memset.p0.i64(ptr nocapture writeonly, i8, i64, i1 immarg)
 
-declare dso_local void @test1(ptr) local_unnamed_addr #3
+declare dso_local void @test1(ptr) local_unnamed_addr
 
 ; Function Attrs: argmemonly nounwind willreturn
-declare void @llvm.lifetime.end.p0(i64 immarg, ptr nocapture) #1
-
-attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { argmemonly nounwind willreturn }
-attributes #2 = { argmemonly nounwind willreturn writeonly }
-attributes #3 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #4 = { nounwind }
+declare void @llvm.lifetime.end.p0(i64 immarg, ptr nocapture)
 
 !llvm.module.flags = !{!0}
 !llvm.ident = !{!1}
diff --git a/llvm/test/CodeGen/BPF/is_trunc_free.ll b/llvm/test/CodeGen/BPF/is_trunc_free.ll
index fe00731..6bb8568 100644
--- a/llvm/test/CodeGen/BPF/is_trunc_free.ll
+++ b/llvm/test/CodeGen/BPF/is_trunc_free.ll
@@ -29,7 +29,7 @@
 %struct.env_t = type { i32, i32 }
 
 ; Function Attrs: nounwind
-define dso_local i32 @test(ptr %skb) local_unnamed_addr #0 {
+define dso_local i32 @test(ptr %skb) local_unnamed_addr {
 entry:
   %data_end1 = getelementptr inbounds %struct.env_t, ptr %skb, i64 0, i32 1
   %0 = load i32, ptr %data_end1, align 4, !tbaa !2
@@ -49,7 +49,7 @@ if.end10:                                         ; preds = %entry
   %sub.ptr.lhs.cast = ptrtoint ptr %add.ptr to i64
   %4 = trunc i64 %sub.ptr.lhs.cast to i32
   %conv13 = sub i32 %4, %2
-  %call = tail call i32 @work(ptr nonnull %skb, i32 %conv13) #2
+  %call = tail call i32 @work(ptr nonnull %skb, i32 %conv13)
   br label %cleanup
 
 cleanup:                                          ; preds = %entry, %if.end10
@@ -59,11 +59,7 @@ cleanup:                                          ; preds = %entry, %if.end10
 
 ; CHECK: w{{[0-9]+}} = *(u32 *)(r{{[0-9]+}} + 0)
 
-declare dso_local i32 @work(ptr, i32) local_unnamed_addr #1
-
-attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #2 = { nounwind }
+declare dso_local i32 @work(ptr, i32) local_unnamed_addr
 
 !llvm.module.flags = !{!0}
 !llvm.ident = !{!1}
diff --git a/llvm/test/CodeGen/BPF/is_zext_free.ll b/llvm/test/CodeGen/BPF/is_zext_free.ll
index 4b81a90..3b794a9 100644
--- a/llvm/test/CodeGen/BPF/is_zext_free.ll
+++ b/llvm/test/CodeGen/BPF/is_zext_free.ll
@@ -7,7 +7,7 @@
 ;   clang -target bpf -O2 -emit-llvm -S test.c
 
 ; Function Attrs: norecurse nounwind readnone
-define dso_local i32 @test(i64 %x, i64 %y) local_unnamed_addr #0 {
+define dso_local i32 @test(i64 %x, i64 %y) local_unnamed_addr {
 entry:
   %and = and i64 %y, %x
   %conv = trunc i64 %and to i32
@@ -17,8 +17,6 @@ entry:
 ; CHECK: r[[REG1:[0-9]+]] = r{{[0-9]+}}
 ; CHECK: w[[REG1]] &= w{{[0-9]+}}
 
-attributes #0 = { norecurse nounwind readnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-
 !llvm.module.flags = !{!0}
 !llvm.ident = !{!1}
 
diff --git a/llvm/test/CodeGen/BPF/objdump_two_funcs.ll b/llvm/test/CodeGen/BPF/objdump_two_funcs.ll
index fb1043c..8158a1b 100644
--- a/llvm/test/CodeGen/BPF/objdump_two_funcs.ll
+++ b/llvm/test/CodeGen/BPF/objdump_two_funcs.ll
@@ -14,7 +14,7 @@
 ;   clang -target bpf -S -gdwarf-5 -gembed-source -emit-llvm -g -O2 bug.c
 
 ; Function Attrs: norecurse nounwind readnone
-define dso_local i32 @func1(i32 %a) local_unnamed_addr #0 section "s1" !dbg !7 {
+define dso_local i32 @func1(i32 %a) local_unnamed_addr section "s1" !dbg !7 {
 entry:
 ; CHECK: <func1>:
   call void @llvm.dbg.value(metadata i32 %a, metadata !12, metadata !DIExpression()), !dbg !13
@@ -24,7 +24,7 @@ entry:
 }
 
 ; Function Attrs: norecurse nounwind readnone
-define dso_local i32 @func2(i32 %a) local_unnamed_addr #0 section "s2" !dbg !16 {
+define dso_local i32 @func2(i32 %a) local_unnamed_addr section "s2" !dbg !16 {
 entry:
 ; CHECK: <func2>:
   call void @llvm.dbg.value(metadata i32 %a, metadata !18, metadata !DIExpression()), !dbg !19
@@ -35,10 +35,7 @@ entry:
 }
 
 ; Function Attrs: nounwind readnone speculatable
-declare void @llvm.dbg.value(metadata, metadata, metadata) #1
-
-attributes #0 = { norecurse nounwind readnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "frame-pointer"="all" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { nounwind readnone speculatable }
+declare void @llvm.dbg.value(metadata, metadata, metadata)
 
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!3, !4, !5}
diff --git a/llvm/test/CodeGen/BPF/optnone-1.ll b/llvm/test/CodeGen/BPF/optnone-1.ll
index 68046bf..f45c85b 100644
--- a/llvm/test/CodeGen/BPF/optnone-1.ll
+++ b/llvm/test/CodeGen/BPF/optnone-1.ll
@@ -5,7 +5,7 @@
 ;   clang -target bpf -g -S -emit-llvm test.c
 
 ; Function Attrs: noinline nounwind optnone
-define dso_local i32 @test(i32 %a, i32 %b) #0 !dbg !7 {
+define dso_local i32 @test(i32 %a, i32 %b) !dbg !7 {
 entry:
   %a.addr = alloca i32, align 4
   %b.addr = alloca i32, align 4
@@ -22,10 +22,7 @@ entry:
 ; CHECK-LABEL: test
 
 ; Function Attrs: nounwind readnone speculatable
-declare void @llvm.dbg.declare(metadata, metadata, metadata) #1
-
-attributes #0 = { noinline nounwind optnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { nounwind readnone speculatable}
+declare void @llvm.dbg.declare(metadata, metadata, metadata)
 
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!3, !4, !5}
diff --git a/llvm/test/CodeGen/BPF/reloc-btf-2.ll b/llvm/test/CodeGen/BPF/reloc-btf-2.ll
index 7398257..430abc7 100644
--- a/llvm/test/CodeGen/BPF/reloc-btf-2.ll
+++ b/llvm/test/CodeGen/BPF/reloc-btf-2.ll
@@ -14,7 +14,7 @@
 @s = internal global i32 0, align 4, !dbg !6
 
 ; Function Attrs: norecurse nounwind
-define dso_local i32 @test() local_unnamed_addr #0 !dbg !14 {
+define dso_local i32 @test() local_unnamed_addr !dbg !14 {
   %1 = load i32, ptr @g, align 4, !dbg !17, !tbaa !18
   %2 = load volatile i32, ptr @s, align 4, !dbg !22, !tbaa !18
   %3 = add nsw i32 %2, %1, !dbg !23
@@ -27,8 +27,6 @@ define dso_local i32 @test() local_unnamed_addr #0 !dbg !14 {
 ; CHECK-RELOC: R_BPF_64_NODYLD32 g
 ; CHECK-RELOC: RELOCATION RECORDS FOR [.BTF.ext]:
 
-attributes #0 = { norecurse nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "frame-pointer"="all" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-
 !llvm.dbg.cu = !{!2}
 !llvm.module.flags = !{!10, !11, !12}
 !llvm.ident = !{!13}
diff --git a/llvm/test/CodeGen/BPF/reloc-btf.ll b/llvm/test/CodeGen/BPF/reloc-btf.ll
index b9f6e3a..875bfa1 100644
--- a/llvm/test/CodeGen/BPF/reloc-btf.ll
+++ b/llvm/test/CodeGen/BPF/reloc-btf.ll
@@ -1,7 +1,7 @@
 ; RUN: llc -mtriple=bpfel -filetype=obj < %s | llvm-objdump -r - | FileCheck --check-prefix=CHECK-RELOC %s
 
 ; Function Attrs: norecurse nounwind readnone
-define dso_local i32 @test() local_unnamed_addr #0 !dbg !7 {
+define dso_local i32 @test() local_unnamed_addr !dbg !7 {
 entry:
   ret i32 0, !dbg !11
 }
@@ -13,8 +13,6 @@ entry:
 ; CHECK-RELOC: RELOCATION RECORDS FOR [.BTF.ext]:
 ; CHECK-RELOC: R_BPF_64_NODYLD32 .text
 
-attributes #0 = { norecurse nounwind readnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "frame-pointer"="all" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!3, !4, !5}
 !llvm.ident = !{!6}
diff --git a/llvm/test/CodeGen/BPF/simplifycfg.ll b/llvm/test/CodeGen/BPF/simplifycfg.ll
index fcd2321..d53b51a 100644
--- a/llvm/test/CodeGen/BPF/simplifycfg.ll
+++ b/llvm/test/CodeGen/BPF/simplifycfg.ll
@@ -38,15 +38,15 @@ target triple = "bpf"
 %struct.FrameData = type { ptr }
 
 ; Function Attrs: nounwind
-define dso_local i32 @test() #0 {
+define dso_local i32 @test() {
 entry:
   %frame_ptr = alloca ptr, align 8
   %frame = alloca %struct.FrameData, align 8
   %i = alloca i32, align 4
-  call void @llvm.lifetime.start.p0(i64 8, ptr %frame_ptr) #3
-  call void @llvm.lifetime.start.p0(i64 8, ptr %frame) #3
+  call void @llvm.lifetime.start.p0(i64 8, ptr %frame_ptr)
+  call void @llvm.lifetime.start.p0(i64 8, ptr %frame)
   call void @get_frame_ptr(ptr %frame_ptr)
-  call void @llvm.lifetime.start.p0(i64 4, ptr %i) #3
+  call void @llvm.lifetime.start.p0(i64 4, ptr %i)
   store i32 0, ptr %i, align 4, !tbaa !2
   br label %for.cond
 
@@ -61,7 +61,7 @@ for.cond:                                         ; preds = %for.inc, %entry
   br i1 %cmp, label %for.body, label %for.cond.cleanup
 
 for.cond.cleanup:                                 ; preds = %for.cond
-  call void @llvm.lifetime.end.p0(i64 4, ptr %i) #3
+  call void @llvm.lifetime.end.p0(i64 4, ptr %i)
   br label %for.end
 
 for.body:                                         ; preds = %for.cond
@@ -93,25 +93,20 @@ for.end:                                          ; preds = %for.cond.cleanup
   %5 = load ptr, ptr %frame_ptr, align 8, !tbaa !6
   %cmp2 = icmp eq ptr %5, null
   %conv = zext i1 %cmp2 to i32
-  call void @llvm.lifetime.end.p0(i64 8, ptr %frame) #3
-  call void @llvm.lifetime.end.p0(i64 8, ptr %frame_ptr) #3
+  call void @llvm.lifetime.end.p0(i64 8, ptr %frame)
+  call void @llvm.lifetime.end.p0(i64 8, ptr %frame_ptr)
   ret i32 %conv
 }
 
 ; Function Attrs: argmemonly nounwind willreturn
-declare void @llvm.lifetime.start.p0(i64 immarg, ptr nocapture) #1
+declare void @llvm.lifetime.start.p0(i64 immarg, ptr nocapture)
 
-declare dso_local void @get_frame_ptr(ptr) #2
+declare dso_local void @get_frame_ptr(ptr)
 
-declare dso_local i32 @get_data(ptr, ptr) #2
+declare dso_local i32 @get_data(ptr, ptr)
 
 ; Function Attrs: argmemonly nounwind willreturn
-declare void @llvm.lifetime.end.p0(i64 immarg, ptr nocapture) #1
-
-attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { argmemonly nounwind willreturn }
-attributes #2 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #3 = { nounwind }
+declare void @llvm.lifetime.end.p0(i64 immarg, ptr nocapture)
 
 !llvm.module.flags = !{!0}
 !llvm.ident = !{!1}
diff --git a/llvm/test/CodeGen/BPF/warn-stack.ll b/llvm/test/CodeGen/BPF/warn-stack.ll
index 58a6e4c..5e62a91 100644
--- a/llvm/test/CodeGen/BPF/warn-stack.ll
+++ b/llvm/test/CodeGen/BPF/warn-stack.ll
@@ -1,43 +1,37 @@
 ; RUN: not llc -mtriple=bpfel < %s 2>&1 >/dev/null | FileCheck %s
 
 ;; CHECK-NOT: nowarn
-define void @nowarn() local_unnamed_addr #0 !dbg !6 {
+define void @nowarn() local_unnamed_addr !dbg !6 {
   %1 = alloca [504 x i8], align 1
-  call void @llvm.lifetime.start.p0(i64 504, ptr nonnull %1) #4, !dbg !15
+  call void @llvm.lifetime.start.p0(i64 504, ptr nonnull %1), !dbg !15
   tail call void @llvm.dbg.declare(metadata ptr %1, metadata !10, metadata !16), !dbg !17
-  call void @doit(ptr nonnull %1) #4, !dbg !18
-  call void @llvm.lifetime.end.p0(i64 504, ptr nonnull %1) #4, !dbg !19
+  call void @doit(ptr nonnull %1), !dbg !18
+  call void @llvm.lifetime.end.p0(i64 504, ptr nonnull %1), !dbg !19
   ret void, !dbg !19
 }
 
 ; Function Attrs: argmemonly nounwind
-declare void @llvm.lifetime.start.p0(i64, ptr nocapture) #1
+declare void @llvm.lifetime.start.p0(i64, ptr nocapture)
 
 ; Function Attrs: nounwind readnone
-declare void @llvm.dbg.declare(metadata, metadata, metadata) #2
+declare void @llvm.dbg.declare(metadata, metadata, metadata)
 
-declare void @doit(ptr) local_unnamed_addr #3
+declare void @doit(ptr) local_unnamed_addr
 
 ; Function Attrs: argmemonly nounwind
-declare void @llvm.lifetime.end.p0(i64, ptr nocapture) #1
+declare void @llvm.lifetime.end.p0(i64, ptr nocapture)
 
 ; CHECK: error: warn_stack.c
 ; CHECK: BPF stack limit
-define void @warn() local_unnamed_addr #0 !dbg !20 {
+define void @warn() local_unnamed_addr !dbg !20 {
   %1 = alloca [512 x i8], align 1
-  call void @llvm.lifetime.start.p0(i64 512, ptr nonnull %1) #4, !dbg !26
+  call void @llvm.lifetime.start.p0(i64 512, ptr nonnull %1), !dbg !26
   tail call void @llvm.dbg.declare(metadata ptr %1, metadata !22, metadata !16), !dbg !27
-  call void @doit(ptr nonnull %1) #4, !dbg !28
-  call void @llvm.lifetime.end.p0(i64 512, ptr nonnull %1) #4, !dbg !29
+  call void @doit(ptr nonnull %1), !dbg !28
+  call void @llvm.lifetime.end.p0(i64 512, ptr nonnull %1), !dbg !29
   ret void, !dbg !29
 }
 
-attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { argmemonly nounwind }
-attributes #2 = { nounwind readnone }
-attributes #3 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #4 = { nounwind }
-
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!3, !4}
 !llvm.ident = !{!5}
diff --git a/llvm/test/CodeGen/BPF/xadd.ll b/llvm/test/CodeGen/BPF/xadd.ll
index a3ec323..8d232ffb 100644
--- a/llvm/test/CodeGen/BPF/xadd.ll
+++ b/llvm/test/CodeGen/BPF/xadd.ll
@@ -17,7 +17,7 @@ target datalayout = "e-m:e-p:64:64-i64:64-n32:64-S128"
 target triple = "bpf"
 
 ; Function Attrs: nounwind
-define dso_local i32 @test(ptr nocapture %ptr) local_unnamed_addr #0 !dbg !7 {
+define dso_local i32 @test(ptr nocapture %ptr) local_unnamed_addr !dbg !7 {
 entry:
   call void @llvm.dbg.value(metadata ptr %ptr, metadata !13, metadata !DIExpression()), !dbg !15
   %0 = atomicrmw add ptr %ptr, i32 4 seq_cst, !dbg !16
@@ -28,10 +28,7 @@ entry:
 }
 
 ; Function Attrs: nounwind readnone speculatable
-declare void @llvm.dbg.value(metadata, metadata, metadata) #1
-
-attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { nounwind readnone speculatable }
+declare void @llvm.dbg.value(metadata, metadata, metadata)
 
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!3, !4, !5}
diff --git a/llvm/test/CodeGen/DirectX/legalize-module-flags.ll b/llvm/test/CodeGen/DirectX/legalize-module-flags.ll
index 6c29dea..044bd91 100644
--- a/llvm/test/CodeGen/DirectX/legalize-module-flags.ll
+++ b/llvm/test/CodeGen/DirectX/legalize-module-flags.ll
@@ -1,4 +1,4 @@
-; RUN: opt -S -dxil-prepare -mtriple=dxil-unknown-shadermodel6.0-compute %s | FileCheck %s
+; RUN: opt -S -dxil-translate-metadata -mtriple=dxil-unknown-shadermodel6.0-compute %s | FileCheck %s
 
 ; Make sure behavior flag > 6 is fixed.
 ; CHECK: !{i32 2, !"frame-pointer", i32 2}
diff --git a/llvm/test/CodeGen/DirectX/legalize-module-flags2.ll b/llvm/test/CodeGen/DirectX/legalize-module-flags2.ll
index 244ec8d..b8a60a8 100644
--- a/llvm/test/CodeGen/DirectX/legalize-module-flags2.ll
+++ b/llvm/test/CodeGen/DirectX/legalize-module-flags2.ll
@@ -1,4 +1,4 @@
-; RUN: opt -S -dxil-prepare -mtriple=dxil-unknown-shadermodel6.0-library %s | FileCheck %s
+; RUN: opt -S -dxil-translate-metadata -mtriple=dxil-unknown-shadermodel6.0-library %s | FileCheck %s
 
 ; CHECK: define void @main()
 ; Make sure behavior flag > 6 is fixed.
diff --git a/llvm/test/CodeGen/DirectX/llc-pipeline.ll b/llvm/test/CodeGen/DirectX/llc-pipeline.ll
index 13c2539..d265826 100644
--- a/llvm/test/CodeGen/DirectX/llc-pipeline.ll
+++ b/llvm/test/CodeGen/DirectX/llc-pipeline.ll
@@ -40,8 +40,8 @@
 ; CHECK-NEXT:   DXIL Resources Analysis
 ; CHECK-NEXT:   DXIL Module Metadata analysis
 ; CHECK-NEXT:   DXIL Shader Flag Analysis
-; CHECK-NEXT:   DXIL Translate Metadata
 ; CHECK-NEXT:   DXIL Root Signature Analysis
+; CHECK-NEXT:   DXIL Translate Metadata
 ; CHECK-NEXT:   DXIL Post Optimization Validation
 ; CHECK-NEXT:   DXIL Op Lowering
 ; CHECK-NEXT:   DXIL Prepare Module
diff --git a/llvm/test/CodeGen/DirectX/metadata-stripping.ll b/llvm/test/CodeGen/DirectX/metadata-stripping.ll
index eb939ba..531ab6c 100644
--- a/llvm/test/CodeGen/DirectX/metadata-stripping.ll
+++ b/llvm/test/CodeGen/DirectX/metadata-stripping.ll
@@ -1,4 +1,4 @@
-; RUN: opt -S --dxil-prepare %s | FileCheck %s
+; RUN: opt -S --dxil-translate-metadata %s | FileCheck %s
 
 ; Test that only metadata nodes that are valid in DXIL are allowed through
 
diff --git a/llvm/test/CodeGen/DirectX/strip-llvm-errno-tbaa.ll b/llvm/test/CodeGen/DirectX/strip-llvm-errno-tbaa.ll
index 9190d03..2c4140d 100644
--- a/llvm/test/CodeGen/DirectX/strip-llvm-errno-tbaa.ll
+++ b/llvm/test/CodeGen/DirectX/strip-llvm-errno-tbaa.ll
@@ -1,6 +1,6 @@
-; RUN: opt -S -dxil-prepare < %s | FileCheck %s
+; RUN: opt -S -dxil-translate-metadata < %s | FileCheck %s
 
-; Ensures that dxil-prepare will remove the llvm.errno.tbaa metadata
+; Ensures that dxil-translate-metadata will remove the llvm.errno.tbaa metadata
 
 target triple = "dxil-unknown-shadermodel6.0-compute"
 
@@ -10,7 +10,6 @@ entry:
 }
 
 ; CHECK-NOT: !llvm.errno.tbaa
-; CHECK-NOT: {{^!}}
 
 !llvm.errno.tbaa = !{!0}
 
diff --git a/llvm/test/CodeGen/DirectX/strip-rootsignatures.ll b/llvm/test/CodeGen/DirectX/strip-rootsignatures.ll
index 3ac617a..daf20bf 100644
--- a/llvm/test/CodeGen/DirectX/strip-rootsignatures.ll
+++ b/llvm/test/CodeGen/DirectX/strip-rootsignatures.ll
@@ -1,6 +1,6 @@
-; RUN: opt -S -dxil-prepare < %s | FileCheck %s
+; RUN: opt -S -dxil-translate-metadata < %s | FileCheck %s
 
-; Ensures that dxil-prepare will remove the dx.rootsignatures metadata
+; Ensures that dxil-translate-metadata  will remove the dx.rootsignatures metadata
 
 target triple = "dxil-unknown-shadermodel6.0-compute"
 
@@ -10,7 +10,6 @@ entry:
 }
 
 ; CHECK-NOT: !dx.rootsignatures
-; CHECK-NOT: {{^!}}
 
 !dx.rootsignatures = !{!2} ; list of function/root signature pairs
 !2 = !{ ptr @main, !3, i32 2 } ; function, root signature
diff --git a/llvm/test/CodeGen/Hexagon/autohvx/ripple_scalarize_scatter.ll b/llvm/test/CodeGen/Hexagon/autohvx/ripple_scalarize_scatter.ll
new file mode 100644
index 0000000..4385da3
--- /dev/null
+++ b/llvm/test/CodeGen/Hexagon/autohvx/ripple_scalarize_scatter.ll
@@ -0,0 +1,63 @@
+; Make sure we do not assert for the cases we do not handle.
+; RUN: llc -march=hexagon -mattr=+hvx,+hvx-length128b,+hvxv75,+v75,-long-calls < %s | FileCheck %s
+
+; Mainly make sure we do not core dump.
+; CHECK-NOT: scatter
+
+target datalayout = "e-m:e-p:32:32:32-a:0-n16:32-i64:64:64-i32:32:32-i16:16:16-i1:8:8-f32:32:32-f64:64:64-v32:32:32-v64:64:64-v512:512:512-v1024:1024:1024-v2048:2048:2048"
+target triple = "hexagon"
+
+; Function Attrs: mustprogress nofree norecurse nosync nounwind memory(argmem: write, inaccessiblemem: readwrite)
+define dso_local void @foo(ptr noundef writeonly captures(none) %cptr, i32 noundef %T, i32 noundef %W) local_unnamed_addr #0 {
+entry:
+  %invariant.gep11 = getelementptr i8, ptr %cptr, i32 0
+  %invariant.gep13 = getelementptr i8, ptr %invariant.gep11, i32 0
+  %cmp.not15 = icmp ugt i32 8, %T
+  br i1 %cmp.not15, label %for.cond.cleanup, label %for.cond1.preheader.lr.ph
+
+for.cond1.preheader.lr.ph:                        ; preds = %entry
+  %cmp3.not8 = icmp ugt i32 8, %W
+  %conv.ripple.LS.instance = trunc i32 %W to i8
+  %conv.ripple.LS.instance.ripple.bcast.splatinsert = insertelement <64 x i8> poison, i8 %conv.ripple.LS.instance, i64 0
+  %conv.ripple.LS.instance.ripple.bcast.splat = shufflevector <64 x i8> %conv.ripple.LS.instance.ripple.bcast.splatinsert, <64 x i8> poison, <64 x i32> zeroinitializer
+  br label %for.cond1.preheader
+
+for.cond.loopexit:                                ; preds = %for.body5, %for.cond1.preheader
+  %add = add i32 %add17, 8
+  %cmp.not = icmp ugt i32 %add, %T
+  br i1 %cmp.not, label %for.cond.cleanup, label %for.cond1.preheader
+
+for.cond1.preheader:                              ; preds = %for.cond1.preheader.lr.ph, %for.cond.loopexit
+  %add17 = phi i32 [ 8, %for.cond1.preheader.lr.ph ], [ %add, %for.cond.loopexit ]
+  %t.016 = phi i32 [ 0, %for.cond1.preheader.lr.ph ], [ %add17, %for.cond.loopexit ]
+  br i1 %cmp3.not8, label %for.cond.loopexit, label %for.body5.lr.ph
+
+for.body5.lr.ph:                                  ; preds = %for.cond1.preheader
+  %gep14 = getelementptr i8, ptr %invariant.gep13, i32 %t.016
+  br label %for.body5
+
+for.cond.cleanup:                                 ; preds = %for.cond.loopexit, %entry
+  ret void
+
+for.body5:                                        ; preds = %for.body5.lr.ph, %for.body5
+  %add210 = phi i32 [ 8, %for.body5.lr.ph ], [ %add2, %for.body5 ]
+  %w.09 = phi i32 [ 0, %for.body5.lr.ph ], [ %add210, %for.body5 ]
+  %gep = getelementptr i8, ptr %gep14, i32 %w.09
+  %gep.ripple.LS.instance = getelementptr i8, ptr %gep, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14>
+  call void @llvm.masked.scatter.v64i8.v64p0(<64 x i8> %conv.ripple.LS.instance.ripple.bcast.splat, <64 x ptr> %gep.ripple.LS.instance, i32 1, <64 x i1> splat (i1 true))
+  %add2 = add i32 %add210, 8
+  %cmp3.not = icmp ugt i32 %add2, %W
+  br i1 %cmp3.not, label %for.cond.loopexit, label %for.body5
+}
+
+; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(inaccessiblemem: write)
+declare void @llvm.ripple.block.setsize.i32(i32 immarg %0, i32 immarg %1, i32 %2) #1
+
+; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(inaccessiblemem: read)
+declare i32 @llvm.ripple.block.index.i32(i32 immarg %0, i32 immarg %1) #2
+
+; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(inaccessiblemem: read)
+declare i32 @llvm.ripple.block.getsize.i32(i32 immarg %0, i32 immarg %1) #2
+
+; Function Attrs: nocallback nofree nosync nounwind willreturn memory(write)
+declare void @llvm.masked.scatter.v64i8.v64p0(<64 x i8> %0, <64 x ptr> %1, i32 immarg %2, <64 x i1> %3) #3
diff --git a/llvm/test/CodeGen/Hexagon/autohvx/ripple_vgather.ll b/llvm/test/CodeGen/Hexagon/autohvx/ripple_vgather.ll
new file mode 100644
index 0000000..83fd63e
--- /dev/null
+++ b/llvm/test/CodeGen/Hexagon/autohvx/ripple_vgather.ll
@@ -0,0 +1,55 @@
+; RUN: llc -march=hexagon -mattr=+hvxv73,+hvx-length128b,-long-calls -hexagon-allow-scatter-gather-hvx < %s | FileCheck %s
+
+; CHECK-LABEL: Ripple_gather_32:
+; CHECK: vtmp.w = vgather
+; CHECK-LABEL: Ripple_gather_16:
+; CHECK: vtmp.h = vgather
+; CHECK-LABEL: Ripple_gather_8:
+; CHECK: vand
+; CHECK: vpacke
+
+target datalayout = "e-m:e-p:32:32:32-a:0-n16:32-i64:64:64-i32:32:32-i16:16:16-i1:8:8-f32:32:32-f64:64:64-v32:32:32-v64:64:64-v512:512:512-v1024:1024:1024-v2048:2048:2048"
+target triple = "hexagon"
+
+; Function Attrs: nofree noinline norecurse nosync nounwind memory(argmem: readwrite, inaccessiblemem: readwrite)
+define dso_local void @Ripple_gather_32(ptr nocapture noundef writeonly %destination, ptr nocapture noundef readonly %source, ptr nocapture noundef readonly %indexes) local_unnamed_addr #0 {
+entry:
+  %source.ripple.bcast.splatinsert = insertelement <32 x ptr> poison, ptr %source, i64 0
+  %source.ripple.bcast.splat = shufflevector <32 x ptr> %source.ripple.bcast.splatinsert, <32 x ptr> poison, <32 x i32> zeroinitializer
+  %0 = load <32 x i32>, ptr %indexes, align 4
+  %arrayidx2.ripple.vectorized = getelementptr inbounds i32, <32 x ptr> %source.ripple.bcast.splat, <32 x i32> %0
+  %1 = tail call <32 x i32> @llvm.masked.gather.v32i32.v32p0(<32 x ptr> %arrayidx2.ripple.vectorized, i32 4, <32 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <32 x i32> poison)
+  store <32 x i32> %1, ptr %destination, align 4
+  ret void
+}
+
+; Function Attrs: nofree noinline norecurse nosync nounwind memory(argmem: readwrite, inaccessiblemem: readwrite)
+define dso_local void @Ripple_gather_16(ptr nocapture noundef writeonly %destination, ptr nocapture noundef readonly %source, ptr nocapture noundef readonly %indexes) local_unnamed_addr #0 {
+entry:
+  %source.ripple.bcast.splatinsert = insertelement <64 x ptr> poison, ptr %source, i64 0
+  %source.ripple.bcast.splat = shufflevector <64 x ptr> %source.ripple.bcast.splatinsert, <64 x ptr> poison, <64 x i32> zeroinitializer
+  %0 = load <64 x i16>, ptr %indexes, align 2
+  %idxprom.ripple.vectorized = zext <64 x i16> %0 to <64 x i32>
+  %arrayidx2.ripple.vectorized = getelementptr inbounds i16, <64 x ptr> %source.ripple.bcast.splat, <64 x i32> %idxprom.ripple.vectorized
+  %1 = tail call <64 x i16> @llvm.masked.gather.v64i16.v64p0(<64 x ptr> %arrayidx2.ripple.vectorized, i32 2, <64 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <64 x i16> poison)
+  store <64 x i16> %1, ptr %destination, align 2
+  ret void
+}
+
+; Function Attrs: nofree noinline norecurse nosync nounwind memory(argmem: readwrite, inaccessiblemem: readwrite)
+define dso_local void @Ripple_gather_8(ptr nocapture noundef writeonly %destination, ptr nocapture noundef readonly %source, ptr nocapture noundef readonly %indexes) local_unnamed_addr #0 {
+entry:
+  %source.ripple.bcast.splatinsert = insertelement <128 x ptr> poison, ptr %source, i64 0
+  %source.ripple.bcast.splat = shufflevector <128 x ptr> %source.ripple.bcast.splatinsert, <128 x ptr> poison, <128 x i32> zeroinitializer
+  %0 = load <128 x i8>, ptr %indexes, align 1
+  %idxprom.ripple.vectorized = zext <128 x i8> %0 to <128 x i32>
+  %arrayidx2.ripple.vectorized = getelementptr inbounds i8, <128 x ptr> %source.ripple.bcast.splat, <128 x i32> %idxprom.ripple.vectorized
+  %1 = tail call <128 x i8> @llvm.masked.gather.v128i8.v128p0(<128 x ptr> %arrayidx2.ripple.vectorized, i32 1, <128 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <128 x i8> poison)
+  store <128 x i8> %1, ptr %destination, align 1
+  ret void
+}
+
+; Function Attrs: nocallback nofree nosync nounwind willreturn memory(read)
+declare <32 x i32> @llvm.masked.gather.v32i32.v32p0(<32 x ptr>, i32 immarg, <32 x i1>, <32 x i32>) #1
+declare <64 x i16> @llvm.masked.gather.v64i16.v64p0(<64 x ptr>, i32 immarg, <64 x i1>, <64 x i16>) #1
+declare <128 x i8> @llvm.masked.gather.v128i8.v128p0(<128 x ptr> %0, i32 immarg %1, <128 x i1> %2, <128 x i8> %3) #1
diff --git a/llvm/test/CodeGen/Hexagon/autohvx/ripple_vgather_SpVV.ll b/llvm/test/CodeGen/Hexagon/autohvx/ripple_vgather_SpVV.ll
new file mode 100644
index 0000000..1bd79d7
--- /dev/null
+++ b/llvm/test/CodeGen/Hexagon/autohvx/ripple_vgather_SpVV.ll
@@ -0,0 +1,54 @@
+; Verify that we generate HVX vgather for the given input.
+; RUN: llc -march=hexagon -mattr=+hvxv73,+hvx-length128b,-long-calls -hexagon-allow-scatter-gather-hvx < %s | FileCheck %s
+; CHECK-LABEL: SpVV_Ripple:
+; CHECK: vtmp.h = vgather(r{{[0-9]+}},m0,v{{[0-9]+}}.h).h
+; CHECK: vmem(r0+#0) = vtmp.new
+
+target datalayout = "e-m:e-p:32:32:32-a:0-n16:32-i64:64:64-i32:32:32-i16:16:16-i1:8:8-f32:32:32-f64:64:64-v32:32:32-v64:64:64-v512:512:512-v1024:1024:1024-v2048:2048:2048"
+target triple = "hexagon"
+
+define dso_local i32 @SpVV_Ripple(ptr nocapture noundef writeonly %scratchpad, ptr nocapture noundef readonly %Source_1, ptr nocapture noundef readonly %S_index, i32 noundef %nS, ptr nocapture noundef readonly %Source_2) local_unnamed_addr #1 {
+entry:
+  %Source_2.ripple.bcast.splatinsert = insertelement <64 x ptr> poison, ptr %Source_2, i64 0
+  %Source_2.ripple.bcast.splat = shufflevector <64 x ptr> %Source_2.ripple.bcast.splatinsert, <64 x ptr> poison, <64 x i32> zeroinitializer
+  %div16 = lshr i32 %nS, 6
+  %cmp6.not = icmp ult i32 %nS, 64
+  br i1 %cmp6.not, label %for.end, label %for.body
+
+for.body:                                         ; preds = %entry, %for.body
+  %lsr.iv17 = phi ptr [ %scevgep18, %for.body ], [ %S_index, %entry ]
+  %lsr.iv = phi ptr [ %scevgep, %for.body ], [ %Source_1, %entry ]
+  %result.08.ripple.vectorized = phi <64 x i32> [ %add8.ripple.vectorized, %for.body ], [ zeroinitializer, %entry ]
+  %_ripple_block_0.07 = phi i32 [ %add9, %for.body ], [ 0, %entry ]
+  %.ripple.LS.instance = load <64 x i16>, ptr %lsr.iv17, align 2
+  %idxprom.ripple.LS.instance = sext <64 x i16> %.ripple.LS.instance to <64 x i32>
+  %arrayidx2.ripple.LS.instance = getelementptr inbounds i16, <64 x ptr> %Source_2.ripple.bcast.splat, <64 x i32> %idxprom.ripple.LS.instance
+  %.ripple.LS.instance13 = tail call <64 x i16> @llvm.masked.gather.v64i16.v64p0(<64 x ptr> %arrayidx2.ripple.LS.instance, i32 2, <64 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <64 x i16> poison)
+  store <64 x i16> %.ripple.LS.instance13, ptr %scratchpad, align 2
+  %.ripple.LS.instance15 = load <64 x i16>, ptr %lsr.iv, align 2
+  %conv.ripple.LS.instance = sext <64 x i16> %.ripple.LS.instance15 to <64 x i32>
+  %conv6.ripple.LS.instance = sext <64 x i16> %.ripple.LS.instance13 to <64 x i32>
+  %mul7.ripple.LS.instance = mul nsw <64 x i32> %conv.ripple.LS.instance, %conv6.ripple.LS.instance
+  %add8.ripple.vectorized = add <64 x i32> %mul7.ripple.LS.instance, %result.08.ripple.vectorized
+  %add9 = add nuw nsw i32 %_ripple_block_0.07, 1
+  %scevgep = getelementptr i8, ptr %lsr.iv, i32 128
+  %scevgep18 = getelementptr i8, ptr %lsr.iv17, i32 128
+  %cmp = icmp ult i32 %add9, %div16
+  br i1 %cmp, label %for.body, label %for.end
+for.end:                                          ; preds = %for.body, %entry
+  %result.0.lcssa.ripple.LS.instance = phi <64 x i32> [ zeroinitializer, %entry ], [ %add8.ripple.vectorized, %for.body ]
+  %rdx.shuf = shufflevector <64 x i32> %result.0.lcssa.ripple.LS.instance, <64 x i32> poison, <64 x i32> <i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+  %bin.rdx = add <64 x i32> %result.0.lcssa.ripple.LS.instance, %rdx.shuf
+  %rdx.shuf19 = shufflevector <64 x i32> %bin.rdx, <64 x i32> poison, <64 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+  %bin.rdx20 = add <64 x i32> %bin.rdx, %rdx.shuf19
+  %rdx.shuf21 = shufflevector <64 x i32> %bin.rdx20, <64 x i32> poison, <64 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+  %bin.rdx22 = add <64 x i32> %bin.rdx20, %rdx.shuf21
+  %rdx.shuf23 = shufflevector <64 x i32> %bin.rdx22, <64 x i32> poison, <64 x i32> <i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+  %bin.rdx24 = add <64 x i32> %bin.rdx22, %rdx.shuf23
+  %rdx.shuf25 = shufflevector <64 x i32> %bin.rdx24, <64 x i32> poison, <64 x i32> <i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+  %bin.rdx26 = add <64 x i32> %bin.rdx24, %rdx.shuf25
+  %rdx.shuf27 = shufflevector <64 x i32> %bin.rdx26, <64 x i32> poison, <64 x i32> <i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+  %bin.rdx28 = add <64 x i32> %bin.rdx26, %rdx.shuf27
+  %0 = extractelement <64 x i32> %bin.rdx28, i32 0
+  ret i32 %0
+}
diff --git a/llvm/test/CodeGen/Hexagon/autohvx/ripple_vscatter.ll b/llvm/test/CodeGen/Hexagon/autohvx/ripple_vscatter.ll
new file mode 100644
index 0000000..85d2999
--- /dev/null
+++ b/llvm/test/CodeGen/Hexagon/autohvx/ripple_vscatter.ll
@@ -0,0 +1,52 @@
+; RUN: llc -march=hexagon -mattr=+hvx-length128b,+hvxv73,+v73,-long-calls -hexagon-allow-scatter-gather-hvx < %s | FileCheck %s
+
+; CHECK-LABEL: Ripple_scatter_8:
+; CHECK: if (q{{[0-9]+}}) vscatter(r{{[0-9]+}},m0,v{{[0-9]+}}.h).h
+; CHECK: if (q{{[0-9]+}}) vscatter(r{{[0-9]+}},m0,v{{[0-9]+}}.h).h
+; CHECK-LABEL: Ripple_scatter_16:
+; CHECK: vscatter(r{{[0-9]+}},m0,v{{[0-9]+}}.h).h = v{{[0-9]+}}
+; CHECK-LABEL: Ripple_scatter_32:
+; CHECK: vscatter(r{{[0-9]+}},m0,v{{[0-9]+}}.w).w = v{{[0-9]+}}
+
+target datalayout = "e-m:e-p:32:32:32-a:0-n16:32-i64:64:64-i32:32:32-i16:16:16-i1:8:8-f32:32:32-f64:64:64-v32:32:32-v64:64:64-v512:512:512-v1024:1024:1024-v2048:2048:2048"
+target triple = "hexagon"
+
+define dso_local void @Ripple_scatter_8(ptr nocapture noundef writeonly %destination, ptr nocapture noundef readonly %source, ptr nocapture noundef readonly %indexes) local_unnamed_addr #0 {
+entry:
+  %destination.ripple.bcast.splatinsert = insertelement <128 x ptr> poison, ptr %destination, i64 0
+  %destination.ripple.bcast.splat = shufflevector <128 x ptr> %destination.ripple.bcast.splatinsert, <128 x ptr> poison, <128 x i32> zeroinitializer
+  %.ripple.LS.instance11 = load <128 x i8>, ptr %source, align 1
+  %.ripple.LS.instance = load <128 x i8>, ptr %indexes, align 1
+  %idxprom.ripple.LS.instance = zext <128 x i8> %.ripple.LS.instance to <128 x i32>
+  %arrayidx3.ripple.LS.instance = getelementptr inbounds i8, <128 x ptr> %destination.ripple.bcast.splat, <128 x i32> %idxprom.ripple.LS.instance
+  %cst_ptr_to_i32 = ptrtoint ptr %destination to i32
+  tail call void @llvm.masked.scatter.v128i8.v128p0(<128 x i8> %.ripple.LS.instance11, <128 x ptr> %arrayidx3.ripple.LS.instance, i32 1, <128 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>)
+  ret void
+}
+
+define dso_local void @Ripple_scatter_16(ptr nocapture noundef writeonly %destination, ptr nocapture noundef readonly %source, ptr nocapture noundef readonly %indexes) local_unnamed_addr #0 {
+entry:
+  %destination.ripple.bcast.splatinsert = insertelement <64 x ptr> poison, ptr %destination, i64 0
+  %destination.ripple.bcast.splat = shufflevector <64 x ptr> %destination.ripple.bcast.splatinsert, <64 x ptr> poison, <64 x i32> zeroinitializer
+  %.ripple.LS.instance11 = load <64 x i16>, ptr %source, align 2
+  %.ripple.LS.instance = load <64 x i16>, ptr %indexes, align 2
+  %idxprom.ripple.LS.instance = zext <64 x i16> %.ripple.LS.instance to <64 x i32>
+  %arrayidx3.ripple.LS.instance = getelementptr inbounds i16, <64 x ptr> %destination.ripple.bcast.splat, <64 x i32> %idxprom.ripple.LS.instance
+  tail call void @llvm.masked.scatter.v64i16.v64p0(<64 x i16> %.ripple.LS.instance11, <64 x ptr> %arrayidx3.ripple.LS.instance, i32 2, <64 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>)
+  ret void
+}
+
+define dso_local void @Ripple_scatter_32(ptr nocapture noundef writeonly %destination, ptr nocapture noundef readonly %source, ptr nocapture noundef readonly %indexes) local_unnamed_addr #0 {
+entry:
+  %destination.ripple.bcast.splatinsert = insertelement <32 x ptr> poison, ptr %destination, i64 0
+  %destination.ripple.bcast.splat = shufflevector <32 x ptr> %destination.ripple.bcast.splatinsert, <32 x ptr> poison, <32 x i32> zeroinitializer
+  %.ripple.LS.instance11 = load <32 x i32>, ptr %source, align 4
+  %.ripple.LS.instance = load <32 x i32>, ptr %indexes, align 4
+  %arrayidx3.ripple.LS.instance = getelementptr inbounds i32, <32 x ptr> %destination.ripple.bcast.splat, <32 x i32> %.ripple.LS.instance
+  tail call void @llvm.masked.scatter.v32i32.v32p0(<32 x i32> %.ripple.LS.instance11, <32 x ptr> %arrayidx3.ripple.LS.instance, i32 4, <32 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>)
+  ret void
+}
+
+declare void @llvm.masked.scatter.v128i8.v128p0(<128 x i8> %0, <128 x ptr> %1, i32 immarg %2, <128 x i1> %3) #2
+declare void @llvm.masked.scatter.v64i16.v64p0(<64 x i16> %0, <64 x ptr> %1, i32 immarg %2, <64 x i1> %3) #2
+declare void @llvm.masked.scatter.v32i32.v32p0(<32 x i32> %0, <32 x ptr> %1, i32 immarg %2, <32 x i1> %3) #2
diff --git a/llvm/test/CodeGen/Hexagon/masked_gather.ll b/llvm/test/CodeGen/Hexagon/masked_gather.ll
new file mode 100644
index 0000000..461fd79
--- /dev/null
+++ b/llvm/test/CodeGen/Hexagon/masked_gather.ll
@@ -0,0 +1,58 @@
+; This produced masked gather that we are not yet handling
+; REQUIRES: asserts
+; RUN: opt -march=hexagon -passes=loop-vectorize -hexagon-autohvx -mattr=+hvx-length128b,+hvxv68,+v68,+hvx-ieee-fp,-long-calls,-packets -debug-only=loop-vectorize -S < %s 2>&1 | FileCheck %s
+
+; Original C++
+; clang -c -Os -mhvx -mhvx-ieee-fp -fvectorize -mno-packets -fno-strict-aliasing -Os -mhvx -mhvx-ieee-fp  -mno-packets -mv68
+;typedef struct poptContext_s * poptContext;
+;typedef struct { unsigned int bits[1]; } pbm_set;
+;struct poptContext_s { pbm_set * arg_strip; };
+;
+;int poptStrippedArgv(poptContext con, int argc, char ** argv) {
+;  int numargs = argc;
+;   for (int i = 1; i < argc; i++) {
+;     if (((((con->arg_strip)->bits)[((i) / (8 * sizeof (unsigned int)))] & ((unsigned int) 1 << ((i) % (8 * sizeof (unsigned int))))) != 0))
+;     numargs--;
+;   }
+;    return numargs;
+;}
+
+; CHECK-NOT: masked_gather
+
+target datalayout = "e-m:e-p:32:32:32-a:0-n16:32-i64:64:64-i32:32:32-i16:16:16-i1:8:8-f32:32:32-f64:64:64-v32:32:32-v64:64:64-v512:512:512-v1024:1024:1024-v2048:2048:2048"
+target triple = "hexagon-unknown-unknown-elf"
+
+; Function Attrs: nofree norecurse nosync nounwind optsize memory(read, inaccessiblemem: none)
+define dso_local i32 @poptStrippedArgv(ptr noundef readonly captures(none) %con, i32 noundef %argc, ptr noundef readnone captures(none) %argv) local_unnamed_addr #0 {
+entry:
+  %cmp8 = icmp sgt i32 %argc, 1
+  br i1 %cmp8, label %for.body.lr.ph, label %for.cond.cleanup
+
+for.body.lr.ph:                                   ; preds = %entry
+  %0 = load ptr, ptr %con, align 4
+  br label %for.body
+
+for.cond.cleanup.loopexit:                        ; preds = %for.body
+  %spec.select.lcssa = phi i32 [ %spec.select, %for.body ]
+  br label %for.cond.cleanup
+
+for.cond.cleanup:                                 ; preds = %for.cond.cleanup.loopexit, %entry
+  %numargs.0.lcssa = phi i32 [ %argc, %entry ], [ %spec.select.lcssa, %for.cond.cleanup.loopexit ]
+  ret i32 %numargs.0.lcssa
+
+for.body:                                         ; preds = %for.body.lr.ph, %for.body
+  %i.010 = phi i32 [ 1, %for.body.lr.ph ], [ %inc, %for.body ]
+  %numargs.09 = phi i32 [ %argc, %for.body.lr.ph ], [ %spec.select, %for.body ]
+  %div7 = lshr i32 %i.010, 5
+  %arrayidx = getelementptr inbounds nuw [1 x i32], ptr %0, i32 0, i32 %div7
+  %1 = load i32, ptr %arrayidx, align 4
+  %rem = and i32 %i.010, 31
+  %shl = shl nuw i32 1, %rem
+  %and = and i32 %1, %shl
+  %cmp1.not = icmp ne i32 %and, 0
+  %dec = sext i1 %cmp1.not to i32
+  %spec.select = add nsw i32 %numargs.09, %dec
+  %inc = add nuw nsw i32 %i.010, 1
+  %exitcond.not = icmp eq i32 %inc, %argc
+  br i1 %exitcond.not, label %for.cond.cleanup.loopexit, label %for.body
+}
diff --git a/llvm/test/CodeGen/Hexagon/vector-gather.ll b/llvm/test/CodeGen/Hexagon/vector-gather.ll
new file mode 100644
index 0000000..5700380
--- /dev/null
+++ b/llvm/test/CodeGen/Hexagon/vector-gather.ll
@@ -0,0 +1,27 @@
+; REQUIRES: hexagon-registered-target
+; RUN: llc -march=hexagon -mcpu=hexagonv73 -mattr=+hvxv73,+hvx-length128b < %s | FileCheck %s
+
+target triple = "hexagon"
+
+@VTCM_SCATTER16_ADDRESS = dso_local global i32 0, align 4
+@region_len = dso_local global i32 16383, align 4
+
+; CHECK: [[ADR:r[0-9]+]] = memw(gp+#VTCM_SCATTER16_ADDRESS)
+; CHECK: vtmp.h = vgather([[ADR]],m0,v0.h).h
+; CHECK: vmem(r0+#0) = vtmp.new
+
+define dso_local void @vector_gather_16(ptr noundef %vgather, <32 x i32> noundef %offsets) #0 {
+entry:
+  %vgather.addr = alloca ptr, align 4
+  %offsets.addr = alloca <32 x i32>, align 128
+  store ptr %vgather, ptr %vgather.addr, align 4
+  store <32 x i32> %offsets, ptr %offsets.addr, align 128
+  %0 = load ptr, ptr %vgather.addr, align 4
+  %1 = load i32, ptr @VTCM_SCATTER16_ADDRESS, align 4
+  %2 = load i32, ptr @region_len, align 4
+  %3 = load <32 x i32>, ptr %offsets.addr, align 128
+  call void @llvm.hexagon.V6.vgathermh.128B(ptr %0, i32 %1, i32 %2, <32 x i32> %3)
+  ret void
+}
+
+declare <128 x i1> @llvm.hexagon.V6.vandvrt.128B(<32 x i32>, i32)
diff --git a/llvm/test/CodeGen/Mips/Fast-ISel/br1.ll b/llvm/test/CodeGen/Mips/Fast-ISel/br1.ll
index b5bdf84..9630dab 100644
--- a/llvm/test/CodeGen/Mips/Fast-ISel/br1.ll
+++ b/llvm/test/CodeGen/Mips/Fast-ISel/br1.ll
@@ -31,4 +31,4 @@ if.end:                                           ; preds = %entry, %if.then
 
 }
 
-attributes #0 = { nounwind "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { nounwind "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "use-soft-float"="false" }
diff --git a/llvm/test/CodeGen/Mips/Fast-ISel/loadstore2.ll b/llvm/test/CodeGen/Mips/Fast-ISel/loadstore2.ll
index a5c1cec0..d3d2e8b 100644
--- a/llvm/test/CodeGen/Mips/Fast-ISel/loadstore2.ll
+++ b/llvm/test/CodeGen/Mips/Fast-ISel/loadstore2.ll
@@ -80,6 +80,6 @@ entry:
   ret void
 }
 
-attributes #0 = { nounwind "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { nounwind "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "use-soft-float"="false" }
 
 
diff --git a/llvm/test/CodeGen/Mips/Fast-ISel/loadstrconst.ll b/llvm/test/CodeGen/Mips/Fast-ISel/loadstrconst.ll
index bc6f2c5..e685465 100644
--- a/llvm/test/CodeGen/Mips/Fast-ISel/loadstrconst.ll
+++ b/llvm/test/CodeGen/Mips/Fast-ISel/loadstrconst.ll
@@ -17,5 +17,5 @@ entry:
 
 }
 
-attributes #0 = { nounwind "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { nounwind "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "use-soft-float"="false" }
 
diff --git a/llvm/test/CodeGen/Mips/Fast-ISel/logopm.ll b/llvm/test/CodeGen/Mips/Fast-ISel/logopm.ll
index 90db1fd..f3b902b 100644
--- a/llvm/test/CodeGen/Mips/Fast-ISel/logopm.ll
+++ b/llvm/test/CodeGen/Mips/Fast-ISel/logopm.ll
@@ -590,8 +590,8 @@ entry:
   ret void
 }
 
-attributes #0 = { noinline nounwind "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { nounwind "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { noinline nounwind "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "use-soft-float"="false" }
+attributes #1 = { nounwind "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "use-soft-float"="false" }
 attributes #2 = { nounwind }
 
 !llvm.module.flags = !{!0}
diff --git a/llvm/test/CodeGen/Mips/Fast-ISel/simplestorefp1.ll b/llvm/test/CodeGen/Mips/Fast-ISel/simplestorefp1.ll
index d1a0574..eca0d16 100644
--- a/llvm/test/CodeGen/Mips/Fast-ISel/simplestorefp1.ll
+++ b/llvm/test/CodeGen/Mips/Fast-ISel/simplestorefp1.ll
@@ -51,4 +51,4 @@ entry:
   ret void
 }
 
-attributes #0 = { nounwind "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { nounwind "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "use-soft-float"="false" }
diff --git a/llvm/test/CodeGen/Mips/Fast-ISel/simplestorei.ll b/llvm/test/CodeGen/Mips/Fast-ISel/simplestorei.ll
index ee174dd..33b4ef8 100644
--- a/llvm/test/CodeGen/Mips/Fast-ISel/simplestorei.ll
+++ b/llvm/test/CodeGen/Mips/Fast-ISel/simplestorei.ll
@@ -63,6 +63,6 @@ entry:
   ret void
 }
 
-attributes #0 = { nounwind "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { nounwind "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "use-soft-float"="false" }
 
 
diff --git a/llvm/test/CodeGen/Mips/beqzc.ll b/llvm/test/CodeGen/Mips/beqzc.ll
index 28f3f8c..42eb392 100644
--- a/llvm/test/CodeGen/Mips/beqzc.ll
+++ b/llvm/test/CodeGen/Mips/beqzc.ll
@@ -14,7 +14,7 @@ entry:
   ret i32 0
 }
 
-attributes #0 = { nounwind optsize "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="true" }
+attributes #0 = { nounwind optsize "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "use-soft-float"="true" }
 
 
 
diff --git a/llvm/test/CodeGen/Mips/beqzc1.ll b/llvm/test/CodeGen/Mips/beqzc1.ll
index 915f34e..01bb5f1 100644
--- a/llvm/test/CodeGen/Mips/beqzc1.ll
+++ b/llvm/test/CodeGen/Mips/beqzc1.ll
@@ -19,6 +19,6 @@ if.end:                                           ; preds = %if.then, %entry
   ret i32 0
 }
 
-attributes #0 = { nounwind optsize "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="true" }
+attributes #0 = { nounwind optsize "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "use-soft-float"="true" }
 
 
diff --git a/llvm/test/CodeGen/Mips/brsize3.ll b/llvm/test/CodeGen/Mips/brsize3.ll
index 1aea201..20aab184 100644
--- a/llvm/test/CodeGen/Mips/brsize3.ll
+++ b/llvm/test/CodeGen/Mips/brsize3.ll
@@ -33,7 +33,7 @@ x:                                                ; preds = %x, %entry
 
 }
 
-attributes #0 = { noreturn nounwind optsize "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="true" }
+attributes #0 = { noreturn nounwind optsize "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "use-soft-float"="true" }
 attributes #1 = { nounwind }
 
 !1 = !{i32 45}
diff --git a/llvm/test/CodeGen/Mips/brsize3a.ll b/llvm/test/CodeGen/Mips/brsize3a.ll
index de866f2..b1ebbd8 100644
--- a/llvm/test/CodeGen/Mips/brsize3a.ll
+++ b/llvm/test/CodeGen/Mips/brsize3a.ll
@@ -20,7 +20,7 @@ x:                                                ; preds = %x, %entry
 
 }
 
-attributes #0 = { noreturn nounwind optsize "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="true" }
+attributes #0 = { noreturn nounwind optsize "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "use-soft-float"="true" }
 attributes #1 = { nounwind }
 
 !1 = !{i32 45}
diff --git a/llvm/test/CodeGen/Mips/ci2.ll b/llvm/test/CodeGen/Mips/ci2.ll
index a949729..4901d8d 100644
--- a/llvm/test/CodeGen/Mips/ci2.ll
+++ b/llvm/test/CodeGen/Mips/ci2.ll
@@ -33,7 +33,7 @@ if.end:                                           ; preds = %if.else, %if.then
 ; constisle	.4byte	305419896               # 0x12345678
 }
 
-attributes #0 = { nounwind "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { nounwind "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "use-soft-float"="false" }
 attributes #1 = { nounwind }
 
 !1 = !{i32 103}
diff --git a/llvm/test/CodeGen/Mips/cmplarge.ll b/llvm/test/CodeGen/Mips/cmplarge.ll
index db7f37a..bfb6080 100644
--- a/llvm/test/CodeGen/Mips/cmplarge.ll
+++ b/llvm/test/CodeGen/Mips/cmplarge.ll
@@ -33,6 +33,6 @@ for.end:                                          ; preds = %for.body, %entry
 ; cmp16:	.end	getSubImagesLuma
 declare i32 @iClip3(...) #1
 
-attributes #0 = { nounwind "less-precise-fpmad"="false" "frame-pointer"="non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { "less-precise-fpmad"="false" "frame-pointer"="non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { nounwind "less-precise-fpmad"="false" "frame-pointer"="non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { "less-precise-fpmad"="false" "frame-pointer"="non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "use-soft-float"="false" }
 attributes #2 = { nounwind }
diff --git a/llvm/test/CodeGen/Mips/const1.ll b/llvm/test/CodeGen/Mips/const1.ll
index 750912d..7915d66 100644
--- a/llvm/test/CodeGen/Mips/const1.ll
+++ b/llvm/test/CodeGen/Mips/const1.ll
@@ -28,7 +28,7 @@ entry:
 ; CHECK:	.4byte	262991277
 }
 
-attributes #0 = { nounwind "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="true" }
+attributes #0 = { nounwind "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "use-soft-float"="true" }
 
 !llvm.ident = !{!0}
 
diff --git a/llvm/test/CodeGen/Mips/const4a.ll b/llvm/test/CodeGen/Mips/const4a.ll
index 245abbf..e88ffd3 100644
--- a/llvm/test/CodeGen/Mips/const4a.ll
+++ b/llvm/test/CodeGen/Mips/const4a.ll
@@ -172,8 +172,8 @@ declare void @goo(...) #1
 
 declare void @hoo(...) #1
 
-attributes #0 = { nounwind "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="true" }
-attributes #1 = { "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="true" }
+attributes #0 = { nounwind "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "use-soft-float"="true" }
+attributes #1 = { "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "use-soft-float"="true" }
 
 !llvm.ident = !{!0}
 
diff --git a/llvm/test/CodeGen/Mips/const6.ll b/llvm/test/CodeGen/Mips/const6.ll
index f40eeef..480a958 100644
--- a/llvm/test/CodeGen/Mips/const6.ll
+++ b/llvm/test/CodeGen/Mips/const6.ll
@@ -154,8 +154,8 @@ entry:
 
 declare void @hoo(...) #1
 
-attributes #0 = { nounwind "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="true" }
-attributes #1 = { "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="true" }
+attributes #0 = { nounwind "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "use-soft-float"="true" }
+attributes #1 = { "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "use-soft-float"="true" }
 
 !llvm.ident = !{!0}
 
diff --git a/llvm/test/CodeGen/Mips/const6a.ll b/llvm/test/CodeGen/Mips/const6a.ll
index 720edd3a..eb62e27 100644
--- a/llvm/test/CodeGen/Mips/const6a.ll
+++ b/llvm/test/CodeGen/Mips/const6a.ll
@@ -23,7 +23,7 @@ entry:
   ret void
 }
 
-attributes #0 = { nounwind "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="true" }
+attributes #0 = { nounwind "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "use-soft-float"="true" }
 attributes #1 = { nounwind }
 
 !1 = !{i32 121}
diff --git a/llvm/test/CodeGen/Mips/ctlz.ll b/llvm/test/CodeGen/Mips/ctlz.ll
index 3cc1569..49eb36f 100644
--- a/llvm/test/CodeGen/Mips/ctlz.ll
+++ b/llvm/test/CodeGen/Mips/ctlz.ll
@@ -22,6 +22,6 @@ declare i32 @llvm.ctlz.i32(i32, i1) #1
 
 
 
-attributes #0 = { nounwind "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="true" }
+attributes #0 = { nounwind "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "use-soft-float"="true" }
 attributes #1 = { nounwind readnone }
 
diff --git a/llvm/test/CodeGen/Mips/delay-slot-fill-forward.ll b/llvm/test/CodeGen/Mips/delay-slot-fill-forward.ll
index 7c41641..43fd36b 100644
--- a/llvm/test/CodeGen/Mips/delay-slot-fill-forward.ll
+++ b/llvm/test/CodeGen/Mips/delay-slot-fill-forward.ll
@@ -161,7 +161,7 @@ if.end461:                                        ; preds = %if.end436, %for.bod
   ret void
 }
 
-attributes #0 = { nounwind "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="mips32r2" "target-features"="+mips32r2,+nooddspreg,+fpxx" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { nounwind "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="mips32r2" "target-features"="+mips32r2,+nooddspreg,+fpxx" "use-soft-float"="false" }
 attributes #1 = { nounwind }
 
 !llvm.ident = !{!0}
diff --git a/llvm/test/CodeGen/Mips/f16abs.ll b/llvm/test/CodeGen/Mips/f16abs.ll
index 23bf402..242d8ff 100644
--- a/llvm/test/CodeGen/Mips/f16abs.ll
+++ b/llvm/test/CodeGen/Mips/f16abs.ll
@@ -29,8 +29,8 @@ declare double @fabs(double) #1
 
 declare float @fabsf(float) #1
 
-attributes #0 = { nounwind optsize "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="true" }
-attributes #1 = { nounwind optsize readnone "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="true" }
+attributes #0 = { nounwind optsize "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "use-soft-float"="true" }
+attributes #1 = { nounwind optsize readnone "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "use-soft-float"="true" }
 attributes #2 = { nounwind optsize readnone }
 
 
diff --git a/llvm/test/CodeGen/Mips/fp16instrinsmc.ll b/llvm/test/CodeGen/Mips/fp16instrinsmc.ll
index 6c29c08..1582605 100644
--- a/llvm/test/CodeGen/Mips/fp16instrinsmc.ll
+++ b/llvm/test/CodeGen/Mips/fp16instrinsmc.ll
@@ -385,7 +385,7 @@ entry:
 ; Function Attrs: nounwind
 declare double @exp2(double) #0
 
-attributes #0 = { nounwind "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { nounwind readnone "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="true" }
+attributes #0 = { nounwind "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "use-soft-float"="false" }
+attributes #1 = { nounwind readnone "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "use-soft-float"="true" }
 attributes #2 = { nounwind readnone }
 attributes #3 = { nounwind }
diff --git a/llvm/test/CodeGen/Mips/fpneeded.ll b/llvm/test/CodeGen/Mips/fpneeded.ll
index cc82f81..babfcad 100644
--- a/llvm/test/CodeGen/Mips/fpneeded.ll
+++ b/llvm/test/CodeGen/Mips/fpneeded.ll
@@ -131,7 +131,7 @@ entry:
 ; 32:	.set	reorder
 ; 32:	.end	foo3
 
-attributes #0 = { nounwind "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { nounwind "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "use-soft-float"="false" }
 
 define void @vv() #0 {
 entry:
diff --git a/llvm/test/CodeGen/Mips/fpnotneeded.ll b/llvm/test/CodeGen/Mips/fpnotneeded.ll
index 761ef30..2b98f7e 100644
--- a/llvm/test/CodeGen/Mips/fpnotneeded.ll
+++ b/llvm/test/CodeGen/Mips/fpnotneeded.ll
@@ -61,7 +61,7 @@ entry:
 
 ; cisle:	.end	foo
 
-attributes #0 = { nounwind "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { nounwind "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "use-soft-float"="false" }
 
 
 define float @fv() #0 {
diff --git a/llvm/test/CodeGen/Mips/hf16call32.ll b/llvm/test/CodeGen/Mips/hf16call32.ll
index e187b76..33353b6 100644
--- a/llvm/test/CodeGen/Mips/hf16call32.ll
+++ b/llvm/test/CodeGen/Mips/hf16call32.ll
@@ -1026,5 +1026,5 @@ declare { double, double } @dc_sf(float) #1
 ; stel: jr $18
 ; stel: .end __call_stub_fp_dc_sf
 
-attributes #0 = { nounwind "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { nounwind "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "use-soft-float"="false" }
diff --git a/llvm/test/CodeGen/Mips/hf16call32_body.ll b/llvm/test/CodeGen/Mips/hf16call32_body.ll
index 3bcb6f6..2eea4c3 100644
--- a/llvm/test/CodeGen/Mips/hf16call32_body.ll
+++ b/llvm/test/CodeGen/Mips/hf16call32_body.ll
@@ -303,4 +303,4 @@ entry:
 ; stel: $__fn_local_sf_df_df = sf_df_df
 ; stel: .end __fn_stub_sf_df_df
 
-attributes #0 = { nounwind "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { nounwind "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "use-soft-float"="false" }
diff --git a/llvm/test/CodeGen/Mips/hfptrcall.ll b/llvm/test/CodeGen/Mips/hfptrcall.ll
index 920c694..2babc67 100644
--- a/llvm/test/CodeGen/Mips/hfptrcall.ll
+++ b/llvm/test/CodeGen/Mips/hfptrcall.ll
@@ -118,8 +118,8 @@ entry:
 
 declare i32 @printf(ptr, ...) #1
 
-attributes #0 = { nounwind "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { nounwind "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "use-soft-float"="false" }
 
 
 
diff --git a/llvm/test/CodeGen/Mips/l3mc.ll b/llvm/test/CodeGen/Mips/l3mc.ll
index 440da3a..dc68eaf 100644
--- a/llvm/test/CodeGen/Mips/l3mc.ll
+++ b/llvm/test/CodeGen/Mips/l3mc.ll
@@ -99,7 +99,7 @@ entry:
   ret void
 }
 
-attributes #0 = { nounwind "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { nounwind "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "use-soft-float"="false" }
 
 ; __call_stub_fp___fixunsdfsi:  __call_stub_fp___fixunsdfsi:
 ; __call_stub_fp___floatdidf:  __call_stub_fp___floatdidf:
diff --git a/llvm/test/CodeGen/Mips/lcb2.ll b/llvm/test/CodeGen/Mips/lcb2.ll
index 036de38..79f4b43 100644
--- a/llvm/test/CodeGen/Mips/lcb2.ll
+++ b/llvm/test/CodeGen/Mips/lcb2.ll
@@ -115,7 +115,7 @@ if.end:                                           ; preds = %if.then, %entry
 ; lcb: 	.end	btz
 ; lcbn:	.end	btz
 
-attributes #0 = { nounwind optsize "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { nounwind optsize "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "use-soft-float"="false" }
 attributes #1 = { nounwind }
 
 !llvm.ident = !{!0}
diff --git a/llvm/test/CodeGen/Mips/lcb3c.ll b/llvm/test/CodeGen/Mips/lcb3c.ll
index 40912f3..dd88924 100644
--- a/llvm/test/CodeGen/Mips/lcb3c.ll
+++ b/llvm/test/CodeGen/Mips/lcb3c.ll
@@ -51,7 +51,7 @@ if.end:                                           ; preds = %if.else, %if.then
 ; lcb:	jal	$BB1_2	# branch
 ; lcb: $BB1_1:                                 # %if.then
 
-attributes #0 = { nounwind "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { nounwind "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "use-soft-float"="false" }
 attributes #1 = { nounwind }
 
 
diff --git a/llvm/test/CodeGen/Mips/lcb4a.ll b/llvm/test/CodeGen/Mips/lcb4a.ll
index a0258b1..ad843bb 100644
--- a/llvm/test/CodeGen/Mips/lcb4a.ll
+++ b/llvm/test/CodeGen/Mips/lcb4a.ll
@@ -55,7 +55,7 @@ if.end:                                           ; preds = %if.else, %if.then
 ; ci:	nop
 ; ci: $BB1_1:                                 # %if.else
 
-attributes #0 = { nounwind optsize "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { nounwind optsize "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "use-soft-float"="false" }
 attributes #1 = { nounwind }
 
 
diff --git a/llvm/test/CodeGen/Mips/lcb5.ll b/llvm/test/CodeGen/Mips/lcb5.ll
index 22baeba..0d479ff 100644
--- a/llvm/test/CodeGen/Mips/lcb5.ll
+++ b/llvm/test/CodeGen/Mips/lcb5.ll
@@ -216,7 +216,7 @@ if.end:                                           ; preds = %if.then, %entry
 ; ci:	.p2align	2
 ; ci:	.end	z4
 
-attributes #0 = { nounwind optsize "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { nounwind optsize "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "use-soft-float"="false" }
 attributes #1 = { nounwind }
 
 
diff --git a/llvm/test/CodeGen/Mips/mbrsize4a.ll b/llvm/test/CodeGen/Mips/mbrsize4a.ll
index b8d2e2d..e6c620a 100644
--- a/llvm/test/CodeGen/Mips/mbrsize4a.ll
+++ b/llvm/test/CodeGen/Mips/mbrsize4a.ll
@@ -30,8 +30,8 @@ declare i32 @foo(...) #1
 
 declare i32 @printf(ptr, ...) #1
 
-attributes #0 = { nounwind "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { nounwind "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "use-soft-float"="false" }
+attributes #1 = { "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "use-soft-float"="false" }
 attributes #2 = { nounwind }
 
 !1 = !{i32 68}
diff --git a/llvm/test/CodeGen/Mips/micromips-attr.ll b/llvm/test/CodeGen/Mips/micromips-attr.ll
index 8e70cc6..1915f3b 100644
--- a/llvm/test/CodeGen/Mips/micromips-attr.ll
+++ b/llvm/test/CodeGen/Mips/micromips-attr.ll
@@ -24,7 +24,7 @@ attributes #0 = {
   "less-precise-fpmad"="false" "frame-pointer"="none"
   "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false"
   "no-signed-zeros-fp-math"="false" "no-trapping-math"="false"
-  "stack-protector-buffer-size"="8" "unsafe-fp-math"="false"
+  "stack-protector-buffer-size"="8"
   "use-soft-float"="false"
 }
 
@@ -34,6 +34,6 @@ attributes #1 = {
   "less-precise-fpmad"="false" "frame-pointer"="none"
   "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false"
   "no-signed-zeros-fp-math"="false" "no-trapping-math"="false"
-  "stack-protector-buffer-size"="8" "unsafe-fp-math"="false"
+  "stack-protector-buffer-size"="8"
   "use-soft-float"="false"
 }
diff --git a/llvm/test/CodeGen/Mips/mips16-hf-attr-2.ll b/llvm/test/CodeGen/Mips/mips16-hf-attr-2.ll
index 80294b5..eaa39e9 100644
--- a/llvm/test/CodeGen/Mips/mips16-hf-attr-2.ll
+++ b/llvm/test/CodeGen/Mips/mips16-hf-attr-2.ll
@@ -28,18 +28,18 @@ attributes #0 = {
   "less-precise-fpmad"="false" "frame-pointer"="all"
  "frame-pointer"="non-leaf" "no-infs-fp-math"="false"
   "no-nans-fp-math"="false" "stack-protector-buffer-size"="8"
-  "unsafe-fp-math"="false" "use-soft-float"="false"
+  "use-soft-float"="false"
 }
 attributes #1 = {
   nounwind
   "less-precise-fpmad"="false" "frame-pointer"="all"
  "frame-pointer"="non-leaf" "no-infs-fp-math"="false"
   "no-nans-fp-math"="false" "stack-protector-buffer-size"="8"
-  "unsafe-fp-math"="false" "use-soft-float"="true"
+  "use-soft-float"="true"
 }
 attributes #2 = {
   "less-precise-fpmad"="false" "frame-pointer"="all"
  "frame-pointer"="non-leaf" "no-infs-fp-math"="false"
   "no-nans-fp-math"="false" "stack-protector-buffer-size"="8"
-  "unsafe-fp-math"="false" "use-soft-float"="true"
+  "use-soft-float"="true"
 }
diff --git a/llvm/test/CodeGen/Mips/mips16-hf-attr.ll b/llvm/test/CodeGen/Mips/mips16-hf-attr.ll
index c8af712..cafa2d5 100644
--- a/llvm/test/CodeGen/Mips/mips16-hf-attr.ll
+++ b/llvm/test/CodeGen/Mips/mips16-hf-attr.ll
@@ -28,18 +28,18 @@ attributes #0 = {
   "less-precise-fpmad"="false" "frame-pointer"="all"
  "frame-pointer"="non-leaf" "no-infs-fp-math"="false"
   "no-nans-fp-math"="false" "stack-protector-buffer-size"="8"
-  "unsafe-fp-math"="false" "use-soft-float"="false"
+  "use-soft-float"="false"
 }
 attributes #1 = {
   nounwind
   "less-precise-fpmad"="false" "frame-pointer"="all"
  "frame-pointer"="non-leaf" "no-infs-fp-math"="false"
   "no-nans-fp-math"="false" "stack-protector-buffer-size"="8"
-  "unsafe-fp-math"="false" "use-soft-float"="true"
+  "use-soft-float"="true"
 }
 attributes #2 = {
   "less-precise-fpmad"="false" "frame-pointer"="all"
  "frame-pointer"="non-leaf" "no-infs-fp-math"="false"
   "no-nans-fp-math"="false" "stack-protector-buffer-size"="8"
-  "unsafe-fp-math"="false" "use-soft-float"="true"
+  "use-soft-float"="true"
 }
diff --git a/llvm/test/CodeGen/Mips/mips16_32_1.ll b/llvm/test/CodeGen/Mips/mips16_32_1.ll
index 0d02022..963fb58 100644
--- a/llvm/test/CodeGen/Mips/mips16_32_1.ll
+++ b/llvm/test/CodeGen/Mips/mips16_32_1.ll
@@ -10,4 +10,4 @@ entry:
 ; CHECK:	.ent	foo
 ; CHECK:	jrc $ra
 ; CHECK:	.end	foo
-attributes #0 = { nounwind "less-precise-fpmad"="false" "mips16" "frame-pointer"="non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { nounwind "less-precise-fpmad"="false" "mips16" "frame-pointer"="non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "use-soft-float"="false" }
diff --git a/llvm/test/CodeGen/Mips/mips16_32_10.ll b/llvm/test/CodeGen/Mips/mips16_32_10.ll
index 86378ff..e0d6859 100644
--- a/llvm/test/CodeGen/Mips/mips16_32_10.ll
+++ b/llvm/test/CodeGen/Mips/mips16_32_10.ll
@@ -53,6 +53,6 @@ entry:
 
 
 
-attributes #0 = { nounwind "less-precise-fpmad"="false" "nomips16" "frame-pointer"="non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { nounwind "less-precise-fpmad"="false" "frame-pointer"="non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false"  "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #2 = { nounwind "less-precise-fpmad"="false" "nomips16" "frame-pointer"="non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { nounwind "less-precise-fpmad"="false" "nomips16" "frame-pointer"="non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { nounwind "less-precise-fpmad"="false" "frame-pointer"="non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false"  "use-soft-float"="false" }
+attributes #2 = { nounwind "less-precise-fpmad"="false" "nomips16" "frame-pointer"="non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "use-soft-float"="false" }
diff --git a/llvm/test/CodeGen/Mips/mips16_32_3.ll b/llvm/test/CodeGen/Mips/mips16_32_3.ll
index ee33abc..dc2fe29 100644
--- a/llvm/test/CodeGen/Mips/mips16_32_3.ll
+++ b/llvm/test/CodeGen/Mips/mips16_32_3.ll
@@ -62,6 +62,6 @@ entry:
 ; 32:	.set	reorder
 ; 32:	.end	main
 
-attributes #0 = { nounwind "less-precise-fpmad"="false" "mips16" "frame-pointer"="non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { nounwind "less-precise-fpmad"="false" "frame-pointer"="non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "nomips16" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #2 = { nounwind "less-precise-fpmad"="false" "frame-pointer"="non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { nounwind "less-precise-fpmad"="false" "mips16" "frame-pointer"="non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { nounwind "less-precise-fpmad"="false" "frame-pointer"="non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "nomips16" "use-soft-float"="false" }
+attributes #2 = { nounwind "less-precise-fpmad"="false" "frame-pointer"="non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "use-soft-float"="false" }
diff --git a/llvm/test/CodeGen/Mips/mips16_32_4.ll b/llvm/test/CodeGen/Mips/mips16_32_4.ll
index da926342..2fed74d 100644
--- a/llvm/test/CodeGen/Mips/mips16_32_4.ll
+++ b/llvm/test/CodeGen/Mips/mips16_32_4.ll
@@ -56,6 +56,6 @@ entry:
 ; 32:	.end	main
 
 
-attributes #0 = { nounwind "less-precise-fpmad"="false" "mips16" "frame-pointer"="non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { nounwind "less-precise-fpmad"="false" "frame-pointer"="non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "nomips16" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #2 = { nounwind "less-precise-fpmad"="false" "mips16" "frame-pointer"="non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { nounwind "less-precise-fpmad"="false" "mips16" "frame-pointer"="non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { nounwind "less-precise-fpmad"="false" "frame-pointer"="non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "nomips16" "use-soft-float"="false" }
+attributes #2 = { nounwind "less-precise-fpmad"="false" "mips16" "frame-pointer"="non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "use-soft-float"="false" }
diff --git a/llvm/test/CodeGen/Mips/mips16_32_5.ll b/llvm/test/CodeGen/Mips/mips16_32_5.ll
index 6692460..2bbe778 100644
--- a/llvm/test/CodeGen/Mips/mips16_32_5.ll
+++ b/llvm/test/CodeGen/Mips/mips16_32_5.ll
@@ -73,6 +73,6 @@ entry:
 
 
 
-attributes #0 = { nounwind "less-precise-fpmad"="false" "mips16" "frame-pointer"="non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { nounwind "less-precise-fpmad"="false" "frame-pointer"="non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "nomips16" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #2 = { nounwind "less-precise-fpmad"="false" "nomips16" "frame-pointer"="non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { nounwind "less-precise-fpmad"="false" "mips16" "frame-pointer"="non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { nounwind "less-precise-fpmad"="false" "frame-pointer"="non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "nomips16" "use-soft-float"="false" }
+attributes #2 = { nounwind "less-precise-fpmad"="false" "nomips16" "frame-pointer"="non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "use-soft-float"="false" }
diff --git a/llvm/test/CodeGen/Mips/mips16_32_6.ll b/llvm/test/CodeGen/Mips/mips16_32_6.ll
index 5a464a2..0503b3f 100644
--- a/llvm/test/CodeGen/Mips/mips16_32_6.ll
+++ b/llvm/test/CodeGen/Mips/mips16_32_6.ll
@@ -80,6 +80,6 @@ entry:
 
 
 
-attributes #0 = { nounwind "less-precise-fpmad"="false" "frame-pointer"="non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { nounwind "less-precise-fpmad"="false" "frame-pointer"="non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "nomips16" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #2 = { nounwind "less-precise-fpmad"="false" "nomips16" "frame-pointer"="non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { nounwind "less-precise-fpmad"="false" "frame-pointer"="non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { nounwind "less-precise-fpmad"="false" "frame-pointer"="non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "nomips16" "use-soft-float"="false" }
+attributes #2 = { nounwind "less-precise-fpmad"="false" "nomips16" "frame-pointer"="non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "use-soft-float"="false" }
diff --git a/llvm/test/CodeGen/Mips/mips16_32_7.ll b/llvm/test/CodeGen/Mips/mips16_32_7.ll
index 236f791..2b2dd8b 100644
--- a/llvm/test/CodeGen/Mips/mips16_32_7.ll
+++ b/llvm/test/CodeGen/Mips/mips16_32_7.ll
@@ -68,6 +68,6 @@ entry:
 
 
 
-attributes #0 = { nounwind "less-precise-fpmad"="false" "frame-pointer"="non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { nounwind "less-precise-fpmad"="false" "frame-pointer"="non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "nomips16" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #2 = { nounwind "less-precise-fpmad"="false" "mips16" "frame-pointer"="non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { nounwind "less-precise-fpmad"="false" "frame-pointer"="non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { nounwind "less-precise-fpmad"="false" "frame-pointer"="non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "nomips16" "use-soft-float"="false" }
+attributes #2 = { nounwind "less-precise-fpmad"="false" "mips16" "frame-pointer"="non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "use-soft-float"="false" }
diff --git a/llvm/test/CodeGen/Mips/mips16_32_8.ll b/llvm/test/CodeGen/Mips/mips16_32_8.ll
index 5c0cd32..1aff91c 100644
--- a/llvm/test/CodeGen/Mips/mips16_32_8.ll
+++ b/llvm/test/CodeGen/Mips/mips16_32_8.ll
@@ -67,7 +67,7 @@ entry:
 ; 32:	.set	reorder
 ; 32:	.end	main
 
-attributes #0 = { nounwind "less-precise-fpmad"="false" "mips16" "frame-pointer"="non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { nounwind "less-precise-fpmad"="false" "frame-pointer"="non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "nomips16" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #2 = { "less-precise-fpmad"="false" "frame-pointer"="non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #3 = { nounwind "less-precise-fpmad"="false" "frame-pointer"="non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { nounwind "less-precise-fpmad"="false" "mips16" "frame-pointer"="non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { nounwind "less-precise-fpmad"="false" "frame-pointer"="non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "nomips16" "use-soft-float"="false" }
+attributes #2 = { "less-precise-fpmad"="false" "frame-pointer"="non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "use-soft-float"="false" }
+attributes #3 = { nounwind "less-precise-fpmad"="false" "frame-pointer"="non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "use-soft-float"="false" }
diff --git a/llvm/test/CodeGen/Mips/mips16_32_9.ll b/llvm/test/CodeGen/Mips/mips16_32_9.ll
index 609f054..82d7727 100644
--- a/llvm/test/CodeGen/Mips/mips16_32_9.ll
+++ b/llvm/test/CodeGen/Mips/mips16_32_9.ll
@@ -44,6 +44,6 @@ entry:
 
 
 
-attributes #0 = { nounwind "less-precise-fpmad"="false" "mips16" "frame-pointer"="non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { nounwind "less-precise-fpmad"="false" "frame-pointer"="non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false"  "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #2 = { nounwind "less-precise-fpmad"="false" "mips16" "frame-pointer"="non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { nounwind "less-precise-fpmad"="false" "mips16" "frame-pointer"="non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { nounwind "less-precise-fpmad"="false" "frame-pointer"="non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false"  "use-soft-float"="false" }
+attributes #2 = { nounwind "less-precise-fpmad"="false" "mips16" "frame-pointer"="non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "use-soft-float"="false" }
diff --git a/llvm/test/CodeGen/Mips/nomips16.ll b/llvm/test/CodeGen/Mips/nomips16.ll
index 62564f9..6b51eb9 100644
--- a/llvm/test/CodeGen/Mips/nomips16.ll
+++ b/llvm/test/CodeGen/Mips/nomips16.ll
@@ -33,6 +33,6 @@ entry:
 ; CHECK: 	.end	nofoo
 
 
-attributes #0 = { nounwind "less-precise-fpmad"="false" "mips16" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { nounwind "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "nomips16" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { nounwind "less-precise-fpmad"="false" "mips16" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "use-soft-float"="false" }
+attributes #1 = { nounwind "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "nomips16" "stack-protector-buffer-size"="8" "use-soft-float"="false" }
 
diff --git a/llvm/test/CodeGen/Mips/pbqp-reserved-physreg.ll b/llvm/test/CodeGen/Mips/pbqp-reserved-physreg.ll
index 63a730c..a8eab07 100644
--- a/llvm/test/CodeGen/Mips/pbqp-reserved-physreg.ll
+++ b/llvm/test/CodeGen/Mips/pbqp-reserved-physreg.ll
@@ -31,5 +31,5 @@ bb35:                                             ; preds = %bb
   unreachable
 }
 
-attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="generic" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="generic" "use-soft-float"="false" }
 
diff --git a/llvm/test/CodeGen/Mips/powif64_16.ll b/llvm/test/CodeGen/Mips/powif64_16.ll
index 3443b62..914ef94 100644
--- a/llvm/test/CodeGen/Mips/powif64_16.ll
+++ b/llvm/test/CodeGen/Mips/powif64_16.ll
@@ -17,7 +17,7 @@ define double @foo_pow_f64(double %y, i32 %p)  {
   ret double %1
 } 
 
-attributes #0 = { nounwind optsize "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="true" }
+attributes #0 = { nounwind optsize "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "use-soft-float"="true" }
 attributes #1 = { nounwind readonly }
 
 !0 = !{!"double", !1}
diff --git a/llvm/test/CodeGen/Mips/s2rem.ll b/llvm/test/CodeGen/Mips/s2rem.ll
index fdf06ce..5d324cb 100644
--- a/llvm/test/CodeGen/Mips/s2rem.ll
+++ b/llvm/test/CodeGen/Mips/s2rem.ll
@@ -86,7 +86,7 @@ entry:
 
 declare void @vf(float) #1
 
-attributes #0 = { nounwind "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { nounwind "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "use-soft-float"="false" }
+attributes #1 = { "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "use-soft-float"="false" }
 
 
diff --git a/llvm/test/CodeGen/Mips/sel1c.ll b/llvm/test/CodeGen/Mips/sel1c.ll
index 071f988..2aaf56d 100644
--- a/llvm/test/CodeGen/Mips/sel1c.ll
+++ b/llvm/test/CodeGen/Mips/sel1c.ll
@@ -16,6 +16,6 @@ entry:
 ; cond-b-short:	bteqz	$BB0_{{[0-9]+}}  # 16 bit inst
 }
 
-attributes #0 = { nounwind optsize "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="true" }
+attributes #0 = { nounwind optsize "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "use-soft-float"="true" }
 
 
diff --git a/llvm/test/CodeGen/Mips/sel2c.ll b/llvm/test/CodeGen/Mips/sel2c.ll
index 0c3b957..44de4ac9 100644
--- a/llvm/test/CodeGen/Mips/sel2c.ll
+++ b/llvm/test/CodeGen/Mips/sel2c.ll
@@ -16,6 +16,6 @@ entry:
   ret void
 }
 
-attributes #0 = { nounwind optsize "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="true" }
+attributes #0 = { nounwind optsize "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "use-soft-float"="true" }
 
 
diff --git a/llvm/test/CodeGen/Mips/simplebr.ll b/llvm/test/CodeGen/Mips/simplebr.ll
index cfe547f..ae09d85 100644
--- a/llvm/test/CodeGen/Mips/simplebr.ll
+++ b/llvm/test/CodeGen/Mips/simplebr.ll
@@ -31,7 +31,7 @@ declare void @goo(...) #1
 
 declare void @hoo(...) #1
 
-attributes #0 = { nounwind "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="true" }
-attributes #1 = { "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="true" }
+attributes #0 = { nounwind "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "use-soft-float"="true" }
+attributes #1 = { "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "use-soft-float"="true" }
 
 
diff --git a/llvm/test/CodeGen/Mips/sr1.ll b/llvm/test/CodeGen/Mips/sr1.ll
index c6fa9fc..6c42d45 100644
--- a/llvm/test/CodeGen/Mips/sr1.ll
+++ b/llvm/test/CodeGen/Mips/sr1.ll
@@ -50,7 +50,7 @@ entry:
 
 declare float @xf() #1
 
-attributes #0 = { nounwind "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { nounwind "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "use-soft-float"="false" }
+attributes #1 = { "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "use-soft-float"="false" }
 
 
diff --git a/llvm/test/CodeGen/Mips/tnaked.ll b/llvm/test/CodeGen/Mips/tnaked.ll
index ac54f2f..287c009 100644
--- a/llvm/test/CodeGen/Mips/tnaked.ll
+++ b/llvm/test/CodeGen/Mips/tnaked.ll
@@ -25,5 +25,5 @@ entry:
 ; CHECK:	.fmask	0x00000000,0
 ; CHECK: 	addiu	$sp, $sp, -8
 
-attributes #0 = { naked noinline nounwind "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { nounwind "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { naked noinline nounwind "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { nounwind "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "use-soft-float"="false" }
diff --git a/llvm/test/CodeGen/RISCV/rv32p.ll b/llvm/test/CodeGen/RISCV/rv32p.ll
new file mode 100644
index 0000000..4eee880a
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/rv32p.ll
@@ -0,0 +1,709 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=riscv32 -mattr=+experimental-p -verify-machineinstrs < %s \
+; RUN:   | FileCheck %s
+
+declare i32 @llvm.ctlz.i32(i32, i1)
+
+define i32 @ctlz_i32(i32 %a) nounwind {
+; CHECK-LABEL: ctlz_i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    clz a0, a0
+; CHECK-NEXT:    ret
+  %1 = call i32 @llvm.ctlz.i32(i32 %a, i1 false)
+  ret i32 %1
+}
+
+declare i64 @llvm.ctlz.i64(i64, i1)
+
+define i64 @ctlz_i64(i64 %a) nounwind {
+; CHECK-LABEL: ctlz_i64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    bnez a1, .LBB1_2
+; CHECK-NEXT:  # %bb.1:
+; CHECK-NEXT:    clz a0, a0
+; CHECK-NEXT:    addi a0, a0, 32
+; CHECK-NEXT:    li a1, 0
+; CHECK-NEXT:    ret
+; CHECK-NEXT:  .LBB1_2:
+; CHECK-NEXT:    clz a0, a1
+; CHECK-NEXT:    li a1, 0
+; CHECK-NEXT:    ret
+  %1 = call i64 @llvm.ctlz.i64(i64 %a, i1 false)
+  ret i64 %1
+}
+
+declare i32 @llvm.cttz.i32(i32, i1)
+
+define i32 @cttz_i32(i32 %a) nounwind {
+; CHECK-LABEL: cttz_i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    beqz a0, .LBB2_2
+; CHECK-NEXT:  # %bb.1: # %cond.false
+; CHECK-NEXT:    addi a1, a0, -1
+; CHECK-NEXT:    not a0, a0
+; CHECK-NEXT:    and a0, a0, a1
+; CHECK-NEXT:    clz a0, a0
+; CHECK-NEXT:    li a1, 32
+; CHECK-NEXT:    sub a0, a1, a0
+; CHECK-NEXT:    ret
+; CHECK-NEXT:  .LBB2_2:
+; CHECK-NEXT:    li a0, 32
+; CHECK-NEXT:    ret
+  %1 = call i32 @llvm.cttz.i32(i32 %a, i1 false)
+  ret i32 %1
+}
+
+declare i64 @llvm.cttz.i64(i64, i1)
+
+define i64 @cttz_i64(i64 %a) nounwind {
+; CHECK-LABEL: cttz_i64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    or a2, a0, a1
+; CHECK-NEXT:    beqz a2, .LBB3_3
+; CHECK-NEXT:  # %bb.1: # %cond.false
+; CHECK-NEXT:    bnez a0, .LBB3_4
+; CHECK-NEXT:  # %bb.2: # %cond.false
+; CHECK-NEXT:    addi a0, a1, -1
+; CHECK-NEXT:    not a1, a1
+; CHECK-NEXT:    and a0, a1, a0
+; CHECK-NEXT:    clz a0, a0
+; CHECK-NEXT:    li a1, 64
+; CHECK-NEXT:    j .LBB3_5
+; CHECK-NEXT:  .LBB3_3:
+; CHECK-NEXT:    li a1, 0
+; CHECK-NEXT:    li a0, 64
+; CHECK-NEXT:    ret
+; CHECK-NEXT:  .LBB3_4:
+; CHECK-NEXT:    addi a1, a0, -1
+; CHECK-NEXT:    not a0, a0
+; CHECK-NEXT:    and a0, a0, a1
+; CHECK-NEXT:    clz a0, a0
+; CHECK-NEXT:    li a1, 32
+; CHECK-NEXT:  .LBB3_5: # %cond.false
+; CHECK-NEXT:    sub a0, a1, a0
+; CHECK-NEXT:    li a1, 0
+; CHECK-NEXT:    ret
+  %1 = call i64 @llvm.cttz.i64(i64 %a, i1 false)
+  ret i64 %1
+}
+
+define i32 @sextb_i32(i32 %a) nounwind {
+; CHECK-LABEL: sextb_i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    sext.b a0, a0
+; CHECK-NEXT:    ret
+  %shl = shl i32 %a, 24
+  %shr = ashr exact i32 %shl, 24
+  ret i32 %shr
+}
+
+define i64 @sextb_i64(i64 %a) nounwind {
+; CHECK-LABEL: sextb_i64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    sext.b a0, a0
+; CHECK-NEXT:    srai a1, a0, 31
+; CHECK-NEXT:    ret
+  %shl = shl i64 %a, 56
+  %shr = ashr exact i64 %shl, 56
+  ret i64 %shr
+}
+
+define i32 @sexth_i32(i32 %a) nounwind {
+; CHECK-LABEL: sexth_i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    sext.h a0, a0
+; CHECK-NEXT:    ret
+  %shl = shl i32 %a, 16
+  %shr = ashr exact i32 %shl, 16
+  ret i32 %shr
+}
+
+define i64 @sexth_i64(i64 %a) nounwind {
+; CHECK-LABEL: sexth_i64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    sext.h a0, a0
+; CHECK-NEXT:    srai a1, a0, 31
+; CHECK-NEXT:    ret
+  %shl = shl i64 %a, 48
+  %shr = ashr exact i64 %shl, 48
+  ret i64 %shr
+}
+
+define i32 @min_i32(i32 %a, i32 %b) nounwind {
+; CHECK-LABEL: min_i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    min a0, a0, a1
+; CHECK-NEXT:    ret
+  %cmp = icmp slt i32 %a, %b
+  %cond = select i1 %cmp, i32 %a, i32 %b
+  ret i32 %cond
+}
+
+; As we are not matching directly i64 code patterns on RV32 some i64 patterns
+; don't have yet any matching bit manipulation instructions on RV32.
+; This test is presented here in case future expansions of the Bitmanip
+; extensions introduce instructions suitable for this pattern.
+
+define i64 @min_i64(i64 %a, i64 %b) nounwind {
+; CHECK-LABEL: min_i64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    beq a1, a3, .LBB9_2
+; CHECK-NEXT:  # %bb.1:
+; CHECK-NEXT:    slt a4, a1, a3
+; CHECK-NEXT:    beqz a4, .LBB9_3
+; CHECK-NEXT:    j .LBB9_4
+; CHECK-NEXT:  .LBB9_2:
+; CHECK-NEXT:    sltu a4, a0, a2
+; CHECK-NEXT:    bnez a4, .LBB9_4
+; CHECK-NEXT:  .LBB9_3:
+; CHECK-NEXT:    mv a0, a2
+; CHECK-NEXT:    mv a1, a3
+; CHECK-NEXT:  .LBB9_4:
+; CHECK-NEXT:    ret
+  %cmp = icmp slt i64 %a, %b
+  %cond = select i1 %cmp, i64 %a, i64 %b
+  ret i64 %cond
+}
+
+define i32 @max_i32(i32 %a, i32 %b) nounwind {
+; CHECK-LABEL: max_i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    max a0, a0, a1
+; CHECK-NEXT:    ret
+  %cmp = icmp sgt i32 %a, %b
+  %cond = select i1 %cmp, i32 %a, i32 %b
+  ret i32 %cond
+}
+
+; As we are not matching directly i64 code patterns on RV32 some i64 patterns
+; don't have yet any matching bit manipulation instructions on RV32.
+; This test is presented here in case future expansions of the Bitmanip
+; extensions introduce instructions suitable for this pattern.
+
+define i64 @max_i64(i64 %a, i64 %b) nounwind {
+; CHECK-LABEL: max_i64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    beq a1, a3, .LBB11_2
+; CHECK-NEXT:  # %bb.1:
+; CHECK-NEXT:    slt a4, a3, a1
+; CHECK-NEXT:    beqz a4, .LBB11_3
+; CHECK-NEXT:    j .LBB11_4
+; CHECK-NEXT:  .LBB11_2:
+; CHECK-NEXT:    sltu a4, a2, a0
+; CHECK-NEXT:    bnez a4, .LBB11_4
+; CHECK-NEXT:  .LBB11_3:
+; CHECK-NEXT:    mv a0, a2
+; CHECK-NEXT:    mv a1, a3
+; CHECK-NEXT:  .LBB11_4:
+; CHECK-NEXT:    ret
+  %cmp = icmp sgt i64 %a, %b
+  %cond = select i1 %cmp, i64 %a, i64 %b
+  ret i64 %cond
+}
+
+define i32 @minu_i32(i32 %a, i32 %b) nounwind {
+; CHECK-LABEL: minu_i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    minu a0, a0, a1
+; CHECK-NEXT:    ret
+  %cmp = icmp ult i32 %a, %b
+  %cond = select i1 %cmp, i32 %a, i32 %b
+  ret i32 %cond
+}
+
+; As we are not matching directly i64 code patterns on RV32 some i64 patterns
+; don't have yet any matching bit manipulation instructions on RV32.
+; This test is presented here in case future expansions of the Bitmanip
+; extensions introduce instructions suitable for this pattern.
+
+define i64 @minu_i64(i64 %a, i64 %b) nounwind {
+; CHECK-LABEL: minu_i64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    beq a1, a3, .LBB13_2
+; CHECK-NEXT:  # %bb.1:
+; CHECK-NEXT:    sltu a4, a1, a3
+; CHECK-NEXT:    beqz a4, .LBB13_3
+; CHECK-NEXT:    j .LBB13_4
+; CHECK-NEXT:  .LBB13_2:
+; CHECK-NEXT:    sltu a4, a0, a2
+; CHECK-NEXT:    bnez a4, .LBB13_4
+; CHECK-NEXT:  .LBB13_3:
+; CHECK-NEXT:    mv a0, a2
+; CHECK-NEXT:    mv a1, a3
+; CHECK-NEXT:  .LBB13_4:
+; CHECK-NEXT:    ret
+  %cmp = icmp ult i64 %a, %b
+  %cond = select i1 %cmp, i64 %a, i64 %b
+  ret i64 %cond
+}
+
+define i32 @maxu_i32(i32 %a, i32 %b) nounwind {
+; CHECK-LABEL: maxu_i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    maxu a0, a0, a1
+; CHECK-NEXT:    ret
+  %cmp = icmp ugt i32 %a, %b
+  %cond = select i1 %cmp, i32 %a, i32 %b
+  ret i32 %cond
+}
+
+; As we are not matching directly i64 code patterns on RV32 some i64 patterns
+; don't have yet any matching bit manipulation instructions on RV32.
+; This test is presented here in case future expansions of the Bitmanip
+; extensions introduce instructions suitable for this pattern.
+
+define i64 @maxu_i64(i64 %a, i64 %b) nounwind {
+; CHECK-LABEL: maxu_i64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    beq a1, a3, .LBB15_2
+; CHECK-NEXT:  # %bb.1:
+; CHECK-NEXT:    sltu a4, a3, a1
+; CHECK-NEXT:    beqz a4, .LBB15_3
+; CHECK-NEXT:    j .LBB15_4
+; CHECK-NEXT:  .LBB15_2:
+; CHECK-NEXT:    sltu a4, a2, a0
+; CHECK-NEXT:    bnez a4, .LBB15_4
+; CHECK-NEXT:  .LBB15_3:
+; CHECK-NEXT:    mv a0, a2
+; CHECK-NEXT:    mv a1, a3
+; CHECK-NEXT:  .LBB15_4:
+; CHECK-NEXT:    ret
+  %cmp = icmp ugt i64 %a, %b
+  %cond = select i1 %cmp, i64 %a, i64 %b
+  ret i64 %cond
+}
+
+declare i32 @llvm.abs.i32(i32, i1 immarg)
+
+define i32 @abs_i32(i32 %x) {
+; CHECK-LABEL: abs_i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    abs a0, a0
+; CHECK-NEXT:    ret
+  %abs = tail call i32 @llvm.abs.i32(i32 %x, i1 true)
+  ret i32 %abs
+}
+
+declare i64 @llvm.abs.i64(i64, i1 immarg)
+
+define i64 @abs_i64(i64 %x) {
+; CHECK-LABEL: abs_i64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    bgez a1, .LBB17_2
+; CHECK-NEXT:  # %bb.1:
+; CHECK-NEXT:    snez a2, a0
+; CHECK-NEXT:    neg a0, a0
+; CHECK-NEXT:    neg a1, a1
+; CHECK-NEXT:    sub a1, a1, a2
+; CHECK-NEXT:  .LBB17_2:
+; CHECK-NEXT:    ret
+  %abs = tail call i64 @llvm.abs.i64(i64 %x, i1 true)
+  ret i64 %abs
+}
+
+define i32 @zexth_i32(i32 %a) nounwind {
+; CHECK-LABEL: zexth_i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    slli a0, a0, 16
+; CHECK-NEXT:    srli a0, a0, 16
+; CHECK-NEXT:    ret
+  %and = and i32 %a, 65535
+  ret i32 %and
+}
+
+define i64 @zexth_i64(i64 %a) nounwind {
+; CHECK-LABEL: zexth_i64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    slli a0, a0, 16
+; CHECK-NEXT:    srli a0, a0, 16
+; CHECK-NEXT:    li a1, 0
+; CHECK-NEXT:    ret
+  %and = and i64 %a, 65535
+  ret i64 %and
+}
+
+declare i32 @llvm.bswap.i32(i32)
+
+define i32 @bswap_i32(i32 %a) nounwind {
+; CHECK-LABEL: bswap_i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    rev8 a0, a0
+; CHECK-NEXT:    ret
+  %1 = tail call i32 @llvm.bswap.i32(i32 %a)
+  ret i32 %1
+}
+
+declare i64 @llvm.bswap.i64(i64)
+
+define i64 @bswap_i64(i64 %a) {
+; CHECK-LABEL: bswap_i64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    rev8 a2, a1
+; CHECK-NEXT:    rev8 a1, a0
+; CHECK-NEXT:    mv a0, a2
+; CHECK-NEXT:    ret
+  %1 = call i64 @llvm.bswap.i64(i64 %a)
+  ret i64 %1
+}
+
+define i32 @srai_slli(i16 signext %0) {
+; CHECK-LABEL: srai_slli:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    slli a0, a0, 25
+; CHECK-NEXT:    srai a0, a0, 31
+; CHECK-NEXT:    ret
+  %2 = shl i16 %0, 9
+  %sext = ashr i16 %2, 15
+  %3 = sext i16 %sext to i32
+  ret i32 %3
+}
+
+define i32 @srai_slli2(i16 signext %0) {
+; CHECK-LABEL: srai_slli2:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    slli a0, a0, 25
+; CHECK-NEXT:    srai a0, a0, 30
+; CHECK-NEXT:    ret
+  %2 = shl i16 %0, 9
+  %sext = ashr i16 %2, 14
+  %3 = sext i16 %sext to i32
+  ret i32 %3
+}
+define i8 @sub_if_uge_i8(i8 %x, i8 %y) {
+; CHECK-LABEL: sub_if_uge_i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    zext.b a2, a0
+; CHECK-NEXT:    sub a0, a0, a1
+; CHECK-NEXT:    zext.b a0, a0
+; CHECK-NEXT:    minu a0, a2, a0
+; CHECK-NEXT:    ret
+  %cmp = icmp ult i8 %x, %y
+  %select = select i1 %cmp, i8 0, i8 %y
+  %sub = sub nuw i8 %x, %select
+  ret i8 %sub
+}
+
+define i16 @sub_if_uge_i16(i16 %x, i16 %y) {
+; CHECK-LABEL: sub_if_uge_i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    lui a2, 16
+; CHECK-NEXT:    sub a1, a0, a1
+; CHECK-NEXT:    addi a2, a2, -1
+; CHECK-NEXT:    and a0, a0, a2
+; CHECK-NEXT:    and a1, a1, a2
+; CHECK-NEXT:    minu a0, a0, a1
+; CHECK-NEXT:    ret
+  %cmp = icmp ult i16 %x, %y
+  %select = select i1 %cmp, i16 0, i16 %y
+  %sub = sub nuw i16 %x, %select
+  ret i16 %sub
+}
+
+define i32 @sub_if_uge_i32(i32 %x, i32 %y) {
+; CHECK-LABEL: sub_if_uge_i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    sub a1, a0, a1
+; CHECK-NEXT:    minu a0, a0, a1
+; CHECK-NEXT:    ret
+  %cmp = icmp ult i32 %x, %y
+  %select = select i1 %cmp, i32 0, i32 %y
+  %sub = sub nuw i32 %x, %select
+  ret i32 %sub
+}
+
+define i64 @sub_if_uge_i64(i64 %x, i64 %y) {
+; CHECK-LABEL: sub_if_uge_i64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    beq a1, a3, .LBB27_2
+; CHECK-NEXT:  # %bb.1:
+; CHECK-NEXT:    sltu a4, a1, a3
+; CHECK-NEXT:    j .LBB27_3
+; CHECK-NEXT:  .LBB27_2:
+; CHECK-NEXT:    sltu a4, a0, a2
+; CHECK-NEXT:  .LBB27_3:
+; CHECK-NEXT:    addi a4, a4, -1
+; CHECK-NEXT:    and a3, a4, a3
+; CHECK-NEXT:    and a2, a4, a2
+; CHECK-NEXT:    sltu a4, a0, a2
+; CHECK-NEXT:    sub a1, a1, a3
+; CHECK-NEXT:    sub a1, a1, a4
+; CHECK-NEXT:    sub a0, a0, a2
+; CHECK-NEXT:    ret
+  %cmp = icmp ult i64 %x, %y
+  %select = select i1 %cmp, i64 0, i64 %y
+  %sub = sub nuw i64 %x, %select
+  ret i64 %sub
+}
+
+define i128 @sub_if_uge_i128(i128 %x, i128 %y) {
+; CHECK-LABEL: sub_if_uge_i128:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    lw a3, 4(a1)
+; CHECK-NEXT:    lw a4, 8(a1)
+; CHECK-NEXT:    lw a5, 12(a1)
+; CHECK-NEXT:    lw a6, 4(a2)
+; CHECK-NEXT:    lw t0, 12(a2)
+; CHECK-NEXT:    lw a7, 8(a2)
+; CHECK-NEXT:    beq a5, t0, .LBB28_2
+; CHECK-NEXT:  # %bb.1:
+; CHECK-NEXT:    sltu t1, a5, t0
+; CHECK-NEXT:    j .LBB28_3
+; CHECK-NEXT:  .LBB28_2:
+; CHECK-NEXT:    sltu t1, a4, a7
+; CHECK-NEXT:  .LBB28_3:
+; CHECK-NEXT:    lw a1, 0(a1)
+; CHECK-NEXT:    lw a2, 0(a2)
+; CHECK-NEXT:    beq a3, a6, .LBB28_5
+; CHECK-NEXT:  # %bb.4:
+; CHECK-NEXT:    sltu t2, a3, a6
+; CHECK-NEXT:    j .LBB28_6
+; CHECK-NEXT:  .LBB28_5:
+; CHECK-NEXT:    sltu t2, a1, a2
+; CHECK-NEXT:  .LBB28_6:
+; CHECK-NEXT:    xor t3, a5, t0
+; CHECK-NEXT:    xor t4, a4, a7
+; CHECK-NEXT:    or t3, t4, t3
+; CHECK-NEXT:    beqz t3, .LBB28_8
+; CHECK-NEXT:  # %bb.7:
+; CHECK-NEXT:    mv t2, t1
+; CHECK-NEXT:  .LBB28_8:
+; CHECK-NEXT:    addi t3, t2, -1
+; CHECK-NEXT:    and t2, t3, t0
+; CHECK-NEXT:    and t0, t3, a2
+; CHECK-NEXT:    and t1, t3, a6
+; CHECK-NEXT:    sltu a2, a1, t0
+; CHECK-NEXT:    and a7, t3, a7
+; CHECK-NEXT:    mv a6, a2
+; CHECK-NEXT:    beq a3, t1, .LBB28_10
+; CHECK-NEXT:  # %bb.9:
+; CHECK-NEXT:    sltu a6, a3, t1
+; CHECK-NEXT:  .LBB28_10:
+; CHECK-NEXT:    sub t3, a4, a7
+; CHECK-NEXT:    sltu a4, a4, a7
+; CHECK-NEXT:    sub a5, a5, t2
+; CHECK-NEXT:    sub a3, a3, t1
+; CHECK-NEXT:    sub a1, a1, t0
+; CHECK-NEXT:    sltu a7, t3, a6
+; CHECK-NEXT:    sub a5, a5, a4
+; CHECK-NEXT:    sub a4, t3, a6
+; CHECK-NEXT:    sub a3, a3, a2
+; CHECK-NEXT:    sub a2, a5, a7
+; CHECK-NEXT:    sw a1, 0(a0)
+; CHECK-NEXT:    sw a3, 4(a0)
+; CHECK-NEXT:    sw a4, 8(a0)
+; CHECK-NEXT:    sw a2, 12(a0)
+; CHECK-NEXT:    ret
+  %cmp = icmp ult i128 %x, %y
+  %select = select i1 %cmp, i128 0, i128 %y
+  %sub = sub nuw i128 %x, %select
+  ret i128 %sub
+}
+
+define i32 @sub_if_uge_multiuse_select_i32(i32 %x, i32 %y) {
+; CHECK-LABEL: sub_if_uge_multiuse_select_i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    sltu a2, a0, a1
+; CHECK-NEXT:    addi a2, a2, -1
+; CHECK-NEXT:    and a1, a2, a1
+; CHECK-NEXT:    sub a0, a0, a1
+; CHECK-NEXT:    sll a0, a0, a1
+; CHECK-NEXT:    ret
+  %cmp = icmp ult i32 %x, %y
+  %select = select i1 %cmp, i32 0, i32 %y
+  %sub = sub nuw i32 %x, %select
+  %shl = shl i32 %sub, %select
+  ret i32 %shl
+}
+
+define i32 @sub_if_uge_multiuse_cmp_i32(i32 %x, i32 %y) {
+; CHECK-LABEL: sub_if_uge_multiuse_cmp_i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    sub a2, a0, a1
+; CHECK-NEXT:    minu a2, a0, a2
+; CHECK-NEXT:    bltu a0, a1, .LBB30_2
+; CHECK-NEXT:  # %bb.1:
+; CHECK-NEXT:    li a0, 4
+; CHECK-NEXT:    sll a0, a2, a0
+; CHECK-NEXT:    ret
+; CHECK-NEXT:  .LBB30_2:
+; CHECK-NEXT:    li a0, 2
+; CHECK-NEXT:    sll a0, a2, a0
+; CHECK-NEXT:    ret
+  %cmp = icmp ult i32 %x, %y
+  %select = select i1 %cmp, i32 0, i32 %y
+  %sub = sub nuw i32 %x, %select
+  %select2 = select i1 %cmp, i32 2, i32 4
+  %shl = shl i32 %sub, %select2
+  ret i32 %shl
+}
+
+define i32 @sub_if_uge_multiuse_cmp_store_i32(i32 %x, i32 %y, ptr %z) {
+; CHECK-LABEL: sub_if_uge_multiuse_cmp_store_i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    sltu a3, a0, a1
+; CHECK-NEXT:    sub a1, a0, a1
+; CHECK-NEXT:    xori a3, a3, 1
+; CHECK-NEXT:    minu a0, a0, a1
+; CHECK-NEXT:    sw a3, 0(a2)
+; CHECK-NEXT:    ret
+  %cmp = icmp uge i32 %x, %y
+  %conv = zext i1 %cmp to i32
+  store i32 %conv, ptr %z, align 4
+  %select = select i1 %cmp, i32 %y, i32 0
+  %sub = sub nuw i32 %x, %select
+  ret i32 %sub
+}
+
+define i8 @sub_if_uge_C_i8(i8 zeroext %x) {
+; CHECK-LABEL: sub_if_uge_C_i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    addi a1, a0, -13
+; CHECK-NEXT:    zext.b a1, a1
+; CHECK-NEXT:    minu a0, a1, a0
+; CHECK-NEXT:    ret
+  %cmp = icmp ugt i8 %x, 12
+  %sub = add i8 %x, -13
+  %conv4 = select i1 %cmp, i8 %sub, i8 %x
+  ret i8 %conv4
+}
+
+define i16 @sub_if_uge_C_i16(i16 zeroext %x) {
+; CHECK-LABEL: sub_if_uge_C_i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    addi a1, a0, -251
+; CHECK-NEXT:    slli a1, a1, 16
+; CHECK-NEXT:    srli a1, a1, 16
+; CHECK-NEXT:    minu a0, a1, a0
+; CHECK-NEXT:    ret
+  %cmp = icmp ugt i16 %x, 250
+  %sub = add i16 %x, -251
+  %conv4 = select i1 %cmp, i16 %sub, i16 %x
+  ret i16 %conv4
+}
+
+define i32 @sub_if_uge_C_i32(i32 signext %x) {
+; CHECK-LABEL: sub_if_uge_C_i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    lui a1, 1048560
+; CHECK-NEXT:    addi a1, a1, 15
+; CHECK-NEXT:    add a1, a0, a1
+; CHECK-NEXT:    minu a0, a1, a0
+; CHECK-NEXT:    ret
+  %cmp = icmp ugt i32 %x, 65520
+  %sub = add i32 %x, -65521
+  %cond = select i1 %cmp, i32 %sub, i32 %x
+  ret i32 %cond
+}
+
+define i64 @sub_if_uge_C_i64(i64 %x) {
+; CHECK-LABEL: sub_if_uge_C_i64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a2, 1
+; CHECK-NEXT:    beq a1, a2, .LBB35_2
+; CHECK-NEXT:  # %bb.1:
+; CHECK-NEXT:    sltiu a2, a1, 2
+; CHECK-NEXT:    xori a2, a2, 1
+; CHECK-NEXT:    j .LBB35_3
+; CHECK-NEXT:  .LBB35_2:
+; CHECK-NEXT:    lui a2, 172127
+; CHECK-NEXT:    addi a2, a2, 511
+; CHECK-NEXT:    sltu a2, a2, a0
+; CHECK-NEXT:  .LBB35_3:
+; CHECK-NEXT:    neg a2, a2
+; CHECK-NEXT:    andi a3, a2, -2
+; CHECK-NEXT:    add a1, a1, a3
+; CHECK-NEXT:    lui a3, 876449
+; CHECK-NEXT:    addi a3, a3, -512
+; CHECK-NEXT:    and a2, a2, a3
+; CHECK-NEXT:    add a2, a0, a2
+; CHECK-NEXT:    sltu a0, a2, a0
+; CHECK-NEXT:    add a1, a1, a0
+; CHECK-NEXT:    mv a0, a2
+; CHECK-NEXT:    ret
+  %cmp = icmp ugt i64 %x, 4999999999
+  %sub = add i64 %x, -5000000000
+  %cond = select i1 %cmp, i64 %sub, i64 %x
+  ret i64 %cond
+}
+
+define i32 @sub_if_uge_C_multiuse_cmp_i32(i32 signext %x, ptr %z) {
+; CHECK-LABEL: sub_if_uge_C_multiuse_cmp_i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    lui a2, 16
+; CHECK-NEXT:    lui a3, 1048560
+; CHECK-NEXT:    addi a2, a2, -16
+; CHECK-NEXT:    addi a3, a3, 15
+; CHECK-NEXT:    sltu a2, a2, a0
+; CHECK-NEXT:    add a3, a0, a3
+; CHECK-NEXT:    minu a0, a3, a0
+; CHECK-NEXT:    sw a2, 0(a1)
+; CHECK-NEXT:    ret
+  %cmp = icmp ugt i32 %x, 65520
+  %conv = zext i1 %cmp to i32
+  store i32 %conv, ptr %z, align 4
+  %sub = add i32 %x, -65521
+  %cond = select i1 %cmp, i32 %sub, i32 %x
+  ret i32 %cond
+}
+
+define i32 @sub_if_uge_C_multiuse_sub_i32(i32 signext %x, ptr %z) {
+; CHECK-LABEL: sub_if_uge_C_multiuse_sub_i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    lui a2, 1048560
+; CHECK-NEXT:    addi a2, a2, 15
+; CHECK-NEXT:    add a2, a0, a2
+; CHECK-NEXT:    minu a0, a2, a0
+; CHECK-NEXT:    sw a2, 0(a1)
+; CHECK-NEXT:    ret
+  %sub = add i32 %x, -65521
+  store i32 %sub, ptr %z, align 4
+  %cmp = icmp ugt i32 %x, 65520
+  %cond = select i1 %cmp, i32 %sub, i32 %x
+  ret i32 %cond
+}
+
+define i32 @sub_if_uge_C_swapped_i32(i32 %x) {
+; CHECK-LABEL: sub_if_uge_C_swapped_i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    lui a1, 1048560
+; CHECK-NEXT:    addi a1, a1, 15
+; CHECK-NEXT:    add a1, a0, a1
+; CHECK-NEXT:    minu a0, a0, a1
+; CHECK-NEXT:    ret
+  %cmp = icmp ult i32 %x, 65521
+  %sub = add i32 %x, -65521
+  %cond = select i1 %cmp, i32 %x, i32 %sub
+  ret i32 %cond
+}
+
+define i7 @sub_if_uge_C_nsw_i7(i7 %a) {
+; CHECK-LABEL: sub_if_uge_C_nsw_i7:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    ori a0, a0, 51
+; CHECK-NEXT:    andi a1, a0, 127
+; CHECK-NEXT:    addi a0, a0, 17
+; CHECK-NEXT:    andi a0, a0, 92
+; CHECK-NEXT:    minu a0, a0, a1
+; CHECK-NEXT:    ret
+  %x = or i7 %a, 51
+  %c = icmp ugt i7 %x, -18
+  %add = add nsw i7 %x, 17
+  %s = select i1 %c, i7 %add, i7 %x
+  ret i7 %s
+}
+
+define i7 @sub_if_uge_C_swapped_nsw_i7(i7 %a) {
+; CHECK-LABEL: sub_if_uge_C_swapped_nsw_i7:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    ori a0, a0, 51
+; CHECK-NEXT:    andi a1, a0, 127
+; CHECK-NEXT:    addi a0, a0, 17
+; CHECK-NEXT:    andi a0, a0, 92
+; CHECK-NEXT:    minu a0, a1, a0
+; CHECK-NEXT:    ret
+  %x = or i7 %a, 51
+  %c = icmp ult i7 %x, -17
+  %add = add nsw i7 %x, 17
+  %s = select i1 %c, i7 %x, i7 %add
+  ret i7 %s
+}
diff --git a/llvm/test/CodeGen/RISCV/rv64p.ll b/llvm/test/CodeGen/RISCV/rv64p.ll
new file mode 100644
index 0000000..cb07f94
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/rv64p.ll
@@ -0,0 +1,677 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=riscv64 -mattr=+experimental-p -verify-machineinstrs < %s \
+; RUN:   | FileCheck %s
+
+declare i32 @llvm.ctlz.i32(i32, i1)
+
+define signext i32 @ctlz_i32(i32 signext %a) nounwind {
+; CHECK-LABEL: ctlz_i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    clzw a0, a0
+; CHECK-NEXT:    ret
+  %1 = call i32 @llvm.ctlz.i32(i32 %a, i1 false)
+  ret i32 %1
+}
+
+define signext i32 @log2_i32(i32 signext %a) nounwind {
+; CHECK-LABEL: log2_i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    clzw a0, a0
+; CHECK-NEXT:    li a1, 31
+; CHECK-NEXT:    sub a0, a1, a0
+; CHECK-NEXT:    ret
+  %1 = call i32 @llvm.ctlz.i32(i32 %a, i1 false)
+  %2 = sub i32 31, %1
+  ret i32 %2
+}
+
+define signext i32 @log2_ceil_i32(i32 signext %a) nounwind {
+; CHECK-LABEL: log2_ceil_i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    addi a0, a0, -1
+; CHECK-NEXT:    clzw a0, a0
+; CHECK-NEXT:    li a1, 32
+; CHECK-NEXT:    sub a0, a1, a0
+; CHECK-NEXT:    ret
+  %1 = sub i32 %a, 1
+  %2 = call i32 @llvm.ctlz.i32(i32 %1, i1 false)
+  %3 = sub i32 32, %2
+  ret i32 %3
+}
+
+define signext i32 @findLastSet_i32(i32 signext %a) nounwind {
+; CHECK-LABEL: findLastSet_i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    clzw a1, a0
+; CHECK-NEXT:    snez a0, a0
+; CHECK-NEXT:    xori a1, a1, 31
+; CHECK-NEXT:    addi a0, a0, -1
+; CHECK-NEXT:    or a0, a0, a1
+; CHECK-NEXT:    ret
+  %1 = call i32 @llvm.ctlz.i32(i32 %a, i1 true)
+  %2 = xor i32 31, %1
+  %3 = icmp eq i32 %a, 0
+  %4 = select i1 %3, i32 -1, i32 %2
+  ret i32 %4
+}
+
+define i32 @ctlz_lshr_i32(i32 signext %a) {
+; CHECK-LABEL: ctlz_lshr_i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    srliw a0, a0, 1
+; CHECK-NEXT:    clzw a0, a0
+; CHECK-NEXT:    ret
+  %1 = lshr i32 %a, 1
+  %2 = call i32 @llvm.ctlz.i32(i32 %1, i1 false)
+  ret i32 %2
+}
+
+declare i64 @llvm.ctlz.i64(i64, i1)
+
+define i64 @ctlz_i64(i64 %a) nounwind {
+; CHECK-LABEL: ctlz_i64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    clz a0, a0
+; CHECK-NEXT:    ret
+  %1 = call i64 @llvm.ctlz.i64(i64 %a, i1 false)
+  ret i64 %1
+}
+
+declare i32 @llvm.cttz.i32(i32, i1)
+
+define signext i32 @cttz_i32(i32 signext %a) nounwind {
+; CHECK-LABEL: cttz_i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    beqz a0, .LBB6_2
+; CHECK-NEXT:  # %bb.1: # %cond.false
+; CHECK-NEXT:    addi a1, a0, -1
+; CHECK-NEXT:    not a0, a0
+; CHECK-NEXT:    and a0, a0, a1
+; CHECK-NEXT:    clzw a0, a0
+; CHECK-NEXT:    li a1, 32
+; CHECK-NEXT:    sub a0, a1, a0
+; CHECK-NEXT:    ret
+; CHECK-NEXT:  .LBB6_2:
+; CHECK-NEXT:    li a0, 32
+; CHECK-NEXT:    ret
+  %1 = call i32 @llvm.cttz.i32(i32 %a, i1 false)
+  ret i32 %1
+}
+
+define signext i32 @cttz_zero_undef_i32(i32 signext %a) nounwind {
+; CHECK-LABEL: cttz_zero_undef_i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    addi a1, a0, -1
+; CHECK-NEXT:    not a0, a0
+; CHECK-NEXT:    and a0, a0, a1
+; CHECK-NEXT:    clzw a0, a0
+; CHECK-NEXT:    li a1, 32
+; CHECK-NEXT:    sub a0, a1, a0
+; CHECK-NEXT:    ret
+  %1 = call i32 @llvm.cttz.i32(i32 %a, i1 true)
+  ret i32 %1
+}
+
+define signext i32 @findFirstSet_i32(i32 signext %a) nounwind {
+; CHECK-LABEL: findFirstSet_i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    addi a1, a0, -1
+; CHECK-NEXT:    not a2, a0
+; CHECK-NEXT:    and a1, a2, a1
+; CHECK-NEXT:    li a2, 32
+; CHECK-NEXT:    snez a0, a0
+; CHECK-NEXT:    clzw a1, a1
+; CHECK-NEXT:    sub a2, a2, a1
+; CHECK-NEXT:    addi a0, a0, -1
+; CHECK-NEXT:    or a0, a0, a2
+; CHECK-NEXT:    ret
+  %1 = call i32 @llvm.cttz.i32(i32 %a, i1 true)
+  %2 = icmp eq i32 %a, 0
+  %3 = select i1 %2, i32 -1, i32 %1
+  ret i32 %3
+}
+
+define signext i32 @ffs_i32(i32 signext %a) nounwind {
+; CHECK-LABEL: ffs_i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    addi a1, a0, -1
+; CHECK-NEXT:    not a2, a0
+; CHECK-NEXT:    and a1, a2, a1
+; CHECK-NEXT:    li a2, 33
+; CHECK-NEXT:    seqz a0, a0
+; CHECK-NEXT:    clzw a1, a1
+; CHECK-NEXT:    sub a2, a2, a1
+; CHECK-NEXT:    addi a0, a0, -1
+; CHECK-NEXT:    and a0, a0, a2
+; CHECK-NEXT:    ret
+  %1 = call i32 @llvm.cttz.i32(i32 %a, i1 true)
+  %2 = add i32 %1, 1
+  %3 = icmp eq i32 %a, 0
+  %4 = select i1 %3, i32 0, i32 %2
+  ret i32 %4
+}
+
+declare i64 @llvm.cttz.i64(i64, i1)
+
+define i64 @cttz_i64(i64 %a) nounwind {
+; CHECK-LABEL: cttz_i64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    beqz a0, .LBB10_2
+; CHECK-NEXT:  # %bb.1: # %cond.false
+; CHECK-NEXT:    addi a1, a0, -1
+; CHECK-NEXT:    not a0, a0
+; CHECK-NEXT:    and a0, a0, a1
+; CHECK-NEXT:    clz a0, a0
+; CHECK-NEXT:    li a1, 64
+; CHECK-NEXT:    sub a0, a1, a0
+; CHECK-NEXT:    ret
+; CHECK-NEXT:  .LBB10_2:
+; CHECK-NEXT:    li a0, 64
+; CHECK-NEXT:    ret
+  %1 = call i64 @llvm.cttz.i64(i64 %a, i1 false)
+  ret i64 %1
+}
+
+define signext i32 @sextb_i32(i32 signext %a) nounwind {
+; CHECK-LABEL: sextb_i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    sext.b a0, a0
+; CHECK-NEXT:    ret
+  %shl = shl i32 %a, 24
+  %shr = ashr exact i32 %shl, 24
+  ret i32 %shr
+}
+
+define i64 @sextb_i64(i64 %a) nounwind {
+; CHECK-LABEL: sextb_i64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    sext.b a0, a0
+; CHECK-NEXT:    ret
+  %shl = shl i64 %a, 56
+  %shr = ashr exact i64 %shl, 56
+  ret i64 %shr
+}
+
+define signext i32 @sexth_i32(i32 signext %a) nounwind {
+; CHECK-LABEL: sexth_i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    sext.h a0, a0
+; CHECK-NEXT:    ret
+  %shl = shl i32 %a, 16
+  %shr = ashr exact i32 %shl, 16
+  ret i32 %shr
+}
+
+define i64 @sexth_i64(i64 %a) nounwind {
+; CHECK-LABEL: sexth_i64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    sext.h a0, a0
+; CHECK-NEXT:    ret
+  %shl = shl i64 %a, 48
+  %shr = ashr exact i64 %shl, 48
+  ret i64 %shr
+}
+
+define signext i32 @min_i32(i32 signext %a, i32 signext %b) nounwind {
+; CHECK-LABEL: min_i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    min a0, a0, a1
+; CHECK-NEXT:    ret
+  %cmp = icmp slt i32 %a, %b
+  %cond = select i1 %cmp, i32 %a, i32 %b
+  ret i32 %cond
+}
+
+define i64 @min_i64(i64 %a, i64 %b) nounwind {
+; CHECK-LABEL: min_i64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    min a0, a0, a1
+; CHECK-NEXT:    ret
+  %cmp = icmp slt i64 %a, %b
+  %cond = select i1 %cmp, i64 %a, i64 %b
+  ret i64 %cond
+}
+
+define signext i32 @max_i32(i32 signext %a, i32 signext %b) nounwind {
+; CHECK-LABEL: max_i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    max a0, a0, a1
+; CHECK-NEXT:    ret
+  %cmp = icmp sgt i32 %a, %b
+  %cond = select i1 %cmp, i32 %a, i32 %b
+  ret i32 %cond
+}
+
+define i64 @max_i64(i64 %a, i64 %b) nounwind {
+; CHECK-LABEL: max_i64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    max a0, a0, a1
+; CHECK-NEXT:    ret
+  %cmp = icmp sgt i64 %a, %b
+  %cond = select i1 %cmp, i64 %a, i64 %b
+  ret i64 %cond
+}
+
+define signext i32 @minu_i32(i32 signext %a, i32 signext %b) nounwind {
+; CHECK-LABEL: minu_i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    minu a0, a0, a1
+; CHECK-NEXT:    ret
+  %cmp = icmp ult i32 %a, %b
+  %cond = select i1 %cmp, i32 %a, i32 %b
+  ret i32 %cond
+}
+
+define i64 @minu_i64(i64 %a, i64 %b) nounwind {
+; CHECK-LABEL: minu_i64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    minu a0, a0, a1
+; CHECK-NEXT:    ret
+  %cmp = icmp ult i64 %a, %b
+  %cond = select i1 %cmp, i64 %a, i64 %b
+  ret i64 %cond
+}
+
+define signext i32 @maxu_i32(i32 signext %a, i32 signext %b) nounwind {
+; CHECK-LABEL: maxu_i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    maxu a0, a0, a1
+; CHECK-NEXT:    ret
+  %cmp = icmp ugt i32 %a, %b
+  %cond = select i1 %cmp, i32 %a, i32 %b
+  ret i32 %cond
+}
+
+define i64 @maxu_i64(i64 %a, i64 %b) nounwind {
+; CHECK-LABEL: maxu_i64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    maxu a0, a0, a1
+; CHECK-NEXT:    ret
+  %cmp = icmp ugt i64 %a, %b
+  %cond = select i1 %cmp, i64 %a, i64 %b
+  ret i64 %cond
+}
+
+declare i32 @llvm.abs.i32(i32, i1 immarg)
+
+define i32 @abs_i32(i32 %x) {
+; CHECK-LABEL: abs_i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    sext.w a0, a0
+; CHECK-NEXT:    abs a0, a0
+; CHECK-NEXT:    ret
+  %abs = tail call i32 @llvm.abs.i32(i32 %x, i1 true)
+  ret i32 %abs
+}
+
+define signext i32 @abs_i32_sext(i32 signext %x) {
+; CHECK-LABEL: abs_i32_sext:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    abs a0, a0
+; CHECK-NEXT:    sext.w a0, a0
+; CHECK-NEXT:    ret
+  %abs = tail call i32 @llvm.abs.i32(i32 %x, i1 true)
+  ret i32 %abs
+}
+
+declare i64 @llvm.abs.i64(i64, i1 immarg)
+
+define i64 @abs_i64(i64 %x) {
+; CHECK-LABEL: abs_i64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    abs a0, a0
+; CHECK-NEXT:    ret
+  %abs = tail call i64 @llvm.abs.i64(i64 %x, i1 true)
+  ret i64 %abs
+}
+
+declare i32 @llvm.bswap.i32(i32)
+
+define signext i32 @bswap_i32(i32 signext %a) nounwind {
+; CHECK-LABEL: bswap_i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    rev8 a0, a0
+; CHECK-NEXT:    srai a0, a0, 32
+; CHECK-NEXT:    ret
+  %1 = tail call i32 @llvm.bswap.i32(i32 %a)
+  ret i32 %1
+}
+
+; Similar to bswap_i32 but the result is not sign extended.
+define void @bswap_i32_nosext(i32 signext %a, ptr %x) nounwind {
+; CHECK-LABEL: bswap_i32_nosext:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    rev8 a0, a0
+; CHECK-NEXT:    srli a0, a0, 32
+; CHECK-NEXT:    sw a0, 0(a1)
+; CHECK-NEXT:    ret
+  %1 = tail call i32 @llvm.bswap.i32(i32 %a)
+  store i32 %1, ptr %x
+  ret void
+}
+
+declare i64 @llvm.bswap.i64(i64)
+
+define i64 @bswap_i64(i64 %a) {
+; CHECK-LABEL: bswap_i64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    rev8 a0, a0
+; CHECK-NEXT:    ret
+  %1 = call i64 @llvm.bswap.i64(i64 %a)
+  ret i64 %1
+}
+
+define i64 @srai_slli(i16 signext %0) {
+; CHECK-LABEL: srai_slli:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    slli a0, a0, 57
+; CHECK-NEXT:    srai a0, a0, 63
+; CHECK-NEXT:    ret
+  %2 = shl i16 %0, 9
+  %sext = ashr i16 %2, 15
+  %3 = sext i16 %sext to i64
+  ret i64 %3
+}
+
+define i64 @srai_slli2(i16 signext %0) {
+; CHECK-LABEL: srai_slli2:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    slli a0, a0, 57
+; CHECK-NEXT:    srai a0, a0, 62
+; CHECK-NEXT:    ret
+  %2 = shl i16 %0, 9
+  %sext = ashr i16 %2, 14
+  %3 = sext i16 %sext to i64
+  ret i64 %3
+}
+
+define signext i32 @func0000000000000001(i32 signext %0, i8 signext %1) #0 {
+; CHECK-LABEL: func0000000000000001:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    slli a1, a1, 59
+; CHECK-NEXT:    srai a1, a1, 63
+; CHECK-NEXT:    addw a0, a1, a0
+; CHECK-NEXT:    ret
+entry:
+  %2 = shl i8 %1, 3
+  %3 = ashr i8 %2, 7
+  %4 = sext i8 %3 to i32
+  %5 = add nsw i32 %4, %0
+  ret i32 %5
+}
+
+define i8 @sub_if_uge_i8(i8 %x, i8 %y) {
+; CHECK-LABEL: sub_if_uge_i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    zext.b a2, a0
+; CHECK-NEXT:    sub a0, a0, a1
+; CHECK-NEXT:    zext.b a0, a0
+; CHECK-NEXT:    minu a0, a2, a0
+; CHECK-NEXT:    ret
+  %cmp = icmp ult i8 %x, %y
+  %select = select i1 %cmp, i8 0, i8 %y
+  %sub = sub nuw i8 %x, %select
+  ret i8 %sub
+}
+
+define i16 @sub_if_uge_i16(i16 %x, i16 %y) {
+; CHECK-LABEL: sub_if_uge_i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    lui a2, 16
+; CHECK-NEXT:    sub a1, a0, a1
+; CHECK-NEXT:    addi a2, a2, -1
+; CHECK-NEXT:    and a0, a0, a2
+; CHECK-NEXT:    and a1, a1, a2
+; CHECK-NEXT:    minu a0, a0, a1
+; CHECK-NEXT:    ret
+  %cmp = icmp ult i16 %x, %y
+  %select = select i1 %cmp, i16 0, i16 %y
+  %sub = sub nuw i16 %x, %select
+  ret i16 %sub
+}
+
+define i32 @sub_if_uge_i32(i32 %x, i32 %y) {
+; CHECK-LABEL: sub_if_uge_i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    sext.w a2, a0
+; CHECK-NEXT:    subw a0, a0, a1
+; CHECK-NEXT:    minu a0, a2, a0
+; CHECK-NEXT:    ret
+  %cmp = icmp ult i32 %x, %y
+  %select = select i1 %cmp, i32 0, i32 %y
+  %sub = sub nuw i32 %x, %select
+  ret i32 %sub
+}
+
+define i64 @sub_if_uge_i64(i64 %x, i64 %y) {
+; CHECK-LABEL: sub_if_uge_i64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    sub a1, a0, a1
+; CHECK-NEXT:    minu a0, a0, a1
+; CHECK-NEXT:    ret
+  %cmp = icmp ult i64 %x, %y
+  %select = select i1 %cmp, i64 0, i64 %y
+  %sub = sub nuw i64 %x, %select
+  ret i64 %sub
+}
+
+define i128 @sub_if_uge_i128(i128 %x, i128 %y) {
+; CHECK-LABEL: sub_if_uge_i128:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    beq a1, a3, .LBB36_2
+; CHECK-NEXT:  # %bb.1:
+; CHECK-NEXT:    sltu a4, a1, a3
+; CHECK-NEXT:    j .LBB36_3
+; CHECK-NEXT:  .LBB36_2:
+; CHECK-NEXT:    sltu a4, a0, a2
+; CHECK-NEXT:  .LBB36_3:
+; CHECK-NEXT:    addi a4, a4, -1
+; CHECK-NEXT:    and a3, a4, a3
+; CHECK-NEXT:    and a2, a4, a2
+; CHECK-NEXT:    sltu a4, a0, a2
+; CHECK-NEXT:    sub a1, a1, a3
+; CHECK-NEXT:    sub a1, a1, a4
+; CHECK-NEXT:    sub a0, a0, a2
+; CHECK-NEXT:    ret
+  %cmp = icmp ult i128 %x, %y
+  %select = select i1 %cmp, i128 0, i128 %y
+  %sub = sub nuw i128 %x, %select
+  ret i128 %sub
+}
+
+define i32 @sub_if_uge_multiuse_select_i32(i32 %x, i32 %y) {
+; CHECK-LABEL: sub_if_uge_multiuse_select_i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    sext.w a2, a1
+; CHECK-NEXT:    sext.w a3, a0
+; CHECK-NEXT:    sltu a2, a3, a2
+; CHECK-NEXT:    addi a2, a2, -1
+; CHECK-NEXT:    and a1, a2, a1
+; CHECK-NEXT:    sub a0, a0, a1
+; CHECK-NEXT:    sllw a0, a0, a1
+; CHECK-NEXT:    ret
+  %cmp = icmp ult i32 %x, %y
+  %select = select i1 %cmp, i32 0, i32 %y
+  %sub = sub nuw i32 %x, %select
+  %shl = shl i32 %sub, %select
+  ret i32 %shl
+}
+
+define i32 @sub_if_uge_multiuse_cmp_i32(i32 %x, i32 %y) {
+; CHECK-LABEL: sub_if_uge_multiuse_cmp_i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    sext.w a2, a1
+; CHECK-NEXT:    sext.w a3, a0
+; CHECK-NEXT:    subw a0, a0, a1
+; CHECK-NEXT:    minu a0, a3, a0
+; CHECK-NEXT:    bltu a3, a2, .LBB38_2
+; CHECK-NEXT:  # %bb.1:
+; CHECK-NEXT:    li a1, 4
+; CHECK-NEXT:    sllw a0, a0, a1
+; CHECK-NEXT:    ret
+; CHECK-NEXT:  .LBB38_2:
+; CHECK-NEXT:    li a1, 2
+; CHECK-NEXT:    sllw a0, a0, a1
+; CHECK-NEXT:    ret
+  %cmp = icmp ult i32 %x, %y
+  %select = select i1 %cmp, i32 0, i32 %y
+  %sub = sub nuw i32 %x, %select
+  %select2 = select i1 %cmp, i32 2, i32 4
+  %shl = shl i32 %sub, %select2
+  ret i32 %shl
+}
+
+define i32 @sub_if_uge_multiuse_cmp_store_i32(i32 signext %x, i32 signext %y, ptr %z) {
+; CHECK-LABEL: sub_if_uge_multiuse_cmp_store_i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    sltu a3, a0, a1
+; CHECK-NEXT:    subw a1, a0, a1
+; CHECK-NEXT:    xori a3, a3, 1
+; CHECK-NEXT:    minu a0, a0, a1
+; CHECK-NEXT:    sw a3, 0(a2)
+; CHECK-NEXT:    ret
+  %cmp = icmp uge i32 %x, %y
+  %conv = zext i1 %cmp to i32
+  store i32 %conv, ptr %z, align 4
+  %select = select i1 %cmp, i32 %y, i32 0
+  %sub = sub nuw i32 %x, %select
+  ret i32 %sub
+}
+
+define i8 @sub_if_uge_C_i8(i8 zeroext %x) {
+; CHECK-LABEL: sub_if_uge_C_i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    addi a1, a0, -13
+; CHECK-NEXT:    zext.b a1, a1
+; CHECK-NEXT:    minu a0, a1, a0
+; CHECK-NEXT:    ret
+  %cmp = icmp ugt i8 %x, 12
+  %sub = add i8 %x, -13
+  %conv4 = select i1 %cmp, i8 %sub, i8 %x
+  ret i8 %conv4
+}
+
+define i16 @sub_if_uge_C_i16(i16 zeroext %x) {
+; CHECK-LABEL: sub_if_uge_C_i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    addi a1, a0, -251
+; CHECK-NEXT:    slli a1, a1, 48
+; CHECK-NEXT:    srli a1, a1, 48
+; CHECK-NEXT:    minu a0, a1, a0
+; CHECK-NEXT:    ret
+  %cmp = icmp ugt i16 %x, 250
+  %sub = add i16 %x, -251
+  %conv4 = select i1 %cmp, i16 %sub, i16 %x
+  ret i16 %conv4
+}
+
+define i32 @sub_if_uge_C_i32(i32 signext %x) {
+; CHECK-LABEL: sub_if_uge_C_i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    lui a1, 1048560
+; CHECK-NEXT:    addi a1, a1, 15
+; CHECK-NEXT:    addw a1, a0, a1
+; CHECK-NEXT:    minu a0, a1, a0
+; CHECK-NEXT:    ret
+  %cmp = icmp ugt i32 %x, 65520
+  %sub = add i32 %x, -65521
+  %cond = select i1 %cmp, i32 %sub, i32 %x
+  ret i32 %cond
+}
+
+define i64 @sub_if_uge_C_i64(i64 %x) {
+; CHECK-LABEL: sub_if_uge_C_i64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    lui a1, 1046192
+; CHECK-NEXT:    addi a1, a1, -761
+; CHECK-NEXT:    slli a1, a1, 9
+; CHECK-NEXT:    add a1, a0, a1
+; CHECK-NEXT:    minu a0, a1, a0
+; CHECK-NEXT:    ret
+  %cmp = icmp ugt i64 %x, 4999999999
+  %sub = add i64 %x, -5000000000
+  %cond = select i1 %cmp, i64 %sub, i64 %x
+  ret i64 %cond
+}
+
+define i32 @sub_if_uge_C_multiuse_cmp_i32(i32 signext %x, ptr %z) {
+; CHECK-LABEL: sub_if_uge_C_multiuse_cmp_i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    lui a2, 16
+; CHECK-NEXT:    lui a3, 1048560
+; CHECK-NEXT:    addi a2, a2, -16
+; CHECK-NEXT:    addi a3, a3, 15
+; CHECK-NEXT:    sltu a2, a2, a0
+; CHECK-NEXT:    addw a3, a0, a3
+; CHECK-NEXT:    minu a0, a3, a0
+; CHECK-NEXT:    sw a2, 0(a1)
+; CHECK-NEXT:    ret
+  %cmp = icmp ugt i32 %x, 65520
+  %conv = zext i1 %cmp to i32
+  store i32 %conv, ptr %z, align 4
+  %sub = add i32 %x, -65521
+  %cond = select i1 %cmp, i32 %sub, i32 %x
+  ret i32 %cond
+}
+
+define i32 @sub_if_uge_C_multiuse_sub_i32(i32 signext %x, ptr %z) {
+; CHECK-LABEL: sub_if_uge_C_multiuse_sub_i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    lui a2, 1048560
+; CHECK-NEXT:    addi a2, a2, 15
+; CHECK-NEXT:    addw a2, a0, a2
+; CHECK-NEXT:    minu a0, a2, a0
+; CHECK-NEXT:    sw a2, 0(a1)
+; CHECK-NEXT:    ret
+  %sub = add i32 %x, -65521
+  store i32 %sub, ptr %z, align 4
+  %cmp = icmp ugt i32 %x, 65520
+  %cond = select i1 %cmp, i32 %sub, i32 %x
+  ret i32 %cond
+}
+
+define i32 @sub_if_uge_C_swapped_i32(i32 signext %x) {
+; CHECK-LABEL: sub_if_uge_C_swapped_i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    lui a1, 1048560
+; CHECK-NEXT:    addi a1, a1, 15
+; CHECK-NEXT:    addw a1, a0, a1
+; CHECK-NEXT:    minu a0, a0, a1
+; CHECK-NEXT:    ret
+  %cmp = icmp ult i32 %x, 65521
+  %sub = add i32 %x, -65521
+  %cond = select i1 %cmp, i32 %x, i32 %sub
+  ret i32 %cond
+}
+
+define i7 @sub_if_uge_C_nsw_i7(i7 %a) {
+; CHECK-LABEL: sub_if_uge_C_nsw_i7:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    ori a0, a0, 51
+; CHECK-NEXT:    andi a1, a0, 127
+; CHECK-NEXT:    addi a0, a0, 17
+; CHECK-NEXT:    andi a0, a0, 92
+; CHECK-NEXT:    minu a0, a0, a1
+; CHECK-NEXT:    ret
+  %x = or i7 %a, 51
+  %c = icmp ugt i7 %x, -18
+  %add = add nsw i7 %x, 17
+  %s = select i1 %c, i7 %add, i7 %x
+  ret i7 %s
+}
+
+define i7 @sub_if_uge_C_swapped_nsw_i7(i7 %a) {
+; CHECK-LABEL: sub_if_uge_C_swapped_nsw_i7:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    ori a0, a0, 51
+; CHECK-NEXT:    andi a1, a0, 127
+; CHECK-NEXT:    addi a0, a0, 17
+; CHECK-NEXT:    andi a0, a0, 92
+; CHECK-NEXT:    minu a0, a1, a0
+; CHECK-NEXT:    ret
+  %x = or i7 %a, 51
+  %c = icmp ult i7 %x, -17
+  %add = add nsw i7 %x, 17
+  %s = select i1 %c, i7 %x, i7 %add
+  ret i7 %s
+}
diff --git a/llvm/test/CodeGen/RISCV/rvv/sf_vfbfexp16e.ll b/llvm/test/CodeGen/RISCV/rvv/sf_vfbfexp16e.ll
new file mode 100644
index 0000000..5c0c6c1
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/rvv/sf_vfbfexp16e.ll
@@ -0,0 +1,191 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+v,+zvfbfmin,+xsfvfbfexp16e \
+; RUN:   -verify-machineinstrs | FileCheck %s --check-prefixes=CHECK
+; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+v,+zvfbfmin,+xsfvfbfexp16e \
+; RUN:   -verify-machineinstrs | FileCheck %s --check-prefixes=CHECK
+
+define <vscale x 1 x bfloat>  @intrinsic_sf_vfexp_v_nxv1bf16(<vscale x 1 x bfloat> %0, iXLen %1) nounwind {
+; CHECK-LABEL: intrinsic_sf_vfexp_v_nxv1bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, mf4, ta, ma
+; CHECK-NEXT:    sf.vfexp.v v8, v8
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x bfloat> @llvm.riscv.sf.vfexp.nxv1bf16(
+    <vscale x 1 x bfloat> poison,
+    <vscale x 1 x bfloat> %0,
+    iXLen %1)
+
+  ret <vscale x 1 x bfloat> %a
+}
+
+define <vscale x 2 x bfloat>  @intrinsic_sf_vfexp_v_nxv2bf16(<vscale x 2 x bfloat> %0, iXLen %1) nounwind {
+; CHECK-LABEL: intrinsic_sf_vfexp_v_nxv2bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, mf2, ta, ma
+; CHECK-NEXT:    sf.vfexp.v v8, v8
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x bfloat> @llvm.riscv.sf.vfexp.nxv2bf16(
+    <vscale x 2 x bfloat> poison,
+    <vscale x 2 x bfloat> %0,
+    iXLen %1)
+
+  ret <vscale x 2 x bfloat> %a
+}
+
+define <vscale x 4 x bfloat>  @intrinsic_sf_vfexp_v_nxv4bf16(<vscale x 4 x bfloat> %0, iXLen %1) nounwind {
+; CHECK-LABEL: intrinsic_sf_vfexp_v_nxv4bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, m1, ta, ma
+; CHECK-NEXT:    sf.vfexp.v v8, v8
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x bfloat> @llvm.riscv.sf.vfexp.nxv4bf16(
+    <vscale x 4 x bfloat> poison,
+    <vscale x 4 x bfloat> %0,
+    iXLen %1)
+
+  ret <vscale x 4 x bfloat> %a
+}
+
+define <vscale x 8 x bfloat>  @intrinsic_sf_vfexp_v_nxv8bf16(<vscale x 8 x bfloat> %0, iXLen %1) nounwind {
+; CHECK-LABEL: intrinsic_sf_vfexp_v_nxv8bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, m2, ta, ma
+; CHECK-NEXT:    sf.vfexp.v v8, v8
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x bfloat> @llvm.riscv.sf.vfexp.nxv8bf16(
+    <vscale x 8 x bfloat> poison,
+    <vscale x 8 x bfloat> %0,
+    iXLen %1)
+
+  ret <vscale x 8 x bfloat> %a
+}
+
+define <vscale x 16 x bfloat>  @intrinsic_sf_vfexp_v_nxv16bf16(<vscale x 16 x bfloat> %0, iXLen %1) nounwind {
+; CHECK-LABEL: intrinsic_sf_vfexp_v_nxv16bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, m4, ta, ma
+; CHECK-NEXT:    sf.vfexp.v v8, v8
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 16 x bfloat> @llvm.riscv.sf.vfexp.nxv16bf16(
+    <vscale x 16 x bfloat> poison,
+    <vscale x 16 x bfloat> %0,
+    iXLen %1)
+
+  ret <vscale x 16 x bfloat> %a
+}
+
+define <vscale x 32 x bfloat>  @intrinsic_sf_vfexp_v_nxv32bf16(<vscale x 32 x bfloat> %0, iXLen %1) nounwind {
+; CHECK-LABEL: intrinsic_sf_vfexp_v_nxv32bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, m8, ta, ma
+; CHECK-NEXT:    sf.vfexp.v v8, v8
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 32 x bfloat> @llvm.riscv.sf.vfexp.nxv32bf16(
+    <vscale x 32 x bfloat> poison,
+    <vscale x 32 x bfloat> %0,
+    iXLen %1)
+
+  ret <vscale x 32 x bfloat> %a
+}
+
+define <vscale x 1 x bfloat>  @intrinsic_sf_vfexp_mask_v_nxv1bf16(<vscale x 1 x bfloat> %0, <vscale x 1 x bfloat> %1, <vscale x 1 x i1> %m, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_sf_vfexp_mask_v_nxv1bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, mf4, tu, mu
+; CHECK-NEXT:    sf.vfexp.v v8, v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x bfloat> @llvm.riscv.sf.vfexp.mask.nxv1bf16(
+    <vscale x 1 x bfloat> %0,
+    <vscale x 1 x bfloat> %1,
+    <vscale x 1 x i1> %m,
+    iXLen %2, iXLen 0)
+
+  ret <vscale x 1 x bfloat> %a
+}
+
+define <vscale x 2 x bfloat>  @intrinsic_sf_vfexp_mask_v_nxv2bf16(<vscale x 2 x bfloat> %0, <vscale x 2 x bfloat> %1, <vscale x 2 x i1> %m, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_sf_vfexp_mask_v_nxv2bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, mf2, tu, mu
+; CHECK-NEXT:    sf.vfexp.v v8, v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x bfloat> @llvm.riscv.sf.vfexp.mask.nxv2bf16(
+    <vscale x 2 x bfloat> %0,
+    <vscale x 2 x bfloat> %1,
+    <vscale x 2 x i1> %m,
+    iXLen %2, iXLen 0)
+
+  ret <vscale x 2 x bfloat> %a
+}
+
+define <vscale x 4 x bfloat>  @intrinsic_sf_vfexp_mask_v_nxv4bf16(<vscale x 4 x bfloat> %0, <vscale x 4 x bfloat> %1, <vscale x 4 x i1> %m, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_sf_vfexp_mask_v_nxv4bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, m1, tu, mu
+; CHECK-NEXT:    sf.vfexp.v v8, v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x bfloat> @llvm.riscv.sf.vfexp.mask.nxv4bf16(
+    <vscale x 4 x bfloat> %0,
+    <vscale x 4 x bfloat> %1,
+    <vscale x 4 x i1> %m,
+    iXLen %2, iXLen 0)
+
+  ret <vscale x 4 x bfloat> %a
+}
+
+define <vscale x 8 x bfloat>  @intrinsic_sf_vfexp_mask_v_nxv8bf16(<vscale x 8 x bfloat> %0, <vscale x 8 x bfloat> %1, <vscale x 8 x i1> %m, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_sf_vfexp_mask_v_nxv8bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, m2, tu, mu
+; CHECK-NEXT:    sf.vfexp.v v8, v10, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x bfloat> @llvm.riscv.sf.vfexp.mask.nxv8bf16(
+    <vscale x 8 x bfloat> %0,
+    <vscale x 8 x bfloat> %1,
+    <vscale x 8 x i1> %m,
+    iXLen %2, iXLen 0)
+
+  ret <vscale x 8 x bfloat> %a
+}
+
+define <vscale x 16 x bfloat>  @intrinsic_sf_vfexp_mask_v_nxv16bf16(<vscale x 16 x bfloat> %0, <vscale x 16 x bfloat> %1, <vscale x 16 x i1> %m, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_sf_vfexp_mask_v_nxv16bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, m4, tu, mu
+; CHECK-NEXT:    sf.vfexp.v v8, v12, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 16 x bfloat> @llvm.riscv.sf.vfexp.mask.nxv16bf16(
+    <vscale x 16 x bfloat> %0,
+    <vscale x 16 x bfloat> %1,
+    <vscale x 16 x i1> %m,
+    iXLen %2, iXLen 0)
+
+  ret <vscale x 16 x bfloat> %a
+}
+
+define <vscale x 32 x bfloat>  @intrinsic_sf_vfexp_mask_v_nxv32bf16(<vscale x 32 x bfloat> %0, <vscale x 32 x bfloat> %1, <vscale x 32 x i1> %m, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_sf_vfexp_mask_v_nxv32bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, m8, tu, mu
+; CHECK-NEXT:    sf.vfexp.v v8, v16, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 32 x bfloat> @llvm.riscv.sf.vfexp.mask.nxv32bf16(
+    <vscale x 32 x bfloat> %0,
+    <vscale x 32 x bfloat> %1,
+    <vscale x 32 x i1> %m,
+    iXLen %2, iXLen 0)
+
+  ret <vscale x 32 x bfloat> %a
+}
diff --git a/llvm/test/CodeGen/RISCV/rvv/sf_vfexp16e.ll b/llvm/test/CodeGen/RISCV/rvv/sf_vfexp16e.ll
new file mode 100644
index 0000000..2d97f73
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/rvv/sf_vfexp16e.ll
@@ -0,0 +1,191 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+v,+xsfvfexp16e \
+; RUN:   -verify-machineinstrs | FileCheck %s --check-prefixes=CHECK
+; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+v,+xsfvfexp16e \
+; RUN:   -verify-machineinstrs | FileCheck %s --check-prefixes=CHECK
+
+define <vscale x 1 x half>  @intrinsic_sf_vfexp_v_nxv1f16(<vscale x 1 x half> %0, iXLen %1) nounwind {
+; CHECK-LABEL: intrinsic_sf_vfexp_v_nxv1f16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
+; CHECK-NEXT:    sf.vfexp.v v8, v8
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x half> @llvm.riscv.sf.vfexp.nxv1f16(
+    <vscale x 1 x half> poison,
+    <vscale x 1 x half> %0,
+    iXLen %1)
+
+  ret <vscale x 1 x half> %a
+}
+
+define <vscale x 2 x half>  @intrinsic_sf_vfexp_v_nxv2f16(<vscale x 2 x half> %0, iXLen %1) nounwind {
+; CHECK-LABEL: intrinsic_sf_vfexp_v_nxv2f16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
+; CHECK-NEXT:    sf.vfexp.v v8, v8
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x half> @llvm.riscv.sf.vfexp.nxv2f16(
+    <vscale x 2 x half> poison,
+    <vscale x 2 x half> %0,
+    iXLen %1)
+
+  ret <vscale x 2 x half> %a
+}
+
+define <vscale x 4 x half>  @intrinsic_sf_vfexp_v_nxv4f16(<vscale x 4 x half> %0, iXLen %1) nounwind {
+; CHECK-LABEL: intrinsic_sf_vfexp_v_nxv4f16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
+; CHECK-NEXT:    sf.vfexp.v v8, v8
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x half> @llvm.riscv.sf.vfexp.nxv4f16(
+    <vscale x 4 x half> poison,
+    <vscale x 4 x half> %0,
+    iXLen %1)
+
+  ret <vscale x 4 x half> %a
+}
+
+define <vscale x 8 x half>  @intrinsic_sf_vfexp_v_nxv8f16(<vscale x 8 x half> %0, iXLen %1) nounwind {
+; CHECK-LABEL: intrinsic_sf_vfexp_v_nxv8f16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
+; CHECK-NEXT:    sf.vfexp.v v8, v8
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x half> @llvm.riscv.sf.vfexp.nxv8f16(
+    <vscale x 8 x half> poison,
+    <vscale x 8 x half> %0,
+    iXLen %1)
+
+  ret <vscale x 8 x half> %a
+}
+
+define <vscale x 16 x half>  @intrinsic_sf_vfexp_v_nxv16f16(<vscale x 16 x half> %0, iXLen %1) nounwind {
+; CHECK-LABEL: intrinsic_sf_vfexp_v_nxv16f16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
+; CHECK-NEXT:    sf.vfexp.v v8, v8
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 16 x half> @llvm.riscv.sf.vfexp.nxv16f16(
+    <vscale x 16 x half> poison,
+    <vscale x 16 x half> %0,
+    iXLen %1)
+
+  ret <vscale x 16 x half> %a
+}
+
+define <vscale x 32 x half>  @intrinsic_sf_vfexp_v_nxv32f16(<vscale x 32 x half> %0, iXLen %1) nounwind {
+; CHECK-LABEL: intrinsic_sf_vfexp_v_nxv32f16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e16, m8, ta, ma
+; CHECK-NEXT:    sf.vfexp.v v8, v8
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 32 x half> @llvm.riscv.sf.vfexp.nxv32f16(
+    <vscale x 32 x half> poison,
+    <vscale x 32 x half> %0,
+    iXLen %1)
+
+  ret <vscale x 32 x half> %a
+}
+
+define <vscale x 1 x half>  @intrinsic_sf_vfexp_mask_v_nxv1f16(<vscale x 1 x half> %0, <vscale x 1 x half> %1, <vscale x 1 x i1> %m, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_sf_vfexp_mask_v_nxv1f16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, tu, mu
+; CHECK-NEXT:    sf.vfexp.v v8, v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x half> @llvm.riscv.sf.vfexp.mask.nxv1f16(
+    <vscale x 1 x half> %0,
+    <vscale x 1 x half> %1,
+    <vscale x 1 x i1> %m,
+    iXLen %2, iXLen 0)
+
+  ret <vscale x 1 x half> %a
+}
+
+define <vscale x 2 x half>  @intrinsic_sf_vfexp_mask_v_nxv2f16(<vscale x 2 x half> %0, <vscale x 2 x half> %1, <vscale x 2 x i1> %m, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_sf_vfexp_mask_v_nxv2f16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, tu, mu
+; CHECK-NEXT:    sf.vfexp.v v8, v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x half> @llvm.riscv.sf.vfexp.mask.nxv2f16(
+    <vscale x 2 x half> %0,
+    <vscale x 2 x half> %1,
+    <vscale x 2 x i1> %m,
+    iXLen %2, iXLen 0)
+
+  ret <vscale x 2 x half> %a
+}
+
+define <vscale x 4 x half>  @intrinsic_sf_vfexp_mask_v_nxv4f16(<vscale x 4 x half> %0, <vscale x 4 x half> %1, <vscale x 4 x i1> %m, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_sf_vfexp_mask_v_nxv4f16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e16, m1, tu, mu
+; CHECK-NEXT:    sf.vfexp.v v8, v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x half> @llvm.riscv.sf.vfexp.mask.nxv4f16(
+    <vscale x 4 x half> %0,
+    <vscale x 4 x half> %1,
+    <vscale x 4 x i1> %m,
+    iXLen %2, iXLen 0)
+
+  ret <vscale x 4 x half> %a
+}
+
+define <vscale x 8 x half>  @intrinsic_sf_vfexp_mask_v_nxv8f16(<vscale x 8 x half> %0, <vscale x 8 x half> %1, <vscale x 8 x i1> %m, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_sf_vfexp_mask_v_nxv8f16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e16, m2, tu, mu
+; CHECK-NEXT:    sf.vfexp.v v8, v10, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x half> @llvm.riscv.sf.vfexp.mask.nxv8f16(
+    <vscale x 8 x half> %0,
+    <vscale x 8 x half> %1,
+    <vscale x 8 x i1> %m,
+    iXLen %2, iXLen 0)
+
+  ret <vscale x 8 x half> %a
+}
+
+define <vscale x 16 x half>  @intrinsic_sf_vfexp_mask_v_nxv16f16(<vscale x 16 x half> %0, <vscale x 16 x half> %1, <vscale x 16 x i1> %m, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_sf_vfexp_mask_v_nxv16f16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e16, m4, tu, mu
+; CHECK-NEXT:    sf.vfexp.v v8, v12, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 16 x half> @llvm.riscv.sf.vfexp.mask.nxv16f16(
+    <vscale x 16 x half> %0,
+    <vscale x 16 x half> %1,
+    <vscale x 16 x i1> %m,
+    iXLen %2, iXLen 0)
+
+  ret <vscale x 16 x half> %a
+}
+
+define <vscale x 32 x half>  @intrinsic_sf_vfexp_mask_v_nxv32f16(<vscale x 32 x half> %0, <vscale x 32 x half> %1, <vscale x 32 x i1> %m, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_sf_vfexp_mask_v_nxv32f16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e16, m8, tu, mu
+; CHECK-NEXT:    sf.vfexp.v v8, v16, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 32 x half> @llvm.riscv.sf.vfexp.mask.nxv32f16(
+    <vscale x 32 x half> %0,
+    <vscale x 32 x half> %1,
+    <vscale x 32 x i1> %m,
+    iXLen %2, iXLen 0)
+
+  ret <vscale x 32 x half> %a
+}
diff --git a/llvm/test/CodeGen/RISCV/rvv/sf_vfexp32e.ll b/llvm/test/CodeGen/RISCV/rvv/sf_vfexp32e.ll
new file mode 100644
index 0000000..46dce14
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/rvv/sf_vfexp32e.ll
@@ -0,0 +1,160 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+v,+xsfvfexp32e \
+; RUN:   -verify-machineinstrs | FileCheck %s --check-prefixes=CHECK
+; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+v,+xsfvfexp32e \
+; RUN:   -verify-machineinstrs | FileCheck %s --check-prefixes=CHECK
+
+define <vscale x 1 x float>  @intrinsic_sf_vfexp_v_nxv1f32(<vscale x 1 x float> %0, iXLen %1) nounwind {
+; CHECK-LABEL: intrinsic_sf_vfexp_v_nxv1f32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; CHECK-NEXT:    sf.vfexp.v v8, v8
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x float> @llvm.riscv.sf.vfexp.nxv1f32(
+    <vscale x 1 x float> poison,
+    <vscale x 1 x float> %0,
+    iXLen %1)
+
+  ret <vscale x 1 x float> %a
+}
+
+define <vscale x 2 x float>  @intrinsic_sf_vfexp_v_nxv2f32(<vscale x 2 x float> %0, iXLen %1) nounwind {
+; CHECK-LABEL: intrinsic_sf_vfexp_v_nxv2f32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
+; CHECK-NEXT:    sf.vfexp.v v8, v8
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x float> @llvm.riscv.sf.vfexp.nxv2f32(
+    <vscale x 2 x float> poison,
+    <vscale x 2 x float> %0,
+    iXLen %1)
+
+  ret <vscale x 2 x float> %a
+}
+
+define <vscale x 4 x float>  @intrinsic_sf_vfexp_v_nxv4f32(<vscale x 4 x float> %0, iXLen %1) nounwind {
+; CHECK-LABEL: intrinsic_sf_vfexp_v_nxv4f32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
+; CHECK-NEXT:    sf.vfexp.v v8, v8
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x float> @llvm.riscv.sf.vfexp.nxv4f32(
+    <vscale x 4 x float> poison,
+    <vscale x 4 x float> %0,
+    iXLen %1)
+
+  ret <vscale x 4 x float> %a
+}
+
+define <vscale x 8 x float>  @intrinsic_sf_vfexp_v_nxv8f32(<vscale x 8 x float> %0, iXLen %1) nounwind {
+; CHECK-LABEL: intrinsic_sf_vfexp_v_nxv8f32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; CHECK-NEXT:    sf.vfexp.v v8, v8
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x float> @llvm.riscv.sf.vfexp.nxv8f32(
+    <vscale x 8 x float> poison,
+    <vscale x 8 x float> %0,
+    iXLen %1)
+
+  ret <vscale x 8 x float> %a
+}
+
+define <vscale x 16 x float>  @intrinsic_sf_vfexp_v_nxv16f32(<vscale x 16 x float> %0, iXLen %1) nounwind {
+; CHECK-LABEL: intrinsic_sf_vfexp_v_nxv16f32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
+; CHECK-NEXT:    sf.vfexp.v v8, v8
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 16 x float> @llvm.riscv.sf.vfexp.nxv16f32(
+    <vscale x 16 x float> poison,
+    <vscale x 16 x float> %0,
+    iXLen %1)
+
+  ret <vscale x 16 x float> %a
+}
+
+define <vscale x 1 x float>  @intrinsic_sf_vfexp_mask_v_nxv1f32(<vscale x 1 x float> %0, <vscale x 1 x float> %1, <vscale x 1 x i1> %m, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_sf_vfexp_mask_v_nxv1f32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, tu, mu
+; CHECK-NEXT:    sf.vfexp.v v8, v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x float> @llvm.riscv.sf.vfexp.mask.nxv1f32(
+    <vscale x 1 x float> %0,
+    <vscale x 1 x float> %1,
+    <vscale x 1 x i1> %m,
+    iXLen %2, iXLen 0)
+
+  ret <vscale x 1 x float> %a
+}
+
+define <vscale x 2 x float>  @intrinsic_sf_vfexp_mask_v_nxv2f32(<vscale x 2 x float> %0, <vscale x 2 x float> %1, <vscale x 2 x i1> %m, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_sf_vfexp_mask_v_nxv2f32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e32, m1, tu, mu
+; CHECK-NEXT:    sf.vfexp.v v8, v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x float> @llvm.riscv.sf.vfexp.mask.nxv2f32(
+    <vscale x 2 x float> %0,
+    <vscale x 2 x float> %1,
+    <vscale x 2 x i1> %m,
+    iXLen %2, iXLen 0)
+
+  ret <vscale x 2 x float> %a
+}
+
+define <vscale x 4 x float>  @intrinsic_sf_vfexp_mask_v_nxv4f32(<vscale x 4 x float> %0, <vscale x 4 x float> %1, <vscale x 4 x i1> %m, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_sf_vfexp_mask_v_nxv4f32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e32, m2, tu, mu
+; CHECK-NEXT:    sf.vfexp.v v8, v10, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x float> @llvm.riscv.sf.vfexp.mask.nxv4f32(
+    <vscale x 4 x float> %0,
+    <vscale x 4 x float> %1,
+    <vscale x 4 x i1> %m,
+    iXLen %2, iXLen 0)
+
+  ret <vscale x 4 x float> %a
+}
+
+define <vscale x 8 x float>  @intrinsic_sf_vfexp_mask_v_nxv8f32(<vscale x 8 x float> %0, <vscale x 8 x float> %1, <vscale x 8 x i1> %m, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_sf_vfexp_mask_v_nxv8f32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e32, m4, tu, mu
+; CHECK-NEXT:    sf.vfexp.v v8, v12, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x float> @llvm.riscv.sf.vfexp.mask.nxv8f32(
+    <vscale x 8 x float> %0,
+    <vscale x 8 x float> %1,
+    <vscale x 8 x i1> %m,
+    iXLen %2, iXLen 0)
+
+  ret <vscale x 8 x float> %a
+}
+
+define <vscale x 16 x float>  @intrinsic_sf_vfexp_mask_v_nxv16f32(<vscale x 16 x float> %0, <vscale x 16 x float> %1, <vscale x 16 x i1> %m, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_sf_vfexp_mask_v_nxv16f32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e32, m8, tu, mu
+; CHECK-NEXT:    sf.vfexp.v v8, v16, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 16 x float> @llvm.riscv.sf.vfexp.mask.nxv16f32(
+    <vscale x 16 x float> %0,
+    <vscale x 16 x float> %1,
+    <vscale x 16 x i1> %m,
+    iXLen %2, iXLen 0)
+
+  ret <vscale x 16 x float> %a
+}
diff --git a/llvm/test/CodeGen/RISCV/rvv/sf_vfexpa.ll b/llvm/test/CodeGen/RISCV/rvv/sf_vfexpa.ll
new file mode 100644
index 0000000..d3d10d2
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/rvv/sf_vfexpa.ll
@@ -0,0 +1,335 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
+; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+zve64f,+zvfh,+xsfvfexpa \
+; RUN:   -verify-machineinstrs | FileCheck %s --check-prefixes=CHECK
+; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+zve64f,+zvfh,+xsfvfexpa \
+; RUN:   -verify-machineinstrs | FileCheck %s --check-prefixes=CHECK
+
+define <vscale x 1 x float> @test_intrinsic_sf_vfexpa_v_nxv1f32(<vscale x 1 x float> %0, iXLen %1) {
+; CHECK-LABEL: test_intrinsic_sf_vfexpa_v_nxv1f32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; CHECK-NEXT:    sf.vfexpa.v v8, v8
+; CHECK-NEXT:    ret
+entry:
+    %f = call <vscale x 1 x float> @llvm.riscv.sf.vfexpa.nxv1f32(
+      <vscale x 1 x float> poison,
+      <vscale x 1 x float> %0,
+      iXLen %1)
+    ret <vscale x 1 x float> %f
+}
+
+define <vscale x 2 x float> @test_intrinsic_sf_vfexpa_v_nxv2f32(<vscale x 2 x float> %0, iXLen %1) {
+; CHECK-LABEL: test_intrinsic_sf_vfexpa_v_nxv2f32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
+; CHECK-NEXT:    sf.vfexpa.v v8, v8
+; CHECK-NEXT:    ret
+entry:
+    %f = call <vscale x 2 x float> @llvm.riscv.sf.vfexpa.nxv2f32(
+      <vscale x 2 x float> poison,
+      <vscale x 2 x float> %0,
+      iXLen %1)
+    ret <vscale x 2 x float> %f
+}
+
+define <vscale x 4 x float> @test_intrinsic_sf_vfexpa_v_nxv4f32(<vscale x 4 x float> %0, iXLen %1) {
+; CHECK-LABEL: test_intrinsic_sf_vfexpa_v_nxv4f32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
+; CHECK-NEXT:    sf.vfexpa.v v8, v8
+; CHECK-NEXT:    ret
+entry:
+    %f = call <vscale x 4 x float> @llvm.riscv.sf.vfexpa.nxv4f32(
+      <vscale x 4 x float> poison,
+      <vscale x 4 x float> %0,
+      iXLen %1)
+    ret <vscale x 4 x float> %f
+}
+
+define <vscale x 8 x float> @test_intrinsic_sf_vfexpa_v_nxv8f32(<vscale x 8 x float> %0, iXLen %1) {
+; CHECK-LABEL: test_intrinsic_sf_vfexpa_v_nxv8f32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; CHECK-NEXT:    sf.vfexpa.v v8, v8
+; CHECK-NEXT:    ret
+entry:
+    %f = call <vscale x 8 x float> @llvm.riscv.sf.vfexpa.nxv8f32(
+      <vscale x 8 x float> poison,
+      <vscale x 8 x float> %0,
+      iXLen %1)
+    ret <vscale x 8 x float> %f
+}
+
+define <vscale x 16 x float> @test_intrinsic_sf_vfexpa_v_nxv16f32(<vscale x 16 x float> %0, iXLen %1) {
+; CHECK-LABEL: test_intrinsic_sf_vfexpa_v_nxv16f32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
+; CHECK-NEXT:    sf.vfexpa.v v8, v8
+; CHECK-NEXT:    ret
+entry:
+    %f = call <vscale x 16 x float> @llvm.riscv.sf.vfexpa.nxv16f32(
+      <vscale x 16 x float> poison,
+      <vscale x 16 x float> %0,
+      iXLen %1)
+    ret <vscale x 16 x float> %f
+}
+
+define <vscale x 1 x float> @test_intrinsic_sf_vfexpa_v_mask_nxv1f32(<vscale x 1 x float> %0, <vscale x 1 x float> %1, <vscale x 1 x i1> %m, iXLen %vl) {
+; CHECK-LABEL: test_intrinsic_sf_vfexpa_v_mask_nxv1f32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, tu, mu
+; CHECK-NEXT:    sf.vfexpa.v v8, v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+    %f = call <vscale x 1 x float> @llvm.riscv.sf.vfexpa.mask.nxv1f32(
+      <vscale x 1 x float> %0,
+      <vscale x 1 x float> %1,
+      <vscale x 1 x i1> %m,
+      iXLen %vl,
+      iXLen 0)
+    ret <vscale x 1 x float> %f
+}
+
+define <vscale x 2 x float> @test_intrinsic_sf_vfexpa_v_mask_nxv2f32(<vscale x 2 x float> %0, <vscale x 2 x float> %1, <vscale x 2 x i1> %m, iXLen %vl) {
+; CHECK-LABEL: test_intrinsic_sf_vfexpa_v_mask_nxv2f32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e32, m1, tu, mu
+; CHECK-NEXT:    sf.vfexpa.v v8, v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+    %f = call <vscale x 2 x float> @llvm.riscv.sf.vfexpa.mask.nxv2f32(
+      <vscale x 2 x float> %0,
+      <vscale x 2 x float> %1,
+      <vscale x 2 x i1> %m,
+      iXLen %vl,
+      iXLen 0)
+    ret <vscale x 2 x float> %f
+}
+
+define <vscale x 4 x float> @test_intrinsic_sf_vfexpa_v_mask_nxv4f32(<vscale x 4 x float> %0, <vscale x 4 x float> %1, <vscale x 4 x i1> %m, iXLen %vl) {
+; CHECK-LABEL: test_intrinsic_sf_vfexpa_v_mask_nxv4f32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e32, m2, tu, mu
+; CHECK-NEXT:    sf.vfexpa.v v8, v10, v0.t
+; CHECK-NEXT:    ret
+entry:
+    %f = call <vscale x 4 x float> @llvm.riscv.sf.vfexpa.mask.nxv4f32(
+      <vscale x 4 x float> %0,
+      <vscale x 4 x float> %1,
+      <vscale x 4 x i1> %m,
+      iXLen %vl,
+      iXLen 0)
+    ret <vscale x 4 x float> %f
+}
+
+define <vscale x 8 x float> @test_intrinsic_sf_vfexpa_v_mask_nxv8f32(<vscale x 8 x float> %0, <vscale x 8 x float> %1, <vscale x 8 x i1> %m, iXLen %vl) {
+; CHECK-LABEL: test_intrinsic_sf_vfexpa_v_mask_nxv8f32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e32, m4, tu, mu
+; CHECK-NEXT:    sf.vfexpa.v v8, v12, v0.t
+; CHECK-NEXT:    ret
+entry:
+    %f = call <vscale x 8 x float> @llvm.riscv.sf.vfexpa.mask.nxv8f32(
+      <vscale x 8 x float> %0,
+      <vscale x 8 x float> %1,
+      <vscale x 8 x i1> %m,
+      iXLen %vl,
+      iXLen 0)
+    ret <vscale x 8 x float> %f
+}
+
+define <vscale x 16 x float> @test_intrinsic_sf_vfexpa_v_mask_nxv16f32(<vscale x 16 x float> %0, <vscale x 16 x float> %1, <vscale x 16 x i1> %m, iXLen %vl) {
+; CHECK-LABEL: test_intrinsic_sf_vfexpa_v_mask_nxv16f32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e32, m8, tu, mu
+; CHECK-NEXT:    sf.vfexpa.v v8, v16, v0.t
+; CHECK-NEXT:    ret
+entry:
+    %f = call <vscale x 16 x float> @llvm.riscv.sf.vfexpa.mask.nxv16f32(
+      <vscale x 16 x float> %0,
+      <vscale x 16 x float> %1,
+      <vscale x 16 x i1> %m,
+      iXLen %vl,
+      iXLen 0)
+    ret <vscale x 16 x float> %f
+}
+
+define <vscale x 1 x half> @test_intrinsic_sf_vfexpa_v_nxv1f16(<vscale x 1 x half> %0, iXLen %1) {
+; CHECK-LABEL: test_intrinsic_sf_vfexpa_v_nxv1f16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
+; CHECK-NEXT:    sf.vfexpa.v v8, v8
+; CHECK-NEXT:    ret
+entry:
+    %f = call <vscale x 1 x half> @llvm.riscv.sf.vfexpa.nxv1f16(
+      <vscale x 1 x half> poison,
+      <vscale x 1 x half> %0,
+      iXLen %1)
+    ret <vscale x 1 x half> %f
+}
+
+define <vscale x 2 x half> @test_intrinsic_sf_vfexpa_v_nxv2f16(<vscale x 2 x half> %0, iXLen %1) {
+; CHECK-LABEL: test_intrinsic_sf_vfexpa_v_nxv2f16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
+; CHECK-NEXT:    sf.vfexpa.v v8, v8
+; CHECK-NEXT:    ret
+entry:
+    %f = call <vscale x 2 x half> @llvm.riscv.sf.vfexpa.nxv2f16(
+      <vscale x 2 x half> poison,
+      <vscale x 2 x half> %0,
+      iXLen %1)
+    ret <vscale x 2 x half> %f
+}
+
+define <vscale x 4 x half> @test_intrinsic_sf_vfexpa_v_nxv4f16(<vscale x 4 x half> %0, iXLen %1) {
+; CHECK-LABEL: test_intrinsic_sf_vfexpa_v_nxv4f16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
+; CHECK-NEXT:    sf.vfexpa.v v8, v8
+; CHECK-NEXT:    ret
+entry:
+    %f = call <vscale x 4 x half> @llvm.riscv.sf.vfexpa.nxv4f16(
+      <vscale x 4 x half> poison,
+      <vscale x 4 x half> %0,
+      iXLen %1)
+    ret <vscale x 4 x half> %f
+}
+
+define <vscale x 8 x half> @test_intrinsic_sf_vfexpa_v_nxv8f16(<vscale x 8 x half> %0, iXLen %1) {
+; CHECK-LABEL: test_intrinsic_sf_vfexpa_v_nxv8f16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
+; CHECK-NEXT:    sf.vfexpa.v v8, v8
+; CHECK-NEXT:    ret
+entry:
+    %f = call <vscale x 8 x half> @llvm.riscv.sf.vfexpa.nxv8f16(
+      <vscale x 8 x half> poison,
+      <vscale x 8 x half> %0,
+      iXLen %1)
+    ret <vscale x 8 x half> %f
+}
+
+define <vscale x 16 x half> @test_intrinsic_sf_vfexpa_v_nxv16f16(<vscale x 16 x half> %0, iXLen %1) {
+; CHECK-LABEL: test_intrinsic_sf_vfexpa_v_nxv16f16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
+; CHECK-NEXT:    sf.vfexpa.v v8, v8
+; CHECK-NEXT:    ret
+entry:
+    %f = call <vscale x 16 x half> @llvm.riscv.sf.vfexpa.nxv16f16(
+      <vscale x 16 x half> poison,
+      <vscale x 16 x half> %0,
+      iXLen %1)
+    ret <vscale x 16 x half> %f
+}
+
+define <vscale x 32 x half> @test_intrinsic_sf_vfexpa_v_nxv32f16(<vscale x 32 x half> %0, iXLen %1) {
+; CHECK-LABEL: test_intrinsic_sf_vfexpa_v_nxv32f16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e16, m8, ta, ma
+; CHECK-NEXT:    sf.vfexpa.v v8, v8
+; CHECK-NEXT:    ret
+entry:
+    %f = call <vscale x 32 x half> @llvm.riscv.sf.vfexpa.nxv32f16(
+      <vscale x 32 x half> poison,
+      <vscale x 32 x half> %0,
+      iXLen %1)
+    ret <vscale x 32 x half> %f
+}
+
+define <vscale x 1 x half> @test_intrinsic_sf_vfexpa_v_mask_nxv1f16(<vscale x 1 x half> %0, <vscale x 1 x half> %1, <vscale x 1 x i1> %m, iXLen %vl) {
+; CHECK-LABEL: test_intrinsic_sf_vfexpa_v_mask_nxv1f16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, tu, mu
+; CHECK-NEXT:    sf.vfexpa.v v8, v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+    %f = call <vscale x 1 x half> @llvm.riscv.sf.vfexpa.mask.nxv1f16(
+      <vscale x 1 x half> %0,
+      <vscale x 1 x half> %1,
+      <vscale x 1 x i1> %m,
+      iXLen %vl,
+      iXLen 0)
+    ret <vscale x 1 x half> %f
+}
+
+define <vscale x 2 x half> @test_intrinsic_sf_vfexpa_v_mask_nxv2f16(<vscale x 2 x half> %0, <vscale x 2 x half> %1, <vscale x 2 x i1> %m, iXLen %vl) {
+; CHECK-LABEL: test_intrinsic_sf_vfexpa_v_mask_nxv2f16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, tu, mu
+; CHECK-NEXT:    sf.vfexpa.v v8, v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+    %f = call <vscale x 2 x half> @llvm.riscv.sf.vfexpa.mask.nxv2f16(
+      <vscale x 2 x half> %0,
+      <vscale x 2 x half> %1,
+      <vscale x 2 x i1> %m,
+      iXLen %vl,
+      iXLen 0)
+    ret <vscale x 2 x half> %f
+}
+
+define <vscale x 4 x half> @test_intrinsic_sf_vfexpa_v_mask_nxv4f16(<vscale x 4 x half> %0, <vscale x 4 x half> %1, <vscale x 4 x i1> %m, iXLen %vl) {
+; CHECK-LABEL: test_intrinsic_sf_vfexpa_v_mask_nxv4f16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e16, m1, tu, mu
+; CHECK-NEXT:    sf.vfexpa.v v8, v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+    %f = call <vscale x 4 x half> @llvm.riscv.sf.vfexpa.mask.nxv4f16(
+      <vscale x 4 x half> %0,
+      <vscale x 4 x half> %1,
+      <vscale x 4 x i1> %m,
+      iXLen %vl,
+      iXLen 0)
+    ret <vscale x 4 x half> %f
+}
+
+define <vscale x 8 x half> @test_intrinsic_sf_vfexpa_v_mask_nxv8f16(<vscale x 8 x half> %0, <vscale x 8 x half> %1, <vscale x 8 x i1> %m, iXLen %vl) {
+; CHECK-LABEL: test_intrinsic_sf_vfexpa_v_mask_nxv8f16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e16, m2, tu, mu
+; CHECK-NEXT:    sf.vfexpa.v v8, v10, v0.t
+; CHECK-NEXT:    ret
+entry:
+    %f = call <vscale x 8 x half> @llvm.riscv.sf.vfexpa.mask.nxv8f16(
+      <vscale x 8 x half> %0,
+      <vscale x 8 x half> %1,
+      <vscale x 8 x i1> %m,
+      iXLen %vl,
+      iXLen 0)
+    ret <vscale x 8 x half> %f
+}
+
+define <vscale x 16 x half> @test_intrinsic_sf_vfexpa_v_mask_nxv16f16(<vscale x 16 x half> %0, <vscale x 16 x half> %1, <vscale x 16 x i1> %m, iXLen %vl) {
+; CHECK-LABEL: test_intrinsic_sf_vfexpa_v_mask_nxv16f16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e16, m4, tu, mu
+; CHECK-NEXT:    sf.vfexpa.v v8, v12, v0.t
+; CHECK-NEXT:    ret
+entry:
+    %f = call <vscale x 16 x half> @llvm.riscv.sf.vfexpa.mask.nxv16f16(
+      <vscale x 16 x half> %0,
+      <vscale x 16 x half> %1,
+      <vscale x 16 x i1> %m,
+      iXLen %vl,
+      iXLen 0)
+    ret <vscale x 16 x half> %f
+}
+
+define <vscale x 32 x half> @test_intrinsic_sf_vfexpa_v_mask_nxv32f16(<vscale x 32 x half> %0, <vscale x 32 x half> %1, <vscale x 32 x i1> %m, iXLen %vl) {
+; CHECK-LABEL: test_intrinsic_sf_vfexpa_v_mask_nxv32f16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e16, m8, tu, mu
+; CHECK-NEXT:    sf.vfexpa.v v8, v16, v0.t
+; CHECK-NEXT:    ret
+entry:
+    %f = call <vscale x 32 x half> @llvm.riscv.sf.vfexpa.mask.nxv32f16(
+      <vscale x 32 x half> %0,
+      <vscale x 32 x half> %1,
+      <vscale x 32 x i1> %m,
+      iXLen %vl,
+      iXLen 0)
+    ret <vscale x 32 x half> %f
+}
diff --git a/llvm/test/CodeGen/RISCV/rvv/sf_vfexpa64e.ll b/llvm/test/CodeGen/RISCV/rvv/sf_vfexpa64e.ll
new file mode 100644
index 0000000..3de0e93
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/rvv/sf_vfexpa64e.ll
@@ -0,0 +1,125 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
+; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+xsfvfexpa64e \
+; RUN:   -verify-machineinstrs | FileCheck %s --check-prefixes=CHECK
+; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+xsfvfexpa64e \
+; RUN:   -verify-machineinstrs | FileCheck %s --check-prefixes=CHECK
+
+define <vscale x 1 x double> @test_intrinsic_sf_vfexpa_v_nxv1f64(<vscale x 1 x double> %0, iXLen %1) {
+; CHECK-LABEL: test_intrinsic_sf_vfexpa_v_nxv1f64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
+; CHECK-NEXT:    sf.vfexpa.v v8, v8
+; CHECK-NEXT:    ret
+entry:
+    %f = call <vscale x 1 x double> @llvm.riscv.sf.vfexpa.nxv1f64(
+      <vscale x 1 x double> poison,
+      <vscale x 1 x double> %0,
+      iXLen %1)
+    ret <vscale x 1 x double> %f
+}
+
+define <vscale x 2 x double> @test_intrinsic_sf_vfexpa_v_nxv2f64(<vscale x 2 x double> %0, iXLen %1) {
+; CHECK-LABEL: test_intrinsic_sf_vfexpa_v_nxv2f64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
+; CHECK-NEXT:    sf.vfexpa.v v8, v8
+; CHECK-NEXT:    ret
+entry:
+    %f = call <vscale x 2 x double> @llvm.riscv.sf.vfexpa.nxv2f64(
+      <vscale x 2 x double> poison,
+      <vscale x 2 x double> %0,
+      iXLen %1)
+    ret <vscale x 2 x double> %f
+}
+
+define <vscale x 4 x double> @test_intrinsic_sf_vfexpa_v_nxv4f64(<vscale x 4 x double> %0, iXLen %1) {
+; CHECK-LABEL: test_intrinsic_sf_vfexpa_v_nxv4f64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
+; CHECK-NEXT:    sf.vfexpa.v v8, v8
+; CHECK-NEXT:    ret
+entry:
+    %f = call <vscale x 4 x double> @llvm.riscv.sf.vfexpa.nxv4f64(
+      <vscale x 4 x double> poison,
+      <vscale x 4 x double> %0,
+      iXLen %1)
+    ret <vscale x 4 x double> %f
+}
+
+define <vscale x 8 x double> @test_intrinsic_sf_vfexpa_v_nxv8f64(<vscale x 8 x double> %0, iXLen %1) {
+; CHECK-LABEL: test_intrinsic_sf_vfexpa_v_nxv8f64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; CHECK-NEXT:    sf.vfexpa.v v8, v8
+; CHECK-NEXT:    ret
+entry:
+    %f = call <vscale x 8 x double> @llvm.riscv.sf.vfexpa.nxv8f64(
+      <vscale x 8 x double> poison,
+      <vscale x 8 x double> %0,
+      iXLen %1)
+    ret <vscale x 8 x double> %f
+}
+
+define <vscale x 1 x double> @test_intrinsic_sf_vfexpa_v_mask_nxv1f64(<vscale x 1 x double> %0, <vscale x 1 x double> %1, <vscale x 1 x i1> %m, iXLen %vl) {
+; CHECK-LABEL: test_intrinsic_sf_vfexpa_v_mask_nxv1f64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e64, m1, tu, mu
+; CHECK-NEXT:    sf.vfexpa.v v8, v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+    %f = call <vscale x 1 x double> @llvm.riscv.sf.vfexpa.mask.nxv1f64(
+      <vscale x 1 x double> %0,
+      <vscale x 1 x double> %1,
+      <vscale x 1 x i1> %m,
+      iXLen %vl,
+      iXLen 0)
+    ret <vscale x 1 x double> %f
+}
+
+define <vscale x 2 x double> @test_intrinsic_sf_vfexpa_v_mask_nxv2f64(<vscale x 2 x double> %0, <vscale x 2 x double> %1, <vscale x 2 x i1> %m, iXLen %vl) {
+; CHECK-LABEL: test_intrinsic_sf_vfexpa_v_mask_nxv2f64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e64, m2, tu, mu
+; CHECK-NEXT:    sf.vfexpa.v v8, v10, v0.t
+; CHECK-NEXT:    ret
+entry:
+    %f = call <vscale x 2 x double> @llvm.riscv.sf.vfexpa.mask.nxv2f64(
+      <vscale x 2 x double> %0,
+      <vscale x 2 x double> %1,
+      <vscale x 2 x i1> %m,
+      iXLen %vl,
+      iXLen 0)
+    ret <vscale x 2 x double> %f
+}
+
+define <vscale x 4 x double> @test_intrinsic_sf_vfexpa_v_mask_nxv4f64(<vscale x 4 x double> %0, <vscale x 4 x double> %1, <vscale x 4 x i1> %m, iXLen %vl) {
+; CHECK-LABEL: test_intrinsic_sf_vfexpa_v_mask_nxv4f64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e64, m4, tu, mu
+; CHECK-NEXT:    sf.vfexpa.v v8, v12, v0.t
+; CHECK-NEXT:    ret
+entry:
+    %f = call <vscale x 4 x double> @llvm.riscv.sf.vfexpa.mask.nxv4f64(
+      <vscale x 4 x double> %0,
+      <vscale x 4 x double> %1,
+      <vscale x 4 x i1> %m,
+      iXLen %vl,
+      iXLen 0)
+    ret <vscale x 4 x double> %f
+}
+
+define <vscale x 8 x double> @test_intrinsic_sf_vfexpa_v_mask_nxv8f64(<vscale x 8 x double> %0, <vscale x 8 x double> %1, <vscale x 8 x i1> %m, iXLen %vl) {
+; CHECK-LABEL: test_intrinsic_sf_vfexpa_v_mask_nxv8f64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e64, m8, tu, mu
+; CHECK-NEXT:    sf.vfexpa.v v8, v16, v0.t
+; CHECK-NEXT:    ret
+entry:
+    %f = call <vscale x 8 x double> @llvm.riscv.sf.vfexpa.mask.nxv8f64(
+      <vscale x 8 x double> %0,
+      <vscale x 8 x double> %1,
+      <vscale x 8 x i1> %m,
+      iXLen %vl,
+      iXLen 0)
+    ret <vscale x 8 x double> %f
+}
diff --git a/llvm/test/CodeGen/X86/issue163738.ll b/llvm/test/CodeGen/X86/issue163738.ll
new file mode 100644
index 0000000..61fe043
--- /dev/null
+++ b/llvm/test/CodeGen/X86/issue163738.ll
@@ -0,0 +1,13 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
+; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512f,+avx512vl | FileCheck %s --check-prefixes=CHECK
+
+define <8 x i64> @foo(<8 x i64> %a, <8 x i64> %b, <8 x i64> %c) {
+; CHECK-LABEL: foo:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vpternlogq {{.*#+}} zmm0 = ~(zmm0 | zmm2 | zmm1)
+; CHECK-NEXT:    retq
+  %and.demorgan = or <8 x i64> %b, %a
+  %and3.demorgan = or <8 x i64> %and.demorgan, %c
+  %and3 = xor <8 x i64> %and3.demorgan, splat (i64 -1)
+  ret <8 x i64> %and3
+}
diff --git a/llvm/test/Instrumentation/MemorySanitizer/AArch64/sme-aarch64-svcount-mini.ll b/llvm/test/Instrumentation/MemorySanitizer/AArch64/sme-aarch64-svcount-mini.ll
index 1c869bd..e7491e9 100644
--- a/llvm/test/Instrumentation/MemorySanitizer/AArch64/sme-aarch64-svcount-mini.ll
+++ b/llvm/test/Instrumentation/MemorySanitizer/AArch64/sme-aarch64-svcount-mini.ll
@@ -1,14 +1,16 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt -S -passes=msan -mattr=+sme -o - %s
-
-; XFAIL: *
+; RUN: opt -S -passes=msan -mattr=+sme -o - %s | FileCheck %s
 
 ; Forked from llvm/test/CodeGen/AArch64/sme-aarch64-svcount.ll
-; Manually minimized to show MSan leads to a compiler crash
 
 target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
 target triple = "aarch64--linux-android9001"
 
 define target("aarch64.svcount") @test_return_arg1(target("aarch64.svcount") %arg0, target("aarch64.svcount") %arg1) nounwind {
+; CHECK-LABEL: @test_return_arg1(
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    store target("aarch64.svcount") zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret target("aarch64.svcount") [[ARG1:%.*]]
+;
   ret target("aarch64.svcount") %arg1
 }
diff --git a/llvm/test/Instrumentation/MemorySanitizer/AArch64/sme-aarch64-svcount.ll b/llvm/test/Instrumentation/MemorySanitizer/AArch64/sme-aarch64-svcount.ll
index 00cf3204..e1ea9e6 100644
--- a/llvm/test/Instrumentation/MemorySanitizer/AArch64/sme-aarch64-svcount.ll
+++ b/llvm/test/Instrumentation/MemorySanitizer/AArch64/sme-aarch64-svcount.ll
@@ -1,7 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt -S -passes=msan -mattr=+sme -o - %s
-
-; XFAIL: *
+; RUN: opt -S -passes=msan -mattr=+sme -o - %s | FileCheck %s
 
 ; Forked from llvm/test/CodeGen/AArch64/sme-aarch64-svcount.ll
 
@@ -12,16 +10,49 @@ target triple = "aarch64--linux-android9001"
 ; Test simple loads, stores and return.
 ;
 define target("aarch64.svcount") @test_load(ptr %ptr) nounwind {
+; CHECK-LABEL: @test_load(
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[RES:%.*]] = load target("aarch64.svcount"), ptr [[PTR:%.*]], align 2
+; CHECK-NEXT:    store target("aarch64.svcount") zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret target("aarch64.svcount") [[RES]]
+;
   %res = load target("aarch64.svcount"), ptr %ptr
   ret target("aarch64.svcount") %res
 }
 
 define void @test_store(ptr %ptr, target("aarch64.svcount") %val) nounwind {
+; CHECK-LABEL: @test_store(
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP1:%.*]] = ptrtoint ptr [[PTR:%.*]] to i64
+; CHECK-NEXT:    [[TMP2:%.*]] = xor i64 [[TMP1]], 193514046488576
+; CHECK-NEXT:    [[TMP3:%.*]] = inttoptr i64 [[TMP2]] to ptr
+; CHECK-NEXT:    store target("aarch64.svcount") zeroinitializer, ptr [[TMP3]], align 2
+; CHECK-NEXT:    store target("aarch64.svcount") [[VAL:%.*]], ptr [[PTR]], align 2
+; CHECK-NEXT:    ret void
+;
   store target("aarch64.svcount") %val, ptr %ptr
   ret void
 }
 
 define target("aarch64.svcount") @test_alloca_store_reload(target("aarch64.svcount") %val) nounwind {
+; CHECK-LABEL: @test_alloca_store_reload(
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[PTR:%.*]] = alloca target("aarch64.svcount"), align 1
+; CHECK-NEXT:    [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP2:%.*]] = mul nuw i64 [[TMP1]], 2
+; CHECK-NEXT:    [[TMP3:%.*]] = ptrtoint ptr [[PTR]] to i64
+; CHECK-NEXT:    [[TMP4:%.*]] = xor i64 [[TMP3]], 193514046488576
+; CHECK-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
+; CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 1 [[TMP5]], i8 0, i64 [[TMP2]], i1 false)
+; CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[PTR]] to i64
+; CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 193514046488576
+; CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
+; CHECK-NEXT:    store target("aarch64.svcount") zeroinitializer, ptr [[TMP8]], align 2
+; CHECK-NEXT:    store target("aarch64.svcount") [[VAL:%.*]], ptr [[PTR]], align 2
+; CHECK-NEXT:    [[RES:%.*]] = load target("aarch64.svcount"), ptr [[PTR]], align 2
+; CHECK-NEXT:    store target("aarch64.svcount") zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret target("aarch64.svcount") [[RES]]
+;
   %ptr = alloca target("aarch64.svcount"), align 1
   store target("aarch64.svcount") %val, ptr %ptr
   %res = load target("aarch64.svcount"), ptr %ptr
@@ -33,10 +64,20 @@ define target("aarch64.svcount") @test_alloca_store_reload(target("aarch64.svcou
 ;
 
 define target("aarch64.svcount") @test_return_arg1(target("aarch64.svcount") %arg0, target("aarch64.svcount") %arg1) nounwind {
+; CHECK-LABEL: @test_return_arg1(
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    store target("aarch64.svcount") zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret target("aarch64.svcount") [[ARG1:%.*]]
+;
   ret target("aarch64.svcount") %arg1
 }
 
 define target("aarch64.svcount") @test_return_arg4(target("aarch64.svcount") %arg0, target("aarch64.svcount") %arg1, target("aarch64.svcount") %arg2, target("aarch64.svcount") %arg3, target("aarch64.svcount") %arg4) nounwind {
+; CHECK-LABEL: @test_return_arg4(
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    store target("aarch64.svcount") zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret target("aarch64.svcount") [[ARG4:%.*]]
+;
   ret target("aarch64.svcount") %arg4
 }
 
@@ -46,22 +87,58 @@ define target("aarch64.svcount") @test_return_arg4(target("aarch64.svcount") %ar
 
 declare void @take_svcount_1(target("aarch64.svcount") %arg)
 define void @test_pass_1arg(target("aarch64.svcount") %arg) nounwind {
+; CHECK-LABEL: @test_pass_1arg(
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    call void @take_svcount_1(target("aarch64.svcount") [[ARG:%.*]])
+; CHECK-NEXT:    ret void
+;
   call void @take_svcount_1(target("aarch64.svcount") %arg)
   ret void
 }
 
 declare void @take_svcount_5(target("aarch64.svcount") %arg0, target("aarch64.svcount") %arg1, target("aarch64.svcount") %arg2, target("aarch64.svcount") %arg3, target("aarch64.svcount") %arg4)
 define void @test_pass_5args(target("aarch64.svcount") %arg) nounwind {
+; CHECK-LABEL: @test_pass_5args(
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    call void @take_svcount_5(target("aarch64.svcount") [[ARG:%.*]], target("aarch64.svcount") [[ARG]], target("aarch64.svcount") [[ARG]], target("aarch64.svcount") [[ARG]], target("aarch64.svcount") [[ARG]])
+; CHECK-NEXT:    ret void
+;
   call void @take_svcount_5(target("aarch64.svcount") %arg, target("aarch64.svcount") %arg, target("aarch64.svcount") %arg, target("aarch64.svcount") %arg, target("aarch64.svcount") %arg)
   ret void
 }
 
 define target("aarch64.svcount") @test_sel(target("aarch64.svcount") %x, target("aarch64.svcount") %y, i1 %cmp) sanitize_memory {
+; CHECK-LABEL: @test_sel(
+; CHECK-NEXT:    [[TMP1:%.*]] = load i1, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP2:%.*]] = select i1 [[CMP:%.*]], target("aarch64.svcount") zeroinitializer, target("aarch64.svcount") zeroinitializer
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select i1 [[TMP1]], target("aarch64.svcount") zeroinitializer, target("aarch64.svcount") [[TMP2]]
+; CHECK-NEXT:    [[X_Y:%.*]] = select i1 [[CMP]], target("aarch64.svcount") [[X:%.*]], target("aarch64.svcount") [[Y:%.*]]
+; CHECK-NEXT:    store target("aarch64.svcount") [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret target("aarch64.svcount") [[X_Y]]
+;
   %x.y = select i1 %cmp, target("aarch64.svcount") %x, target("aarch64.svcount") %y
   ret target("aarch64.svcount") %x.y
 }
 
 define target("aarch64.svcount") @test_sel_cc(target("aarch64.svcount") %x, target("aarch64.svcount") %y, i32 %k) sanitize_memory {
+; CHECK-LABEL: @test_sel_cc(
+; CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP2:%.*]] = xor i32 [[K:%.*]], -2147483648
+; CHECK-NEXT:    [[TMP3:%.*]] = xor i32 [[TMP1]], -1
+; CHECK-NEXT:    [[TMP4:%.*]] = and i32 [[TMP2]], [[TMP3]]
+; CHECK-NEXT:    [[TMP5:%.*]] = or i32 [[TMP2]], [[TMP1]]
+; CHECK-NEXT:    [[TMP6:%.*]] = icmp ugt i32 [[TMP4]], -2147483606
+; CHECK-NEXT:    [[TMP7:%.*]] = icmp ugt i32 [[TMP5]], -2147483606
+; CHECK-NEXT:    [[TMP8:%.*]] = xor i1 [[TMP6]], [[TMP7]]
+; CHECK-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[K]], 42
+; CHECK-NEXT:    [[TMP9:%.*]] = select i1 [[CMP]], target("aarch64.svcount") zeroinitializer, target("aarch64.svcount") zeroinitializer
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select i1 [[TMP8]], target("aarch64.svcount") zeroinitializer, target("aarch64.svcount") [[TMP9]]
+; CHECK-NEXT:    [[X_Y:%.*]] = select i1 [[CMP]], target("aarch64.svcount") [[X:%.*]], target("aarch64.svcount") [[Y:%.*]]
+; CHECK-NEXT:    store target("aarch64.svcount") [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret target("aarch64.svcount") [[X_Y]]
+;
   %cmp = icmp sgt i32 %k, 42
   %x.y = select i1 %cmp, target("aarch64.svcount") %x, target("aarch64.svcount") %y
   ret target("aarch64.svcount") %x.y
diff --git a/llvm/test/Instrumentation/MemorySanitizer/AArch64/sme2-intrinsics-add-mini.ll b/llvm/test/Instrumentation/MemorySanitizer/AArch64/sme2-intrinsics-add-mini.ll
index 3f43efa..3ae73c5 100644
--- a/llvm/test/Instrumentation/MemorySanitizer/AArch64/sme2-intrinsics-add-mini.ll
+++ b/llvm/test/Instrumentation/MemorySanitizer/AArch64/sme2-intrinsics-add-mini.ll
@@ -1,7 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt -S -passes=msan -mattr=+sme2 -mattr=+sme-i16i64 -mattr=+sme-f64f64 -o - %s
-
-; XFAIL: *
+; RUN: opt -S -passes=msan -mattr=+sme2 -mattr=+sme-i16i64 -mattr=+sme-f64f64 -o - %s | FileCheck %s
 
 ; Forked from llvm/test/CodeGen/AArch64/sme2-intrinsics-add.ll
 ; Manually reduced to show MSan leads to a compiler crash
@@ -10,6 +8,19 @@ target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
 target triple = "aarch64--linux-android9001"
 
 define void @multi_vector_add_za_vg1x4_f32_tuple(i64 %stride, ptr %ptr) sanitize_memory {
+; CHECK-LABEL: @multi_vector_add_za_vg1x4_f32_tuple(
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP2:%.*]] = tail call target("aarch64.svcount") @llvm.aarch64.sve.ptrue.c8()
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1:![0-9]+]]
+; CHECK:       3:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR5:[0-9]+]]
+; CHECK-NEXT:    unreachable
+; CHECK:       4:
+; CHECK-NEXT:    [[TMP5:%.*]] = tail call { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } @llvm.aarch64.sve.ld1.pn.x4.nxv4f32(target("aarch64.svcount") [[TMP2]], ptr [[PTR:%.*]])
+; CHECK-NEXT:    ret void
+;
   %1 = tail call target("aarch64.svcount") @llvm.aarch64.sve.ptrue.c8()
   %2 = tail call { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } @llvm.aarch64.sve.ld1.pn.x4.nxv4f32(target("aarch64.svcount") %1, ptr %ptr)
   ret void
diff --git a/llvm/test/Instrumentation/MemorySanitizer/AArch64/sme2-intrinsics-add.ll b/llvm/test/Instrumentation/MemorySanitizer/AArch64/sme2-intrinsics-add.ll
index cd04373..8d00b93 100644
--- a/llvm/test/Instrumentation/MemorySanitizer/AArch64/sme2-intrinsics-add.ll
+++ b/llvm/test/Instrumentation/MemorySanitizer/AArch64/sme2-intrinsics-add.ll
@@ -1,7 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt -S -passes=msan -mattr=+sme2 -mattr=+sme-i16i64 -mattr=+sme-f64f64 -o - %s
-
-; XFAIL: *
+; RUN: opt -S -passes=msan -mattr=+sme2 -mattr=+sme-i16i64 -mattr=+sme-f64f64 -o - %s | FileCheck %s
 
 ; Forked from llvm/test/CodeGen/AArch64/sme2-intrinsics-add.ll
 
@@ -9,6 +7,27 @@ target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
 target triple = "aarch64--linux-android9001"
 
 define void @multi_vector_add_write_single_za_vg1x2_i32(i32 %slice, <vscale x 4 x i32> %zn0, <vscale x 4 x i32> %zn1,  <vscale x 4 x i32> %zm) sanitize_memory {
+; CHECK-LABEL: @multi_vector_add_write_single_za_vg1x2_i32(
+; CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i32 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP2:%.*]], label [[TMP3:%.*]], !prof [[PROF1:![0-9]+]]
+; CHECK:       2:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7:[0-9]+]]
+; CHECK-NEXT:    unreachable
+; CHECK:       3:
+; CHECK-NEXT:    call void @llvm.aarch64.sme.add.write.single.za.vg1x2.nxv4i32(i32 [[SLICE:%.*]], <vscale x 4 x i32> [[ZN0:%.*]], <vscale x 4 x i32> [[ZN1:%.*]], <vscale x 4 x i32> [[ZM:%.*]])
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or i32 [[TMP1]], 0
+; CHECK-NEXT:    [[SLICE_7:%.*]] = add i32 [[SLICE]], 7
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i32 [[_MSPROP]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP1]], label [[TMP4:%.*]], label [[TMP5:%.*]], !prof [[PROF1]]
+; CHECK:       4:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+; CHECK-NEXT:    unreachable
+; CHECK:       5:
+; CHECK-NEXT:    call void @llvm.aarch64.sme.add.write.single.za.vg1x2.nxv4i32(i32 [[SLICE_7]], <vscale x 4 x i32> [[ZN0]], <vscale x 4 x i32> [[ZN1]], <vscale x 4 x i32> [[ZM]])
+; CHECK-NEXT:    ret void
+;
   call void @llvm.aarch64.sme.add.write.single.za.vg1x2.nxv4i32(i32 %slice,
   <vscale x 4 x i32> %zn0, <vscale x 4 x i32> %zn1,
   <vscale x 4 x i32> %zm)
@@ -20,6 +39,27 @@ define void @multi_vector_add_write_single_za_vg1x2_i32(i32 %slice, <vscale x 4
 }
 
 define void @multi_vector_add_write_single_za_vg1x2_i64(i32 %slice, <vscale x 2 x i64> %zn0, <vscale x 2 x i64> %zn1,  <vscale x 2 x i64> %zm) sanitize_memory {
+; CHECK-LABEL: @multi_vector_add_write_single_za_vg1x2_i64(
+; CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i32 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP2:%.*]], label [[TMP3:%.*]], !prof [[PROF1]]
+; CHECK:       2:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+; CHECK-NEXT:    unreachable
+; CHECK:       3:
+; CHECK-NEXT:    call void @llvm.aarch64.sme.add.write.single.za.vg1x2.nxv2i64(i32 [[SLICE:%.*]], <vscale x 2 x i64> [[ZN0:%.*]], <vscale x 2 x i64> [[ZN1:%.*]], <vscale x 2 x i64> [[ZM:%.*]])
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or i32 [[TMP1]], 0
+; CHECK-NEXT:    [[SLICE_7:%.*]] = add i32 [[SLICE]], 7
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i32 [[_MSPROP]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP1]], label [[TMP4:%.*]], label [[TMP5:%.*]], !prof [[PROF1]]
+; CHECK:       4:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+; CHECK-NEXT:    unreachable
+; CHECK:       5:
+; CHECK-NEXT:    call void @llvm.aarch64.sme.add.write.single.za.vg1x2.nxv2i64(i32 [[SLICE_7]], <vscale x 2 x i64> [[ZN0]], <vscale x 2 x i64> [[ZN1]], <vscale x 2 x i64> [[ZM]])
+; CHECK-NEXT:    ret void
+;
   call void @llvm.aarch64.sme.add.write.single.za.vg1x2.nxv2i64(i32 %slice,
   <vscale x 2 x i64> %zn0, <vscale x 2 x i64> %zn1,
   <vscale x 2 x i64> %zm)
@@ -32,6 +72,27 @@ define void @multi_vector_add_write_single_za_vg1x2_i64(i32 %slice, <vscale x 2
 
 
 define void @multi_vector_add_write_single_za_vg1x4_i32(i32 %slice, <vscale x 4 x i32> %zn0, <vscale x 4 x i32> %zn1,
+; CHECK-LABEL: @multi_vector_add_write_single_za_vg1x4_i32(
+; CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i32 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP2:%.*]], label [[TMP3:%.*]], !prof [[PROF1]]
+; CHECK:       2:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+; CHECK-NEXT:    unreachable
+; CHECK:       3:
+; CHECK-NEXT:    call void @llvm.aarch64.sme.add.write.single.za.vg1x4.nxv4i32(i32 [[SLICE:%.*]], <vscale x 4 x i32> [[ZN0:%.*]], <vscale x 4 x i32> [[ZN1:%.*]], <vscale x 4 x i32> [[ZN2:%.*]], <vscale x 4 x i32> [[ZN3:%.*]], <vscale x 4 x i32> [[ZM:%.*]])
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or i32 [[TMP1]], 0
+; CHECK-NEXT:    [[SLICE_7:%.*]] = add i32 [[SLICE]], 7
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i32 [[_MSPROP]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP1]], label [[TMP4:%.*]], label [[TMP5:%.*]], !prof [[PROF1]]
+; CHECK:       4:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+; CHECK-NEXT:    unreachable
+; CHECK:       5:
+; CHECK-NEXT:    call void @llvm.aarch64.sme.add.write.single.za.vg1x4.nxv4i32(i32 [[SLICE_7]], <vscale x 4 x i32> [[ZN0]], <vscale x 4 x i32> [[ZN1]], <vscale x 4 x i32> [[ZN2]], <vscale x 4 x i32> [[ZN3]], <vscale x 4 x i32> [[ZM]])
+; CHECK-NEXT:    ret void
+;
   <vscale x 4 x i32> %zn2, <vscale x 4 x i32> %zn3,
   <vscale x 4 x i32> %zm) sanitize_memory {
   call void @llvm.aarch64.sme.add.write.single.za.vg1x4.nxv4i32(i32 %slice,
@@ -47,6 +108,27 @@ define void @multi_vector_add_write_single_za_vg1x4_i32(i32 %slice, <vscale x 4
 }
 
 define void @multi_vector_add_write_single_za_vg1x4_i64(i32 %slice,
+; CHECK-LABEL: @multi_vector_add_write_single_za_vg1x4_i64(
+; CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i32 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP2:%.*]], label [[TMP3:%.*]], !prof [[PROF1]]
+; CHECK:       2:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+; CHECK-NEXT:    unreachable
+; CHECK:       3:
+; CHECK-NEXT:    call void @llvm.aarch64.sme.add.write.single.za.vg1x4.nxv2i64(i32 [[SLICE:%.*]], <vscale x 2 x i64> [[ZN0:%.*]], <vscale x 2 x i64> [[ZN1:%.*]], <vscale x 2 x i64> [[ZN2:%.*]], <vscale x 2 x i64> [[ZN3:%.*]], <vscale x 2 x i64> [[ZM:%.*]])
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or i32 [[TMP1]], 0
+; CHECK-NEXT:    [[SLICE_7:%.*]] = add i32 [[SLICE]], 7
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i32 [[_MSPROP]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP1]], label [[TMP4:%.*]], label [[TMP5:%.*]], !prof [[PROF1]]
+; CHECK:       4:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+; CHECK-NEXT:    unreachable
+; CHECK:       5:
+; CHECK-NEXT:    call void @llvm.aarch64.sme.add.write.single.za.vg1x4.nxv2i64(i32 [[SLICE_7]], <vscale x 2 x i64> [[ZN0]], <vscale x 2 x i64> [[ZN1]], <vscale x 2 x i64> [[ZN2]], <vscale x 2 x i64> [[ZN3]], <vscale x 2 x i64> [[ZM]])
+; CHECK-NEXT:    ret void
+;
   <vscale x 2 x i64> %zn0, <vscale x 2 x i64> %zn1,
   <vscale x 2 x i64> %zn2, <vscale x 2 x i64> %zn3,
   <vscale x 2 x i64> %zm) sanitize_memory {
@@ -64,6 +146,27 @@ define void @multi_vector_add_write_single_za_vg1x4_i64(i32 %slice,
 
 
 define void @multi_vector_add_write_za_vg1x2_i32(i32 %slice, <vscale x 4 x i32> %zn0, <vscale x 4 x i32> %zn1,
+; CHECK-LABEL: @multi_vector_add_write_za_vg1x2_i32(
+; CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i32 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP2:%.*]], label [[TMP3:%.*]], !prof [[PROF1]]
+; CHECK:       2:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+; CHECK-NEXT:    unreachable
+; CHECK:       3:
+; CHECK-NEXT:    call void @llvm.aarch64.sme.add.write.za.vg1x2.nxv4i32(i32 [[SLICE:%.*]], <vscale x 4 x i32> [[ZN0:%.*]], <vscale x 4 x i32> [[ZN1:%.*]], <vscale x 4 x i32> [[ZM1:%.*]], <vscale x 4 x i32> [[ZM2:%.*]])
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or i32 [[TMP1]], 0
+; CHECK-NEXT:    [[SLICE_7:%.*]] = add i32 [[SLICE]], 7
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i32 [[_MSPROP]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP1]], label [[TMP4:%.*]], label [[TMP5:%.*]], !prof [[PROF1]]
+; CHECK:       4:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+; CHECK-NEXT:    unreachable
+; CHECK:       5:
+; CHECK-NEXT:    call void @llvm.aarch64.sme.add.write.za.vg1x2.nxv4i32(i32 [[SLICE_7]], <vscale x 4 x i32> [[ZN0]], <vscale x 4 x i32> [[ZN1]], <vscale x 4 x i32> [[ZM1]], <vscale x 4 x i32> [[ZM2]])
+; CHECK-NEXT:    ret void
+;
   <vscale x 4 x i32> %zm1, <vscale x 4 x i32> %zm2) sanitize_memory {
   call void @llvm.aarch64.sme.add.write.za.vg1x2.nxv4i32(i32 %slice,
   <vscale x 4 x i32> %zn0, <vscale x 4 x i32> %zn1,
@@ -77,6 +180,27 @@ define void @multi_vector_add_write_za_vg1x2_i32(i32 %slice, <vscale x 4 x i32>
 
 
 define void @multi_vector_add_write_za_vg1x2_i64(i32 %slice, <vscale x 2 x i64> %zn0, <vscale x 2 x i64> %zn1,
+; CHECK-LABEL: @multi_vector_add_write_za_vg1x2_i64(
+; CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i32 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP2:%.*]], label [[TMP3:%.*]], !prof [[PROF1]]
+; CHECK:       2:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+; CHECK-NEXT:    unreachable
+; CHECK:       3:
+; CHECK-NEXT:    call void @llvm.aarch64.sme.add.write.za.vg1x2.nxv2i64(i32 [[SLICE:%.*]], <vscale x 2 x i64> [[ZN0:%.*]], <vscale x 2 x i64> [[ZN1:%.*]], <vscale x 2 x i64> [[ZM1:%.*]], <vscale x 2 x i64> [[ZM2:%.*]])
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or i32 [[TMP1]], 0
+; CHECK-NEXT:    [[SLICE_7:%.*]] = add i32 [[SLICE]], 7
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i32 [[_MSPROP]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP1]], label [[TMP4:%.*]], label [[TMP5:%.*]], !prof [[PROF1]]
+; CHECK:       4:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+; CHECK-NEXT:    unreachable
+; CHECK:       5:
+; CHECK-NEXT:    call void @llvm.aarch64.sme.add.write.za.vg1x2.nxv2i64(i32 [[SLICE_7]], <vscale x 2 x i64> [[ZN0]], <vscale x 2 x i64> [[ZN1]], <vscale x 2 x i64> [[ZM1]], <vscale x 2 x i64> [[ZM2]])
+; CHECK-NEXT:    ret void
+;
   <vscale x 2 x i64> %zm1, <vscale x 2 x i64> %zm2) sanitize_memory {
   call void @llvm.aarch64.sme.add.write.za.vg1x2.nxv2i64(i32 %slice,
   <vscale x 2 x i64> %zn0, <vscale x 2 x i64> %zn1,
@@ -91,6 +215,27 @@ define void @multi_vector_add_write_za_vg1x2_i64(i32 %slice, <vscale x 2 x i64>
 
 
 define void @multi_vector_add_write_za_vg1x4_i32(i32 %slice, <vscale x 4 x i32> %zn0, <vscale x 4 x i32> %zn1,
+; CHECK-LABEL: @multi_vector_add_write_za_vg1x4_i32(
+; CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i32 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP2:%.*]], label [[TMP3:%.*]], !prof [[PROF1]]
+; CHECK:       2:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+; CHECK-NEXT:    unreachable
+; CHECK:       3:
+; CHECK-NEXT:    call void @llvm.aarch64.sme.add.write.za.vg1x4.nxv4i32(i32 [[SLICE:%.*]], <vscale x 4 x i32> [[ZN0:%.*]], <vscale x 4 x i32> [[ZN1:%.*]], <vscale x 4 x i32> [[ZN2:%.*]], <vscale x 4 x i32> [[ZN3:%.*]], <vscale x 4 x i32> [[ZM0:%.*]], <vscale x 4 x i32> [[ZM1:%.*]], <vscale x 4 x i32> [[ZM2:%.*]], <vscale x 4 x i32> [[ZM3:%.*]])
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or i32 [[TMP1]], 0
+; CHECK-NEXT:    [[SLICE_7:%.*]] = add i32 [[SLICE]], 7
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i32 [[_MSPROP]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP1]], label [[TMP4:%.*]], label [[TMP5:%.*]], !prof [[PROF1]]
+; CHECK:       4:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+; CHECK-NEXT:    unreachable
+; CHECK:       5:
+; CHECK-NEXT:    call void @llvm.aarch64.sme.add.write.za.vg1x4.nxv4i32(i32 [[SLICE_7]], <vscale x 4 x i32> [[ZN0]], <vscale x 4 x i32> [[ZN1]], <vscale x 4 x i32> [[ZN2]], <vscale x 4 x i32> [[ZN3]], <vscale x 4 x i32> [[ZM0]], <vscale x 4 x i32> [[ZM1]], <vscale x 4 x i32> [[ZM2]], <vscale x 4 x i32> [[ZM3]])
+; CHECK-NEXT:    ret void
+;
   <vscale x 4 x i32> %zn2, <vscale x 4 x i32> %zn3,
   <vscale x 4 x i32> %zm0, <vscale x 4 x i32> %zm1,
   <vscale x 4 x i32> %zm2, <vscale x 4 x i32> %zm3) sanitize_memory {
@@ -109,6 +254,27 @@ define void @multi_vector_add_write_za_vg1x4_i32(i32 %slice, <vscale x 4 x i32>
 }
 
 define void @multi_vector_add_write_za_vg1x4_i64(i32 %slice, <vscale x 2 x i64> %zn0, <vscale x 2 x i64> %zn1,
+; CHECK-LABEL: @multi_vector_add_write_za_vg1x4_i64(
+; CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i32 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP2:%.*]], label [[TMP3:%.*]], !prof [[PROF1]]
+; CHECK:       2:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+; CHECK-NEXT:    unreachable
+; CHECK:       3:
+; CHECK-NEXT:    call void @llvm.aarch64.sme.add.write.za.vg1x4.nxv2i64(i32 [[SLICE:%.*]], <vscale x 2 x i64> [[ZN0:%.*]], <vscale x 2 x i64> [[ZN1:%.*]], <vscale x 2 x i64> [[ZN2:%.*]], <vscale x 2 x i64> [[ZN3:%.*]], <vscale x 2 x i64> [[ZM0:%.*]], <vscale x 2 x i64> [[ZM1:%.*]], <vscale x 2 x i64> [[ZM2:%.*]], <vscale x 2 x i64> [[ZM3:%.*]])
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or i32 [[TMP1]], 0
+; CHECK-NEXT:    [[SLICE_7:%.*]] = add i32 [[SLICE]], 7
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i32 [[_MSPROP]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP1]], label [[TMP4:%.*]], label [[TMP5:%.*]], !prof [[PROF1]]
+; CHECK:       4:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+; CHECK-NEXT:    unreachable
+; CHECK:       5:
+; CHECK-NEXT:    call void @llvm.aarch64.sme.add.write.za.vg1x4.nxv2i64(i32 [[SLICE_7]], <vscale x 2 x i64> [[ZN0]], <vscale x 2 x i64> [[ZN1]], <vscale x 2 x i64> [[ZN2]], <vscale x 2 x i64> [[ZN3]], <vscale x 2 x i64> [[ZM0]], <vscale x 2 x i64> [[ZM1]], <vscale x 2 x i64> [[ZM2]], <vscale x 2 x i64> [[ZM3]])
+; CHECK-NEXT:    ret void
+;
   <vscale x 2 x i64> %zn2, <vscale x 2 x i64> %zn3,
   <vscale x 2 x i64> %zm0, <vscale x 2 x i64> %zm1,
   <vscale x 2 x i64> %zm2, <vscale x 2 x i64> %zm3) sanitize_memory {
@@ -127,6 +293,27 @@ define void @multi_vector_add_write_za_vg1x4_i64(i32 %slice, <vscale x 2 x i64>
 }
 
 define void @multi_vector_add_za_vg1x2_i32(i32 %slice, <vscale x 4 x i32> %zn0, <vscale x 4 x i32> %zn1) sanitize_memory {
+; CHECK-LABEL: @multi_vector_add_za_vg1x2_i32(
+; CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i32 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP2:%.*]], label [[TMP3:%.*]], !prof [[PROF1]]
+; CHECK:       2:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+; CHECK-NEXT:    unreachable
+; CHECK:       3:
+; CHECK-NEXT:    call void @llvm.aarch64.sme.add.za32.vg1x2.nxv4i32(i32 [[SLICE:%.*]], <vscale x 4 x i32> [[ZN0:%.*]], <vscale x 4 x i32> [[ZN1:%.*]])
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or i32 [[TMP1]], 0
+; CHECK-NEXT:    [[SLICE_7:%.*]] = add i32 [[SLICE]], 7
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i32 [[_MSPROP]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP1]], label [[TMP4:%.*]], label [[TMP5:%.*]], !prof [[PROF1]]
+; CHECK:       4:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+; CHECK-NEXT:    unreachable
+; CHECK:       5:
+; CHECK-NEXT:    call void @llvm.aarch64.sme.add.za32.vg1x2.nxv4i32(i32 [[SLICE_7]], <vscale x 4 x i32> [[ZN0]], <vscale x 4 x i32> [[ZN1]])
+; CHECK-NEXT:    ret void
+;
   call void @llvm.aarch64.sme.add.za32.vg1x2.nxv4i32(i32 %slice,<vscale x 4 x i32> %zn0, <vscale x 4 x i32> %zn1)
   %slice.7 = add i32 %slice, 7
   call void @llvm.aarch64.sme.add.za32.vg1x2.nxv4i32(i32 %slice.7, <vscale x 4 x i32> %zn0, <vscale x 4 x i32> %zn1)
@@ -134,6 +321,27 @@ define void @multi_vector_add_za_vg1x2_i32(i32 %slice, <vscale x 4 x i32> %zn0,
 }
 
 define void @multi_vector_add_za_vg1x2_i64(i32 %slice, <vscale x 2 x i64> %zn0, <vscale x 2 x i64> %zn1) sanitize_memory {
+; CHECK-LABEL: @multi_vector_add_za_vg1x2_i64(
+; CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i32 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP2:%.*]], label [[TMP3:%.*]], !prof [[PROF1]]
+; CHECK:       2:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+; CHECK-NEXT:    unreachable
+; CHECK:       3:
+; CHECK-NEXT:    call void @llvm.aarch64.sme.add.za64.vg1x2.nxv2i64(i32 [[SLICE:%.*]], <vscale x 2 x i64> [[ZN0:%.*]], <vscale x 2 x i64> [[ZN1:%.*]])
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or i32 [[TMP1]], 0
+; CHECK-NEXT:    [[SLICE_7:%.*]] = add i32 [[SLICE]], 7
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i32 [[_MSPROP]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP1]], label [[TMP4:%.*]], label [[TMP5:%.*]], !prof [[PROF1]]
+; CHECK:       4:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+; CHECK-NEXT:    unreachable
+; CHECK:       5:
+; CHECK-NEXT:    call void @llvm.aarch64.sme.add.za64.vg1x2.nxv2i64(i32 [[SLICE_7]], <vscale x 2 x i64> [[ZN0]], <vscale x 2 x i64> [[ZN1]])
+; CHECK-NEXT:    ret void
+;
   call void @llvm.aarch64.sme.add.za64.vg1x2.nxv2i64(i32 %slice, <vscale x 2 x i64> %zn0, <vscale x 2 x i64> %zn1)
   %slice.7 = add i32 %slice, 7
   call void @llvm.aarch64.sme.add.za64.vg1x2.nxv2i64(i32 %slice.7, <vscale x 2 x i64> %zn0, <vscale x 2 x i64> %zn1)
@@ -141,6 +349,27 @@ define void @multi_vector_add_za_vg1x2_i64(i32 %slice, <vscale x 2 x i64> %zn0,
 }
 
 define void @multi_vector_add_za_vg1x2_f32(i32 %slice, <vscale x 4 x float> %zn0, <vscale x 4 x float> %zn1) sanitize_memory {
+; CHECK-LABEL: @multi_vector_add_za_vg1x2_f32(
+; CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i32 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP2:%.*]], label [[TMP3:%.*]], !prof [[PROF1]]
+; CHECK:       2:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+; CHECK-NEXT:    unreachable
+; CHECK:       3:
+; CHECK-NEXT:    call void @llvm.aarch64.sme.add.za32.vg1x2.nxv4f32(i32 [[SLICE:%.*]], <vscale x 4 x float> [[ZN0:%.*]], <vscale x 4 x float> [[ZN1:%.*]])
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or i32 [[TMP1]], 0
+; CHECK-NEXT:    [[SLICE_7:%.*]] = add i32 [[SLICE]], 7
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i32 [[_MSPROP]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP1]], label [[TMP4:%.*]], label [[TMP5:%.*]], !prof [[PROF1]]
+; CHECK:       4:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+; CHECK-NEXT:    unreachable
+; CHECK:       5:
+; CHECK-NEXT:    call void @llvm.aarch64.sme.add.za32.vg1x2.nxv4f32(i32 [[SLICE_7]], <vscale x 4 x float> [[ZN0]], <vscale x 4 x float> [[ZN1]])
+; CHECK-NEXT:    ret void
+;
   call void @llvm.aarch64.sme.add.za32.vg1x2.nxv4f32(i32 %slice,
   <vscale x 4 x float> %zn0, <vscale x 4 x float> %zn1)
   %slice.7 = add i32 %slice, 7
@@ -150,6 +379,27 @@ define void @multi_vector_add_za_vg1x2_f32(i32 %slice, <vscale x 4 x float> %zn0
 }
 
 define void @multi_vector_add_za_vg1x2_f64(i32 %slice, <vscale x 2 x double> %zn0, <vscale x 2 x double> %zn1) sanitize_memory {
+; CHECK-LABEL: @multi_vector_add_za_vg1x2_f64(
+; CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i32 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP2:%.*]], label [[TMP3:%.*]], !prof [[PROF1]]
+; CHECK:       2:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+; CHECK-NEXT:    unreachable
+; CHECK:       3:
+; CHECK-NEXT:    call void @llvm.aarch64.sme.add.za64.vg1x2.nxv2f64(i32 [[SLICE:%.*]], <vscale x 2 x double> [[ZN0:%.*]], <vscale x 2 x double> [[ZN1:%.*]])
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or i32 [[TMP1]], 0
+; CHECK-NEXT:    [[SLICE_7:%.*]] = add i32 [[SLICE]], 7
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i32 [[_MSPROP]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP1]], label [[TMP4:%.*]], label [[TMP5:%.*]], !prof [[PROF1]]
+; CHECK:       4:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+; CHECK-NEXT:    unreachable
+; CHECK:       5:
+; CHECK-NEXT:    call void @llvm.aarch64.sme.add.za64.vg1x2.nxv2f64(i32 [[SLICE_7]], <vscale x 2 x double> [[ZN0]], <vscale x 2 x double> [[ZN1]])
+; CHECK-NEXT:    ret void
+;
   call void @llvm.aarch64.sme.add.za64.vg1x2.nxv2f64(i32 %slice,
   <vscale x 2 x double> %zn0, <vscale x 2 x double> %zn1)
   %slice.7 = add i32 %slice, 7
@@ -159,6 +409,36 @@ define void @multi_vector_add_za_vg1x2_f64(i32 %slice, <vscale x 2 x double> %zn
 }
 
 define void @multi_vector_add_za_vg1x2_f64_tuple(i64 %stride, ptr %ptr) sanitize_memory {
+; CHECK-LABEL: @multi_vector_add_za_vg1x2_f64_tuple(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP2:%.*]] = tail call target("aarch64.svcount") @llvm.aarch64.sve.ptrue.c8()
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP0]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]]
+; CHECK:       3:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+; CHECK-NEXT:    unreachable
+; CHECK:       4:
+; CHECK-NEXT:    [[TMP5:%.*]] = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.aarch64.sve.ld1.pn.x2.nxv2f64(target("aarch64.svcount") [[TMP2]], ptr [[PTR:%.*]])
+; CHECK-NEXT:    [[TMP6:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } [[TMP5]], 0
+; CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } [[TMP5]], 1
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or i64 [[TMP0]], [[TMP1]]
+; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i8, ptr [[PTR]], i64 [[STRIDE:%.*]]
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i64 [[_MSPROP]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP1]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]]
+; CHECK:       8:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+; CHECK-NEXT:    unreachable
+; CHECK:       9:
+; CHECK-NEXT:    [[TMP10:%.*]] = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.aarch64.sve.ld1.pn.x2.nxv2f64(target("aarch64.svcount") [[TMP2]], ptr [[ARRAYIDX2]])
+; CHECK-NEXT:    [[TMP11:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } [[TMP10]], 0
+; CHECK-NEXT:    [[TMP12:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } [[TMP10]], 1
+; CHECK-NEXT:    call void @llvm.aarch64.sme.add.za64.vg1x2.nxv2f64(i32 0, <vscale x 2 x double> [[TMP6]], <vscale x 2 x double> [[TMP11]])
+; CHECK-NEXT:    call void @llvm.aarch64.sme.add.za64.vg1x2.nxv2f64(i32 0, <vscale x 2 x double> [[TMP7]], <vscale x 2 x double> [[TMP12]])
+; CHECK-NEXT:    ret void
+;
 entry:
   %0 = tail call target("aarch64.svcount") @llvm.aarch64.sve.ptrue.c8()
   %1 = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.aarch64.sve.ld1.pn.x2.nxv2f64(target("aarch64.svcount") %0, ptr %ptr)
@@ -175,6 +455,27 @@ entry:
 
 
 define void @multi_vector_add_za_vg1x4_i32(i32 %slice, <vscale x 4 x i32> %zn0, <vscale x 4 x i32> %zn1, <vscale x 4 x i32> %zn2, <vscale x 4 x i32> %zn3) sanitize_memory {
+; CHECK-LABEL: @multi_vector_add_za_vg1x4_i32(
+; CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i32 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP2:%.*]], label [[TMP3:%.*]], !prof [[PROF1]]
+; CHECK:       2:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+; CHECK-NEXT:    unreachable
+; CHECK:       3:
+; CHECK-NEXT:    call void @llvm.aarch64.sme.add.za32.vg1x4.nxv4i32(i32 [[SLICE:%.*]], <vscale x 4 x i32> [[ZN0:%.*]], <vscale x 4 x i32> [[ZN1:%.*]], <vscale x 4 x i32> [[ZN2:%.*]], <vscale x 4 x i32> [[ZN3:%.*]])
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or i32 [[TMP1]], 0
+; CHECK-NEXT:    [[SLICE_7:%.*]] = add i32 [[SLICE]], 7
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i32 [[_MSPROP]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP1]], label [[TMP4:%.*]], label [[TMP5:%.*]], !prof [[PROF1]]
+; CHECK:       4:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+; CHECK-NEXT:    unreachable
+; CHECK:       5:
+; CHECK-NEXT:    call void @llvm.aarch64.sme.add.za32.vg1x4.nxv4i32(i32 [[SLICE_7]], <vscale x 4 x i32> [[ZN0]], <vscale x 4 x i32> [[ZN1]], <vscale x 4 x i32> [[ZN2]], <vscale x 4 x i32> [[ZN3]])
+; CHECK-NEXT:    ret void
+;
   call void @llvm.aarch64.sme.add.za32.vg1x4.nxv4i32(i32 %slice,
   <vscale x 4 x i32> %zn0, <vscale x 4 x i32> %zn1,
   <vscale x 4 x i32> %zn2, <vscale x 4 x i32> %zn3)
@@ -186,6 +487,27 @@ define void @multi_vector_add_za_vg1x4_i32(i32 %slice, <vscale x 4 x i32> %zn0,
 }
 
 define void @multi_vector_add_za_vg1x4_i64(i32 %slice, <vscale x 2 x i64> %zn0, <vscale x 2 x i64> %zn1, <vscale x 2 x i64> %zn2, <vscale x 2 x i64> %zn3) sanitize_memory {
+; CHECK-LABEL: @multi_vector_add_za_vg1x4_i64(
+; CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i32 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP2:%.*]], label [[TMP3:%.*]], !prof [[PROF1]]
+; CHECK:       2:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+; CHECK-NEXT:    unreachable
+; CHECK:       3:
+; CHECK-NEXT:    call void @llvm.aarch64.sme.add.za64.vg1x4.nxv2i64(i32 [[SLICE:%.*]], <vscale x 2 x i64> [[ZN0:%.*]], <vscale x 2 x i64> [[ZN1:%.*]], <vscale x 2 x i64> [[ZN2:%.*]], <vscale x 2 x i64> [[ZN3:%.*]])
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or i32 [[TMP1]], 0
+; CHECK-NEXT:    [[SLICE_7:%.*]] = add i32 [[SLICE]], 7
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i32 [[_MSPROP]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP1]], label [[TMP4:%.*]], label [[TMP5:%.*]], !prof [[PROF1]]
+; CHECK:       4:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+; CHECK-NEXT:    unreachable
+; CHECK:       5:
+; CHECK-NEXT:    call void @llvm.aarch64.sme.add.za64.vg1x4.nxv2i64(i32 [[SLICE_7]], <vscale x 2 x i64> [[ZN0]], <vscale x 2 x i64> [[ZN1]], <vscale x 2 x i64> [[ZN2]], <vscale x 2 x i64> [[ZN3]])
+; CHECK-NEXT:    ret void
+;
   call void @llvm.aarch64.sme.add.za64.vg1x4.nxv2i64(i32 %slice,
   <vscale x 2 x i64> %zn0, <vscale x 2 x i64> %zn1,
   <vscale x 2 x i64> %zn2, <vscale x 2 x i64> %zn3)
@@ -197,6 +519,27 @@ define void @multi_vector_add_za_vg1x4_i64(i32 %slice, <vscale x 2 x i64> %zn0,
 }
 
 define void @multi_vector_add_za_vg1x4_f32(i32 %slice, <vscale x 4 x float> %zn0, <vscale x 4 x float> %zn1, <vscale x 4 x float> %zn2, <vscale x 4 x float> %zn3) sanitize_memory {
+; CHECK-LABEL: @multi_vector_add_za_vg1x4_f32(
+; CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i32 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP2:%.*]], label [[TMP3:%.*]], !prof [[PROF1]]
+; CHECK:       2:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+; CHECK-NEXT:    unreachable
+; CHECK:       3:
+; CHECK-NEXT:    call void @llvm.aarch64.sme.add.za32.vg1x4.nxv4f32(i32 [[SLICE:%.*]], <vscale x 4 x float> [[ZN0:%.*]], <vscale x 4 x float> [[ZN1:%.*]], <vscale x 4 x float> [[ZN2:%.*]], <vscale x 4 x float> [[ZN3:%.*]])
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or i32 [[TMP1]], 0
+; CHECK-NEXT:    [[SLICE_7:%.*]] = add i32 [[SLICE]], 7
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i32 [[_MSPROP]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP1]], label [[TMP4:%.*]], label [[TMP5:%.*]], !prof [[PROF1]]
+; CHECK:       4:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+; CHECK-NEXT:    unreachable
+; CHECK:       5:
+; CHECK-NEXT:    call void @llvm.aarch64.sme.add.za32.vg1x4.nxv4f32(i32 [[SLICE_7]], <vscale x 4 x float> [[ZN0]], <vscale x 4 x float> [[ZN1]], <vscale x 4 x float> [[ZN2]], <vscale x 4 x float> [[ZN3]])
+; CHECK-NEXT:    ret void
+;
   call void @llvm.aarch64.sme.add.za32.vg1x4.nxv4f32(i32 %slice,
   <vscale x 4 x float> %zn0, <vscale x 4 x float> %zn1,
   <vscale x 4 x float> %zn2, <vscale x 4 x float> %zn3)
@@ -208,6 +551,73 @@ define void @multi_vector_add_za_vg1x4_f32(i32 %slice, <vscale x 4 x float> %zn0
 }
 
 define void @multi_vector_add_za_vg1x4_f32_tuple(i64 %stride, ptr %ptr) sanitize_memory {
+; CHECK-LABEL: @multi_vector_add_za_vg1x4_f32_tuple(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP2:%.*]] = tail call target("aarch64.svcount") @llvm.aarch64.sve.ptrue.c8()
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP0]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]]
+; CHECK:       3:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+; CHECK-NEXT:    unreachable
+; CHECK:       4:
+; CHECK-NEXT:    [[TMP5:%.*]] = tail call { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } @llvm.aarch64.sve.ld1.pn.x4.nxv4f32(target("aarch64.svcount") [[TMP2]], ptr [[PTR:%.*]])
+; CHECK-NEXT:    [[TMP6:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP5]], 0
+; CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP5]], 1
+; CHECK-NEXT:    [[TMP8:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP5]], 2
+; CHECK-NEXT:    [[TMP9:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP5]], 3
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or i64 [[TMP0]], [[TMP1]]
+; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i8, ptr [[PTR]], i64 [[STRIDE:%.*]]
+; CHECK-NEXT:    [[_MSCMP3:%.*]] = icmp ne i64 [[_MSPROP]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP3]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF1]]
+; CHECK:       10:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+; CHECK-NEXT:    unreachable
+; CHECK:       11:
+; CHECK-NEXT:    [[TMP12:%.*]] = tail call { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } @llvm.aarch64.sve.ld1.pn.x4.nxv4f32(target("aarch64.svcount") [[TMP2]], ptr [[ARRAYIDX2]])
+; CHECK-NEXT:    [[TMP13:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP12]], 0
+; CHECK-NEXT:    [[TMP14:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP12]], 1
+; CHECK-NEXT:    [[TMP15:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP12]], 2
+; CHECK-NEXT:    [[TMP16:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP12]], 3
+; CHECK-NEXT:    [[TMP17:%.*]] = shl i64 [[TMP1]], 1
+; CHECK-NEXT:    [[TMP18:%.*]] = or i64 [[TMP17]], 0
+; CHECK-NEXT:    [[MUL3:%.*]] = shl i64 [[STRIDE]], 1
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = or i64 [[TMP0]], [[TMP18]]
+; CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds i8, ptr [[PTR]], i64 [[MUL3]]
+; CHECK-NEXT:    [[_MSCMP4:%.*]] = icmp ne i64 [[_MSPROP1]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP4]], label [[TMP19:%.*]], label [[TMP20:%.*]], !prof [[PROF1]]
+; CHECK:       19:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+; CHECK-NEXT:    unreachable
+; CHECK:       20:
+; CHECK-NEXT:    [[TMP21:%.*]] = tail call { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } @llvm.aarch64.sve.ld1.pn.x4.nxv4f32(target("aarch64.svcount") [[TMP2]], ptr [[ARRAYIDX4]])
+; CHECK-NEXT:    [[TMP22:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP21]], 0
+; CHECK-NEXT:    [[TMP23:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP21]], 1
+; CHECK-NEXT:    [[TMP24:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP21]], 2
+; CHECK-NEXT:    [[TMP25:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP21]], 3
+; CHECK-NEXT:    [[MSPROP_MUL_CST:%.*]] = mul i64 [[TMP1]], 1
+; CHECK-NEXT:    [[MUL5:%.*]] = mul i64 [[STRIDE]], 3
+; CHECK-NEXT:    [[_MSPROP2:%.*]] = or i64 [[TMP0]], [[MSPROP_MUL_CST]]
+; CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds i8, ptr [[PTR]], i64 [[MUL5]]
+; CHECK-NEXT:    [[_MSCMP5:%.*]] = icmp ne i64 [[_MSPROP2]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP5]], label [[TMP26:%.*]], label [[TMP27:%.*]], !prof [[PROF1]]
+; CHECK:       26:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+; CHECK-NEXT:    unreachable
+; CHECK:       27:
+; CHECK-NEXT:    [[TMP28:%.*]] = tail call { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } @llvm.aarch64.sve.ld1.pn.x4.nxv4f32(target("aarch64.svcount") [[TMP2]], ptr [[ARRAYIDX6]])
+; CHECK-NEXT:    [[TMP29:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP28]], 0
+; CHECK-NEXT:    [[TMP30:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP28]], 1
+; CHECK-NEXT:    [[TMP31:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP28]], 2
+; CHECK-NEXT:    [[TMP32:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP28]], 3
+; CHECK-NEXT:    call void @llvm.aarch64.sme.add.za32.vg1x4.nxv4f32(i32 0, <vscale x 4 x float> [[TMP6]], <vscale x 4 x float> [[TMP13]], <vscale x 4 x float> [[TMP22]], <vscale x 4 x float> [[TMP29]])
+; CHECK-NEXT:    call void @llvm.aarch64.sme.add.za32.vg1x4.nxv4f32(i32 0, <vscale x 4 x float> [[TMP7]], <vscale x 4 x float> [[TMP14]], <vscale x 4 x float> [[TMP23]], <vscale x 4 x float> [[TMP30]])
+; CHECK-NEXT:    call void @llvm.aarch64.sme.add.za32.vg1x4.nxv4f32(i32 0, <vscale x 4 x float> [[TMP8]], <vscale x 4 x float> [[TMP15]], <vscale x 4 x float> [[TMP24]], <vscale x 4 x float> [[TMP31]])
+; CHECK-NEXT:    call void @llvm.aarch64.sme.add.za32.vg1x4.nxv4f32(i32 0, <vscale x 4 x float> [[TMP9]], <vscale x 4 x float> [[TMP16]], <vscale x 4 x float> [[TMP25]], <vscale x 4 x float> [[TMP32]])
+; CHECK-NEXT:    ret void
+;
 entry:
   %0 = tail call target("aarch64.svcount") @llvm.aarch64.sve.ptrue.c8()
   %1 = tail call { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } @llvm.aarch64.sve.ld1.pn.x4.nxv4f32(target("aarch64.svcount") %0, ptr %ptr)
@@ -243,6 +653,27 @@ entry:
 }
 
 define void @multi_vector_add_za_vg1x4_f64(i32 %slice, <vscale x 2 x double> %zn0, <vscale x 2 x double> %zn1, <vscale x 2 x double> %zn2, <vscale x 2 x double> %zn3) sanitize_memory {
+; CHECK-LABEL: @multi_vector_add_za_vg1x4_f64(
+; CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i32 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP2:%.*]], label [[TMP3:%.*]], !prof [[PROF1]]
+; CHECK:       2:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+; CHECK-NEXT:    unreachable
+; CHECK:       3:
+; CHECK-NEXT:    call void @llvm.aarch64.sme.add.za64.vg1x4.nxv2f64(i32 [[SLICE:%.*]], <vscale x 2 x double> [[ZN0:%.*]], <vscale x 2 x double> [[ZN1:%.*]], <vscale x 2 x double> [[ZN2:%.*]], <vscale x 2 x double> [[ZN3:%.*]])
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or i32 [[TMP1]], 0
+; CHECK-NEXT:    [[SLICE_7:%.*]] = add i32 [[SLICE]], 7
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i32 [[_MSPROP]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP1]], label [[TMP4:%.*]], label [[TMP5:%.*]], !prof [[PROF1]]
+; CHECK:       4:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+; CHECK-NEXT:    unreachable
+; CHECK:       5:
+; CHECK-NEXT:    call void @llvm.aarch64.sme.add.za64.vg1x4.nxv2f64(i32 [[SLICE_7]], <vscale x 2 x double> [[ZN0]], <vscale x 2 x double> [[ZN1]], <vscale x 2 x double> [[ZN2]], <vscale x 2 x double> [[ZN3]])
+; CHECK-NEXT:    ret void
+;
   call void @llvm.aarch64.sme.add.za64.vg1x4.nxv2f64(i32 %slice,
   <vscale x 2 x double> %zn0, <vscale x 2 x double> %zn1,
   <vscale x 2 x double> %zn2, <vscale x 2 x double> %zn3)
@@ -255,6 +686,12 @@ define void @multi_vector_add_za_vg1x4_f64(i32 %slice, <vscale x 2 x double> %zn
 
 
 define { <vscale x 16 x i8>, <vscale x 16 x i8> } @multi_vec_add_single_x2_s8(<vscale x 16 x i8> %unused, <vscale x 16 x i8> %zdn1, <vscale x 16 x i8> %zdn2, <vscale x 16 x i8> %zm) sanitize_memory {
+; CHECK-LABEL: @multi_vec_add_single_x2_s8(
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[RES:%.*]] = call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.add.single.x2.nxv16i8(<vscale x 16 x i8> [[ZDN1:%.*]], <vscale x 16 x i8> [[ZDN2:%.*]], <vscale x 16 x i8> [[ZM:%.*]])
+; CHECK-NEXT:    store { <vscale x 16 x i8>, <vscale x 16 x i8> } zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8> } [[RES]]
+;
   %res = call { <vscale x 16 x i8>, <vscale x 16 x i8> }
   @llvm.aarch64.sve.add.single.x2.nxv16i8(<vscale x 16 x i8> %zdn1, <vscale x 16 x i8> %zdn2,
   <vscale x 16 x i8> %zm)
@@ -262,6 +699,12 @@ define { <vscale x 16 x i8>, <vscale x 16 x i8> } @multi_vec_add_single_x2_s8(<v
 }
 
 define { <vscale x 8 x i16>, <vscale x 8 x i16> } @multi_vec_add_single_x2_s16(<vscale x 8 x i16> %unused, <vscale x 8 x i16> %zdn1, <vscale x 8 x i16> %zdn2, <vscale x 8 x i16> %zm) sanitize_memory {
+; CHECK-LABEL: @multi_vec_add_single_x2_s16(
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[RES:%.*]] = call { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sve.add.single.x2.nxv8i16(<vscale x 8 x i16> [[ZDN1:%.*]], <vscale x 8 x i16> [[ZDN2:%.*]], <vscale x 8 x i16> [[ZM:%.*]])
+; CHECK-NEXT:    store { <vscale x 8 x i16>, <vscale x 8 x i16> } zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret { <vscale x 8 x i16>, <vscale x 8 x i16> } [[RES]]
+;
   %res = call { <vscale x 8 x i16>, <vscale x 8 x i16> }
   @llvm.aarch64.sve.add.single.x2.nxv8i16(<vscale x 8 x i16> %zdn1, <vscale x 8 x i16> %zdn2,
   <vscale x 8 x i16> %zm)
@@ -269,6 +712,12 @@ define { <vscale x 8 x i16>, <vscale x 8 x i16> } @multi_vec_add_single_x2_s16(<
 }
 
 define { <vscale x 4 x i32>, <vscale x 4 x i32> } @multi_vec_add_single_x2_s32(<vscale x 4 x i32> %unused, <vscale x 4 x i32> %zdn1, <vscale x 4 x i32> %zdn2, <vscale x 4 x i32> %zm) sanitize_memory {
+; CHECK-LABEL: @multi_vec_add_single_x2_s32(
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[RES:%.*]] = call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sve.add.single.x2.nxv4i32(<vscale x 4 x i32> [[ZDN1:%.*]], <vscale x 4 x i32> [[ZDN2:%.*]], <vscale x 4 x i32> [[ZM:%.*]])
+; CHECK-NEXT:    store { <vscale x 4 x i32>, <vscale x 4 x i32> } zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret { <vscale x 4 x i32>, <vscale x 4 x i32> } [[RES]]
+;
   %res = call { <vscale x 4 x i32>, <vscale x 4 x i32> }
   @llvm.aarch64.sve.add.single.x2.nxv4i32(<vscale x 4 x i32> %zdn1, <vscale x 4 x i32> %zdn2,
   <vscale x 4 x i32> %zm)
@@ -276,6 +725,12 @@ define { <vscale x 4 x i32>, <vscale x 4 x i32> } @multi_vec_add_single_x2_s32(<
 }
 
 define { <vscale x 2 x i64>, <vscale x 2 x i64> } @multi_vec_add_single_x2_s64(<vscale x 2 x i64> %unused, <vscale x 2 x i64> %zdn1, <vscale x 2 x i64> %zdn2, <vscale x 2 x i64> %zm) sanitize_memory {
+; CHECK-LABEL: @multi_vec_add_single_x2_s64(
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[RES:%.*]] = call { <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sve.add.single.x2.nxv2i64(<vscale x 2 x i64> [[ZDN1:%.*]], <vscale x 2 x i64> [[ZDN2:%.*]], <vscale x 2 x i64> [[ZM:%.*]])
+; CHECK-NEXT:    store { <vscale x 2 x i64>, <vscale x 2 x i64> } zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret { <vscale x 2 x i64>, <vscale x 2 x i64> } [[RES]]
+;
   %res = call { <vscale x 2 x i64>, <vscale x 2 x i64> }
   @llvm.aarch64.sve.add.single.x2.nxv2i64(<vscale x 2 x i64> %zdn1, <vscale x 2 x i64> %zdn2,
   <vscale x 2 x i64> %zm)
@@ -284,6 +739,12 @@ define { <vscale x 2 x i64>, <vscale x 2 x i64> } @multi_vec_add_single_x2_s64(<
 
 
 define { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @multi_vec_add_single_x4_s8(<vscale x 16 x i8> %unused, <vscale x 16 x i8> %zdn1, <vscale x 16 x i8> %zdn2, <vscale x 16 x i8> %zdn3, <vscale x 16 x i8> %zdn4, <vscale x 16 x i8>%zm) sanitize_memory {
+; CHECK-LABEL: @multi_vec_add_single_x4_s8(
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[RES:%.*]] = call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.add.single.x4.nxv16i8(<vscale x 16 x i8> [[ZDN1:%.*]], <vscale x 16 x i8> [[ZDN2:%.*]], <vscale x 16 x i8> [[ZDN3:%.*]], <vscale x 16 x i8> [[ZDN4:%.*]], <vscale x 16 x i8> [[ZM:%.*]])
+; CHECK-NEXT:    store { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[RES]]
+;
   %res = call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> }
   @llvm.aarch64.sve.add.single.x4.nxv16i8(<vscale x 16 x i8> %zdn1, <vscale x 16 x i8> %zdn2,
   <vscale x 16 x i8> %zdn3, <vscale x 16 x i8> %zdn4,
@@ -292,6 +753,12 @@ define { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 1
 }
 
 define { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @multi_vec_add_x4_single_s16(<vscale x 8 x i16> %unused, <vscale x 8 x i16> %zdn1, <vscale x 8 x i16> %zdn2, <vscale x 8 x i16> %zdn3, <vscale x 8 x i16> %zdn4, <vscale x 8 x i16> %zm) sanitize_memory {
+; CHECK-LABEL: @multi_vec_add_x4_single_s16(
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[RES:%.*]] = call { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sve.add.single.x4.nxv8i16(<vscale x 8 x i16> [[ZDN1:%.*]], <vscale x 8 x i16> [[ZDN2:%.*]], <vscale x 8 x i16> [[ZDN3:%.*]], <vscale x 8 x i16> [[ZDN4:%.*]], <vscale x 8 x i16> [[ZM:%.*]])
+; CHECK-NEXT:    store { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[RES]]
+;
   %res = call { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> }
   @llvm.aarch64.sve.add.single.x4.nxv8i16(<vscale x 8 x i16> %zdn1, <vscale x 8 x i16> %zdn2,
   <vscale x 8 x i16> %zdn3, <vscale x 8 x i16> %zdn4,
@@ -300,6 +767,12 @@ define { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8
 }
 
 define { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } @multi_vec_add_x4_single_s32(<vscale x 4 x i32> %unused, <vscale x 4 x i32> %zdn1, <vscale x 4 x i32> %zdn2, <vscale x 4 x i32> %zdn3, <vscale x 4 x i32> %zdn4, <vscale x 4 x i32> %zm) sanitize_memory {
+; CHECK-LABEL: @multi_vec_add_x4_single_s32(
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[RES:%.*]] = call { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sve.add.single.x4.nxv4i32(<vscale x 4 x i32> [[ZDN1:%.*]], <vscale x 4 x i32> [[ZDN2:%.*]], <vscale x 4 x i32> [[ZDN3:%.*]], <vscale x 4 x i32> [[ZDN4:%.*]], <vscale x 4 x i32> [[ZM:%.*]])
+; CHECK-NEXT:    store { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[RES]]
+;
   %res = call { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> }
   @llvm.aarch64.sve.add.single.x4.nxv4i32(<vscale x 4 x i32> %zdn1, <vscale x 4 x i32> %zdn2,
   <vscale x 4 x i32> %zdn3, <vscale x 4 x i32> %zdn4,
@@ -308,6 +781,12 @@ define { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4
 }
 
 define { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } @multi_vec_add_x4_single_s64(<vscale x 2 x i64> %unused, <vscale x 2 x i64> %zdn1, <vscale x 2 x i64> %zdn2, <vscale x 2 x i64> %zdn3, <vscale x 2 x i64> %zdn4, <vscale x 2 x i64> %zm) sanitize_memory {
+; CHECK-LABEL: @multi_vec_add_x4_single_s64(
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[RES:%.*]] = call { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sve.add.single.x4.nxv2i64(<vscale x 2 x i64> [[ZDN1:%.*]], <vscale x 2 x i64> [[ZDN2:%.*]], <vscale x 2 x i64> [[ZDN3:%.*]], <vscale x 2 x i64> [[ZDN4:%.*]], <vscale x 2 x i64> [[ZM:%.*]])
+; CHECK-NEXT:    store { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[RES]]
+;
   %res = call { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> }
   @llvm.aarch64.sve.add.single.x4.nxv2i64(<vscale x 2 x i64> %zdn1, <vscale x 2 x i64> %zdn2,
   <vscale x 2 x i64> %zdn3, <vscale x 2 x i64> %zdn4,
diff --git a/llvm/test/MC/ELF/cfi-sframe-cfi-escape-diagnostics.s b/llvm/test/MC/ELF/cfi-sframe-cfi-escape-diagnostics.s
new file mode 100644
index 0000000..cb44a76
--- /dev/null
+++ b/llvm/test/MC/ELF/cfi-sframe-cfi-escape-diagnostics.s
@@ -0,0 +1,36 @@
+# RUN: llvm-mc --filetype=obj --gsframe -triple x86_64 %s -o %t.o 2>&1 | FileCheck %s
+# RUN: llvm-readelf --sframe %t.o | FileCheck %s --check-prefix=NOFDES
+
+## Tests that .cfi_escape sequences that are unrepresentable in sframe warn
+## and do not produce FDEs.
+
+        .align 1024
+cfi_escape_sp:
+        .cfi_startproc
+        .long 0
+## Setting SP via other registers makes it unrepresentable in sframe
+## DW_CFA_expression,reg 0x7,length 2,DW_OP_breg6,SLEB(-8)
+# CHECK: {{.*}}.s:[[#@LINE+1]]:9: warning: skipping SFrame FDE; .cfi_escape DW_CFA_expression with SP reg 7
+        .cfi_escape 0x10, 0x7, 0x2, 0x76, 0x78
+        .long 0
+.cfi_endproc
+
+cfi_escape_args_sp:
+        .cfi_startproc
+        .long 0
+## DW_CFA_GNU_args_size is not OK if cfa is SP
+# CHECK: {{.*}}.s:[[#@LINE+1]]:9: warning: skipping SFrame FDE; .cfi_escape DW_CFA_GNU_args_size with non frame-pointer CFA
+        .cfi_escape 0x2e, 0x20
+        .cfi_endproc
+
+cfi_escape_val_offset:
+        .cfi_startproc
+        .long 0
+        .cfi_def_cfa_offset 16
+## DW_CFA_val_offset,rbp,ULEB scaled offset(16)
+# CHECK: {{.*}}.s:[[#@LINE+1]]:9: warning: skipping SFrame FDE;  .cfi_escape DW_CFA_val_offset with FP reg 6
+        .cfi_escape 0x14,0x6,0x2
+        .long 0
+        .cfi_endproc
+
+# NOFDES: Num FDEs: 0
diff --git a/llvm/test/MC/ELF/cfi-sframe-cfi-escape.s b/llvm/test/MC/ELF/cfi-sframe-cfi-escape.s
new file mode 100644
index 0000000..df8e7d2
--- /dev/null
+++ b/llvm/test/MC/ELF/cfi-sframe-cfi-escape.s
@@ -0,0 +1,46 @@
+# RUN: llvm-mc --filetype=obj --gsframe -triple x86_64 %s -o %t.o
+# RUN: llvm-readelf --sframe %t.o | FileCheck %s
+
+## Tests that .cfi_escape sequences that are ok to pass through work.
+
+        .align 1024
+cfi_escape_ok:
+        .cfi_startproc
+        .long 0
+        .cfi_def_cfa_offset 16
+        ## Uninteresting register
+## DW_CFA_expression,reg 0xc,length 2,DW_OP_breg6,SLEB(-8)
+        .cfi_escape 0x10,0xc,0x2,0x76,0x78
+## DW_CFA_nop
+        .cfi_escape 0x0
+        .cfi_escape 0x0,0x0,0x0,0x0
+        ## Uninteresting register
+## DW_CFA_val_offset,reg 0xc,ULEB scaled offset
+        .cfi_escape 0x14,0xc,0x4
+        .long 0
+        .cfi_endproc
+
+cfi_escape_gnu_args_fp:
+        .cfi_startproc
+        .long 0
+## DW_CFA_GNU_args_size is OK if arg size is zero
+        .cfi_escape 0x2e, 0x0
+        .long 0
+        .cfi_def_cfa_register 6
+        .long 0
+## DW_CFA_GNU_args_size is OK if cfa is FP
+        .cfi_escape 0x2e, 0x20
+        .cfi_endproc
+
+cfi_escape_long_expr:
+        .cfi_startproc
+        .long 0
+        .cfi_def_cfa_offset 16
+## This is a long, but valid, dwarf expression without sframe
+## implications. An FDE can still be created.
+## DW_CFA_val_offset,rcx,ULEB scaled offset(16), DW_CFA_expr,r10,length,DW_OP_deref,SLEB(-8)
+        .cfi_escape 0x14,0x2,0x2,0x10,0xa,0x2,0x76,0x78
+        .long 0
+        .cfi_endproc
+
+# CHECK: Num FDEs: 3
diff --git a/llvm/test/MC/Hexagon/arch-support.s b/llvm/test/MC/Hexagon/arch-support.s
index eb362a7..94a6eb1 100644
--- a/llvm/test/MC/Hexagon/arch-support.s
+++ b/llvm/test/MC/Hexagon/arch-support.s
@@ -10,6 +10,7 @@
 # RUN: llvm-mc -triple=hexagon -mv73 -filetype=obj %s | llvm-readelf -h - | FileCheck --check-prefix=CHECK-V73 %s
 # RUN: llvm-mc -triple=hexagon -mv75 -filetype=obj %s | llvm-readelf -h - | FileCheck --check-prefix=CHECK-V75 %s
 # RUN: llvm-mc -triple=hexagon -mv79 -filetype=obj %s | llvm-readelf -h - | FileCheck --check-prefix=CHECK-V79 %s
+# RUN: llvm-mc -triple=hexagon -mv81 -filetype=obj %s | llvm-readelf -h - | FileCheck --check-prefix=CHECK-V81 %s
 
 ## Check which arch version llvm-mc sets when the user does not provide one.
 # RUN: llvm-mc -triple=hexagon -filetype=obj %s | llvm-readelf -h - | FileCheck --check-prefix=CHECK-DEFAULT %s
@@ -26,6 +27,7 @@
 # RUN: llvm-mc -triple=hexagon -mv73 -filetype=obj %s | llvm-objdump --disassemble - | FileCheck --check-prefix=CHECK-OBJDUMP %s
 # RUN: llvm-mc -triple=hexagon -mv75 -filetype=obj %s | llvm-objdump --disassemble - | FileCheck --check-prefix=CHECK-OBJDUMP %s
 # RUN: llvm-mc -triple=hexagon -mv79 -filetype=obj %s | llvm-objdump --disassemble - | FileCheck --check-prefix=CHECK-OBJDUMP %s
+# RUN: llvm-mc -triple=hexagon -mv81 -filetype=obj %s | llvm-objdump --disassemble - | FileCheck --check-prefix=CHECK-OBJDUMP %s
     .text
 r1 = r1
 
@@ -41,6 +43,7 @@ r1 = r1
 # CHECK-V73: Flags:{{.*}}0x73
 # CHECK-V75: Flags:{{.*}}0x75
 # CHECK-V79: Flags:{{.*}}0x79
+# CHECK-V81: Flags:{{.*}}0x81
 # CHECK-DEFAULT: Flags:{{.*}}0x68
 
 # CHECK-OBJDUMP: { r1 = r1 }
diff --git a/llvm/test/MC/Hexagon/v81_arch.s b/llvm/test/MC/Hexagon/v81_arch.s
new file mode 100644
index 0000000..0cd5d6b
--- /dev/null
+++ b/llvm/test/MC/Hexagon/v81_arch.s
@@ -0,0 +1,10 @@
+# RUN: llvm-mc -arch=hexagon -mcpu=hexagonv81 -filetype=obj %s | llvm-objdump -d - | FileCheck %s
+# RUN: llvm-mc -arch=hexagon -mcpu=hexagonv81 -mhvx -filetype=obj %s | llvm-objdump -d - | FileCheck %s
+
+r1=memw(r0)
+{ r0=r0
+  memw(r0)=r0.new }
+
+# CHECK: { r1 = memw(r0+#0x0) }
+# CHECK: { r0 = r0
+# CHECK:   memw(r0+#0x0) = r0.new }
diff --git a/llvm/test/MC/PowerPC/ppc64-encoding-ext.s b/llvm/test/MC/PowerPC/ppc64-encoding-ext.s
index 959f3c5..6662220 100644
--- a/llvm/test/MC/PowerPC/ppc64-encoding-ext.s
+++ b/llvm/test/MC/PowerPC/ppc64-encoding-ext.s
@@ -3491,12 +3491,18 @@
 # CHECK-BE: mfamr 2                         # encoding: [0x7c,0x5d,0x02,0xa6]
 # CHECK-LE: mfamr 2                         # encoding: [0xa6,0x02,0x5d,0x7c]
             mfamr 2
-# CHECK-BE: mtpid 2                         # encoding: [0x7c,0x50,0x0b,0xa6]
-# CHECK-LE: mtpid 2                         # encoding: [0xa6,0x0b,0x50,0x7c]
+# CHECK-BE: mtspr 48, 2                     # encoding: [0x7c,0x50,0x0b,0xa6]
+# CHECK-LE: mtspr 48, 2                     # encoding: [0xa6,0x0b,0x50,0x7c]
             mtpid 2
-# CHECK-BE: mfpid 2                         # encoding: [0x7c,0x50,0x0a,0xa6]
-# CHECK-LE: mfpid 2                         # encoding: [0xa6,0x0a,0x50,0x7c]
+# CHECK-BE: mtspr 48, 2                     # encoding: [0x7c,0x50,0x0b,0xa6]
+# CHECK-LE: mtspr 48, 2                     # encoding: [0xa6,0x0b,0x50,0x7c]
+            mtpidr 2
+# CHECK-BE: mfspr 2, 48                     # encoding: [0x7c,0x50,0x0a,0xa6]
+# CHECK-LE: mfspr 2, 48                     # encoding: [0xa6,0x0a,0x50,0x7c]
             mfpid 2
+# CHECK-BE: mfspr 2, 48                     # encoding: [0x7c,0x50,0x0a,0xa6]
+# CHECK-LE: mfspr 2, 48                     # encoding: [0xa6,0x0a,0x50,0x7c]
+            mfpidr 2
 # CHECK-BE: mtlr 2                          # encoding: [0x7c,0x48,0x03,0xa6]
 # CHECK-LE: mtlr 2                          # encoding: [0xa6,0x03,0x48,0x7c]
             mtlr 2
diff --git a/llvm/test/Transforms/InstCombine/ctlz-cttz.ll b/llvm/test/Transforms/InstCombine/ctlz-cttz.ll
new file mode 100644
index 0000000..871fb34
--- /dev/null
+++ b/llvm/test/Transforms/InstCombine/ctlz-cttz.ll
@@ -0,0 +1,145 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
+; RUN: opt < %s -S -passes=instcombine | FileCheck %s
+
+; ctpop(~i & (i - 1)) -> bitwidth - cttz(i, false)
+define i8 @ctlz_to_sub_bw_cttz(i8 %a0) {
+; CHECK-LABEL: define i8 @ctlz_to_sub_bw_cttz(
+; CHECK-SAME: i8 [[A0:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = call range(i8 0, 9) i8 @llvm.cttz.i8(i8 [[A0]], i1 false)
+; CHECK-NEXT:    [[CLZ:%.*]] = sub nuw nsw i8 8, [[TMP1]]
+; CHECK-NEXT:    ret i8 [[CLZ]]
+;
+  %dec = add i8 %a0, -1
+  %not = xor i8 %a0, -1
+  %and = and i8 %dec, %not
+  %clz = tail call i8 @llvm.ctlz.i8(i8 %and, i1 false)
+  ret i8 %clz
+}
+
+define i8 @ctlz_to_sub_bw_cttz_poison(i8 %a0) {
+; CHECK-LABEL: define i8 @ctlz_to_sub_bw_cttz_poison(
+; CHECK-SAME: i8 [[A0:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = call range(i8 0, 9) i8 @llvm.cttz.i8(i8 [[A0]], i1 false)
+; CHECK-NEXT:    [[CLZ:%.*]] = sub nuw nsw i8 8, [[TMP1]]
+; CHECK-NEXT:    ret i8 [[CLZ]]
+;
+  %dec = add i8 %a0, -1
+  %not = xor i8 %a0, -1
+  %and = and i8 %dec, %not
+  %clz = tail call i8 @llvm.ctlz.i8(i8 %and, i1 true)
+  ret i8 %clz
+}
+
+define i8 @ctlz_to_sub_bw_cttz_different_add(i8 %a0) {
+; CHECK-LABEL: define i8 @ctlz_to_sub_bw_cttz_different_add(
+; CHECK-SAME: i8 [[A0:%.*]]) {
+; CHECK-NEXT:    [[DEC:%.*]] = add i8 [[A0]], 1
+; CHECK-NEXT:    [[NOT:%.*]] = xor i8 [[A0]], -1
+; CHECK-NEXT:    [[AND:%.*]] = and i8 [[DEC]], [[NOT]]
+; CHECK-NEXT:    [[CLZ:%.*]] = tail call range(i8 0, 9) i8 @llvm.ctlz.i8(i8 [[AND]], i1 false)
+; CHECK-NEXT:    ret i8 [[CLZ]]
+;
+  %dec = add i8 %a0, 1
+  %not = xor i8 %a0, -1
+  %and = and i8 %dec, %not
+  %clz = tail call i8 @llvm.ctlz.i8(i8 %and, i1 false)
+  ret i8 %clz
+}
+
+define i8 @ctlz_to_sub_bw_cttz_different_xor(i8 %a0) {
+; CHECK-LABEL: define i8 @ctlz_to_sub_bw_cttz_different_xor(
+; CHECK-SAME: i8 [[A0:%.*]]) {
+; CHECK-NEXT:    [[DEC:%.*]] = add i8 [[A0]], -1
+; CHECK-NEXT:    [[NOT:%.*]] = xor i8 [[A0]], 1
+; CHECK-NEXT:    [[AND:%.*]] = and i8 [[DEC]], [[NOT]]
+; CHECK-NEXT:    [[CLZ:%.*]] = tail call range(i8 0, 9) i8 @llvm.ctlz.i8(i8 [[AND]], i1 false)
+; CHECK-NEXT:    ret i8 [[CLZ]]
+;
+  %dec = add i8 %a0, -1
+  %not = xor i8 %a0, 1
+  %and = and i8 %dec, %not
+  %clz = tail call i8 @llvm.ctlz.i8(i8 %and, i1 false)
+  ret i8 %clz
+}
+
+declare void @use(i8)
+
+define i8 @ctlz_to_sub_bw_cttz_multi_use_dec(i8 %a0) {
+; CHECK-LABEL: define i8 @ctlz_to_sub_bw_cttz_multi_use_dec(
+; CHECK-SAME: i8 [[A0:%.*]]) {
+; CHECK-NEXT:    [[DEC:%.*]] = add i8 [[A0]], -1
+; CHECK-NEXT:    call void @use(i8 [[DEC]])
+; CHECK-NEXT:    [[TMP1:%.*]] = call range(i8 0, 9) i8 @llvm.cttz.i8(i8 [[A0]], i1 false)
+; CHECK-NEXT:    [[CLZ:%.*]] = sub nuw nsw i8 8, [[TMP1]]
+; CHECK-NEXT:    ret i8 [[CLZ]]
+;
+  %dec = add i8 %a0, -1
+  call void @use(i8 %dec)
+  %not = xor i8 %a0, -1
+  %and = and i8 %dec, %not
+  %clz = tail call i8 @llvm.ctlz.i8(i8 %and, i1 false)
+  ret i8 %clz
+}
+
+define i8 @ctlz_to_sub_bw_cttz_multi_use_not(i8 %a0) {
+; CHECK-LABEL: define i8 @ctlz_to_sub_bw_cttz_multi_use_not(
+; CHECK-SAME: i8 [[A0:%.*]]) {
+; CHECK-NEXT:    [[NOT:%.*]] = xor i8 [[A0]], -1
+; CHECK-NEXT:    call void @use(i8 [[NOT]])
+; CHECK-NEXT:    [[TMP1:%.*]] = call range(i8 0, 9) i8 @llvm.cttz.i8(i8 [[A0]], i1 false)
+; CHECK-NEXT:    [[CLZ:%.*]] = sub nuw nsw i8 8, [[TMP1]]
+; CHECK-NEXT:    ret i8 [[CLZ]]
+;
+  %dec = add i8 %a0, -1
+  %not = xor i8 %a0, -1
+  call void @use(i8 %not)
+  %and = and i8 %dec, %not
+  %clz = tail call i8 @llvm.ctlz.i8(i8 %and, i1 false)
+  ret i8 %clz
+}
+
+define i8 @ctlz_to_sub_bw_cttz_multi_use_and(i8 %a0) {
+; CHECK-LABEL: define i8 @ctlz_to_sub_bw_cttz_multi_use_and(
+; CHECK-SAME: i8 [[A0:%.*]]) {
+; CHECK-NEXT:    [[DEC:%.*]] = add i8 [[A0]], -1
+; CHECK-NEXT:    [[NOT:%.*]] = xor i8 [[A0]], -1
+; CHECK-NEXT:    [[AND:%.*]] = and i8 [[DEC]], [[NOT]]
+; CHECK-NEXT:    call void @use(i8 [[AND]])
+; CHECK-NEXT:    [[CLZ:%.*]] = tail call range(i8 0, 9) i8 @llvm.ctlz.i8(i8 [[AND]], i1 false)
+; CHECK-NEXT:    ret i8 [[CLZ]]
+;
+  %dec = add i8 %a0, -1
+  %not = xor i8 %a0, -1
+  %and = and i8 %dec, %not
+  call void @use(i8 %and)
+  %clz = tail call i8 @llvm.ctlz.i8(i8 %and, i1 false)
+  ret i8 %clz
+}
+
+define i8 @ctlz_to_sub_bw_cttz_commute_and(i8 %a0) {
+; CHECK-LABEL: define i8 @ctlz_to_sub_bw_cttz_commute_and(
+; CHECK-SAME: i8 [[A0:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = call range(i8 0, 9) i8 @llvm.cttz.i8(i8 [[A0]], i1 false)
+; CHECK-NEXT:    [[CLZ:%.*]] = sub nuw nsw i8 8, [[TMP1]]
+; CHECK-NEXT:    ret i8 [[CLZ]]
+;
+  %dec = add i8 %a0, -1
+  %not = xor i8 %a0, -1
+  %and = and i8 %not, %dec
+  %clz = tail call i8 @llvm.ctlz.i8(i8 %and, i1 false)
+  ret i8 %clz
+}
+
+define <2 x i8> @ctlz_to_sub_bw_cttz_vec_splat(<2 x i8> %a0) {
+; CHECK-LABEL: define <2 x i8> @ctlz_to_sub_bw_cttz_vec_splat(
+; CHECK-SAME: <2 x i8> [[A0:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = call range(i8 0, 9) <2 x i8> @llvm.cttz.v2i8(<2 x i8> [[A0]], i1 false)
+; CHECK-NEXT:    [[CLZ:%.*]] = sub nuw nsw <2 x i8> splat (i8 8), [[TMP1]]
+; CHECK-NEXT:    ret <2 x i8> [[CLZ]]
+;
+  %dec = add <2 x i8> %a0, <i8 -1, i8 -1>
+  %not = xor <2 x i8> %a0, <i8 -1, i8 -1>
+  %and = and <2 x i8> %dec, %not
+  %clz = tail call <2 x i8>@llvm.ctlz.v2i8(<2 x i8> %and, i1 false)
+  ret <2 x i8> %clz
+}
diff --git a/llvm/test/Transforms/InstCombine/scmp.ll b/llvm/test/Transforms/InstCombine/scmp.ll
index c0be5b9..2ae062cd 100644
--- a/llvm/test/Transforms/InstCombine/scmp.ll
+++ b/llvm/test/Transforms/InstCombine/scmp.ll
@@ -519,9 +519,7 @@ define <3 x i2> @scmp_unary_shuffle_ops(<3 x i8> %x, <3 x i8> %y) {
 define i32 @scmp_sgt_slt(i32 %a) {
 ; CHECK-LABEL: define i32 @scmp_sgt_slt(
 ; CHECK-SAME: i32 [[A:%.*]]) {
-; CHECK-NEXT:    [[A_LOBIT:%.*]] = ashr i32 [[A]], 31
-; CHECK-NEXT:    [[CMP_INV:%.*]] = icmp slt i32 [[A]], 1
-; CHECK-NEXT:    [[RETVAL_0:%.*]] = select i1 [[CMP_INV]], i32 [[A_LOBIT]], i32 1
+; CHECK-NEXT:    [[RETVAL_0:%.*]] = call i32 @llvm.scmp.i32.i32(i32 [[A]], i32 0)
 ; CHECK-NEXT:    ret i32 [[RETVAL_0]]
 ;
   %cmp = icmp sgt i32 %a, 0
@@ -747,3 +745,55 @@ define i8 @scmp_from_select_eq_and_gt_neg3(i32 %x, i32 %y) {
   %r = select i1 %eq, i8 0, i8 %sel1
   ret i8 %r
 }
+
+define i32 @scmp_ashr(i32 %a) {
+; CHECK-LABEL: define i32 @scmp_ashr(
+; CHECK-SAME: i32 [[A:%.*]]) {
+; CHECK-NEXT:    [[RETVAL_0:%.*]] = call i32 @llvm.scmp.i32.i32(i32 [[A]], i32 0)
+; CHECK-NEXT:    ret i32 [[RETVAL_0]]
+;
+  %a.lobit = ashr i32 %a, 31
+  %cmp.inv = icmp slt i32 %a, 1
+  %retval.0 = select i1 %cmp.inv, i32 %a.lobit, i32 1
+  ret i32 %retval.0
+}
+
+; select (icmp sgt X, 0), 1, ashr X, bitwidth-1 -> scmp(X, 0)
+define i8 @scmp_ashr_sgt_pattern(i8 %a) {
+; CHECK-LABEL: define i8 @scmp_ashr_sgt_pattern(
+; CHECK-SAME: i8 [[A:%.*]]) {
+; CHECK-NEXT:    [[R:%.*]] = call i8 @llvm.scmp.i8.i8(i8 [[A]], i8 0)
+; CHECK-NEXT:    ret i8 [[R]]
+;
+  %a.lobit = ashr i8 %a, 7
+  %cmp = icmp sgt i8 %a, 0
+  %retval = select i1 %cmp, i8 1, i8 %a.lobit
+  ret i8 %retval
+}
+
+; select (icmp slt X, 1), ashr X, bitwidth-1, 1 -> scmp(X, 0)
+define i8 @scmp_ashr_slt_pattern(i8 %a) {
+; CHECK-LABEL: define i8 @scmp_ashr_slt_pattern(
+; CHECK-SAME: i8 [[A:%.*]]) {
+; CHECK-NEXT:    [[R:%.*]] = call i8 @llvm.scmp.i8.i8(i8 [[A]], i8 0)
+; CHECK-NEXT:    ret i8 [[R]]
+;
+  %a.lobit = ashr i8 %a, 7
+  %cmp = icmp slt i8 %a, 1
+  %retval = select i1 %cmp, i8 %a.lobit, i8 1
+  ret i8 %retval
+}
+
+define i8 @scmp_ashr_slt_pattern_neg(i8 %a) {
+; CHECK-LABEL: define i8 @scmp_ashr_slt_pattern_neg(
+; CHECK-SAME: i8 [[A:%.*]]) {
+; CHECK-NEXT:    [[A_LOBIT:%.*]] = ashr i8 [[A]], 4
+; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i8 [[A]], 1
+; CHECK-NEXT:    [[RETVAL:%.*]] = select i1 [[CMP]], i8 [[A_LOBIT]], i8 1
+; CHECK-NEXT:    ret i8 [[RETVAL]]
+;
+  %a.lobit = ashr i8 %a, 4
+  %cmp = icmp slt i8 %a, 1
+  %retval = select i1 %cmp, i8 %a.lobit, i8 1
+  ret i8 %retval
+}
diff --git a/llvm/test/Transforms/SafeStack/AArch64/abi_ssp.ll b/llvm/test/Transforms/SafeStack/AArch64/abi_ssp.ll
index 43fb260..d981626 100644
--- a/llvm/test/Transforms/SafeStack/AArch64/abi_ssp.ll
+++ b/llvm/test/Transforms/SafeStack/AArch64/abi_ssp.ll
@@ -1,7 +1,5 @@
 ; RUN: opt -safe-stack -S -mtriple=aarch64-linux-android < %s -o - | FileCheck --check-prefixes=TLS,ANDROID %s
-; RUN: opt -safe-stack -S -mtriple=aarch64-unknown-fuchsia < %s -o - | FileCheck --check-prefixes=TLS,FUCHSIA %s
 ; RUN: opt -passes=safe-stack -S -mtriple=aarch64-linux-android < %s -o - | FileCheck --check-prefixes=TLS,ANDROID %s
-; RUN: opt -passes=safe-stack -S -mtriple=aarch64-unknown-fuchsia < %s -o - | FileCheck --check-prefixes=TLS,FUCHSIA %s
 
 define void @foo() nounwind uwtable safestack sspreq {
 entry:
@@ -10,7 +8,6 @@ entry:
 
 ; TLS: %[[TP2:.*]] = call ptr @llvm.thread.pointer.p0()
 ; ANDROID: %[[B:.*]] = getelementptr i8, ptr %[[TP2]], i32 40
-; FUCHSIA: %[[B:.*]] = getelementptr i8, ptr %[[TP2]], i32 -16
 ; TLS: %[[StackGuard:.*]] = load ptr, ptr %[[B]]
 ; TLS: store ptr %[[StackGuard]], ptr %[[StackGuardSlot:.*]]
   %a = alloca i128, align 16
diff --git a/llvm/test/tools/llvm-mca/AArch64/Neoverse/V2-sve-instructions.s b/llvm/test/tools/llvm-mca/AArch64/Neoverse/V2-sve-instructions.s
index 49af4df..c20409e 100644
--- a/llvm/test/tools/llvm-mca/AArch64/Neoverse/V2-sve-instructions.s
+++ b/llvm/test/tools/llvm-mca/AArch64/Neoverse/V2-sve-instructions.s
@@ -6864,7 +6864,7 @@ zip2	z31.s, z31.s, z31.s
 
 # CHECK:      Resource pressure per iteration:
 # CHECK-NEXT: [0.0]  [0.1]  [1.0]  [1.1]  [2.0]  [2.1]  [2.2]  [3]    [4.0]  [4.1]  [5]    [6]    [7]    [8]    [9]    [10]   [11]   [12]   [13]   [14]
-# CHECK-NEXT:  -      -      -      -      -      -      -     245.00 651.00 651.00 570.50 272.50 83.75  83.75  81.75  81.75  1536.75 1281.75 794.25 748.25
+# CHECK-NEXT:  -      -      -      -      -      -      -     245.00 651.00 651.00 570.50 272.50 83.75  83.75  81.75  81.75  1540.75 1285.75 790.25 744.25
 
 # CHECK:      Resource pressure by instruction:
 # CHECK-NEXT: [0.0]  [0.1]  [1.0]  [1.1]  [2.0]  [2.1]  [2.2]  [3]    [4.0]  [4.1]  [5]    [6]    [7]    [8]    [9]    [10]   [11]   [12]   [13]   [14]   Instructions:
@@ -9617,39 +9617,39 @@ zip2	z31.s, z31.s, z31.s
 # CHECK-NEXT:  -      -      -      -      -      -      -      -     9.00   9.00    -      -      -      -      -      -     9.00   9.00    -      -     st4w	{ z21.s - z24.s }, p5, [x10, #20, mul vl]
 # CHECK-NEXT:  -      -      -      -      -      -      -      -     9.00   9.00    -      -      -      -      -      -     9.00   9.00    -      -     st4w	{ z23.s - z26.s }, p3, [x13, #-32, mul vl]
 # CHECK-NEXT:  -      -      -      -      -      -      -      -     9.00   9.00    -      -     4.50   4.50   4.50   4.50   9.00   9.00    -      -     st4w	{ z5.s - z8.s }, p3, [x17, x16, lsl #2]
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -      -      -     0.25   0.25   0.25   0.25   stnt1b	{ z0.b }, p0, [x0, x0]
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -      -      -     0.25   0.25   0.25   0.25   stnt1b	{ z0.b }, p0, [x0]
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -     stnt1b	{ z0.b }, p0, [x0, x0]
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -     stnt1b	{ z0.b }, p0, [x0]
 # CHECK-NEXT:  -      -      -      -      -      -      -      -     1.00   1.00    -      -      -      -      -      -     1.00   1.00    -      -     stnt1b	{ z0.d }, p0, [z1.d]
 # CHECK-NEXT:  -      -      -      -      -      -      -      -     2.00   2.00    -      -      -      -      -      -     2.00   2.00    -      -     stnt1b	{ z0.s }, p0, [z1.s]
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -      -      -     0.25   0.25   0.25   0.25   stnt1b	{ z21.b }, p5, [x10, #7, mul vl]
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -      -      -     0.25   0.25   0.25   0.25   stnt1b	{ z23.b }, p3, [x13, #-8, mul vl]
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -     stnt1b	{ z21.b }, p5, [x10, #7, mul vl]
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -     stnt1b	{ z23.b }, p3, [x13, #-8, mul vl]
 # CHECK-NEXT:  -      -      -      -      -      -      -      -     1.00   1.00    -      -      -      -      -      -     1.00   1.00    -      -     stnt1b	{ z31.d }, p7, [z31.d, x0]
 # CHECK-NEXT:  -      -      -      -      -      -      -      -     1.00   1.00    -      -      -      -      -      -     1.00   1.00    -      -     stnt1b	{ z31.d }, p7, [z31.d]
 # CHECK-NEXT:  -      -      -      -      -      -      -      -     2.00   2.00    -      -      -      -      -      -     2.00   2.00    -      -     stnt1b	{ z31.s }, p7, [z31.s, x0]
 # CHECK-NEXT:  -      -      -      -      -      -      -      -     2.00   2.00    -      -      -      -      -      -     2.00   2.00    -      -     stnt1b	{ z31.s }, p7, [z31.s]
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -      -      -     0.25   0.25   0.25   0.25   stnt1d	{ z0.d }, p0, [x0, x0, lsl #3]
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -      -      -     0.25   0.25   0.25   0.25   stnt1d	{ z0.d }, p0, [x0]
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -     stnt1d	{ z0.d }, p0, [x0, x0, lsl #3]
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -     stnt1d	{ z0.d }, p0, [x0]
 # CHECK-NEXT:  -      -      -      -      -      -      -      -     1.00   1.00    -      -      -      -      -      -     1.00   1.00    -      -     stnt1d	{ z0.d }, p0, [z1.d]
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -      -      -     0.25   0.25   0.25   0.25   stnt1d	{ z21.d }, p5, [x10, #7, mul vl]
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -      -      -     0.25   0.25   0.25   0.25   stnt1d	{ z23.d }, p3, [x13, #-8, mul vl]
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -     stnt1d	{ z21.d }, p5, [x10, #7, mul vl]
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -     stnt1d	{ z23.d }, p3, [x13, #-8, mul vl]
 # CHECK-NEXT:  -      -      -      -      -      -      -      -     1.00   1.00    -      -      -      -      -      -     1.00   1.00    -      -     stnt1d	{ z31.d }, p7, [z31.d, x0]
 # CHECK-NEXT:  -      -      -      -      -      -      -      -     1.00   1.00    -      -      -      -      -      -     1.00   1.00    -      -     stnt1d	{ z31.d }, p7, [z31.d]
 # CHECK-NEXT:  -      -      -      -      -      -      -      -     1.00   1.00    -      -      -      -      -      -     1.00   1.00    -      -     stnt1h	{ z0.d }, p0, [z1.d]
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.25   0.25   0.25   0.25   0.25   0.25   0.25   0.25   stnt1h	{ z0.h }, p0, [x0, x0, lsl #1]
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -      -      -     0.25   0.25   0.25   0.25   stnt1h	{ z0.h }, p0, [x0]
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.25   0.25   0.25   0.25   0.50   0.50    -      -     stnt1h	{ z0.h }, p0, [x0, x0, lsl #1]
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -     stnt1h	{ z0.h }, p0, [x0]
 # CHECK-NEXT:  -      -      -      -      -      -      -      -     2.00   2.00    -      -      -      -      -      -     2.00   2.00    -      -     stnt1h	{ z0.s }, p0, [z1.s]
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -      -      -     0.25   0.25   0.25   0.25   stnt1h	{ z21.h }, p5, [x10, #7, mul vl]
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -      -      -     0.25   0.25   0.25   0.25   stnt1h	{ z23.h }, p3, [x13, #-8, mul vl]
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -     stnt1h	{ z21.h }, p5, [x10, #7, mul vl]
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -     stnt1h	{ z23.h }, p3, [x13, #-8, mul vl]
 # CHECK-NEXT:  -      -      -      -      -      -      -      -     1.00   1.00    -      -      -      -      -      -     1.00   1.00    -      -     stnt1h	{ z31.d }, p7, [z31.d, x0]
 # CHECK-NEXT:  -      -      -      -      -      -      -      -     1.00   1.00    -      -      -      -      -      -     1.00   1.00    -      -     stnt1h	{ z31.d }, p7, [z31.d]
 # CHECK-NEXT:  -      -      -      -      -      -      -      -     2.00   2.00    -      -      -      -      -      -     2.00   2.00    -      -     stnt1h	{ z31.s }, p7, [z31.s, x0]
 # CHECK-NEXT:  -      -      -      -      -      -      -      -     2.00   2.00    -      -      -      -      -      -     2.00   2.00    -      -     stnt1h	{ z31.s }, p7, [z31.s]
 # CHECK-NEXT:  -      -      -      -      -      -      -      -     1.00   1.00    -      -      -      -      -      -     1.00   1.00    -      -     stnt1w	{ z0.d }, p0, [z1.d]
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -      -      -     0.25   0.25   0.25   0.25   stnt1w	{ z0.s }, p0, [x0, x0, lsl #2]
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -      -      -     0.25   0.25   0.25   0.25   stnt1w	{ z0.s }, p0, [x0]
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -     stnt1w	{ z0.s }, p0, [x0, x0, lsl #2]
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -     stnt1w	{ z0.s }, p0, [x0]
 # CHECK-NEXT:  -      -      -      -      -      -      -      -     2.00   2.00    -      -      -      -      -      -     2.00   2.00    -      -     stnt1w	{ z0.s }, p0, [z1.s]
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -      -      -     0.25   0.25   0.25   0.25   stnt1w	{ z21.s }, p5, [x10, #7, mul vl]
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -      -      -     0.25   0.25   0.25   0.25   stnt1w	{ z23.s }, p3, [x13, #-8, mul vl]
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -     stnt1w	{ z21.s }, p5, [x10, #7, mul vl]
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -     stnt1w	{ z23.s }, p3, [x13, #-8, mul vl]
 # CHECK-NEXT:  -      -      -      -      -      -      -      -     1.00   1.00    -      -      -      -      -      -     1.00   1.00    -      -     stnt1w	{ z31.d }, p7, [z31.d, x0]
 # CHECK-NEXT:  -      -      -      -      -      -      -      -     1.00   1.00    -      -      -      -      -      -     1.00   1.00    -      -     stnt1w	{ z31.d }, p7, [z31.d]
 # CHECK-NEXT:  -      -      -      -      -      -      -      -     2.00   2.00    -      -      -      -      -      -     2.00   2.00    -      -     stnt1w	{ z31.s }, p7, [z31.s, x0]
diff --git a/llvm/test/tools/llvm-profdata/input-wildcard.test b/llvm/test/tools/llvm-profdata/input-wildcard.test
new file mode 100644
index 0000000..f2c46c9
--- /dev/null
+++ b/llvm/test/tools/llvm-profdata/input-wildcard.test
@@ -0,0 +1,15 @@
+# This test verifies that llvm-profdata will do wildcard expansion on its
+# arguments. The expansion is done by Windows-specific support in InitLLVM, so
+# we only expect this to work on Windows hosts.
+# REQUIRES: system-windows
+
+# Create two files to glob.
+RUN: echo '# empty profile 1' >  %t.prof1.proftxt
+RUN: echo '# empty profile 2' >> %t.prof2.proftxt
+
+# Prevent LIT itself from globbing by quoting the wildcard argument.
+RUN: llvm-profdata merge "%t.*.proftxt" -dump-input-file-list -o /dev/null | FileCheck %s
+
+# Verify that llvm-profdata expanded the wildcard argument.
+CHECK: 1,{{.*}}.prof1.proftxt
+CHECK-NEXT: 1,{{.*}}.prof2.proftxt