aboutsummaryrefslogtreecommitdiff
path: root/llvm/test/CodeGen
diff options
context:
space:
mode:
Diffstat (limited to 'llvm/test/CodeGen')
-rw-r--r--llvm/test/CodeGen/AArch64/aarch64-mops.ll188
-rw-r--r--llvm/test/CodeGen/AArch64/aarch64-split-and-bitmask-immediate.ll107
-rw-r--r--llvm/test/CodeGen/AArch64/bsp_implicit_ops.mir98
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-memcpy.mir30
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-memcpyinline.mir30
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-memmove.mir30
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-memset.mir29
-rw-r--r--llvm/test/CodeGen/AMDGPU/div_i128.ll64
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.round.f64.ll3
-rw-r--r--llvm/test/CodeGen/AMDGPU/rem_i128.ll64
-rw-r--r--llvm/test/CodeGen/AMDGPU/scale-offset-smem.ll372
-rw-r--r--llvm/test/CodeGen/AMDGPU/srem.ll4
-rw-r--r--llvm/test/CodeGen/LoongArch/lasx/build-vector.ll2
-rw-r--r--llvm/test/CodeGen/LoongArch/lasx/fpowi.ll36
-rw-r--r--llvm/test/CodeGen/LoongArch/lasx/ir-instruction/fix-xvshuf.ll11
-rw-r--r--llvm/test/CodeGen/LoongArch/llvm.exp10.ll8
-rw-r--r--llvm/test/CodeGen/LoongArch/llvm.sincos.ll26
-rw-r--r--llvm/test/CodeGen/LoongArch/lsx/build-vector.ll2
-rw-r--r--llvm/test/CodeGen/LoongArch/lsx/fpowi.ll35
-rw-r--r--llvm/test/CodeGen/NVPTX/i1-select.ll30
-rw-r--r--llvm/test/CodeGen/NVPTX/i128.ll582
-rw-r--r--llvm/test/CodeGen/PowerPC/ppc64-P9-vabsd.ll417
-rw-r--r--llvm/test/CodeGen/RISCV/fpclamptosat.ll88
-rw-r--r--llvm/test/CodeGen/RISCV/intrinsic-cttz-elts-vscale.ll36
-rw-r--r--llvm/test/CodeGen/RISCV/rvv/rvv-peephole-vmerge-to-vmv.mir50
-rw-r--r--llvm/test/CodeGen/RISCV/rvv/rvv-vmerge-to-vmv.ll35
-rw-r--r--llvm/test/CodeGen/RISCV/wide-scalar-shift-legalization.ll1901
-rw-r--r--llvm/test/CodeGen/RISCV/xtheadfmemidx.ll128
-rw-r--r--llvm/test/CodeGen/RISCV/xtheadmemidx.ll759
-rw-r--r--llvm/test/CodeGen/SPARC/tls-sp.ll105
-rw-r--r--llvm/test/CodeGen/SystemZ/pr60413.ll36
-rw-r--r--llvm/test/CodeGen/WebAssembly/memory-interleave.ll1413
-rw-r--r--llvm/test/CodeGen/WinEH/wineh-noret-cleanup.ll14
-rw-r--r--llvm/test/CodeGen/X86/abds-neg.ll92
-rw-r--r--llvm/test/CodeGen/X86/avg.ll156
-rw-r--r--llvm/test/CodeGen/X86/catchret-empty-fallthrough.ll2
-rw-r--r--llvm/test/CodeGen/X86/conditional-tailcall-pgso.ll1
-rw-r--r--llvm/test/CodeGen/X86/conditional-tailcall.ll1
-rw-r--r--llvm/test/CodeGen/X86/div-rem-pair-recomposition-signed.ll39
-rw-r--r--llvm/test/CodeGen/X86/div-rem-pair-recomposition-unsigned.ll12
-rw-r--r--llvm/test/CodeGen/X86/freeze-vector.ll24
-rw-r--r--llvm/test/CodeGen/X86/noreturn-call-win64.ll12
-rw-r--r--llvm/test/CodeGen/X86/pr149841.ll34
-rw-r--r--llvm/test/CodeGen/X86/seh-catch-all.ll2
-rw-r--r--llvm/test/CodeGen/X86/seh-catchpad.ll10
-rw-r--r--llvm/test/CodeGen/X86/seh-except-finally.ll6
-rw-r--r--llvm/test/CodeGen/X86/seh-finally.ll2
-rw-r--r--llvm/test/CodeGen/X86/seh-safe-div.ll5
-rw-r--r--llvm/test/CodeGen/X86/seh-unwind-inline-asm-codegen.ll4
-rw-r--r--llvm/test/CodeGen/X86/setcc-non-simple-type.ll4
-rw-r--r--llvm/test/CodeGen/X86/stack-coloring-wineh.ll2
-rw-r--r--llvm/test/CodeGen/X86/taildup-heapallocsite.ll2
-rw-r--r--llvm/test/CodeGen/X86/win-catchpad-nested-cxx.ll8
-rw-r--r--llvm/test/CodeGen/X86/win-catchpad.ll8
-rw-r--r--llvm/test/CodeGen/X86/win-cleanuppad.ll2
-rw-r--r--llvm/test/CodeGen/X86/win32-eh-states.ll14
-rw-r--r--llvm/test/CodeGen/X86/win64-seh-epilogue-statepoint.ll2
-rw-r--r--llvm/test/CodeGen/X86/wineh-coreclr.ll14
-rw-r--r--llvm/test/CodeGen/XCore/exception.ll2
59 files changed, 4665 insertions, 2528 deletions
diff --git a/llvm/test/CodeGen/AArch64/aarch64-mops.ll b/llvm/test/CodeGen/AArch64/aarch64-mops.ll
index ff7872c..83530049a 100644
--- a/llvm/test/CodeGen/AArch64/aarch64-mops.ll
+++ b/llvm/test/CodeGen/AArch64/aarch64-mops.ll
@@ -87,46 +87,17 @@ entry:
}
define void @memset_10_zeroval_volatile(ptr %dst) {
-; GISel-WITHOUT-MOPS-O0-LABEL: memset_10_zeroval_volatile:
-; GISel-WITHOUT-MOPS-O0: // %bb.0: // %entry
-; GISel-WITHOUT-MOPS-O0-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill
-; GISel-WITHOUT-MOPS-O0-NEXT: .cfi_def_cfa_offset 16
-; GISel-WITHOUT-MOPS-O0-NEXT: .cfi_offset w30, -16
-; GISel-WITHOUT-MOPS-O0-NEXT: mov w8, #10 // =0xa
-; GISel-WITHOUT-MOPS-O0-NEXT: mov w2, w8
-; GISel-WITHOUT-MOPS-O0-NEXT: mov w1, wzr
-; GISel-WITHOUT-MOPS-O0-NEXT: bl memset
-; GISel-WITHOUT-MOPS-O0-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload
-; GISel-WITHOUT-MOPS-O0-NEXT: ret
-;
-; GISel-WITHOUT-MOPS-O3-LABEL: memset_10_zeroval_volatile:
-; GISel-WITHOUT-MOPS-O3: // %bb.0: // %entry
-; GISel-WITHOUT-MOPS-O3-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill
-; GISel-WITHOUT-MOPS-O3-NEXT: .cfi_def_cfa_offset 16
-; GISel-WITHOUT-MOPS-O3-NEXT: .cfi_offset w30, -16
-; GISel-WITHOUT-MOPS-O3-NEXT: mov w1, wzr
-; GISel-WITHOUT-MOPS-O3-NEXT: mov w2, #10 // =0xa
-; GISel-WITHOUT-MOPS-O3-NEXT: bl memset
-; GISel-WITHOUT-MOPS-O3-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload
-; GISel-WITHOUT-MOPS-O3-NEXT: ret
-;
-; GISel-MOPS-O0-LABEL: memset_10_zeroval_volatile:
-; GISel-MOPS-O0: // %bb.0: // %entry
-; GISel-MOPS-O0-NEXT: mov w8, #10 // =0xa
-; GISel-MOPS-O0-NEXT: // kill: def $x8 killed $w8
-; GISel-MOPS-O0-NEXT: mov x9, xzr
-; GISel-MOPS-O0-NEXT: setp [x0]!, x8!, x9
-; GISel-MOPS-O0-NEXT: setm [x0]!, x8!, x9
-; GISel-MOPS-O0-NEXT: sete [x0]!, x8!, x9
-; GISel-MOPS-O0-NEXT: ret
+; GISel-WITHOUT-MOPS-LABEL: memset_10_zeroval_volatile:
+; GISel-WITHOUT-MOPS: // %bb.0: // %entry
+; GISel-WITHOUT-MOPS-NEXT: str xzr, [x0]
+; GISel-WITHOUT-MOPS-NEXT: strh wzr, [x0, #8]
+; GISel-WITHOUT-MOPS-NEXT: ret
;
-; GISel-MOPS-O3-LABEL: memset_10_zeroval_volatile:
-; GISel-MOPS-O3: // %bb.0: // %entry
-; GISel-MOPS-O3-NEXT: mov w8, #10 // =0xa
-; GISel-MOPS-O3-NEXT: setp [x0]!, x8!, xzr
-; GISel-MOPS-O3-NEXT: setm [x0]!, x8!, xzr
-; GISel-MOPS-O3-NEXT: sete [x0]!, x8!, xzr
-; GISel-MOPS-O3-NEXT: ret
+; GISel-MOPS-LABEL: memset_10_zeroval_volatile:
+; GISel-MOPS: // %bb.0: // %entry
+; GISel-MOPS-NEXT: str xzr, [x0]
+; GISel-MOPS-NEXT: strh wzr, [x0, #8]
+; GISel-MOPS-NEXT: ret
;
; SDAG-WITHOUT-MOPS-O2-LABEL: memset_10_zeroval_volatile:
; SDAG-WITHOUT-MOPS-O2: // %bb.0: // %entry
@@ -490,43 +461,46 @@ entry:
define void @memset_10_volatile(ptr %dst, i32 %value) {
; GISel-WITHOUT-MOPS-O0-LABEL: memset_10_volatile:
; GISel-WITHOUT-MOPS-O0: // %bb.0: // %entry
-; GISel-WITHOUT-MOPS-O0-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill
-; GISel-WITHOUT-MOPS-O0-NEXT: .cfi_def_cfa_offset 16
-; GISel-WITHOUT-MOPS-O0-NEXT: .cfi_offset w30, -16
-; GISel-WITHOUT-MOPS-O0-NEXT: mov w8, #10 // =0xa
-; GISel-WITHOUT-MOPS-O0-NEXT: mov w2, w8
-; GISel-WITHOUT-MOPS-O0-NEXT: bl memset
-; GISel-WITHOUT-MOPS-O0-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload
+; GISel-WITHOUT-MOPS-O0-NEXT: // implicit-def: $x8
+; GISel-WITHOUT-MOPS-O0-NEXT: mov w8, w1
+; GISel-WITHOUT-MOPS-O0-NEXT: and x8, x8, #0xff
+; GISel-WITHOUT-MOPS-O0-NEXT: mov x9, #72340172838076673 // =0x101010101010101
+; GISel-WITHOUT-MOPS-O0-NEXT: mul x8, x8, x9
+; GISel-WITHOUT-MOPS-O0-NEXT: str x8, [x0]
+; GISel-WITHOUT-MOPS-O0-NEXT: // kill: def $w8 killed $w8 killed $x8
+; GISel-WITHOUT-MOPS-O0-NEXT: strh w8, [x0, #8]
; GISel-WITHOUT-MOPS-O0-NEXT: ret
;
; GISel-WITHOUT-MOPS-O3-LABEL: memset_10_volatile:
; GISel-WITHOUT-MOPS-O3: // %bb.0: // %entry
-; GISel-WITHOUT-MOPS-O3-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill
-; GISel-WITHOUT-MOPS-O3-NEXT: .cfi_def_cfa_offset 16
-; GISel-WITHOUT-MOPS-O3-NEXT: .cfi_offset w30, -16
-; GISel-WITHOUT-MOPS-O3-NEXT: mov w2, #10 // =0xa
-; GISel-WITHOUT-MOPS-O3-NEXT: bl memset
-; GISel-WITHOUT-MOPS-O3-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload
+; GISel-WITHOUT-MOPS-O3-NEXT: // kill: def $w1 killed $w1 def $x1
+; GISel-WITHOUT-MOPS-O3-NEXT: mov x8, #72340172838076673 // =0x101010101010101
+; GISel-WITHOUT-MOPS-O3-NEXT: and x9, x1, #0xff
+; GISel-WITHOUT-MOPS-O3-NEXT: mul x8, x9, x8
+; GISel-WITHOUT-MOPS-O3-NEXT: str x8, [x0]
+; GISel-WITHOUT-MOPS-O3-NEXT: strh w8, [x0, #8]
; GISel-WITHOUT-MOPS-O3-NEXT: ret
;
; GISel-MOPS-O0-LABEL: memset_10_volatile:
; GISel-MOPS-O0: // %bb.0: // %entry
-; GISel-MOPS-O0-NEXT: mov w8, #10 // =0xa
-; GISel-MOPS-O0-NEXT: // kill: def $x8 killed $w8
-; GISel-MOPS-O0-NEXT: // implicit-def: $x9
-; GISel-MOPS-O0-NEXT: mov w9, w1
-; GISel-MOPS-O0-NEXT: setp [x0]!, x8!, x9
-; GISel-MOPS-O0-NEXT: setm [x0]!, x8!, x9
-; GISel-MOPS-O0-NEXT: sete [x0]!, x8!, x9
+; GISel-MOPS-O0-NEXT: // implicit-def: $x8
+; GISel-MOPS-O0-NEXT: mov w8, w1
+; GISel-MOPS-O0-NEXT: and x8, x8, #0xff
+; GISel-MOPS-O0-NEXT: mov x9, #72340172838076673 // =0x101010101010101
+; GISel-MOPS-O0-NEXT: mul x8, x8, x9
+; GISel-MOPS-O0-NEXT: str x8, [x0]
+; GISel-MOPS-O0-NEXT: // kill: def $w8 killed $w8 killed $x8
+; GISel-MOPS-O0-NEXT: strh w8, [x0, #8]
; GISel-MOPS-O0-NEXT: ret
;
; GISel-MOPS-O3-LABEL: memset_10_volatile:
; GISel-MOPS-O3: // %bb.0: // %entry
-; GISel-MOPS-O3-NEXT: mov w8, #10 // =0xa
; GISel-MOPS-O3-NEXT: // kill: def $w1 killed $w1 def $x1
-; GISel-MOPS-O3-NEXT: setp [x0]!, x8!, x1
-; GISel-MOPS-O3-NEXT: setm [x0]!, x8!, x1
-; GISel-MOPS-O3-NEXT: sete [x0]!, x8!, x1
+; GISel-MOPS-O3-NEXT: mov x8, #72340172838076673 // =0x101010101010101
+; GISel-MOPS-O3-NEXT: and x9, x1, #0xff
+; GISel-MOPS-O3-NEXT: mul x8, x9, x8
+; GISel-MOPS-O3-NEXT: str x8, [x0]
+; GISel-MOPS-O3-NEXT: strh w8, [x0, #8]
; GISel-MOPS-O3-NEXT: ret
;
; SDAG-WITHOUT-MOPS-O2-LABEL: memset_10_volatile:
@@ -905,43 +879,21 @@ entry:
}
define void @memcpy_10_volatile(ptr %dst, ptr %src, i32 %value) {
-; GISel-WITHOUT-MOPS-O0-LABEL: memcpy_10_volatile:
-; GISel-WITHOUT-MOPS-O0: // %bb.0: // %entry
-; GISel-WITHOUT-MOPS-O0-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill
-; GISel-WITHOUT-MOPS-O0-NEXT: .cfi_def_cfa_offset 16
-; GISel-WITHOUT-MOPS-O0-NEXT: .cfi_offset w30, -16
-; GISel-WITHOUT-MOPS-O0-NEXT: mov w8, #10 // =0xa
-; GISel-WITHOUT-MOPS-O0-NEXT: mov w2, w8
-; GISel-WITHOUT-MOPS-O0-NEXT: bl memcpy
-; GISel-WITHOUT-MOPS-O0-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload
-; GISel-WITHOUT-MOPS-O0-NEXT: ret
-;
-; GISel-WITHOUT-MOPS-O3-LABEL: memcpy_10_volatile:
-; GISel-WITHOUT-MOPS-O3: // %bb.0: // %entry
-; GISel-WITHOUT-MOPS-O3-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill
-; GISel-WITHOUT-MOPS-O3-NEXT: .cfi_def_cfa_offset 16
-; GISel-WITHOUT-MOPS-O3-NEXT: .cfi_offset w30, -16
-; GISel-WITHOUT-MOPS-O3-NEXT: mov w2, #10 // =0xa
-; GISel-WITHOUT-MOPS-O3-NEXT: bl memcpy
-; GISel-WITHOUT-MOPS-O3-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload
-; GISel-WITHOUT-MOPS-O3-NEXT: ret
-;
-; GISel-MOPS-O0-LABEL: memcpy_10_volatile:
-; GISel-MOPS-O0: // %bb.0: // %entry
-; GISel-MOPS-O0-NEXT: mov w8, #10 // =0xa
-; GISel-MOPS-O0-NEXT: // kill: def $x8 killed $w8
-; GISel-MOPS-O0-NEXT: cpyfp [x0]!, [x1]!, x8!
-; GISel-MOPS-O0-NEXT: cpyfm [x0]!, [x1]!, x8!
-; GISel-MOPS-O0-NEXT: cpyfe [x0]!, [x1]!, x8!
-; GISel-MOPS-O0-NEXT: ret
+; GISel-WITHOUT-MOPS-LABEL: memcpy_10_volatile:
+; GISel-WITHOUT-MOPS: // %bb.0: // %entry
+; GISel-WITHOUT-MOPS-NEXT: ldr x8, [x1]
+; GISel-WITHOUT-MOPS-NEXT: str x8, [x0]
+; GISel-WITHOUT-MOPS-NEXT: ldrh w8, [x1, #8]
+; GISel-WITHOUT-MOPS-NEXT: strh w8, [x0, #8]
+; GISel-WITHOUT-MOPS-NEXT: ret
;
-; GISel-MOPS-O3-LABEL: memcpy_10_volatile:
-; GISel-MOPS-O3: // %bb.0: // %entry
-; GISel-MOPS-O3-NEXT: mov w8, #10 // =0xa
-; GISel-MOPS-O3-NEXT: cpyfp [x0]!, [x1]!, x8!
-; GISel-MOPS-O3-NEXT: cpyfm [x0]!, [x1]!, x8!
-; GISel-MOPS-O3-NEXT: cpyfe [x0]!, [x1]!, x8!
-; GISel-MOPS-O3-NEXT: ret
+; GISel-MOPS-LABEL: memcpy_10_volatile:
+; GISel-MOPS: // %bb.0: // %entry
+; GISel-MOPS-NEXT: ldr x8, [x1]
+; GISel-MOPS-NEXT: str x8, [x0]
+; GISel-MOPS-NEXT: ldrh w8, [x1, #8]
+; GISel-MOPS-NEXT: strh w8, [x0, #8]
+; GISel-MOPS-NEXT: ret
;
; SDAG-WITHOUT-MOPS-O2-LABEL: memcpy_10_volatile:
; SDAG-WITHOUT-MOPS-O2: // %bb.0: // %entry
@@ -1736,40 +1688,34 @@ entry:
define void @memmove_10_volatile(ptr %dst, ptr %src, i32 %value) {
; GISel-WITHOUT-MOPS-O0-LABEL: memmove_10_volatile:
; GISel-WITHOUT-MOPS-O0: // %bb.0: // %entry
-; GISel-WITHOUT-MOPS-O0-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill
-; GISel-WITHOUT-MOPS-O0-NEXT: .cfi_def_cfa_offset 16
-; GISel-WITHOUT-MOPS-O0-NEXT: .cfi_offset w30, -16
-; GISel-WITHOUT-MOPS-O0-NEXT: mov w8, #10 // =0xa
-; GISel-WITHOUT-MOPS-O0-NEXT: mov w2, w8
-; GISel-WITHOUT-MOPS-O0-NEXT: bl memmove
-; GISel-WITHOUT-MOPS-O0-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload
+; GISel-WITHOUT-MOPS-O0-NEXT: ldr x9, [x1]
+; GISel-WITHOUT-MOPS-O0-NEXT: ldrh w8, [x1, #8]
+; GISel-WITHOUT-MOPS-O0-NEXT: str x9, [x0]
+; GISel-WITHOUT-MOPS-O0-NEXT: strh w8, [x0, #8]
; GISel-WITHOUT-MOPS-O0-NEXT: ret
;
; GISel-WITHOUT-MOPS-O3-LABEL: memmove_10_volatile:
; GISel-WITHOUT-MOPS-O3: // %bb.0: // %entry
-; GISel-WITHOUT-MOPS-O3-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill
-; GISel-WITHOUT-MOPS-O3-NEXT: .cfi_def_cfa_offset 16
-; GISel-WITHOUT-MOPS-O3-NEXT: .cfi_offset w30, -16
-; GISel-WITHOUT-MOPS-O3-NEXT: mov w2, #10 // =0xa
-; GISel-WITHOUT-MOPS-O3-NEXT: bl memmove
-; GISel-WITHOUT-MOPS-O3-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload
+; GISel-WITHOUT-MOPS-O3-NEXT: ldr x8, [x1]
+; GISel-WITHOUT-MOPS-O3-NEXT: ldrh w9, [x1, #8]
+; GISel-WITHOUT-MOPS-O3-NEXT: str x8, [x0]
+; GISel-WITHOUT-MOPS-O3-NEXT: strh w9, [x0, #8]
; GISel-WITHOUT-MOPS-O3-NEXT: ret
;
; GISel-MOPS-O0-LABEL: memmove_10_volatile:
; GISel-MOPS-O0: // %bb.0: // %entry
-; GISel-MOPS-O0-NEXT: mov w8, #10 // =0xa
-; GISel-MOPS-O0-NEXT: // kill: def $x8 killed $w8
-; GISel-MOPS-O0-NEXT: cpyp [x0]!, [x1]!, x8!
-; GISel-MOPS-O0-NEXT: cpym [x0]!, [x1]!, x8!
-; GISel-MOPS-O0-NEXT: cpye [x0]!, [x1]!, x8!
+; GISel-MOPS-O0-NEXT: ldr x9, [x1]
+; GISel-MOPS-O0-NEXT: ldrh w8, [x1, #8]
+; GISel-MOPS-O0-NEXT: str x9, [x0]
+; GISel-MOPS-O0-NEXT: strh w8, [x0, #8]
; GISel-MOPS-O0-NEXT: ret
;
; GISel-MOPS-O3-LABEL: memmove_10_volatile:
; GISel-MOPS-O3: // %bb.0: // %entry
-; GISel-MOPS-O3-NEXT: mov w8, #10 // =0xa
-; GISel-MOPS-O3-NEXT: cpyp [x0]!, [x1]!, x8!
-; GISel-MOPS-O3-NEXT: cpym [x0]!, [x1]!, x8!
-; GISel-MOPS-O3-NEXT: cpye [x0]!, [x1]!, x8!
+; GISel-MOPS-O3-NEXT: ldr x8, [x1]
+; GISel-MOPS-O3-NEXT: ldrh w9, [x1, #8]
+; GISel-MOPS-O3-NEXT: str x8, [x0]
+; GISel-MOPS-O3-NEXT: strh w9, [x0, #8]
; GISel-MOPS-O3-NEXT: ret
;
; SDAG-WITHOUT-MOPS-O2-LABEL: memmove_10_volatile:
diff --git a/llvm/test/CodeGen/AArch64/aarch64-split-and-bitmask-immediate.ll b/llvm/test/CodeGen/AArch64/aarch64-split-and-bitmask-immediate.ll
index e31c9a0..113eb14 100644
--- a/llvm/test/CodeGen/AArch64/aarch64-split-and-bitmask-immediate.ll
+++ b/llvm/test/CodeGen/AArch64/aarch64-split-and-bitmask-immediate.ll
@@ -263,3 +263,110 @@ entry:
%conv = zext i1 %cmp to i8
ret i8 %conv
}
+
+; Test ANDS.
+define i32 @test1_ands(i32 %a) {
+; CHECK-LABEL: test1_ands:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: and w8, w0, #0x3ffc00
+; CHECK-NEXT: ands w8, w8, #0xffe007ff
+; CHECK-NEXT: csel w0, w0, w8, eq
+; CHECK-NEXT: ret
+entry:
+ %ands = and i32 %a, 2098176
+ %c = icmp eq i32 %ands, 0
+ %r = select i1 %c, i32 %a, i32 %ands
+ ret i32 %r
+}
+
+; This constant should not be split because it can be handled by one mov.
+define i32 @test2_ands(i32 %a) {
+; CHECK-LABEL: test2_ands:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: mov w8, #135 // =0x87
+; CHECK-NEXT: ands w8, w0, w8
+; CHECK-NEXT: csel w0, w0, w8, eq
+; CHECK-NEXT: ret
+entry:
+ %ands = and i32 %a, 135
+ %c = icmp eq i32 %ands, 0
+ %r = select i1 %c, i32 %a, i32 %ands
+ ret i32 %r
+}
+
+; This constant should not be split because the split immediate is not valid
+; bitmask immediate.
+define i32 @test3_ands(i32 %a) {
+; CHECK-LABEL: test3_ands:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: mov w8, #1024 // =0x400
+; CHECK-NEXT: movk w8, #33, lsl #16
+; CHECK-NEXT: ands w8, w0, w8
+; CHECK-NEXT: csel w0, w0, w8, eq
+; CHECK-NEXT: ret
+entry:
+ %ands = and i32 %a, 2163712
+ %c = icmp eq i32 %ands, 0
+ %r = select i1 %c, i32 %a, i32 %ands
+ ret i32 %r
+}
+
+define i64 @test4_ands(i64 %a) {
+; CHECK-LABEL: test4_ands:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: and x8, x0, #0x3ffc00
+; CHECK-NEXT: ands x8, x8, #0xffffffffffe007ff
+; CHECK-NEXT: csel x0, x0, x8, eq
+; CHECK-NEXT: ret
+entry:
+ %ands = and i64 %a, 2098176
+ %c = icmp eq i64 %ands, 0
+ %r = select i1 %c, i64 %a, i64 %ands
+ ret i64 %r
+}
+
+define i64 @test5_ands(i64 %a) {
+; CHECK-LABEL: test5_ands:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: and x8, x0, #0x3ffffc000
+; CHECK-NEXT: ands x8, x8, #0xfffffffe00007fff
+; CHECK-NEXT: csel x0, x0, x8, eq
+; CHECK-NEXT: ret
+entry:
+ %ands = and i64 %a, 8589950976
+ %c = icmp eq i64 %ands, 0
+ %r = select i1 %c, i64 %a, i64 %ands
+ ret i64 %r
+}
+
+; This constant should not be split because it can be handled by one mov.
+define i64 @test6_ands(i64 %a) {
+; CHECK-LABEL: test6_ands:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: mov w8, #135 // =0x87
+; CHECK-NEXT: ands x8, x0, x8
+; CHECK-NEXT: csel x0, x0, x8, eq
+; CHECK-NEXT: ret
+entry:
+ %ands = and i64 %a, 135
+ %c = icmp eq i64 %ands, 0
+ %r = select i1 %c, i64 %a, i64 %ands
+ ret i64 %r
+}
+
+; This constant should not be split because the split immediate is not valid
+; bitmask immediate.
+define i64 @test7_ands(i64 %a) {
+; CHECK-LABEL: test7_ands:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: mov w8, #1024 // =0x400
+; CHECK-NEXT: movk w8, #33, lsl #16
+; CHECK-NEXT: ands x8, x0, x8
+; CHECK-NEXT: csel x0, x0, x8, eq
+; CHECK-NEXT: ret
+entry:
+ %ands = and i64 %a, 2163712
+ %c = icmp eq i64 %ands, 0
+ %r = select i1 %c, i64 %a, i64 %ands
+ ret i64 %r
+}
diff --git a/llvm/test/CodeGen/AArch64/bsp_implicit_ops.mir b/llvm/test/CodeGen/AArch64/bsp_implicit_ops.mir
new file mode 100644
index 0000000..23ac67c
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/bsp_implicit_ops.mir
@@ -0,0 +1,98 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5
+# RUN: llc -mtriple=aarch64-none-linux-gnu -run-pass aarch64-expand-pseudo -verify-machineinstrs %s -o - | FileCheck %s
+
+
+---
+name: BSL_COPY
+tracksRegLiveness: true
+body: |
+ bb.0.entry:
+ liveins: $q20, $q21, $q22, $q23, $q6, $q1, $q7
+
+
+ ; CHECK-LABEL: name: BSL_COPY
+ ; CHECK: liveins: $q20, $q21, $q22, $q23, $q6, $q1, $q7
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: renamable $q2 = ORRv16i8 killed renamable $q20, killed renamable $q20
+ ; CHECK-NEXT: renamable $q2 = BSLv16i8 killed renamable $q2, renamable $q21, renamable $q6, implicit killed $q21_q22_q23, implicit killed $q0_q1_q2_q3, implicit-def $q0_q1_q2_q3
+ ; CHECK-NEXT: $q22 = ORRv16i8 $q0, killed $q0
+ ; CHECK-NEXT: $q23 = ORRv16i8 $q1, killed $q1
+ ; CHECK-NEXT: $q24 = ORRv16i8 $q2, killed $q2
+ ; CHECK-NEXT: $q25 = ORRv16i8 $q3, killed $q3
+ ; CHECK-NEXT: RET undef $lr, implicit $q22
+ renamable $q2 = BSPv16i8 killed renamable $q20, renamable $q21, renamable $q6, implicit killed $q21_q22_q23, implicit killed $q0_q1_q2_q3, implicit-def $q0_q1_q2_q3
+ $q22 = ORRv16i8 $q0, killed $q0
+ $q23 = ORRv16i8 $q1, killed $q1
+ $q24 = ORRv16i8 $q2, killed $q2
+ $q25 = ORRv16i8 $q3, killed $q3
+ RET_ReallyLR implicit $q22
+...
+---
+name: BSL
+tracksRegLiveness: true
+body: |
+ bb.0.entry:
+ liveins: $q20, $q21, $q22, $q23, $q6, $q1, $q7
+
+ ; CHECK-LABEL: name: BSL
+ ; CHECK: liveins: $q20, $q21, $q22, $q23, $q6, $q1, $q7
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: renamable $q2 = BSLv16i8 killed renamable $q2, renamable $q21, renamable $q6, implicit killed $q21_q22_q23, implicit killed $q0_q1_q2_q3, implicit-def $q0_q1_q2_q3
+ ; CHECK-NEXT: $q22 = ORRv16i8 $q0, killed $q0
+ ; CHECK-NEXT: $q23 = ORRv16i8 $q1, killed $q1
+ ; CHECK-NEXT: $q24 = ORRv16i8 $q2, killed $q2
+ ; CHECK-NEXT: $q25 = ORRv16i8 $q3, killed $q3
+ ; CHECK-NEXT: RET undef $lr, implicit $q22
+ renamable $q2 = BSPv16i8 killed renamable $q2, renamable $q21, renamable $q6, implicit killed $q21_q22_q23, implicit killed $q0_q1_q2_q3, implicit-def $q0_q1_q2_q3
+ $q22 = ORRv16i8 $q0, killed $q0
+ $q23 = ORRv16i8 $q1, killed $q1
+ $q24 = ORRv16i8 $q2, killed $q2
+ $q25 = ORRv16i8 $q3, killed $q3
+ RET_ReallyLR implicit $q22
+...
+---
+name: BIF
+tracksRegLiveness: true
+body: |
+ bb.0.entry:
+ liveins: $q20, $q21, $q22, $q23, $q6, $q1, $q7
+
+ ; CHECK-LABEL: name: BIF
+ ; CHECK: liveins: $q20, $q21, $q22, $q23, $q6, $q1, $q7
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: renamable $q2 = BIFv16i8 renamable $q2, renamable $q6, killed renamable $q20, implicit killed $q21_q22_q23, implicit killed $q0_q1_q2_q3, implicit-def $q0_q1_q2_q3
+ ; CHECK-NEXT: $q22 = ORRv16i8 $q0, killed $q0
+ ; CHECK-NEXT: $q23 = ORRv16i8 $q1, killed $q1
+ ; CHECK-NEXT: $q24 = ORRv16i8 $q2, killed $q2
+ ; CHECK-NEXT: $q25 = ORRv16i8 $q3, killed $q3
+ ; CHECK-NEXT: RET undef $lr, implicit $q22
+ renamable $q2 = BSPv16i8 killed renamable $q20, renamable $q2, renamable $q6, implicit killed $q21_q22_q23, implicit killed $q0_q1_q2_q3, implicit-def $q0_q1_q2_q3
+ $q22 = ORRv16i8 $q0, killed $q0
+ $q23 = ORRv16i8 $q1, killed $q1
+ $q24 = ORRv16i8 $q2, killed $q2
+ $q25 = ORRv16i8 $q3, killed $q3
+ RET_ReallyLR implicit $q22
+...
+---
+name: BIT
+tracksRegLiveness: true
+body: |
+ bb.0.entry:
+ liveins: $q20, $q21, $q22, $q23, $q6, $q1, $q7
+
+ ; CHECK-LABEL: name: BIT
+ ; CHECK: liveins: $q20, $q21, $q22, $q23, $q6, $q1, $q7
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: renamable $q2 = BITv16i8 renamable $q2, renamable $q21, killed renamable $q20, implicit killed $q21_q22_q23, implicit killed $q0_q1_q2_q3, implicit-def $q0_q1_q2_q3
+ ; CHECK-NEXT: $q22 = ORRv16i8 $q0, killed $q0
+ ; CHECK-NEXT: $q23 = ORRv16i8 $q1, killed $q1
+ ; CHECK-NEXT: $q24 = ORRv16i8 $q2, killed $q2
+ ; CHECK-NEXT: $q25 = ORRv16i8 $q3, killed $q3
+ ; CHECK-NEXT: RET undef $lr, implicit $q22
+ renamable $q2 = BSPv16i8 killed renamable $q20, renamable $q21, renamable $q2, implicit killed $q21_q22_q23, implicit killed $q0_q1_q2_q3, implicit-def $q0_q1_q2_q3
+ $q22 = ORRv16i8 $q0, killed $q0
+ $q23 = ORRv16i8 $q1, killed $q1
+ $q24 = ORRv16i8 $q2, killed $q2
+ $q25 = ORRv16i8 $q3, killed $q3
+ RET_ReallyLR implicit $q22
+...
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-memcpy.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-memcpy.mir
index be3fe91..4f5f52b 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-memcpy.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-memcpy.mir
@@ -31,3 +31,33 @@ body: |
S_ENDPGM 0
...
+---
+name: memcpy_test_volatile
+body: |
+ bb.0:
+ liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3
+
+ ; CHECK-LABEL: name: memcpy_test_volatile
+ ; CHECK: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+ ; CHECK-NEXT: [[MV:%[0-9]+]]:_(p0) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32)
+ ; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
+ ; CHECK-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3
+ ; CHECK-NEXT: [[MV1:%[0-9]+]]:_(p0) = G_MERGE_VALUES [[COPY2]](s32), [[COPY3]](s32)
+ ; CHECK-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[MV1]](p0) :: (volatile load (s8))
+ ; CHECK-NEXT: G_STORE [[LOAD]](s32), [[MV]](p0) :: (volatile store (s8))
+ ; CHECK-NEXT: S_ENDPGM 0
+ %0:_(s32) = COPY $vgpr0
+ %1:_(s32) = COPY $vgpr1
+ %2:_(p0) = G_MERGE_VALUES %0:_(s32), %1:_(s32)
+ %3:_(s32) = COPY $vgpr2
+ %4:_(s32) = COPY $vgpr3
+ %5:_(p0) = G_MERGE_VALUES %3:_(s32), %4:_(s32)
+ %6:_(s32) = G_CONSTANT i32 1
+ %7:_(s64) = G_ZEXT %6:_(s32)
+ G_MEMCPY %2:_(p0), %5:_(p0), %7:_(s64), 0 :: (volatile store (s8)), (volatile load (s8))
+ S_ENDPGM 0
+
+...
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-memcpyinline.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-memcpyinline.mir
index a82ca30..0392aef 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-memcpyinline.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-memcpyinline.mir
@@ -31,3 +31,33 @@ body: |
S_ENDPGM 0
...
+---
+name: memcpyinline_test_volatile
+body: |
+ bb.0:
+ liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3
+
+ ; CHECK-LABEL: name: memcpyinline_test_volatile
+ ; CHECK: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+ ; CHECK-NEXT: [[MV:%[0-9]+]]:_(p0) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32)
+ ; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
+ ; CHECK-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3
+ ; CHECK-NEXT: [[MV1:%[0-9]+]]:_(p0) = G_MERGE_VALUES [[COPY2]](s32), [[COPY3]](s32)
+ ; CHECK-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[MV1]](p0) :: (volatile load (s8))
+ ; CHECK-NEXT: G_STORE [[LOAD]](s32), [[MV]](p0) :: (volatile store (s8))
+ ; CHECK-NEXT: S_ENDPGM 0
+ %0:_(s32) = COPY $vgpr0
+ %1:_(s32) = COPY $vgpr1
+ %2:_(p0) = G_MERGE_VALUES %0:_(s32), %1:_(s32)
+ %3:_(s32) = COPY $vgpr2
+ %4:_(s32) = COPY $vgpr3
+ %5:_(p0) = G_MERGE_VALUES %3:_(s32), %4:_(s32)
+ %6:_(s32) = G_CONSTANT i32 1
+ %7:_(s64) = G_ZEXT %6:_(s32)
+ G_MEMCPY_INLINE %2:_(p0), %5:_(p0), %7:_(s64) :: (volatile store (s8)), (volatile load (s8))
+ S_ENDPGM 0
+
+...
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-memmove.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-memmove.mir
index e7cfaab..1f8d1aa 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-memmove.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-memmove.mir
@@ -31,3 +31,33 @@ body: |
S_ENDPGM 0
...
+---
+name: memmove_test_volatile
+body: |
+ bb.0:
+ liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3
+
+ ; CHECK-LABEL: name: memmove_test_volatile
+ ; CHECK: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+ ; CHECK-NEXT: [[MV:%[0-9]+]]:_(p0) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32)
+ ; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
+ ; CHECK-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3
+ ; CHECK-NEXT: [[MV1:%[0-9]+]]:_(p0) = G_MERGE_VALUES [[COPY2]](s32), [[COPY3]](s32)
+ ; CHECK-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[MV1]](p0) :: (volatile load (s8))
+ ; CHECK-NEXT: G_STORE [[LOAD]](s32), [[MV]](p0) :: (volatile store (s8))
+ ; CHECK-NEXT: S_ENDPGM 0
+ %0:_(s32) = COPY $vgpr0
+ %1:_(s32) = COPY $vgpr1
+ %2:_(p0) = G_MERGE_VALUES %0:_(s32), %1:_(s32)
+ %3:_(s32) = COPY $vgpr2
+ %4:_(s32) = COPY $vgpr3
+ %5:_(p0) = G_MERGE_VALUES %3:_(s32), %4:_(s32)
+ %6:_(s32) = G_CONSTANT i32 1
+ %7:_(s64) = G_ZEXT %6:_(s32)
+ G_MEMMOVE %2:_(p0), %5:_(p0), %7:_(s64), 0 :: (volatile store (s8)), (volatile load (s8))
+ S_ENDPGM 0
+
+...
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-memset.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-memset.mir
index 021cebb..dda94e15 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-memset.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-memset.mir
@@ -30,3 +30,32 @@ body: |
S_ENDPGM 0
...
+---
+name: memset_test_volatile
+body: |
+ bb.0:
+ liveins: $vgpr0, $vgpr1, $vgpr2
+
+ ; CHECK-LABEL: name: memset_test_volatile
+ ; CHECK: liveins: $vgpr0, $vgpr1, $vgpr2
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+ ; CHECK-NEXT: [[MV:%[0-9]+]]:_(p0) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32)
+ ; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
+ ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s8) = G_TRUNC [[COPY2]](s32)
+ ; CHECK-NEXT: [[COPY3:%[0-9]+]]:_(s8) = COPY [[TRUNC]](s8)
+ ; CHECK-NEXT: G_STORE [[COPY2]](s32), [[MV]](p0) :: (volatile store (s8))
+ ; CHECK-NEXT: S_ENDPGM 0
+ %0:_(s32) = COPY $vgpr0
+ %1:_(s32) = COPY $vgpr1
+ %2:_(p0) = G_MERGE_VALUES %0:_(s32), %1:_(s32)
+ %3:_(s32) = COPY $vgpr2
+ %4:_(s16) = G_TRUNC %3:_(s32)
+ %5:_(s8) = G_TRUNC %4:_(s16)
+ %6:_(s32) = G_CONSTANT i32 1
+ %7:_(s64) = G_ZEXT %6:_(s32)
+ G_MEMSET %2:_(p0), %5:_(s8), %7:_(s64), 0 :: (volatile store (s8))
+ S_ENDPGM 0
+
+...
diff --git a/llvm/test/CodeGen/AMDGPU/div_i128.ll b/llvm/test/CodeGen/AMDGPU/div_i128.ll
index 4cb0d2d..e6c38d2 100644
--- a/llvm/test/CodeGen/AMDGPU/div_i128.ll
+++ b/llvm/test/CodeGen/AMDGPU/div_i128.ll
@@ -475,28 +475,21 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) {
; GFX9-O0-NEXT: ; implicit-def: $sgpr8
; GFX9-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec
; GFX9-O0-NEXT: v_mov_b32_e32 v5, v8
+; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: s_nop 0
+; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
; GFX9-O0-NEXT: ; implicit-def: $sgpr8
; GFX9-O0-NEXT: ; implicit-def: $sgpr8
; GFX9-O0-NEXT: ; kill: def $vgpr7 killed $vgpr7 def $vgpr7_vgpr8 killed $exec
; GFX9-O0-NEXT: v_mov_b32_e32 v8, v6
-; GFX9-O0-NEXT: v_mov_b32_e32 v10, v8
-; GFX9-O0-NEXT: v_mov_b32_e32 v9, v7
-; GFX9-O0-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
-; GFX9-O0-NEXT: s_nop 0
-; GFX9-O0-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
-; GFX9-O0-NEXT: v_mov_b32_e32 v10, v5
-; GFX9-O0-NEXT: v_mov_b32_e32 v9, v4
-; GFX9-O0-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
; GFX9-O0-NEXT: s_nop 0
-; GFX9-O0-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
-; GFX9-O0-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX9-O0-NEXT: v_cmp_eq_u64_e64 s[8:9], v[7:8], s[8:9]
+; GFX9-O0-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: v_cmp_eq_u64_e64 s[8:9], v[7:8], s[6:7]
; GFX9-O0-NEXT: s_mov_b64 s[12:13], 0x7f
-; GFX9-O0-NEXT: s_mov_b64 s[14:15], s[12:13]
-; GFX9-O0-NEXT: v_cmp_gt_u64_e64 s[14:15], v[4:5], s[14:15]
+; GFX9-O0-NEXT: v_cmp_gt_u64_e64 s[14:15], v[4:5], s[12:13]
; GFX9-O0-NEXT: v_cndmask_b32_e64 v9, 0, 1, s[14:15]
-; GFX9-O0-NEXT: s_mov_b64 s[14:15], s[6:7]
-; GFX9-O0-NEXT: v_cmp_ne_u64_e64 s[14:15], v[7:8], s[14:15]
+; GFX9-O0-NEXT: v_cmp_ne_u64_e64 s[14:15], v[7:8], s[6:7]
; GFX9-O0-NEXT: v_cndmask_b32_e64 v6, 0, 1, s[14:15]
; GFX9-O0-NEXT: v_cndmask_b32_e64 v6, v6, v9, s[8:9]
; GFX9-O0-NEXT: v_and_b32_e64 v6, 1, v6
@@ -507,7 +500,6 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) {
; GFX9-O0-NEXT: v_mov_b32_e32 v6, v5
; GFX9-O0-NEXT: s_mov_b32 s14, s13
; GFX9-O0-NEXT: v_xor_b32_e64 v6, v6, s14
-; GFX9-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 killed $vgpr4_vgpr5 killed $exec
; GFX9-O0-NEXT: ; kill: def $sgpr12 killed $sgpr12 killed $sgpr12_sgpr13
; GFX9-O0-NEXT: v_xor_b32_e64 v4, v4, s12
; GFX9-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec
@@ -1046,10 +1038,10 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) {
; GFX9-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
; GFX9-O0-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
; GFX9-O0-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
; GFX9-O0-NEXT: s_mov_b64 s[6:7], 1
; GFX9-O0-NEXT: s_mov_b32 s5, s6
; GFX9-O0-NEXT: s_waitcnt vmcnt(1)
@@ -2667,28 +2659,21 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) {
; GFX9-O0-NEXT: ; implicit-def: $sgpr8
; GFX9-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec
; GFX9-O0-NEXT: v_mov_b32_e32 v5, v8
+; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: s_nop 0
+; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
; GFX9-O0-NEXT: ; implicit-def: $sgpr8
; GFX9-O0-NEXT: ; implicit-def: $sgpr8
; GFX9-O0-NEXT: ; kill: def $vgpr7 killed $vgpr7 def $vgpr7_vgpr8 killed $exec
; GFX9-O0-NEXT: v_mov_b32_e32 v8, v6
-; GFX9-O0-NEXT: v_mov_b32_e32 v10, v8
-; GFX9-O0-NEXT: v_mov_b32_e32 v9, v7
-; GFX9-O0-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
-; GFX9-O0-NEXT: s_nop 0
-; GFX9-O0-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
-; GFX9-O0-NEXT: v_mov_b32_e32 v10, v5
-; GFX9-O0-NEXT: v_mov_b32_e32 v9, v4
-; GFX9-O0-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
; GFX9-O0-NEXT: s_nop 0
-; GFX9-O0-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
-; GFX9-O0-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX9-O0-NEXT: v_cmp_eq_u64_e64 s[8:9], v[7:8], s[8:9]
+; GFX9-O0-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: v_cmp_eq_u64_e64 s[8:9], v[7:8], s[6:7]
; GFX9-O0-NEXT: s_mov_b64 s[12:13], 0x7f
-; GFX9-O0-NEXT: s_mov_b64 s[14:15], s[12:13]
-; GFX9-O0-NEXT: v_cmp_gt_u64_e64 s[14:15], v[4:5], s[14:15]
+; GFX9-O0-NEXT: v_cmp_gt_u64_e64 s[14:15], v[4:5], s[12:13]
; GFX9-O0-NEXT: v_cndmask_b32_e64 v9, 0, 1, s[14:15]
-; GFX9-O0-NEXT: s_mov_b64 s[14:15], s[6:7]
-; GFX9-O0-NEXT: v_cmp_ne_u64_e64 s[14:15], v[7:8], s[14:15]
+; GFX9-O0-NEXT: v_cmp_ne_u64_e64 s[14:15], v[7:8], s[6:7]
; GFX9-O0-NEXT: v_cndmask_b32_e64 v6, 0, 1, s[14:15]
; GFX9-O0-NEXT: v_cndmask_b32_e64 v6, v6, v9, s[8:9]
; GFX9-O0-NEXT: v_and_b32_e64 v6, 1, v6
@@ -2699,7 +2684,6 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) {
; GFX9-O0-NEXT: v_mov_b32_e32 v6, v5
; GFX9-O0-NEXT: s_mov_b32 s14, s13
; GFX9-O0-NEXT: v_xor_b32_e64 v6, v6, s14
-; GFX9-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 killed $vgpr4_vgpr5 killed $exec
; GFX9-O0-NEXT: ; kill: def $sgpr12 killed $sgpr12 killed $sgpr12_sgpr13
; GFX9-O0-NEXT: v_xor_b32_e64 v4, v4, s12
; GFX9-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec
@@ -3238,10 +3222,10 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) {
; GFX9-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
; GFX9-O0-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
; GFX9-O0-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
; GFX9-O0-NEXT: s_mov_b64 s[6:7], 1
; GFX9-O0-NEXT: s_mov_b32 s5, s6
; GFX9-O0-NEXT: s_waitcnt vmcnt(1)
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.round.f64.ll b/llvm/test/CodeGen/AMDGPU/llvm.round.f64.ll
index 355f77a..af914bd 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.round.f64.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.round.f64.ll
@@ -76,13 +76,12 @@ define amdgpu_kernel void @v_round_f64(ptr addrspace(1) %out, ptr addrspace(1) %
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: s_mov_b64 s[4:5], s[2:3]
; SI-NEXT: buffer_load_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64
-; SI-NEXT: s_movk_i32 s4, 0xfc01
; SI-NEXT: s_mov_b32 s2, -1
; SI-NEXT: s_mov_b32 s3, 0xfffff
; SI-NEXT: v_mov_b32_e32 v8, 0x3ff00000
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_bfe_u32 v4, v3, 20, 11
-; SI-NEXT: v_add_i32_e32 v6, vcc, s4, v4
+; SI-NEXT: v_add_i32_e32 v6, vcc, 0xfffffc01, v4
; SI-NEXT: v_lshr_b64 v[4:5], s[2:3], v6
; SI-NEXT: v_and_b32_e32 v7, 0x80000000, v3
; SI-NEXT: v_not_b32_e32 v5, v5
diff --git a/llvm/test/CodeGen/AMDGPU/rem_i128.ll b/llvm/test/CodeGen/AMDGPU/rem_i128.ll
index 5d0e4bf..8fe68ba 100644
--- a/llvm/test/CodeGen/AMDGPU/rem_i128.ll
+++ b/llvm/test/CodeGen/AMDGPU/rem_i128.ll
@@ -513,28 +513,21 @@ define i128 @v_srem_i128_vv(i128 %lhs, i128 %rhs) {
; GFX9-O0-NEXT: ; implicit-def: $sgpr8
; GFX9-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec
; GFX9-O0-NEXT: v_mov_b32_e32 v5, v8
+; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: s_nop 0
+; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
; GFX9-O0-NEXT: ; implicit-def: $sgpr8
; GFX9-O0-NEXT: ; implicit-def: $sgpr8
; GFX9-O0-NEXT: ; kill: def $vgpr7 killed $vgpr7 def $vgpr7_vgpr8 killed $exec
; GFX9-O0-NEXT: v_mov_b32_e32 v8, v6
-; GFX9-O0-NEXT: v_mov_b32_e32 v10, v8
-; GFX9-O0-NEXT: v_mov_b32_e32 v9, v7
-; GFX9-O0-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
-; GFX9-O0-NEXT: s_nop 0
-; GFX9-O0-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
-; GFX9-O0-NEXT: v_mov_b32_e32 v10, v5
-; GFX9-O0-NEXT: v_mov_b32_e32 v9, v4
-; GFX9-O0-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
; GFX9-O0-NEXT: s_nop 0
-; GFX9-O0-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
-; GFX9-O0-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX9-O0-NEXT: v_cmp_eq_u64_e64 s[8:9], v[7:8], s[8:9]
+; GFX9-O0-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: v_cmp_eq_u64_e64 s[8:9], v[7:8], s[6:7]
; GFX9-O0-NEXT: s_mov_b64 s[12:13], 0x7f
-; GFX9-O0-NEXT: s_mov_b64 s[14:15], s[12:13]
-; GFX9-O0-NEXT: v_cmp_gt_u64_e64 s[14:15], v[4:5], s[14:15]
+; GFX9-O0-NEXT: v_cmp_gt_u64_e64 s[14:15], v[4:5], s[12:13]
; GFX9-O0-NEXT: v_cndmask_b32_e64 v9, 0, 1, s[14:15]
-; GFX9-O0-NEXT: s_mov_b64 s[14:15], s[6:7]
-; GFX9-O0-NEXT: v_cmp_ne_u64_e64 s[14:15], v[7:8], s[14:15]
+; GFX9-O0-NEXT: v_cmp_ne_u64_e64 s[14:15], v[7:8], s[6:7]
; GFX9-O0-NEXT: v_cndmask_b32_e64 v6, 0, 1, s[14:15]
; GFX9-O0-NEXT: v_cndmask_b32_e64 v6, v6, v9, s[8:9]
; GFX9-O0-NEXT: v_and_b32_e64 v6, 1, v6
@@ -545,7 +538,6 @@ define i128 @v_srem_i128_vv(i128 %lhs, i128 %rhs) {
; GFX9-O0-NEXT: v_mov_b32_e32 v6, v5
; GFX9-O0-NEXT: s_mov_b32 s14, s13
; GFX9-O0-NEXT: v_xor_b32_e64 v6, v6, s14
-; GFX9-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 killed $vgpr4_vgpr5 killed $exec
; GFX9-O0-NEXT: ; kill: def $sgpr12 killed $sgpr12 killed $sgpr12_sgpr13
; GFX9-O0-NEXT: v_xor_b32_e64 v4, v4, s12
; GFX9-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec
@@ -1084,10 +1076,10 @@ define i128 @v_srem_i128_vv(i128 %lhs, i128 %rhs) {
; GFX9-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
; GFX9-O0-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
; GFX9-O0-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
; GFX9-O0-NEXT: s_mov_b64 s[6:7], 1
; GFX9-O0-NEXT: s_mov_b32 s5, s6
; GFX9-O0-NEXT: s_waitcnt vmcnt(1)
@@ -1900,28 +1892,21 @@ define i128 @v_urem_i128_vv(i128 %lhs, i128 %rhs) {
; GFX9-O0-NEXT: ; implicit-def: $sgpr8
; GFX9-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec
; GFX9-O0-NEXT: v_mov_b32_e32 v5, v8
+; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: s_nop 0
+; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
; GFX9-O0-NEXT: ; implicit-def: $sgpr8
; GFX9-O0-NEXT: ; implicit-def: $sgpr8
; GFX9-O0-NEXT: ; kill: def $vgpr7 killed $vgpr7 def $vgpr7_vgpr8 killed $exec
; GFX9-O0-NEXT: v_mov_b32_e32 v8, v6
-; GFX9-O0-NEXT: v_mov_b32_e32 v10, v8
-; GFX9-O0-NEXT: v_mov_b32_e32 v9, v7
-; GFX9-O0-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
-; GFX9-O0-NEXT: s_nop 0
-; GFX9-O0-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
-; GFX9-O0-NEXT: v_mov_b32_e32 v10, v5
-; GFX9-O0-NEXT: v_mov_b32_e32 v9, v4
-; GFX9-O0-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
; GFX9-O0-NEXT: s_nop 0
-; GFX9-O0-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
-; GFX9-O0-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX9-O0-NEXT: v_cmp_eq_u64_e64 s[8:9], v[7:8], s[8:9]
+; GFX9-O0-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: v_cmp_eq_u64_e64 s[8:9], v[7:8], s[6:7]
; GFX9-O0-NEXT: s_mov_b64 s[12:13], 0x7f
-; GFX9-O0-NEXT: s_mov_b64 s[14:15], s[12:13]
-; GFX9-O0-NEXT: v_cmp_gt_u64_e64 s[14:15], v[4:5], s[14:15]
+; GFX9-O0-NEXT: v_cmp_gt_u64_e64 s[14:15], v[4:5], s[12:13]
; GFX9-O0-NEXT: v_cndmask_b32_e64 v9, 0, 1, s[14:15]
-; GFX9-O0-NEXT: s_mov_b64 s[14:15], s[6:7]
-; GFX9-O0-NEXT: v_cmp_ne_u64_e64 s[14:15], v[7:8], s[14:15]
+; GFX9-O0-NEXT: v_cmp_ne_u64_e64 s[14:15], v[7:8], s[6:7]
; GFX9-O0-NEXT: v_cndmask_b32_e64 v6, 0, 1, s[14:15]
; GFX9-O0-NEXT: v_cndmask_b32_e64 v6, v6, v9, s[8:9]
; GFX9-O0-NEXT: v_and_b32_e64 v6, 1, v6
@@ -1932,7 +1917,6 @@ define i128 @v_urem_i128_vv(i128 %lhs, i128 %rhs) {
; GFX9-O0-NEXT: v_mov_b32_e32 v6, v5
; GFX9-O0-NEXT: s_mov_b32 s14, s13
; GFX9-O0-NEXT: v_xor_b32_e64 v6, v6, s14
-; GFX9-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 killed $vgpr4_vgpr5 killed $exec
; GFX9-O0-NEXT: ; kill: def $sgpr12 killed $sgpr12 killed $sgpr12_sgpr13
; GFX9-O0-NEXT: v_xor_b32_e64 v4, v4, s12
; GFX9-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec
@@ -2471,10 +2455,10 @@ define i128 @v_urem_i128_vv(i128 %lhs, i128 %rhs) {
; GFX9-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
; GFX9-O0-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
; GFX9-O0-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
; GFX9-O0-NEXT: s_mov_b64 s[6:7], 1
; GFX9-O0-NEXT: s_mov_b32 s5, s6
; GFX9-O0-NEXT: s_waitcnt vmcnt(1)
diff --git a/llvm/test/CodeGen/AMDGPU/scale-offset-smem.ll b/llvm/test/CodeGen/AMDGPU/scale-offset-smem.ll
new file mode 100644
index 0000000..b5bb68e
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/scale-offset-smem.ll
@@ -0,0 +1,372 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1250 < %s | FileCheck --check-prefixes=GCN,SDAG %s
+; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1250 < %s | FileCheck --check-prefixes=GCN,GISEL %s
+
+define amdgpu_ps float @s_load_b32_idxprom(ptr addrspace(4) align 4 inreg %p, i32 inreg %idx) {
+; GCN-LABEL: s_load_b32_idxprom:
+; GCN: ; %bb.0: ; %entry
+; GCN-NEXT: s_load_b32 s0, s[0:1], s2 offset:0x0 scale_offset
+; GCN-NEXT: s_wait_kmcnt 0x0
+; GCN-NEXT: v_mov_b32_e32 v0, s0
+; GCN-NEXT: ; return to shader part epilog
+entry:
+ %idxprom = zext i32 %idx to i64
+ %arrayidx = getelementptr inbounds float, ptr addrspace(4) %p, i64 %idxprom
+ %ret = load float, ptr addrspace(4) %arrayidx, align 4
+ ret float %ret
+}
+
+; 'i32 %idx' is a signed index while SMRD soffset is unsigned, thus it is not selected.
+
+define amdgpu_ps float @s_load_b32_idx32(ptr addrspace(4) align 4 inreg %p, i32 inreg %idx) {
+; SDAG-LABEL: s_load_b32_idx32:
+; SDAG: ; %bb.0: ; %entry
+; SDAG-NEXT: s_ashr_i32 s3, s2, 31
+; SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; SDAG-NEXT: s_lshl_b64 s[2:3], s[2:3], 2
+; SDAG-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[2:3]
+; SDAG-NEXT: s_load_b32 s0, s[0:1], 0x0
+; SDAG-NEXT: s_wait_kmcnt 0x0
+; SDAG-NEXT: v_mov_b32_e32 v0, s0
+; SDAG-NEXT: ; return to shader part epilog
+;
+; GISEL-LABEL: s_load_b32_idx32:
+; GISEL: ; %bb.0: ; %entry
+; GISEL-NEXT: s_ashr_i32 s3, s2, 31
+; GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GISEL-NEXT: s_lshl_b64 s[2:3], s[2:3], 2
+; GISEL-NEXT: s_add_co_u32 s0, s0, s2
+; GISEL-NEXT: s_add_co_ci_u32 s1, s1, s3
+; GISEL-NEXT: s_load_b32 s0, s[0:1], 0x0
+; GISEL-NEXT: s_wait_kmcnt 0x0
+; GISEL-NEXT: v_mov_b32_e32 v0, s0
+; GISEL-NEXT: ; return to shader part epilog
+entry:
+ %arrayidx = getelementptr inbounds float, ptr addrspace(4) %p, i32 %idx
+ %ret = load float, ptr addrspace(4) %arrayidx, align 4
+ ret float %ret
+}
+
+define amdgpu_ps float @s_load_b32_idxprom_wrong_stride(ptr addrspace(4) align 4 inreg %p, i32 inreg %idx) {
+; SDAG-LABEL: s_load_b32_idxprom_wrong_stride:
+; SDAG: ; %bb.0: ; %entry
+; SDAG-NEXT: s_mov_b32 s3, 0
+; SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; SDAG-NEXT: s_lshl_b64 s[2:3], s[2:3], 3
+; SDAG-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[2:3]
+; SDAG-NEXT: s_load_b32 s0, s[0:1], 0x0
+; SDAG-NEXT: s_wait_kmcnt 0x0
+; SDAG-NEXT: v_mov_b32_e32 v0, s0
+; SDAG-NEXT: ; return to shader part epilog
+;
+; GISEL-LABEL: s_load_b32_idxprom_wrong_stride:
+; GISEL: ; %bb.0: ; %entry
+; GISEL-NEXT: s_mov_b32 s3, 0
+; GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GISEL-NEXT: s_lshl_b64 s[2:3], s[2:3], 3
+; GISEL-NEXT: s_add_co_u32 s0, s0, s2
+; GISEL-NEXT: s_add_co_ci_u32 s1, s1, s3
+; GISEL-NEXT: s_load_b32 s0, s[0:1], 0x0
+; GISEL-NEXT: s_wait_kmcnt 0x0
+; GISEL-NEXT: v_mov_b32_e32 v0, s0
+; GISEL-NEXT: ; return to shader part epilog
+entry:
+ %idxprom = zext i32 %idx to i64
+ %arrayidx = getelementptr inbounds <2 x float>, ptr addrspace(4) %p, i64 %idxprom
+ %ret = load float, ptr addrspace(4) %arrayidx, align 4
+ ret float %ret
+}
+
+define amdgpu_ps float @s_load_b16_idxprom_ioffset(ptr addrspace(4) align 4 inreg %p, i32 inreg %idx) {
+; GCN-LABEL: s_load_b16_idxprom_ioffset:
+; GCN: ; %bb.0: ; %entry
+; GCN-NEXT: s_load_u16 s0, s[0:1], s2 offset:0x20 scale_offset
+; GCN-NEXT: s_wait_kmcnt 0x0
+; GCN-NEXT: v_mov_b32_e32 v0, s0
+; GCN-NEXT: ; return to shader part epilog
+entry:
+ %idxprom = zext i32 %idx to i64
+ %idxadd = add i64 %idxprom, 16
+ %arrayidx = getelementptr inbounds i16, ptr addrspace(4) %p, i64 %idxadd
+ %ld = load i16, ptr addrspace(4) %arrayidx, align 2
+ %ret.i32 = zext i16 %ld to i32
+ %ret = bitcast i32 %ret.i32 to float
+ ret float %ret
+}
+
+define amdgpu_ps <2 x float> @s_load_b64_idxprom(ptr addrspace(4) align 4 inreg %p, i32 inreg %idx) {
+; GCN-LABEL: s_load_b64_idxprom:
+; GCN: ; %bb.0: ; %entry
+; GCN-NEXT: s_load_b64 s[0:1], s[0:1], s2 offset:0x0 scale_offset
+; GCN-NEXT: s_wait_kmcnt 0x0
+; GCN-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GCN-NEXT: ; return to shader part epilog
+entry:
+ %idxprom = zext i32 %idx to i64
+ %arrayidx = getelementptr inbounds <2 x float>, ptr addrspace(4) %p, i64 %idxprom
+ %ret = load <2 x float>, ptr addrspace(4) %arrayidx, align 4
+ ret <2 x float> %ret
+}
+
+define amdgpu_ps <3 x float> @s_load_b96_idxprom(ptr addrspace(4) align 4 inreg %p, i32 inreg %idx) {
+; GCN-LABEL: s_load_b96_idxprom:
+; GCN: ; %bb.0: ; %entry
+; GCN-NEXT: s_load_b96 s[0:2], s[0:1], s2 offset:0x0 scale_offset
+; GCN-NEXT: s_wait_kmcnt 0x0
+; GCN-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GCN-NEXT: v_mov_b32_e32 v2, s2
+; GCN-NEXT: ; return to shader part epilog
+entry:
+ %idxprom = zext i32 %idx to i64
+ %arrayidx = getelementptr inbounds [3 x float], ptr addrspace(4) %p, i64 %idxprom
+ %ret = load <3 x float>, ptr addrspace(4) %arrayidx, align 4
+ ret <3 x float> %ret
+}
+
+define amdgpu_ps <4 x float> @s_load_b128_idxprom(ptr addrspace(4) align 4 inreg %p, i32 inreg %idx) {
+; GCN-LABEL: s_load_b128_idxprom:
+; GCN: ; %bb.0: ; %entry
+; GCN-NEXT: s_load_b128 s[0:3], s[0:1], s2 offset:0x0 scale_offset
+; GCN-NEXT: s_wait_kmcnt 0x0
+; GCN-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GCN-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GCN-NEXT: ; return to shader part epilog
+entry:
+ %idxprom = zext i32 %idx to i64
+ %arrayidx = getelementptr inbounds <4 x float>, ptr addrspace(4) %p, i64 %idxprom
+ %ret = load <4 x float>, ptr addrspace(4) %arrayidx, align 4
+ ret <4 x float> %ret
+}
+
+define amdgpu_ps <8 x float> @s_load_b256_idxprom(ptr addrspace(4) align 4 inreg %p, i32 inreg %idx) {
+; GCN-LABEL: s_load_b256_idxprom:
+; GCN: ; %bb.0: ; %entry
+; GCN-NEXT: s_load_b256 s[0:7], s[0:1], s2 offset:0x0 scale_offset
+; GCN-NEXT: s_wait_kmcnt 0x0
+; GCN-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GCN-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GCN-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s5
+; GCN-NEXT: v_dual_mov_b32 v6, s6 :: v_dual_mov_b32 v7, s7
+; GCN-NEXT: ; return to shader part epilog
+entry:
+ %idxprom = zext i32 %idx to i64
+ %arrayidx = getelementptr inbounds <8 x float>, ptr addrspace(4) %p, i64 %idxprom
+ %ret = load <8 x float>, ptr addrspace(4) %arrayidx, align 4
+ ret <8 x float> %ret
+}
+
+define amdgpu_ps <16 x float> @s_load_b512_idxprom(ptr addrspace(4) align 4 inreg %p, i32 inreg %idx) {
+; GCN-LABEL: s_load_b512_idxprom:
+; GCN: ; %bb.0: ; %entry
+; GCN-NEXT: s_load_b512 s[0:15], s[0:1], s2 offset:0x0 scale_offset
+; GCN-NEXT: s_wait_kmcnt 0x0
+; GCN-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GCN-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GCN-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s5
+; GCN-NEXT: v_dual_mov_b32 v6, s6 :: v_dual_mov_b32 v7, s7
+; GCN-NEXT: v_dual_mov_b32 v8, s8 :: v_dual_mov_b32 v9, s9
+; GCN-NEXT: v_dual_mov_b32 v10, s10 :: v_dual_mov_b32 v11, s11
+; GCN-NEXT: v_dual_mov_b32 v12, s12 :: v_dual_mov_b32 v13, s13
+; GCN-NEXT: v_dual_mov_b32 v14, s14 :: v_dual_mov_b32 v15, s15
+; GCN-NEXT: ; return to shader part epilog
+entry:
+ %idxprom = zext i32 %idx to i64
+ %arrayidx = getelementptr inbounds <16 x float>, ptr addrspace(4) %p, i64 %idxprom
+ %ret = load <16 x float>, ptr addrspace(4) %arrayidx, align 4
+ ret <16 x float> %ret
+}
+
+define amdgpu_ps float @s_load_b32_idxprom_range(ptr addrspace(4) align 4 inreg %p) {
+; GCN-LABEL: s_load_b32_idxprom_range:
+; GCN: ; %bb.0: ; %entry
+; GCN-NEXT: s_load_b32 s2, s[0:1], 0x0
+; GCN-NEXT: s_wait_kmcnt 0x0
+; GCN-NEXT: s_load_b32 s0, s[0:1], s2 offset:0x0 scale_offset
+; GCN-NEXT: s_wait_kmcnt 0x0
+; GCN-NEXT: v_mov_b32_e32 v0, s0
+; GCN-NEXT: ; return to shader part epilog
+entry:
+ %idx = load i32, ptr addrspace(4) %p, align 4, !range !0
+ %idxprom = zext i32 %idx to i64
+ %arrayidx = getelementptr inbounds float, ptr addrspace(4) %p, i64 %idxprom
+ %ret = load float, ptr addrspace(4) %arrayidx, align 4
+ ret float %ret
+}
+
+define amdgpu_ps float @s_load_b32_idxprom_range_ioffset(ptr addrspace(4) align 4 inreg %p) {
+; GCN-LABEL: s_load_b32_idxprom_range_ioffset:
+; GCN: ; %bb.0: ; %entry
+; GCN-NEXT: s_load_b32 s2, s[0:1], 0x0
+; GCN-NEXT: s_wait_kmcnt 0x0
+; GCN-NEXT: s_load_b32 s0, s[0:1], s2 offset:0x40 scale_offset
+; GCN-NEXT: s_wait_kmcnt 0x0
+; GCN-NEXT: v_mov_b32_e32 v0, s0
+; GCN-NEXT: ; return to shader part epilog
+entry:
+ %idx = load i32, ptr addrspace(4) %p, align 4, !range !0
+ %idxprom = zext i32 %idx to i64
+ %idxadd = add i64 %idxprom, 16
+ %arrayidx = getelementptr inbounds float, ptr addrspace(4) %p, i64 %idxadd
+ %ret = load float, ptr addrspace(4) %arrayidx, align 4
+ ret float %ret
+}
+
+; Note: this is a byte load, there is nothing to scale
+
+define amdgpu_ps float @s_load_b8_idxprom_range_ioffset(ptr addrspace(4) align 4 inreg %p) {
+; GCN-LABEL: s_load_b8_idxprom_range_ioffset:
+; GCN: ; %bb.0: ; %entry
+; GCN-NEXT: s_load_b32 s2, s[0:1], 0x0
+; GCN-NEXT: s_wait_kmcnt 0x0
+; GCN-NEXT: s_load_u8 s0, s[0:1], s2 offset:0x10
+; GCN-NEXT: s_wait_kmcnt 0x0
+; GCN-NEXT: v_mov_b32_e32 v0, s0
+; GCN-NEXT: ; return to shader part epilog
+entry:
+ %idx = load i32, ptr addrspace(4) %p, align 4, !range !0
+ %idxprom = zext i32 %idx to i64
+ %idxadd = add i64 %idxprom, 16
+ %arrayidx = getelementptr inbounds i8, ptr addrspace(4) %p, i64 %idxadd
+ %ld = load i8, ptr addrspace(4) %arrayidx
+ %ret.i32 = zext i8 %ld to i32
+ %ret = bitcast i32 %ret.i32 to float
+ ret float %ret
+}
+
+define amdgpu_ps float @s_load_b16_idxprom_range(ptr addrspace(4) align 4 inreg %p) {
+; GCN-LABEL: s_load_b16_idxprom_range:
+; GCN: ; %bb.0: ; %entry
+; GCN-NEXT: s_load_b32 s2, s[0:1], 0x0
+; GCN-NEXT: s_wait_kmcnt 0x0
+; GCN-NEXT: s_load_u16 s0, s[0:1], s2 offset:0x0 scale_offset
+; GCN-NEXT: s_wait_kmcnt 0x0
+; GCN-NEXT: v_mov_b32_e32 v0, s0
+; GCN-NEXT: ; return to shader part epilog
+entry:
+ %idx = load i32, ptr addrspace(4) %p, align 4, !range !0
+ %idxprom = zext i32 %idx to i64
+ %arrayidx = getelementptr inbounds i16, ptr addrspace(4) %p, i64 %idxprom
+ %ld = load i16, ptr addrspace(4) %arrayidx, align 2
+ %ret.i32 = zext i16 %ld to i32
+ %ret = bitcast i32 %ret.i32 to float
+ ret float %ret
+}
+
+define amdgpu_ps float @s_load_b16_idxprom_range_ioffset(ptr addrspace(4) align 4 inreg %p) {
+; GCN-LABEL: s_load_b16_idxprom_range_ioffset:
+; GCN: ; %bb.0: ; %entry
+; GCN-NEXT: s_load_b32 s2, s[0:1], 0x0
+; GCN-NEXT: s_wait_kmcnt 0x0
+; GCN-NEXT: s_load_u16 s0, s[0:1], s2 offset:0x20 scale_offset
+; GCN-NEXT: s_wait_kmcnt 0x0
+; GCN-NEXT: v_mov_b32_e32 v0, s0
+; GCN-NEXT: ; return to shader part epilog
+entry:
+ %idx = load i32, ptr addrspace(4) %p, align 4, !range !0
+ %idxprom = zext i32 %idx to i64
+ %idxadd = add i64 %idxprom, 16
+ %arrayidx = getelementptr inbounds i16, ptr addrspace(4) %p, i64 %idxadd
+ %ld = load i16, ptr addrspace(4) %arrayidx, align 2
+ %ret.i32 = zext i16 %ld to i32
+ %ret = bitcast i32 %ret.i32 to float
+ ret float %ret
+}
+
+define amdgpu_ps <2 x float> @s_load_b64_idxprom_range(ptr addrspace(4) align 4 inreg %p) {
+; GCN-LABEL: s_load_b64_idxprom_range:
+; GCN: ; %bb.0: ; %entry
+; GCN-NEXT: s_load_b32 s2, s[0:1], 0x0
+; GCN-NEXT: s_wait_kmcnt 0x0
+; GCN-NEXT: s_load_b64 s[0:1], s[0:1], s2 offset:0x0 scale_offset
+; GCN-NEXT: s_wait_kmcnt 0x0
+; GCN-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GCN-NEXT: ; return to shader part epilog
+entry:
+ %idx = load i32, ptr addrspace(4) %p, align 4, !range !0
+ %idxprom = zext i32 %idx to i64
+ %arrayidx = getelementptr inbounds <2 x float>, ptr addrspace(4) %p, i64 %idxprom
+ %ret = load <2 x float>, ptr addrspace(4) %arrayidx, align 4
+ ret <2 x float> %ret
+}
+
+define amdgpu_ps <3 x float> @s_load_b96_idxprom_range(ptr addrspace(4) align 4 inreg %p) {
+; GCN-LABEL: s_load_b96_idxprom_range:
+; GCN: ; %bb.0: ; %entry
+; GCN-NEXT: s_load_b32 s2, s[0:1], 0x0
+; GCN-NEXT: s_wait_kmcnt 0x0
+; GCN-NEXT: s_load_b96 s[0:2], s[0:1], s2 offset:0x0 scale_offset
+; GCN-NEXT: s_wait_kmcnt 0x0
+; GCN-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GCN-NEXT: v_mov_b32_e32 v2, s2
+; GCN-NEXT: ; return to shader part epilog
+entry:
+ %idx = load i32, ptr addrspace(4) %p, align 4, !range !0
+ %idxprom = zext i32 %idx to i64
+ %arrayidx = getelementptr inbounds [3 x float], ptr addrspace(4) %p, i64 %idxprom
+ %ret = load <3 x float>, ptr addrspace(4) %arrayidx, align 4
+ ret <3 x float> %ret
+}
+
+define amdgpu_ps <4 x float> @s_load_b128_idxprom_range(ptr addrspace(4) align 4 inreg %p) {
+; GCN-LABEL: s_load_b128_idxprom_range:
+; GCN: ; %bb.0: ; %entry
+; GCN-NEXT: s_load_b32 s2, s[0:1], 0x0
+; GCN-NEXT: s_wait_kmcnt 0x0
+; GCN-NEXT: s_load_b128 s[0:3], s[0:1], s2 offset:0x0 scale_offset
+; GCN-NEXT: s_wait_kmcnt 0x0
+; GCN-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GCN-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GCN-NEXT: ; return to shader part epilog
+entry:
+ %idx = load i32, ptr addrspace(4) %p, align 4, !range !0
+ %idxprom = zext i32 %idx to i64
+ %arrayidx = getelementptr inbounds <4 x float>, ptr addrspace(4) %p, i64 %idxprom
+ %ret = load <4 x float>, ptr addrspace(4) %arrayidx, align 4
+ ret <4 x float> %ret
+}
+
+define amdgpu_ps <8 x float> @s_load_b256_idxprom_range(ptr addrspace(4) align 4 inreg %p) {
+; GCN-LABEL: s_load_b256_idxprom_range:
+; GCN: ; %bb.0: ; %entry
+; GCN-NEXT: s_load_b32 s2, s[0:1], 0x0
+; GCN-NEXT: s_wait_kmcnt 0x0
+; GCN-NEXT: s_load_b256 s[0:7], s[0:1], s2 offset:0x0 scale_offset
+; GCN-NEXT: s_wait_kmcnt 0x0
+; GCN-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GCN-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GCN-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s5
+; GCN-NEXT: v_dual_mov_b32 v6, s6 :: v_dual_mov_b32 v7, s7
+; GCN-NEXT: ; return to shader part epilog
+entry:
+ %idx = load i32, ptr addrspace(4) %p, align 4, !range !0
+ %idxprom = zext i32 %idx to i64
+ %arrayidx = getelementptr inbounds <8 x float>, ptr addrspace(4) %p, i64 %idxprom
+ %ret = load <8 x float>, ptr addrspace(4) %arrayidx, align 4
+ ret <8 x float> %ret
+}
+
+define amdgpu_ps <16 x float> @s_load_b512_idxprom_range(ptr addrspace(4) align 4 inreg %p) {
+; GCN-LABEL: s_load_b512_idxprom_range:
+; GCN: ; %bb.0: ; %entry
+; GCN-NEXT: s_load_b32 s2, s[0:1], 0x0
+; GCN-NEXT: s_wait_kmcnt 0x0
+; GCN-NEXT: s_load_b512 s[0:15], s[0:1], s2 offset:0x0 scale_offset
+; GCN-NEXT: s_wait_kmcnt 0x0
+; GCN-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GCN-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GCN-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s5
+; GCN-NEXT: v_dual_mov_b32 v6, s6 :: v_dual_mov_b32 v7, s7
+; GCN-NEXT: v_dual_mov_b32 v8, s8 :: v_dual_mov_b32 v9, s9
+; GCN-NEXT: v_dual_mov_b32 v10, s10 :: v_dual_mov_b32 v11, s11
+; GCN-NEXT: v_dual_mov_b32 v12, s12 :: v_dual_mov_b32 v13, s13
+; GCN-NEXT: v_dual_mov_b32 v14, s14 :: v_dual_mov_b32 v15, s15
+; GCN-NEXT: ; return to shader part epilog
+entry:
+ %idx = load i32, ptr addrspace(4) %p, align 4, !range !0
+ %idxprom = zext i32 %idx to i64
+ %arrayidx = getelementptr inbounds <16 x float>, ptr addrspace(4) %p, i64 %idxprom
+ %ret = load <16 x float>, ptr addrspace(4) %arrayidx, align 4
+ ret <16 x float> %ret
+}
+
+!0 = !{i32 0, i32 1024}
diff --git a/llvm/test/CodeGen/AMDGPU/srem.ll b/llvm/test/CodeGen/AMDGPU/srem.ll
index a6b8ea3..6da7d1b 100644
--- a/llvm/test/CodeGen/AMDGPU/srem.ll
+++ b/llvm/test/CodeGen/AMDGPU/srem.ll
@@ -1819,7 +1819,7 @@ define amdgpu_kernel void @srem_i64(ptr addrspace(1) %out, ptr addrspace(1) %in)
; TAHITI-NEXT: v_mul_hi_u32 v1, v0, v1
; TAHITI-NEXT: v_mul_lo_u32 v1, v1, v2
; TAHITI-NEXT: v_sub_i32_e32 v0, vcc, v0, v1
-; TAHITI-NEXT: v_sub_i32_e32 v1, vcc, v0, v2
+; TAHITI-NEXT: v_subrev_i32_e32 v1, vcc, v2, v0
; TAHITI-NEXT: v_cmp_ge_u32_e32 vcc, v0, v2
; TAHITI-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
; TAHITI-NEXT: v_sub_i32_e32 v1, vcc, v0, v2
@@ -6232,7 +6232,7 @@ define amdgpu_kernel void @srem_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %i
; TONGA-NEXT: v_mul_hi_u32 v8, v14, v8
; TONGA-NEXT: v_mul_lo_u32 v8, v8, v10
; TONGA-NEXT: v_sub_u32_e32 v8, vcc, v14, v8
-; TONGA-NEXT: v_sub_u32_e32 v9, vcc, v8, v10
+; TONGA-NEXT: v_subrev_u32_e32 v9, vcc, v10, v8
; TONGA-NEXT: v_cmp_ge_u32_e32 vcc, v8, v10
; TONGA-NEXT: v_cndmask_b32_e32 v8, v8, v9, vcc
; TONGA-NEXT: v_sub_u32_e32 v9, vcc, v8, v10
diff --git a/llvm/test/CodeGen/LoongArch/lasx/build-vector.ll b/llvm/test/CodeGen/LoongArch/lasx/build-vector.ll
index a4f3fe7..61a915a 100644
--- a/llvm/test/CodeGen/LoongArch/lasx/build-vector.ll
+++ b/llvm/test/CodeGen/LoongArch/lasx/build-vector.ll
@@ -475,7 +475,6 @@ define void @buildvector_v8f32(ptr %dst, float %a0, float %a1, float %a2, float
; CHECK-NEXT: # kill: def $f2 killed $f2 def $xr2
; CHECK-NEXT: # kill: def $f1 killed $f1 def $xr1
; CHECK-NEXT: # kill: def $f0 killed $f0 def $xr0
-; CHECK-NEXT: xvinsve0.w $xr0, $xr0, 0
; CHECK-NEXT: xvinsve0.w $xr0, $xr1, 1
; CHECK-NEXT: xvinsve0.w $xr0, $xr2, 2
; CHECK-NEXT: xvinsve0.w $xr0, $xr3, 3
@@ -505,7 +504,6 @@ define void @buildvector_v4f64(ptr %dst, double %a0, double %a1, double %a2, dou
; CHECK-NEXT: # kill: def $f2_64 killed $f2_64 def $xr2
; CHECK-NEXT: # kill: def $f1_64 killed $f1_64 def $xr1
; CHECK-NEXT: # kill: def $f0_64 killed $f0_64 def $xr0
-; CHECK-NEXT: xvinsve0.d $xr0, $xr0, 0
; CHECK-NEXT: xvinsve0.d $xr0, $xr1, 1
; CHECK-NEXT: xvinsve0.d $xr0, $xr2, 2
; CHECK-NEXT: xvinsve0.d $xr0, $xr3, 3
diff --git a/llvm/test/CodeGen/LoongArch/lasx/fpowi.ll b/llvm/test/CodeGen/LoongArch/lasx/fpowi.ll
index 6ea3efe..3800712 100644
--- a/llvm/test/CodeGen/LoongArch/lasx/fpowi.ll
+++ b/llvm/test/CodeGen/LoongArch/lasx/fpowi.ll
@@ -11,24 +11,23 @@ define <8 x float> @powi_v8f32(<8 x float> %va, i32 %b) nounwind {
; CHECK-NEXT: st.d $fp, $sp, 80 # 8-byte Folded Spill
; CHECK-NEXT: xvst $xr0, $sp, 16 # 32-byte Folded Spill
; CHECK-NEXT: addi.w $fp, $a0, 0
-; CHECK-NEXT: xvpickve2gr.w $a0, $xr0, 0
+; CHECK-NEXT: xvpickve2gr.w $a0, $xr0, 1
; CHECK-NEXT: movgr2fr.w $fa0, $a0
; CHECK-NEXT: move $a0, $fp
; CHECK-NEXT: pcaddu18i $ra, %call36(__powisf2)
; CHECK-NEXT: jirl $ra, $ra, 0
; CHECK-NEXT: # kill: def $f0 killed $f0 def $xr0
-; CHECK-NEXT: xvinsve0.w $xr0, $xr0, 0
; CHECK-NEXT: xvst $xr0, $sp, 48 # 32-byte Folded Spill
; CHECK-NEXT: xvld $xr0, $sp, 16 # 32-byte Folded Reload
-; CHECK-NEXT: xvpickve2gr.w $a0, $xr0, 1
+; CHECK-NEXT: xvpickve2gr.w $a0, $xr0, 0
; CHECK-NEXT: movgr2fr.w $fa0, $a0
; CHECK-NEXT: move $a0, $fp
; CHECK-NEXT: pcaddu18i $ra, %call36(__powisf2)
; CHECK-NEXT: jirl $ra, $ra, 0
; CHECK-NEXT: # kill: def $f0 killed $f0 def $xr0
; CHECK-NEXT: xvld $xr1, $sp, 48 # 32-byte Folded Reload
-; CHECK-NEXT: xvinsve0.w $xr1, $xr0, 1
-; CHECK-NEXT: xvst $xr1, $sp, 48 # 32-byte Folded Spill
+; CHECK-NEXT: xvinsve0.w $xr0, $xr1, 1
+; CHECK-NEXT: xvst $xr0, $sp, 48 # 32-byte Folded Spill
; CHECK-NEXT: xvld $xr0, $sp, 16 # 32-byte Folded Reload
; CHECK-NEXT: xvpickve2gr.w $a0, $xr0, 2
; CHECK-NEXT: movgr2fr.w $fa0, $a0
@@ -106,44 +105,43 @@ define <4 x double> @powi_v4f64(<4 x double> %va, i32 %b) nounwind {
; CHECK-NEXT: addi.d $sp, $sp, -96
; CHECK-NEXT: st.d $ra, $sp, 88 # 8-byte Folded Spill
; CHECK-NEXT: st.d $fp, $sp, 80 # 8-byte Folded Spill
-; CHECK-NEXT: xvst $xr0, $sp, 16 # 32-byte Folded Spill
+; CHECK-NEXT: xvst $xr0, $sp, 48 # 32-byte Folded Spill
; CHECK-NEXT: addi.w $fp, $a0, 0
-; CHECK-NEXT: xvpickve2gr.d $a0, $xr0, 0
+; CHECK-NEXT: xvpickve2gr.d $a0, $xr0, 1
; CHECK-NEXT: movgr2fr.d $fa0, $a0
; CHECK-NEXT: move $a0, $fp
; CHECK-NEXT: pcaddu18i $ra, %call36(__powidf2)
; CHECK-NEXT: jirl $ra, $ra, 0
; CHECK-NEXT: # kill: def $f0_64 killed $f0_64 def $xr0
-; CHECK-NEXT: xvinsve0.d $xr0, $xr0, 0
-; CHECK-NEXT: xvst $xr0, $sp, 48 # 32-byte Folded Spill
-; CHECK-NEXT: xvld $xr0, $sp, 16 # 32-byte Folded Reload
-; CHECK-NEXT: xvpickve2gr.d $a0, $xr0, 1
+; CHECK-NEXT: xvst $xr0, $sp, 16 # 32-byte Folded Spill
+; CHECK-NEXT: xvld $xr0, $sp, 48 # 32-byte Folded Reload
+; CHECK-NEXT: xvpickve2gr.d $a0, $xr0, 0
; CHECK-NEXT: movgr2fr.d $fa0, $a0
; CHECK-NEXT: move $a0, $fp
; CHECK-NEXT: pcaddu18i $ra, %call36(__powidf2)
; CHECK-NEXT: jirl $ra, $ra, 0
; CHECK-NEXT: # kill: def $f0_64 killed $f0_64 def $xr0
-; CHECK-NEXT: xvld $xr1, $sp, 48 # 32-byte Folded Reload
-; CHECK-NEXT: xvinsve0.d $xr1, $xr0, 1
-; CHECK-NEXT: xvst $xr1, $sp, 48 # 32-byte Folded Spill
-; CHECK-NEXT: xvld $xr0, $sp, 16 # 32-byte Folded Reload
+; CHECK-NEXT: xvld $xr1, $sp, 16 # 32-byte Folded Reload
+; CHECK-NEXT: xvinsve0.d $xr0, $xr1, 1
+; CHECK-NEXT: xvst $xr0, $sp, 16 # 32-byte Folded Spill
+; CHECK-NEXT: xvld $xr0, $sp, 48 # 32-byte Folded Reload
; CHECK-NEXT: xvpickve2gr.d $a0, $xr0, 2
; CHECK-NEXT: movgr2fr.d $fa0, $a0
; CHECK-NEXT: move $a0, $fp
; CHECK-NEXT: pcaddu18i $ra, %call36(__powidf2)
; CHECK-NEXT: jirl $ra, $ra, 0
; CHECK-NEXT: # kill: def $f0_64 killed $f0_64 def $xr0
-; CHECK-NEXT: xvld $xr1, $sp, 48 # 32-byte Folded Reload
+; CHECK-NEXT: xvld $xr1, $sp, 16 # 32-byte Folded Reload
; CHECK-NEXT: xvinsve0.d $xr1, $xr0, 2
-; CHECK-NEXT: xvst $xr1, $sp, 48 # 32-byte Folded Spill
-; CHECK-NEXT: xvld $xr0, $sp, 16 # 32-byte Folded Reload
+; CHECK-NEXT: xvst $xr1, $sp, 16 # 32-byte Folded Spill
+; CHECK-NEXT: xvld $xr0, $sp, 48 # 32-byte Folded Reload
; CHECK-NEXT: xvpickve2gr.d $a0, $xr0, 3
; CHECK-NEXT: movgr2fr.d $fa0, $a0
; CHECK-NEXT: move $a0, $fp
; CHECK-NEXT: pcaddu18i $ra, %call36(__powidf2)
; CHECK-NEXT: jirl $ra, $ra, 0
; CHECK-NEXT: # kill: def $f0_64 killed $f0_64 def $xr0
-; CHECK-NEXT: xvld $xr1, $sp, 48 # 32-byte Folded Reload
+; CHECK-NEXT: xvld $xr1, $sp, 16 # 32-byte Folded Reload
; CHECK-NEXT: xvinsve0.d $xr1, $xr0, 3
; CHECK-NEXT: xvori.b $xr0, $xr1, 0
; CHECK-NEXT: ld.d $fp, $sp, 80 # 8-byte Folded Reload
diff --git a/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/fix-xvshuf.ll b/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/fix-xvshuf.ll
index f154dd3..221aba3 100644
--- a/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/fix-xvshuf.ll
+++ b/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/fix-xvshuf.ll
@@ -6,15 +6,12 @@
define <4 x double> @shufflevector_v4f64(<4 x double> %a, <4 x double> %b) {
; CHECK-LABEL: shufflevector_v4f64:
; CHECK: # %bb.0: # %entry
-; CHECK-NEXT: xvpickve2gr.d $a0, $xr0, 0
-; CHECK-NEXT: xvinsgr2vr.d $xr2, $a0, 0
; CHECK-NEXT: xvpickve2gr.d $a0, $xr1, 2
-; CHECK-NEXT: xvinsgr2vr.d $xr2, $a0, 1
-; CHECK-NEXT: xvpickve2gr.d $a0, $xr0, 3
-; CHECK-NEXT: xvinsgr2vr.d $xr2, $a0, 2
+; CHECK-NEXT: xvpickve2gr.d $a1, $xr0, 3
+; CHECK-NEXT: xvinsgr2vr.d $xr0, $a0, 1
+; CHECK-NEXT: xvinsgr2vr.d $xr0, $a1, 2
; CHECK-NEXT: xvpickve2gr.d $a0, $xr1, 3
-; CHECK-NEXT: xvinsgr2vr.d $xr2, $a0, 3
-; CHECK-NEXT: xvori.b $xr0, $xr2, 0
+; CHECK-NEXT: xvinsgr2vr.d $xr0, $a0, 3
; CHECK-NEXT: ret
entry:
%c = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 0, i32 6, i32 3, i32 7>
diff --git a/llvm/test/CodeGen/LoongArch/llvm.exp10.ll b/llvm/test/CodeGen/LoongArch/llvm.exp10.ll
index c667a36..62ea5cb 100644
--- a/llvm/test/CodeGen/LoongArch/llvm.exp10.ll
+++ b/llvm/test/CodeGen/LoongArch/llvm.exp10.ll
@@ -196,22 +196,20 @@ define <2 x double> @exp10_v2f64(<2 x double> %x) #0 {
; LA64-NEXT: addi.d $sp, $sp, -48
; LA64-NEXT: st.d $ra, $sp, 40 # 8-byte Folded Spill
; LA64-NEXT: vst $vr0, $sp, 0 # 16-byte Folded Spill
-; LA64-NEXT: vreplvei.d $vr0, $vr0, 0
+; LA64-NEXT: vreplvei.d $vr0, $vr0, 1
; LA64-NEXT: # kill: def $f0_64 killed $f0_64 killed $vr0
; LA64-NEXT: pcaddu18i $ra, %call36(exp10)
; LA64-NEXT: jirl $ra, $ra, 0
; LA64-NEXT: # kill: def $f0_64 killed $f0_64 def $vr0
-; LA64-NEXT: vextrins.d $vr0, $vr0, 0
; LA64-NEXT: vst $vr0, $sp, 16 # 16-byte Folded Spill
; LA64-NEXT: vld $vr0, $sp, 0 # 16-byte Folded Reload
-; LA64-NEXT: vreplvei.d $vr0, $vr0, 1
+; LA64-NEXT: vreplvei.d $vr0, $vr0, 0
; LA64-NEXT: # kill: def $f0_64 killed $f0_64 killed $vr0
; LA64-NEXT: pcaddu18i $ra, %call36(exp10)
; LA64-NEXT: jirl $ra, $ra, 0
; LA64-NEXT: # kill: def $f0_64 killed $f0_64 def $vr0
; LA64-NEXT: vld $vr1, $sp, 16 # 16-byte Folded Reload
-; LA64-NEXT: vextrins.d $vr1, $vr0, 16
-; LA64-NEXT: vori.b $vr0, $vr1, 0
+; LA64-NEXT: vextrins.d $vr0, $vr1, 16
; LA64-NEXT: ld.d $ra, $sp, 40 # 8-byte Folded Reload
; LA64-NEXT: addi.d $sp, $sp, 48
; LA64-NEXT: ret
diff --git a/llvm/test/CodeGen/LoongArch/llvm.sincos.ll b/llvm/test/CodeGen/LoongArch/llvm.sincos.ll
index 17e5969..383d63c 100644
--- a/llvm/test/CodeGen/LoongArch/llvm.sincos.ll
+++ b/llvm/test/CodeGen/LoongArch/llvm.sincos.ll
@@ -571,39 +571,37 @@ define { <2 x double>, <2 x double> } @test_sincos_v2f64(<2 x double> %a) #0 {
; LA64-NEXT: addi.d $sp, $sp, -80
; LA64-NEXT: st.d $ra, $sp, 72 # 8-byte Folded Spill
; LA64-NEXT: vst $vr0, $sp, 16 # 16-byte Folded Spill
-; LA64-NEXT: vreplvei.d $vr0, $vr0, 0
-; LA64-NEXT: vst $vr0, $sp, 32 # 16-byte Folded Spill
+; LA64-NEXT: vreplvei.d $vr0, $vr0, 1
+; LA64-NEXT: vst $vr0, $sp, 48 # 16-byte Folded Spill
; LA64-NEXT: # kill: def $f0_64 killed $f0_64 killed $vr0
; LA64-NEXT: pcaddu18i $ra, %call36(sin)
; LA64-NEXT: jirl $ra, $ra, 0
; LA64-NEXT: # kill: def $f0_64 killed $f0_64 def $vr0
-; LA64-NEXT: vextrins.d $vr0, $vr0, 0
-; LA64-NEXT: vst $vr0, $sp, 48 # 16-byte Folded Spill
+; LA64-NEXT: vst $vr0, $sp, 32 # 16-byte Folded Spill
; LA64-NEXT: vld $vr0, $sp, 16 # 16-byte Folded Reload
-; LA64-NEXT: vreplvei.d $vr0, $vr0, 1
+; LA64-NEXT: vreplvei.d $vr0, $vr0, 0
; LA64-NEXT: vst $vr0, $sp, 16 # 16-byte Folded Spill
; LA64-NEXT: # kill: def $f0_64 killed $f0_64 killed $vr0
; LA64-NEXT: pcaddu18i $ra, %call36(sin)
; LA64-NEXT: jirl $ra, $ra, 0
; LA64-NEXT: # kill: def $f0_64 killed $f0_64 def $vr0
-; LA64-NEXT: vld $vr1, $sp, 48 # 16-byte Folded Reload
-; LA64-NEXT: vextrins.d $vr1, $vr0, 16
-; LA64-NEXT: vst $vr1, $sp, 48 # 16-byte Folded Spill
-; LA64-NEXT: vld $vr0, $sp, 32 # 16-byte Folded Reload
+; LA64-NEXT: vld $vr1, $sp, 32 # 16-byte Folded Reload
+; LA64-NEXT: vextrins.d $vr0, $vr1, 16
+; LA64-NEXT: vst $vr0, $sp, 32 # 16-byte Folded Spill
+; LA64-NEXT: vld $vr0, $sp, 48 # 16-byte Folded Reload
; LA64-NEXT: # kill: def $f0_64 killed $f0_64 killed $vr0
; LA64-NEXT: pcaddu18i $ra, %call36(cos)
; LA64-NEXT: jirl $ra, $ra, 0
; LA64-NEXT: # kill: def $f0_64 killed $f0_64 def $vr0
-; LA64-NEXT: vextrins.d $vr0, $vr0, 0
-; LA64-NEXT: vst $vr0, $sp, 32 # 16-byte Folded Spill
+; LA64-NEXT: vst $vr0, $sp, 48 # 16-byte Folded Spill
; LA64-NEXT: vld $vr0, $sp, 16 # 16-byte Folded Reload
; LA64-NEXT: # kill: def $f0_64 killed $f0_64 killed $vr0
; LA64-NEXT: pcaddu18i $ra, %call36(cos)
; LA64-NEXT: jirl $ra, $ra, 0
-; LA64-NEXT: # kill: def $f0_64 killed $f0_64 def $vr0
-; LA64-NEXT: vld $vr1, $sp, 32 # 16-byte Folded Reload
-; LA64-NEXT: vextrins.d $vr1, $vr0, 16
+; LA64-NEXT: fmov.d $fa1, $fa0
; LA64-NEXT: vld $vr0, $sp, 48 # 16-byte Folded Reload
+; LA64-NEXT: vextrins.d $vr1, $vr0, 16
+; LA64-NEXT: vld $vr0, $sp, 32 # 16-byte Folded Reload
; LA64-NEXT: ld.d $ra, $sp, 72 # 8-byte Folded Reload
; LA64-NEXT: addi.d $sp, $sp, 80
; LA64-NEXT: ret
diff --git a/llvm/test/CodeGen/LoongArch/lsx/build-vector.ll b/llvm/test/CodeGen/LoongArch/lsx/build-vector.ll
index f723343..afc87d1 100644
--- a/llvm/test/CodeGen/LoongArch/lsx/build-vector.ll
+++ b/llvm/test/CodeGen/LoongArch/lsx/build-vector.ll
@@ -338,7 +338,6 @@ define void @buildvector_v4f32(ptr %dst, float %a0, float %a1, float %a2, float
; CHECK-NEXT: # kill: def $f2 killed $f2 def $vr2
; CHECK-NEXT: # kill: def $f1 killed $f1 def $vr1
; CHECK-NEXT: # kill: def $f0 killed $f0 def $vr0
-; CHECK-NEXT: vextrins.w $vr0, $vr0, 0
; CHECK-NEXT: vextrins.w $vr0, $vr1, 16
; CHECK-NEXT: vextrins.w $vr0, $vr2, 32
; CHECK-NEXT: vextrins.w $vr0, $vr3, 48
@@ -358,7 +357,6 @@ define void @buildvector_v2f64(ptr %dst, double %a0, double %a1) nounwind {
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: # kill: def $f1_64 killed $f1_64 def $vr1
; CHECK-NEXT: # kill: def $f0_64 killed $f0_64 def $vr0
-; CHECK-NEXT: vextrins.d $vr0, $vr0, 0
; CHECK-NEXT: vextrins.d $vr0, $vr1, 16
; CHECK-NEXT: vst $vr0, $a0, 0
; CHECK-NEXT: ret
diff --git a/llvm/test/CodeGen/LoongArch/lsx/fpowi.ll b/llvm/test/CodeGen/LoongArch/lsx/fpowi.ll
index 79663b6..735dad4 100644
--- a/llvm/test/CodeGen/LoongArch/lsx/fpowi.ll
+++ b/llvm/test/CodeGen/LoongArch/lsx/fpowi.ll
@@ -9,44 +9,43 @@ define <4 x float> @powi_v4f32(<4 x float> %va, i32 %b) nounwind {
; CHECK-NEXT: addi.d $sp, $sp, -48
; CHECK-NEXT: st.d $ra, $sp, 40 # 8-byte Folded Spill
; CHECK-NEXT: st.d $fp, $sp, 32 # 8-byte Folded Spill
-; CHECK-NEXT: vst $vr0, $sp, 0 # 16-byte Folded Spill
+; CHECK-NEXT: vst $vr0, $sp, 16 # 16-byte Folded Spill
; CHECK-NEXT: addi.w $fp, $a0, 0
-; CHECK-NEXT: vreplvei.w $vr0, $vr0, 0
+; CHECK-NEXT: vreplvei.w $vr0, $vr0, 1
; CHECK-NEXT: # kill: def $f0 killed $f0 killed $vr0
; CHECK-NEXT: move $a0, $fp
; CHECK-NEXT: pcaddu18i $ra, %call36(__powisf2)
; CHECK-NEXT: jirl $ra, $ra, 0
; CHECK-NEXT: # kill: def $f0 killed $f0 def $vr0
-; CHECK-NEXT: vextrins.w $vr0, $vr0, 0
-; CHECK-NEXT: vst $vr0, $sp, 16 # 16-byte Folded Spill
-; CHECK-NEXT: vld $vr0, $sp, 0 # 16-byte Folded Reload
-; CHECK-NEXT: vreplvei.w $vr0, $vr0, 1
+; CHECK-NEXT: vst $vr0, $sp, 0 # 16-byte Folded Spill
+; CHECK-NEXT: vld $vr0, $sp, 16 # 16-byte Folded Reload
+; CHECK-NEXT: vreplvei.w $vr0, $vr0, 0
; CHECK-NEXT: # kill: def $f0 killed $f0 killed $vr0
; CHECK-NEXT: move $a0, $fp
; CHECK-NEXT: pcaddu18i $ra, %call36(__powisf2)
; CHECK-NEXT: jirl $ra, $ra, 0
; CHECK-NEXT: # kill: def $f0 killed $f0 def $vr0
-; CHECK-NEXT: vld $vr1, $sp, 16 # 16-byte Folded Reload
-; CHECK-NEXT: vextrins.w $vr1, $vr0, 16
-; CHECK-NEXT: vst $vr1, $sp, 16 # 16-byte Folded Spill
-; CHECK-NEXT: vld $vr0, $sp, 0 # 16-byte Folded Reload
+; CHECK-NEXT: vld $vr1, $sp, 0 # 16-byte Folded Reload
+; CHECK-NEXT: vextrins.w $vr0, $vr1, 16
+; CHECK-NEXT: vst $vr0, $sp, 0 # 16-byte Folded Spill
+; CHECK-NEXT: vld $vr0, $sp, 16 # 16-byte Folded Reload
; CHECK-NEXT: vreplvei.w $vr0, $vr0, 2
; CHECK-NEXT: # kill: def $f0 killed $f0 killed $vr0
; CHECK-NEXT: move $a0, $fp
; CHECK-NEXT: pcaddu18i $ra, %call36(__powisf2)
; CHECK-NEXT: jirl $ra, $ra, 0
; CHECK-NEXT: # kill: def $f0 killed $f0 def $vr0
-; CHECK-NEXT: vld $vr1, $sp, 16 # 16-byte Folded Reload
+; CHECK-NEXT: vld $vr1, $sp, 0 # 16-byte Folded Reload
; CHECK-NEXT: vextrins.w $vr1, $vr0, 32
-; CHECK-NEXT: vst $vr1, $sp, 16 # 16-byte Folded Spill
-; CHECK-NEXT: vld $vr0, $sp, 0 # 16-byte Folded Reload
+; CHECK-NEXT: vst $vr1, $sp, 0 # 16-byte Folded Spill
+; CHECK-NEXT: vld $vr0, $sp, 16 # 16-byte Folded Reload
; CHECK-NEXT: vreplvei.w $vr0, $vr0, 3
; CHECK-NEXT: # kill: def $f0 killed $f0 killed $vr0
; CHECK-NEXT: move $a0, $fp
; CHECK-NEXT: pcaddu18i $ra, %call36(__powisf2)
; CHECK-NEXT: jirl $ra, $ra, 0
; CHECK-NEXT: # kill: def $f0 killed $f0 def $vr0
-; CHECK-NEXT: vld $vr1, $sp, 16 # 16-byte Folded Reload
+; CHECK-NEXT: vld $vr1, $sp, 0 # 16-byte Folded Reload
; CHECK-NEXT: vextrins.w $vr1, $vr0, 48
; CHECK-NEXT: vori.b $vr0, $vr1, 0
; CHECK-NEXT: ld.d $fp, $sp, 32 # 8-byte Folded Reload
@@ -68,24 +67,22 @@ define <2 x double> @powi_v2f64(<2 x double> %va, i32 %b) nounwind {
; CHECK-NEXT: st.d $fp, $sp, 32 # 8-byte Folded Spill
; CHECK-NEXT: vst $vr0, $sp, 0 # 16-byte Folded Spill
; CHECK-NEXT: addi.w $fp, $a0, 0
-; CHECK-NEXT: vreplvei.d $vr0, $vr0, 0
+; CHECK-NEXT: vreplvei.d $vr0, $vr0, 1
; CHECK-NEXT: # kill: def $f0_64 killed $f0_64 killed $vr0
; CHECK-NEXT: move $a0, $fp
; CHECK-NEXT: pcaddu18i $ra, %call36(__powidf2)
; CHECK-NEXT: jirl $ra, $ra, 0
; CHECK-NEXT: # kill: def $f0_64 killed $f0_64 def $vr0
-; CHECK-NEXT: vextrins.d $vr0, $vr0, 0
; CHECK-NEXT: vst $vr0, $sp, 16 # 16-byte Folded Spill
; CHECK-NEXT: vld $vr0, $sp, 0 # 16-byte Folded Reload
-; CHECK-NEXT: vreplvei.d $vr0, $vr0, 1
+; CHECK-NEXT: vreplvei.d $vr0, $vr0, 0
; CHECK-NEXT: # kill: def $f0_64 killed $f0_64 killed $vr0
; CHECK-NEXT: move $a0, $fp
; CHECK-NEXT: pcaddu18i $ra, %call36(__powidf2)
; CHECK-NEXT: jirl $ra, $ra, 0
; CHECK-NEXT: # kill: def $f0_64 killed $f0_64 def $vr0
; CHECK-NEXT: vld $vr1, $sp, 16 # 16-byte Folded Reload
-; CHECK-NEXT: vextrins.d $vr1, $vr0, 16
-; CHECK-NEXT: vori.b $vr0, $vr1, 0
+; CHECK-NEXT: vextrins.d $vr0, $vr1, 16
; CHECK-NEXT: ld.d $fp, $sp, 32 # 8-byte Folded Reload
; CHECK-NEXT: ld.d $ra, $sp, 40 # 8-byte Folded Reload
; CHECK-NEXT: addi.d $sp, $sp, 48
diff --git a/llvm/test/CodeGen/NVPTX/i1-select.ll b/llvm/test/CodeGen/NVPTX/i1-select.ll
index f1adc34..9a051b3 100644
--- a/llvm/test/CodeGen/NVPTX/i1-select.ll
+++ b/llvm/test/CodeGen/NVPTX/i1-select.ll
@@ -94,27 +94,27 @@ define i32 @test_select_i1_basic(i32 %v1, i32 %v2, i32 %v3, i32 %true, i32 %fals
define i32 @test_select_i1_basic_folding(i32 %v1, i32 %v2, i32 %v3, i32 %true, i32 %false) {
; CHECK-LABEL: test_select_i1_basic_folding(
; CHECK: {
-; CHECK-NEXT: .reg .pred %p<12>;
-; CHECK-NEXT: .reg .b32 %r<9>;
+; CHECK-NEXT: .reg .pred %p<13>;
+; CHECK-NEXT: .reg .b32 %r<7>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: ld.param.b32 %r1, [test_select_i1_basic_folding_param_0];
; CHECK-NEXT: setp.eq.b32 %p1, %r1, 0;
-; CHECK-NEXT: ld.param.b32 %r3, [test_select_i1_basic_folding_param_1];
-; CHECK-NEXT: setp.ne.b32 %p2, %r3, 0;
-; CHECK-NEXT: setp.eq.b32 %p3, %r3, 0;
-; CHECK-NEXT: ld.param.b32 %r5, [test_select_i1_basic_folding_param_2];
-; CHECK-NEXT: setp.eq.b32 %p4, %r5, 0;
-; CHECK-NEXT: ld.param.b32 %r6, [test_select_i1_basic_folding_param_3];
+; CHECK-NEXT: ld.param.b32 %r2, [test_select_i1_basic_folding_param_1];
+; CHECK-NEXT: setp.ne.b32 %p2, %r2, 0;
+; CHECK-NEXT: setp.eq.b32 %p3, %r2, 0;
+; CHECK-NEXT: ld.param.b32 %r3, [test_select_i1_basic_folding_param_2];
+; CHECK-NEXT: setp.eq.b32 %p4, %r3, 0;
+; CHECK-NEXT: ld.param.b32 %r4, [test_select_i1_basic_folding_param_3];
; CHECK-NEXT: xor.pred %p6, %p1, %p3;
-; CHECK-NEXT: ld.param.b32 %r7, [test_select_i1_basic_folding_param_4];
+; CHECK-NEXT: ld.param.b32 %r5, [test_select_i1_basic_folding_param_4];
; CHECK-NEXT: and.pred %p7, %p6, %p4;
-; CHECK-NEXT: and.pred %p8, %p2, %p4;
-; CHECK-NEXT: and.pred %p9, %p3, %p7;
-; CHECK-NEXT: or.pred %p10, %p9, %p8;
-; CHECK-NEXT: xor.pred %p11, %p10, %p3;
-; CHECK-NEXT: selp.b32 %r8, %r6, %r7, %p11;
-; CHECK-NEXT: st.param.b32 [func_retval0], %r8;
+; CHECK-NEXT: and.pred %p9, %p2, %p4;
+; CHECK-NEXT: and.pred %p10, %p3, %p7;
+; CHECK-NEXT: or.pred %p11, %p10, %p9;
+; CHECK-NEXT: xor.pred %p12, %p11, %p3;
+; CHECK-NEXT: selp.b32 %r6, %r4, %r5, %p12;
+; CHECK-NEXT: st.param.b32 [func_retval0], %r6;
; CHECK-NEXT: ret;
%b1 = icmp eq i32 %v1, 0
%b2 = icmp eq i32 %v2, 0
diff --git a/llvm/test/CodeGen/NVPTX/i128.ll b/llvm/test/CodeGen/NVPTX/i128.ll
index f2211eb..44d8558 100644
--- a/llvm/test/CodeGen/NVPTX/i128.ll
+++ b/llvm/test/CodeGen/NVPTX/i128.ll
@@ -5,9 +5,9 @@
define i128 @srem_i128(i128 %lhs, i128 %rhs) {
; CHECK-LABEL: srem_i128(
; CHECK: {
-; CHECK-NEXT: .reg .pred %p<22>;
+; CHECK-NEXT: .reg .pred %p<20>;
; CHECK-NEXT: .reg .b32 %r<12>;
-; CHECK-NEXT: .reg .b64 %rd<126>;
+; CHECK-NEXT: .reg .b64 %rd<127>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0: // %_udiv-special-cases
; CHECK-NEXT: ld.param.v2.b64 {%rd45, %rd46}, [srem_i128_param_0];
@@ -42,103 +42,102 @@ define i128 @srem_i128(i128 %lhs, i128 %rhs) {
; CHECK-NEXT: cvt.u64.u32 %rd62, %r4;
; CHECK-NEXT: add.s64 %rd63, %rd62, 64;
; CHECK-NEXT: selp.b64 %rd64, %rd61, %rd63, %p7;
-; CHECK-NEXT: mov.b64 %rd116, 0;
+; CHECK-NEXT: mov.b64 %rd117, 0;
; CHECK-NEXT: sub.cc.s64 %rd66, %rd60, %rd64;
-; CHECK-NEXT: subc.cc.s64 %rd8, %rd116, 0;
-; CHECK-NEXT: setp.ne.b64 %p8, %rd8, 0;
-; CHECK-NEXT: and.pred %p10, %p8, %p8;
-; CHECK-NEXT: setp.eq.b64 %p11, %rd8, 0;
-; CHECK-NEXT: setp.gt.u64 %p12, %rd66, 127;
-; CHECK-NEXT: and.pred %p13, %p11, %p12;
-; CHECK-NEXT: or.pred %p14, %p13, %p10;
-; CHECK-NEXT: or.pred %p15, %p5, %p14;
-; CHECK-NEXT: xor.b64 %rd67, %rd66, 127;
-; CHECK-NEXT: or.b64 %rd68, %rd67, %rd8;
-; CHECK-NEXT: setp.eq.b64 %p16, %rd68, 0;
-; CHECK-NEXT: selp.b64 %rd125, 0, %rd4, %p15;
-; CHECK-NEXT: selp.b64 %rd124, 0, %rd3, %p15;
-; CHECK-NEXT: or.pred %p17, %p15, %p16;
-; CHECK-NEXT: @%p17 bra $L__BB0_5;
+; CHECK-NEXT: subc.cc.s64 %rd67, %rd117, 0;
+; CHECK-NEXT: setp.gt.u64 %p8, %rd66, 127;
+; CHECK-NEXT: setp.eq.b64 %p9, %rd67, 0;
+; CHECK-NEXT: and.pred %p10, %p9, %p8;
+; CHECK-NEXT: setp.ne.b64 %p11, %rd67, 0;
+; CHECK-NEXT: or.pred %p12, %p10, %p11;
+; CHECK-NEXT: or.pred %p13, %p5, %p12;
+; CHECK-NEXT: xor.b64 %rd68, %rd66, 127;
+; CHECK-NEXT: or.b64 %rd69, %rd68, %rd67;
+; CHECK-NEXT: setp.eq.b64 %p14, %rd69, 0;
+; CHECK-NEXT: selp.b64 %rd126, 0, %rd4, %p13;
+; CHECK-NEXT: selp.b64 %rd125, 0, %rd3, %p13;
+; CHECK-NEXT: or.pred %p15, %p13, %p14;
+; CHECK-NEXT: @%p15 bra $L__BB0_5;
; CHECK-NEXT: // %bb.3: // %udiv-bb1
-; CHECK-NEXT: add.cc.s64 %rd118, %rd66, 1;
-; CHECK-NEXT: addc.cc.s64 %rd119, %rd8, 0;
-; CHECK-NEXT: or.b64 %rd71, %rd118, %rd119;
-; CHECK-NEXT: setp.eq.b64 %p18, %rd71, 0;
+; CHECK-NEXT: add.cc.s64 %rd119, %rd66, 1;
+; CHECK-NEXT: addc.cc.s64 %rd120, %rd67, 0;
+; CHECK-NEXT: or.b64 %rd72, %rd119, %rd120;
+; CHECK-NEXT: setp.eq.b64 %p16, %rd72, 0;
; CHECK-NEXT: cvt.u32.u64 %r5, %rd66;
; CHECK-NEXT: sub.s32 %r6, 127, %r5;
-; CHECK-NEXT: shl.b64 %rd72, %rd4, %r6;
+; CHECK-NEXT: shl.b64 %rd73, %rd4, %r6;
; CHECK-NEXT: sub.s32 %r7, 64, %r6;
-; CHECK-NEXT: shr.u64 %rd73, %rd3, %r7;
-; CHECK-NEXT: or.b64 %rd74, %rd72, %rd73;
+; CHECK-NEXT: shr.u64 %rd74, %rd3, %r7;
+; CHECK-NEXT: or.b64 %rd75, %rd73, %rd74;
; CHECK-NEXT: sub.s32 %r8, 63, %r5;
-; CHECK-NEXT: shl.b64 %rd75, %rd3, %r8;
-; CHECK-NEXT: setp.gt.s32 %p19, %r6, 63;
-; CHECK-NEXT: selp.b64 %rd123, %rd75, %rd74, %p19;
-; CHECK-NEXT: shl.b64 %rd122, %rd3, %r6;
-; CHECK-NEXT: mov.b64 %rd113, %rd116;
-; CHECK-NEXT: @%p18 bra $L__BB0_4;
+; CHECK-NEXT: shl.b64 %rd76, %rd3, %r8;
+; CHECK-NEXT: setp.gt.s32 %p17, %r6, 63;
+; CHECK-NEXT: selp.b64 %rd124, %rd76, %rd75, %p17;
+; CHECK-NEXT: shl.b64 %rd123, %rd3, %r6;
+; CHECK-NEXT: mov.b64 %rd114, %rd117;
+; CHECK-NEXT: @%p16 bra $L__BB0_4;
; CHECK-NEXT: // %bb.1: // %udiv-preheader
-; CHECK-NEXT: cvt.u32.u64 %r9, %rd118;
-; CHECK-NEXT: shr.u64 %rd78, %rd3, %r9;
+; CHECK-NEXT: cvt.u32.u64 %r9, %rd119;
+; CHECK-NEXT: shr.u64 %rd79, %rd3, %r9;
; CHECK-NEXT: sub.s32 %r10, 64, %r9;
-; CHECK-NEXT: shl.b64 %rd79, %rd4, %r10;
-; CHECK-NEXT: or.b64 %rd80, %rd78, %rd79;
+; CHECK-NEXT: shl.b64 %rd80, %rd4, %r10;
+; CHECK-NEXT: or.b64 %rd81, %rd79, %rd80;
; CHECK-NEXT: add.s32 %r11, %r9, -64;
-; CHECK-NEXT: shr.u64 %rd81, %rd4, %r11;
-; CHECK-NEXT: setp.gt.s32 %p20, %r9, 63;
-; CHECK-NEXT: selp.b64 %rd120, %rd81, %rd80, %p20;
-; CHECK-NEXT: shr.u64 %rd121, %rd4, %r9;
+; CHECK-NEXT: shr.u64 %rd82, %rd4, %r11;
+; CHECK-NEXT: setp.gt.s32 %p18, %r9, 63;
+; CHECK-NEXT: selp.b64 %rd121, %rd82, %rd81, %p18;
+; CHECK-NEXT: shr.u64 %rd122, %rd4, %r9;
; CHECK-NEXT: add.cc.s64 %rd35, %rd5, -1;
; CHECK-NEXT: addc.cc.s64 %rd36, %rd6, -1;
-; CHECK-NEXT: mov.b64 %rd113, 0;
-; CHECK-NEXT: mov.b64 %rd116, %rd113;
+; CHECK-NEXT: mov.b64 %rd114, 0;
+; CHECK-NEXT: mov.b64 %rd117, %rd114;
; CHECK-NEXT: $L__BB0_2: // %udiv-do-while
; CHECK-NEXT: // =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: shr.u64 %rd82, %rd120, 63;
-; CHECK-NEXT: shl.b64 %rd83, %rd121, 1;
-; CHECK-NEXT: or.b64 %rd84, %rd83, %rd82;
-; CHECK-NEXT: shl.b64 %rd85, %rd120, 1;
-; CHECK-NEXT: shr.u64 %rd86, %rd123, 63;
-; CHECK-NEXT: or.b64 %rd87, %rd85, %rd86;
-; CHECK-NEXT: shr.u64 %rd88, %rd122, 63;
-; CHECK-NEXT: shl.b64 %rd89, %rd123, 1;
-; CHECK-NEXT: or.b64 %rd90, %rd89, %rd88;
-; CHECK-NEXT: shl.b64 %rd91, %rd122, 1;
-; CHECK-NEXT: or.b64 %rd122, %rd116, %rd91;
-; CHECK-NEXT: or.b64 %rd123, %rd113, %rd90;
-; CHECK-NEXT: sub.cc.s64 %rd92, %rd35, %rd87;
-; CHECK-NEXT: subc.cc.s64 %rd93, %rd36, %rd84;
-; CHECK-NEXT: shr.s64 %rd94, %rd93, 63;
-; CHECK-NEXT: and.b64 %rd116, %rd94, 1;
-; CHECK-NEXT: and.b64 %rd95, %rd94, %rd5;
-; CHECK-NEXT: and.b64 %rd96, %rd94, %rd6;
-; CHECK-NEXT: sub.cc.s64 %rd120, %rd87, %rd95;
-; CHECK-NEXT: subc.cc.s64 %rd121, %rd84, %rd96;
-; CHECK-NEXT: add.cc.s64 %rd118, %rd118, -1;
-; CHECK-NEXT: addc.cc.s64 %rd119, %rd119, -1;
-; CHECK-NEXT: or.b64 %rd97, %rd118, %rd119;
-; CHECK-NEXT: setp.eq.b64 %p21, %rd97, 0;
-; CHECK-NEXT: @%p21 bra $L__BB0_4;
+; CHECK-NEXT: shr.u64 %rd83, %rd121, 63;
+; CHECK-NEXT: shl.b64 %rd84, %rd122, 1;
+; CHECK-NEXT: or.b64 %rd85, %rd84, %rd83;
+; CHECK-NEXT: shl.b64 %rd86, %rd121, 1;
+; CHECK-NEXT: shr.u64 %rd87, %rd124, 63;
+; CHECK-NEXT: or.b64 %rd88, %rd86, %rd87;
+; CHECK-NEXT: shr.u64 %rd89, %rd123, 63;
+; CHECK-NEXT: shl.b64 %rd90, %rd124, 1;
+; CHECK-NEXT: or.b64 %rd91, %rd90, %rd89;
+; CHECK-NEXT: shl.b64 %rd92, %rd123, 1;
+; CHECK-NEXT: or.b64 %rd123, %rd117, %rd92;
+; CHECK-NEXT: or.b64 %rd124, %rd114, %rd91;
+; CHECK-NEXT: sub.cc.s64 %rd93, %rd35, %rd88;
+; CHECK-NEXT: subc.cc.s64 %rd94, %rd36, %rd85;
+; CHECK-NEXT: shr.s64 %rd95, %rd94, 63;
+; CHECK-NEXT: and.b64 %rd117, %rd95, 1;
+; CHECK-NEXT: and.b64 %rd96, %rd95, %rd5;
+; CHECK-NEXT: and.b64 %rd97, %rd95, %rd6;
+; CHECK-NEXT: sub.cc.s64 %rd121, %rd88, %rd96;
+; CHECK-NEXT: subc.cc.s64 %rd122, %rd85, %rd97;
+; CHECK-NEXT: add.cc.s64 %rd119, %rd119, -1;
+; CHECK-NEXT: addc.cc.s64 %rd120, %rd120, -1;
+; CHECK-NEXT: or.b64 %rd98, %rd119, %rd120;
+; CHECK-NEXT: setp.eq.b64 %p19, %rd98, 0;
+; CHECK-NEXT: @%p19 bra $L__BB0_4;
; CHECK-NEXT: bra.uni $L__BB0_2;
; CHECK-NEXT: $L__BB0_4: // %udiv-loop-exit
-; CHECK-NEXT: shr.u64 %rd98, %rd122, 63;
-; CHECK-NEXT: shl.b64 %rd99, %rd123, 1;
-; CHECK-NEXT: or.b64 %rd100, %rd99, %rd98;
-; CHECK-NEXT: shl.b64 %rd101, %rd122, 1;
-; CHECK-NEXT: or.b64 %rd124, %rd116, %rd101;
-; CHECK-NEXT: or.b64 %rd125, %rd113, %rd100;
+; CHECK-NEXT: shr.u64 %rd99, %rd123, 63;
+; CHECK-NEXT: shl.b64 %rd100, %rd124, 1;
+; CHECK-NEXT: or.b64 %rd101, %rd100, %rd99;
+; CHECK-NEXT: shl.b64 %rd102, %rd123, 1;
+; CHECK-NEXT: or.b64 %rd125, %rd117, %rd102;
+; CHECK-NEXT: or.b64 %rd126, %rd114, %rd101;
; CHECK-NEXT: $L__BB0_5: // %udiv-end
-; CHECK-NEXT: mul.hi.u64 %rd102, %rd5, %rd124;
-; CHECK-NEXT: mad.lo.s64 %rd103, %rd5, %rd125, %rd102;
-; CHECK-NEXT: mad.lo.s64 %rd104, %rd6, %rd124, %rd103;
-; CHECK-NEXT: mul.lo.s64 %rd105, %rd5, %rd124;
-; CHECK-NEXT: sub.cc.s64 %rd106, %rd3, %rd105;
-; CHECK-NEXT: subc.cc.s64 %rd107, %rd4, %rd104;
-; CHECK-NEXT: xor.b64 %rd108, %rd106, %rd2;
+; CHECK-NEXT: mul.hi.u64 %rd103, %rd5, %rd125;
+; CHECK-NEXT: mad.lo.s64 %rd104, %rd5, %rd126, %rd103;
+; CHECK-NEXT: mad.lo.s64 %rd105, %rd6, %rd125, %rd104;
+; CHECK-NEXT: mul.lo.s64 %rd106, %rd5, %rd125;
+; CHECK-NEXT: sub.cc.s64 %rd107, %rd3, %rd106;
+; CHECK-NEXT: subc.cc.s64 %rd108, %rd4, %rd105;
; CHECK-NEXT: xor.b64 %rd109, %rd107, %rd2;
-; CHECK-NEXT: sub.cc.s64 %rd110, %rd108, %rd2;
-; CHECK-NEXT: subc.cc.s64 %rd111, %rd109, %rd2;
-; CHECK-NEXT: st.param.v2.b64 [func_retval0], {%rd110, %rd111};
+; CHECK-NEXT: xor.b64 %rd110, %rd108, %rd2;
+; CHECK-NEXT: sub.cc.s64 %rd111, %rd109, %rd2;
+; CHECK-NEXT: subc.cc.s64 %rd112, %rd110, %rd2;
+; CHECK-NEXT: st.param.v2.b64 [func_retval0], {%rd111, %rd112};
; CHECK-NEXT: ret;
%div = srem i128 %lhs, %rhs
ret i128 %div
@@ -149,7 +148,7 @@ define i128 @urem_i128(i128 %lhs, i128 %rhs) {
; CHECK: {
; CHECK-NEXT: .reg .pred %p<18>;
; CHECK-NEXT: .reg .b32 %r<12>;
-; CHECK-NEXT: .reg .b64 %rd<111>;
+; CHECK-NEXT: .reg .b64 %rd<113>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0: // %_udiv-special-cases
; CHECK-NEXT: ld.param.v2.b64 {%rd41, %rd42}, [urem_i128_param_0];
@@ -173,98 +172,98 @@ define i128 @urem_i128(i128 %lhs, i128 %rhs) {
; CHECK-NEXT: cvt.u64.u32 %rd52, %r4;
; CHECK-NEXT: add.s64 %rd53, %rd52, 64;
; CHECK-NEXT: selp.b64 %rd54, %rd51, %rd53, %p5;
-; CHECK-NEXT: mov.b64 %rd101, 0;
-; CHECK-NEXT: sub.cc.s64 %rd5, %rd50, %rd54;
-; CHECK-NEXT: subc.cc.s64 %rd6, %rd101, 0;
-; CHECK-NEXT: setp.gt.u64 %p6, %rd5, 127;
-; CHECK-NEXT: setp.eq.b64 %p7, %rd6, 0;
+; CHECK-NEXT: mov.b64 %rd103, 0;
+; CHECK-NEXT: sub.cc.s64 %rd56, %rd50, %rd54;
+; CHECK-NEXT: subc.cc.s64 %rd57, %rd103, 0;
+; CHECK-NEXT: setp.gt.u64 %p6, %rd56, 127;
+; CHECK-NEXT: setp.eq.b64 %p7, %rd57, 0;
; CHECK-NEXT: and.pred %p8, %p7, %p6;
-; CHECK-NEXT: setp.ne.b64 %p9, %rd6, 0;
+; CHECK-NEXT: setp.ne.b64 %p9, %rd57, 0;
; CHECK-NEXT: or.pred %p10, %p8, %p9;
; CHECK-NEXT: or.pred %p11, %p3, %p10;
-; CHECK-NEXT: xor.b64 %rd56, %rd5, 127;
-; CHECK-NEXT: or.b64 %rd57, %rd56, %rd6;
-; CHECK-NEXT: setp.eq.b64 %p12, %rd57, 0;
-; CHECK-NEXT: selp.b64 %rd110, 0, %rd42, %p11;
-; CHECK-NEXT: selp.b64 %rd109, 0, %rd41, %p11;
+; CHECK-NEXT: xor.b64 %rd58, %rd56, 127;
+; CHECK-NEXT: or.b64 %rd59, %rd58, %rd57;
+; CHECK-NEXT: setp.eq.b64 %p12, %rd59, 0;
+; CHECK-NEXT: selp.b64 %rd112, 0, %rd42, %p11;
+; CHECK-NEXT: selp.b64 %rd111, 0, %rd41, %p11;
; CHECK-NEXT: or.pred %p13, %p11, %p12;
; CHECK-NEXT: @%p13 bra $L__BB1_5;
; CHECK-NEXT: // %bb.3: // %udiv-bb1
-; CHECK-NEXT: add.cc.s64 %rd103, %rd5, 1;
-; CHECK-NEXT: addc.cc.s64 %rd104, %rd6, 0;
-; CHECK-NEXT: or.b64 %rd60, %rd103, %rd104;
-; CHECK-NEXT: setp.eq.b64 %p14, %rd60, 0;
-; CHECK-NEXT: cvt.u32.u64 %r5, %rd5;
+; CHECK-NEXT: add.cc.s64 %rd105, %rd56, 1;
+; CHECK-NEXT: addc.cc.s64 %rd106, %rd57, 0;
+; CHECK-NEXT: or.b64 %rd62, %rd105, %rd106;
+; CHECK-NEXT: setp.eq.b64 %p14, %rd62, 0;
+; CHECK-NEXT: cvt.u32.u64 %r5, %rd56;
; CHECK-NEXT: sub.s32 %r6, 127, %r5;
-; CHECK-NEXT: shl.b64 %rd61, %rd42, %r6;
+; CHECK-NEXT: shl.b64 %rd63, %rd42, %r6;
; CHECK-NEXT: sub.s32 %r7, 64, %r6;
-; CHECK-NEXT: shr.u64 %rd62, %rd41, %r7;
-; CHECK-NEXT: or.b64 %rd63, %rd61, %rd62;
+; CHECK-NEXT: shr.u64 %rd64, %rd41, %r7;
+; CHECK-NEXT: or.b64 %rd65, %rd63, %rd64;
; CHECK-NEXT: sub.s32 %r8, 63, %r5;
-; CHECK-NEXT: shl.b64 %rd64, %rd41, %r8;
+; CHECK-NEXT: shl.b64 %rd66, %rd41, %r8;
; CHECK-NEXT: setp.gt.s32 %p15, %r6, 63;
-; CHECK-NEXT: selp.b64 %rd108, %rd64, %rd63, %p15;
-; CHECK-NEXT: shl.b64 %rd107, %rd41, %r6;
-; CHECK-NEXT: mov.b64 %rd98, %rd101;
+; CHECK-NEXT: selp.b64 %rd110, %rd66, %rd65, %p15;
+; CHECK-NEXT: shl.b64 %rd109, %rd41, %r6;
+; CHECK-NEXT: mov.b64 %rd100, %rd103;
; CHECK-NEXT: @%p14 bra $L__BB1_4;
; CHECK-NEXT: // %bb.1: // %udiv-preheader
-; CHECK-NEXT: cvt.u32.u64 %r9, %rd103;
-; CHECK-NEXT: shr.u64 %rd67, %rd41, %r9;
+; CHECK-NEXT: cvt.u32.u64 %r9, %rd105;
+; CHECK-NEXT: shr.u64 %rd69, %rd41, %r9;
; CHECK-NEXT: sub.s32 %r10, 64, %r9;
-; CHECK-NEXT: shl.b64 %rd68, %rd42, %r10;
-; CHECK-NEXT: or.b64 %rd69, %rd67, %rd68;
+; CHECK-NEXT: shl.b64 %rd70, %rd42, %r10;
+; CHECK-NEXT: or.b64 %rd71, %rd69, %rd70;
; CHECK-NEXT: add.s32 %r11, %r9, -64;
-; CHECK-NEXT: shr.u64 %rd70, %rd42, %r11;
+; CHECK-NEXT: shr.u64 %rd72, %rd42, %r11;
; CHECK-NEXT: setp.gt.s32 %p16, %r9, 63;
-; CHECK-NEXT: selp.b64 %rd105, %rd70, %rd69, %p16;
-; CHECK-NEXT: shr.u64 %rd106, %rd42, %r9;
+; CHECK-NEXT: selp.b64 %rd107, %rd72, %rd71, %p16;
+; CHECK-NEXT: shr.u64 %rd108, %rd42, %r9;
; CHECK-NEXT: add.cc.s64 %rd33, %rd3, -1;
; CHECK-NEXT: addc.cc.s64 %rd34, %rd4, -1;
-; CHECK-NEXT: mov.b64 %rd98, 0;
-; CHECK-NEXT: mov.b64 %rd101, %rd98;
+; CHECK-NEXT: mov.b64 %rd100, 0;
+; CHECK-NEXT: mov.b64 %rd103, %rd100;
; CHECK-NEXT: $L__BB1_2: // %udiv-do-while
; CHECK-NEXT: // =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: shr.u64 %rd71, %rd105, 63;
-; CHECK-NEXT: shl.b64 %rd72, %rd106, 1;
-; CHECK-NEXT: or.b64 %rd73, %rd72, %rd71;
-; CHECK-NEXT: shl.b64 %rd74, %rd105, 1;
-; CHECK-NEXT: shr.u64 %rd75, %rd108, 63;
-; CHECK-NEXT: or.b64 %rd76, %rd74, %rd75;
-; CHECK-NEXT: shr.u64 %rd77, %rd107, 63;
-; CHECK-NEXT: shl.b64 %rd78, %rd108, 1;
-; CHECK-NEXT: or.b64 %rd79, %rd78, %rd77;
-; CHECK-NEXT: shl.b64 %rd80, %rd107, 1;
-; CHECK-NEXT: or.b64 %rd107, %rd101, %rd80;
-; CHECK-NEXT: or.b64 %rd108, %rd98, %rd79;
-; CHECK-NEXT: sub.cc.s64 %rd81, %rd33, %rd76;
-; CHECK-NEXT: subc.cc.s64 %rd82, %rd34, %rd73;
-; CHECK-NEXT: shr.s64 %rd83, %rd82, 63;
-; CHECK-NEXT: and.b64 %rd101, %rd83, 1;
-; CHECK-NEXT: and.b64 %rd84, %rd83, %rd3;
-; CHECK-NEXT: and.b64 %rd85, %rd83, %rd4;
-; CHECK-NEXT: sub.cc.s64 %rd105, %rd76, %rd84;
-; CHECK-NEXT: subc.cc.s64 %rd106, %rd73, %rd85;
-; CHECK-NEXT: add.cc.s64 %rd103, %rd103, -1;
-; CHECK-NEXT: addc.cc.s64 %rd104, %rd104, -1;
-; CHECK-NEXT: or.b64 %rd86, %rd103, %rd104;
-; CHECK-NEXT: setp.eq.b64 %p17, %rd86, 0;
+; CHECK-NEXT: shr.u64 %rd73, %rd107, 63;
+; CHECK-NEXT: shl.b64 %rd74, %rd108, 1;
+; CHECK-NEXT: or.b64 %rd75, %rd74, %rd73;
+; CHECK-NEXT: shl.b64 %rd76, %rd107, 1;
+; CHECK-NEXT: shr.u64 %rd77, %rd110, 63;
+; CHECK-NEXT: or.b64 %rd78, %rd76, %rd77;
+; CHECK-NEXT: shr.u64 %rd79, %rd109, 63;
+; CHECK-NEXT: shl.b64 %rd80, %rd110, 1;
+; CHECK-NEXT: or.b64 %rd81, %rd80, %rd79;
+; CHECK-NEXT: shl.b64 %rd82, %rd109, 1;
+; CHECK-NEXT: or.b64 %rd109, %rd103, %rd82;
+; CHECK-NEXT: or.b64 %rd110, %rd100, %rd81;
+; CHECK-NEXT: sub.cc.s64 %rd83, %rd33, %rd78;
+; CHECK-NEXT: subc.cc.s64 %rd84, %rd34, %rd75;
+; CHECK-NEXT: shr.s64 %rd85, %rd84, 63;
+; CHECK-NEXT: and.b64 %rd103, %rd85, 1;
+; CHECK-NEXT: and.b64 %rd86, %rd85, %rd3;
+; CHECK-NEXT: and.b64 %rd87, %rd85, %rd4;
+; CHECK-NEXT: sub.cc.s64 %rd107, %rd78, %rd86;
+; CHECK-NEXT: subc.cc.s64 %rd108, %rd75, %rd87;
+; CHECK-NEXT: add.cc.s64 %rd105, %rd105, -1;
+; CHECK-NEXT: addc.cc.s64 %rd106, %rd106, -1;
+; CHECK-NEXT: or.b64 %rd88, %rd105, %rd106;
+; CHECK-NEXT: setp.eq.b64 %p17, %rd88, 0;
; CHECK-NEXT: @%p17 bra $L__BB1_4;
; CHECK-NEXT: bra.uni $L__BB1_2;
; CHECK-NEXT: $L__BB1_4: // %udiv-loop-exit
-; CHECK-NEXT: shr.u64 %rd87, %rd107, 63;
-; CHECK-NEXT: shl.b64 %rd88, %rd108, 1;
-; CHECK-NEXT: or.b64 %rd89, %rd88, %rd87;
-; CHECK-NEXT: shl.b64 %rd90, %rd107, 1;
-; CHECK-NEXT: or.b64 %rd109, %rd101, %rd90;
-; CHECK-NEXT: or.b64 %rd110, %rd98, %rd89;
+; CHECK-NEXT: shr.u64 %rd89, %rd109, 63;
+; CHECK-NEXT: shl.b64 %rd90, %rd110, 1;
+; CHECK-NEXT: or.b64 %rd91, %rd90, %rd89;
+; CHECK-NEXT: shl.b64 %rd92, %rd109, 1;
+; CHECK-NEXT: or.b64 %rd111, %rd103, %rd92;
+; CHECK-NEXT: or.b64 %rd112, %rd100, %rd91;
; CHECK-NEXT: $L__BB1_5: // %udiv-end
-; CHECK-NEXT: mul.hi.u64 %rd91, %rd3, %rd109;
-; CHECK-NEXT: mad.lo.s64 %rd92, %rd3, %rd110, %rd91;
-; CHECK-NEXT: mad.lo.s64 %rd93, %rd4, %rd109, %rd92;
-; CHECK-NEXT: mul.lo.s64 %rd94, %rd3, %rd109;
-; CHECK-NEXT: sub.cc.s64 %rd95, %rd41, %rd94;
-; CHECK-NEXT: subc.cc.s64 %rd96, %rd42, %rd93;
-; CHECK-NEXT: st.param.v2.b64 [func_retval0], {%rd95, %rd96};
+; CHECK-NEXT: mul.hi.u64 %rd93, %rd3, %rd111;
+; CHECK-NEXT: mad.lo.s64 %rd94, %rd3, %rd112, %rd93;
+; CHECK-NEXT: mad.lo.s64 %rd95, %rd4, %rd111, %rd94;
+; CHECK-NEXT: mul.lo.s64 %rd96, %rd3, %rd111;
+; CHECK-NEXT: sub.cc.s64 %rd97, %rd41, %rd96;
+; CHECK-NEXT: subc.cc.s64 %rd98, %rd42, %rd95;
+; CHECK-NEXT: st.param.v2.b64 [func_retval0], {%rd97, %rd98};
; CHECK-NEXT: ret;
%div = urem i128 %lhs, %rhs
ret i128 %div
@@ -307,9 +306,9 @@ define i128 @urem_i128_pow2k(i128 %lhs) {
define i128 @sdiv_i128(i128 %lhs, i128 %rhs) {
; CHECK-LABEL: sdiv_i128(
; CHECK: {
-; CHECK-NEXT: .reg .pred %p<22>;
+; CHECK-NEXT: .reg .pred %p<20>;
; CHECK-NEXT: .reg .b32 %r<12>;
-; CHECK-NEXT: .reg .b64 %rd<121>;
+; CHECK-NEXT: .reg .b64 %rd<122>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0: // %_udiv-special-cases
; CHECK-NEXT: ld.param.v2.b64 {%rd45, %rd46}, [sdiv_i128_param_0];
@@ -345,97 +344,96 @@ define i128 @sdiv_i128(i128 %lhs, i128 %rhs) {
; CHECK-NEXT: cvt.u64.u32 %rd63, %r4;
; CHECK-NEXT: add.s64 %rd64, %rd63, 64;
; CHECK-NEXT: selp.b64 %rd65, %rd62, %rd64, %p7;
-; CHECK-NEXT: mov.b64 %rd111, 0;
+; CHECK-NEXT: mov.b64 %rd112, 0;
; CHECK-NEXT: sub.cc.s64 %rd67, %rd61, %rd65;
-; CHECK-NEXT: subc.cc.s64 %rd8, %rd111, 0;
-; CHECK-NEXT: setp.ne.b64 %p8, %rd8, 0;
-; CHECK-NEXT: and.pred %p10, %p8, %p8;
-; CHECK-NEXT: setp.eq.b64 %p11, %rd8, 0;
-; CHECK-NEXT: setp.gt.u64 %p12, %rd67, 127;
-; CHECK-NEXT: and.pred %p13, %p11, %p12;
-; CHECK-NEXT: or.pred %p14, %p13, %p10;
-; CHECK-NEXT: or.pred %p15, %p5, %p14;
-; CHECK-NEXT: xor.b64 %rd68, %rd67, 127;
-; CHECK-NEXT: or.b64 %rd69, %rd68, %rd8;
-; CHECK-NEXT: setp.eq.b64 %p16, %rd69, 0;
-; CHECK-NEXT: selp.b64 %rd120, 0, %rd2, %p15;
-; CHECK-NEXT: selp.b64 %rd119, 0, %rd1, %p15;
-; CHECK-NEXT: or.pred %p17, %p15, %p16;
-; CHECK-NEXT: @%p17 bra $L__BB4_5;
+; CHECK-NEXT: subc.cc.s64 %rd68, %rd112, 0;
+; CHECK-NEXT: setp.gt.u64 %p8, %rd67, 127;
+; CHECK-NEXT: setp.eq.b64 %p9, %rd68, 0;
+; CHECK-NEXT: and.pred %p10, %p9, %p8;
+; CHECK-NEXT: setp.ne.b64 %p11, %rd68, 0;
+; CHECK-NEXT: or.pred %p12, %p10, %p11;
+; CHECK-NEXT: or.pred %p13, %p5, %p12;
+; CHECK-NEXT: xor.b64 %rd69, %rd67, 127;
+; CHECK-NEXT: or.b64 %rd70, %rd69, %rd68;
+; CHECK-NEXT: setp.eq.b64 %p14, %rd70, 0;
+; CHECK-NEXT: selp.b64 %rd121, 0, %rd2, %p13;
+; CHECK-NEXT: selp.b64 %rd120, 0, %rd1, %p13;
+; CHECK-NEXT: or.pred %p15, %p13, %p14;
+; CHECK-NEXT: @%p15 bra $L__BB4_5;
; CHECK-NEXT: // %bb.3: // %udiv-bb1
-; CHECK-NEXT: add.cc.s64 %rd113, %rd67, 1;
-; CHECK-NEXT: addc.cc.s64 %rd114, %rd8, 0;
-; CHECK-NEXT: or.b64 %rd72, %rd113, %rd114;
-; CHECK-NEXT: setp.eq.b64 %p18, %rd72, 0;
+; CHECK-NEXT: add.cc.s64 %rd114, %rd67, 1;
+; CHECK-NEXT: addc.cc.s64 %rd115, %rd68, 0;
+; CHECK-NEXT: or.b64 %rd73, %rd114, %rd115;
+; CHECK-NEXT: setp.eq.b64 %p16, %rd73, 0;
; CHECK-NEXT: cvt.u32.u64 %r5, %rd67;
; CHECK-NEXT: sub.s32 %r6, 127, %r5;
-; CHECK-NEXT: shl.b64 %rd73, %rd2, %r6;
+; CHECK-NEXT: shl.b64 %rd74, %rd2, %r6;
; CHECK-NEXT: sub.s32 %r7, 64, %r6;
-; CHECK-NEXT: shr.u64 %rd74, %rd1, %r7;
-; CHECK-NEXT: or.b64 %rd75, %rd73, %rd74;
+; CHECK-NEXT: shr.u64 %rd75, %rd1, %r7;
+; CHECK-NEXT: or.b64 %rd76, %rd74, %rd75;
; CHECK-NEXT: sub.s32 %r8, 63, %r5;
-; CHECK-NEXT: shl.b64 %rd76, %rd1, %r8;
-; CHECK-NEXT: setp.gt.s32 %p19, %r6, 63;
-; CHECK-NEXT: selp.b64 %rd118, %rd76, %rd75, %p19;
-; CHECK-NEXT: shl.b64 %rd117, %rd1, %r6;
-; CHECK-NEXT: mov.b64 %rd108, %rd111;
-; CHECK-NEXT: @%p18 bra $L__BB4_4;
+; CHECK-NEXT: shl.b64 %rd77, %rd1, %r8;
+; CHECK-NEXT: setp.gt.s32 %p17, %r6, 63;
+; CHECK-NEXT: selp.b64 %rd119, %rd77, %rd76, %p17;
+; CHECK-NEXT: shl.b64 %rd118, %rd1, %r6;
+; CHECK-NEXT: mov.b64 %rd109, %rd112;
+; CHECK-NEXT: @%p16 bra $L__BB4_4;
; CHECK-NEXT: // %bb.1: // %udiv-preheader
-; CHECK-NEXT: cvt.u32.u64 %r9, %rd113;
-; CHECK-NEXT: shr.u64 %rd79, %rd1, %r9;
+; CHECK-NEXT: cvt.u32.u64 %r9, %rd114;
+; CHECK-NEXT: shr.u64 %rd80, %rd1, %r9;
; CHECK-NEXT: sub.s32 %r10, 64, %r9;
-; CHECK-NEXT: shl.b64 %rd80, %rd2, %r10;
-; CHECK-NEXT: or.b64 %rd81, %rd79, %rd80;
+; CHECK-NEXT: shl.b64 %rd81, %rd2, %r10;
+; CHECK-NEXT: or.b64 %rd82, %rd80, %rd81;
; CHECK-NEXT: add.s32 %r11, %r9, -64;
-; CHECK-NEXT: shr.u64 %rd82, %rd2, %r11;
-; CHECK-NEXT: setp.gt.s32 %p20, %r9, 63;
-; CHECK-NEXT: selp.b64 %rd115, %rd82, %rd81, %p20;
-; CHECK-NEXT: shr.u64 %rd116, %rd2, %r9;
+; CHECK-NEXT: shr.u64 %rd83, %rd2, %r11;
+; CHECK-NEXT: setp.gt.s32 %p18, %r9, 63;
+; CHECK-NEXT: selp.b64 %rd116, %rd83, %rd82, %p18;
+; CHECK-NEXT: shr.u64 %rd117, %rd2, %r9;
; CHECK-NEXT: add.cc.s64 %rd35, %rd3, -1;
; CHECK-NEXT: addc.cc.s64 %rd36, %rd4, -1;
-; CHECK-NEXT: mov.b64 %rd108, 0;
-; CHECK-NEXT: mov.b64 %rd111, %rd108;
+; CHECK-NEXT: mov.b64 %rd109, 0;
+; CHECK-NEXT: mov.b64 %rd112, %rd109;
; CHECK-NEXT: $L__BB4_2: // %udiv-do-while
; CHECK-NEXT: // =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: shr.u64 %rd83, %rd115, 63;
-; CHECK-NEXT: shl.b64 %rd84, %rd116, 1;
-; CHECK-NEXT: or.b64 %rd85, %rd84, %rd83;
-; CHECK-NEXT: shl.b64 %rd86, %rd115, 1;
-; CHECK-NEXT: shr.u64 %rd87, %rd118, 63;
-; CHECK-NEXT: or.b64 %rd88, %rd86, %rd87;
-; CHECK-NEXT: shr.u64 %rd89, %rd117, 63;
-; CHECK-NEXT: shl.b64 %rd90, %rd118, 1;
-; CHECK-NEXT: or.b64 %rd91, %rd90, %rd89;
-; CHECK-NEXT: shl.b64 %rd92, %rd117, 1;
-; CHECK-NEXT: or.b64 %rd117, %rd111, %rd92;
-; CHECK-NEXT: or.b64 %rd118, %rd108, %rd91;
-; CHECK-NEXT: sub.cc.s64 %rd93, %rd35, %rd88;
-; CHECK-NEXT: subc.cc.s64 %rd94, %rd36, %rd85;
-; CHECK-NEXT: shr.s64 %rd95, %rd94, 63;
-; CHECK-NEXT: and.b64 %rd111, %rd95, 1;
-; CHECK-NEXT: and.b64 %rd96, %rd95, %rd3;
-; CHECK-NEXT: and.b64 %rd97, %rd95, %rd4;
-; CHECK-NEXT: sub.cc.s64 %rd115, %rd88, %rd96;
-; CHECK-NEXT: subc.cc.s64 %rd116, %rd85, %rd97;
-; CHECK-NEXT: add.cc.s64 %rd113, %rd113, -1;
-; CHECK-NEXT: addc.cc.s64 %rd114, %rd114, -1;
-; CHECK-NEXT: or.b64 %rd98, %rd113, %rd114;
-; CHECK-NEXT: setp.eq.b64 %p21, %rd98, 0;
-; CHECK-NEXT: @%p21 bra $L__BB4_4;
+; CHECK-NEXT: shr.u64 %rd84, %rd116, 63;
+; CHECK-NEXT: shl.b64 %rd85, %rd117, 1;
+; CHECK-NEXT: or.b64 %rd86, %rd85, %rd84;
+; CHECK-NEXT: shl.b64 %rd87, %rd116, 1;
+; CHECK-NEXT: shr.u64 %rd88, %rd119, 63;
+; CHECK-NEXT: or.b64 %rd89, %rd87, %rd88;
+; CHECK-NEXT: shr.u64 %rd90, %rd118, 63;
+; CHECK-NEXT: shl.b64 %rd91, %rd119, 1;
+; CHECK-NEXT: or.b64 %rd92, %rd91, %rd90;
+; CHECK-NEXT: shl.b64 %rd93, %rd118, 1;
+; CHECK-NEXT: or.b64 %rd118, %rd112, %rd93;
+; CHECK-NEXT: or.b64 %rd119, %rd109, %rd92;
+; CHECK-NEXT: sub.cc.s64 %rd94, %rd35, %rd89;
+; CHECK-NEXT: subc.cc.s64 %rd95, %rd36, %rd86;
+; CHECK-NEXT: shr.s64 %rd96, %rd95, 63;
+; CHECK-NEXT: and.b64 %rd112, %rd96, 1;
+; CHECK-NEXT: and.b64 %rd97, %rd96, %rd3;
+; CHECK-NEXT: and.b64 %rd98, %rd96, %rd4;
+; CHECK-NEXT: sub.cc.s64 %rd116, %rd89, %rd97;
+; CHECK-NEXT: subc.cc.s64 %rd117, %rd86, %rd98;
+; CHECK-NEXT: add.cc.s64 %rd114, %rd114, -1;
+; CHECK-NEXT: addc.cc.s64 %rd115, %rd115, -1;
+; CHECK-NEXT: or.b64 %rd99, %rd114, %rd115;
+; CHECK-NEXT: setp.eq.b64 %p19, %rd99, 0;
+; CHECK-NEXT: @%p19 bra $L__BB4_4;
; CHECK-NEXT: bra.uni $L__BB4_2;
; CHECK-NEXT: $L__BB4_4: // %udiv-loop-exit
-; CHECK-NEXT: shr.u64 %rd99, %rd117, 63;
-; CHECK-NEXT: shl.b64 %rd100, %rd118, 1;
-; CHECK-NEXT: or.b64 %rd101, %rd100, %rd99;
-; CHECK-NEXT: shl.b64 %rd102, %rd117, 1;
-; CHECK-NEXT: or.b64 %rd119, %rd111, %rd102;
-; CHECK-NEXT: or.b64 %rd120, %rd108, %rd101;
+; CHECK-NEXT: shr.u64 %rd100, %rd118, 63;
+; CHECK-NEXT: shl.b64 %rd101, %rd119, 1;
+; CHECK-NEXT: or.b64 %rd102, %rd101, %rd100;
+; CHECK-NEXT: shl.b64 %rd103, %rd118, 1;
+; CHECK-NEXT: or.b64 %rd120, %rd112, %rd103;
+; CHECK-NEXT: or.b64 %rd121, %rd109, %rd102;
; CHECK-NEXT: $L__BB4_5: // %udiv-end
-; CHECK-NEXT: xor.b64 %rd103, %rd119, %rd5;
; CHECK-NEXT: xor.b64 %rd104, %rd120, %rd5;
-; CHECK-NEXT: sub.cc.s64 %rd105, %rd103, %rd5;
-; CHECK-NEXT: subc.cc.s64 %rd106, %rd104, %rd5;
-; CHECK-NEXT: st.param.v2.b64 [func_retval0], {%rd105, %rd106};
+; CHECK-NEXT: xor.b64 %rd105, %rd121, %rd5;
+; CHECK-NEXT: sub.cc.s64 %rd106, %rd104, %rd5;
+; CHECK-NEXT: subc.cc.s64 %rd107, %rd105, %rd5;
+; CHECK-NEXT: st.param.v2.b64 [func_retval0], {%rd106, %rd107};
; CHECK-NEXT: ret;
%div = sdiv i128 %lhs, %rhs
ret i128 %div
@@ -446,7 +444,7 @@ define i128 @udiv_i128(i128 %lhs, i128 %rhs) {
; CHECK: {
; CHECK-NEXT: .reg .pred %p<18>;
; CHECK-NEXT: .reg .b32 %r<12>;
-; CHECK-NEXT: .reg .b64 %rd<105>;
+; CHECK-NEXT: .reg .b64 %rd<107>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0: // %_udiv-special-cases
; CHECK-NEXT: ld.param.v2.b64 {%rd41, %rd42}, [udiv_i128_param_0];
@@ -470,92 +468,92 @@ define i128 @udiv_i128(i128 %lhs, i128 %rhs) {
; CHECK-NEXT: cvt.u64.u32 %rd52, %r4;
; CHECK-NEXT: add.s64 %rd53, %rd52, 64;
; CHECK-NEXT: selp.b64 %rd54, %rd51, %rd53, %p5;
-; CHECK-NEXT: mov.b64 %rd95, 0;
-; CHECK-NEXT: sub.cc.s64 %rd5, %rd50, %rd54;
-; CHECK-NEXT: subc.cc.s64 %rd6, %rd95, 0;
-; CHECK-NEXT: setp.gt.u64 %p6, %rd5, 127;
-; CHECK-NEXT: setp.eq.b64 %p7, %rd6, 0;
+; CHECK-NEXT: mov.b64 %rd97, 0;
+; CHECK-NEXT: sub.cc.s64 %rd56, %rd50, %rd54;
+; CHECK-NEXT: subc.cc.s64 %rd57, %rd97, 0;
+; CHECK-NEXT: setp.gt.u64 %p6, %rd56, 127;
+; CHECK-NEXT: setp.eq.b64 %p7, %rd57, 0;
; CHECK-NEXT: and.pred %p8, %p7, %p6;
-; CHECK-NEXT: setp.ne.b64 %p9, %rd6, 0;
+; CHECK-NEXT: setp.ne.b64 %p9, %rd57, 0;
; CHECK-NEXT: or.pred %p10, %p8, %p9;
; CHECK-NEXT: or.pred %p11, %p3, %p10;
-; CHECK-NEXT: xor.b64 %rd56, %rd5, 127;
-; CHECK-NEXT: or.b64 %rd57, %rd56, %rd6;
-; CHECK-NEXT: setp.eq.b64 %p12, %rd57, 0;
-; CHECK-NEXT: selp.b64 %rd104, 0, %rd42, %p11;
-; CHECK-NEXT: selp.b64 %rd103, 0, %rd41, %p11;
+; CHECK-NEXT: xor.b64 %rd58, %rd56, 127;
+; CHECK-NEXT: or.b64 %rd59, %rd58, %rd57;
+; CHECK-NEXT: setp.eq.b64 %p12, %rd59, 0;
+; CHECK-NEXT: selp.b64 %rd106, 0, %rd42, %p11;
+; CHECK-NEXT: selp.b64 %rd105, 0, %rd41, %p11;
; CHECK-NEXT: or.pred %p13, %p11, %p12;
; CHECK-NEXT: @%p13 bra $L__BB5_5;
; CHECK-NEXT: // %bb.3: // %udiv-bb1
-; CHECK-NEXT: add.cc.s64 %rd97, %rd5, 1;
-; CHECK-NEXT: addc.cc.s64 %rd98, %rd6, 0;
-; CHECK-NEXT: or.b64 %rd60, %rd97, %rd98;
-; CHECK-NEXT: setp.eq.b64 %p14, %rd60, 0;
-; CHECK-NEXT: cvt.u32.u64 %r5, %rd5;
+; CHECK-NEXT: add.cc.s64 %rd99, %rd56, 1;
+; CHECK-NEXT: addc.cc.s64 %rd100, %rd57, 0;
+; CHECK-NEXT: or.b64 %rd62, %rd99, %rd100;
+; CHECK-NEXT: setp.eq.b64 %p14, %rd62, 0;
+; CHECK-NEXT: cvt.u32.u64 %r5, %rd56;
; CHECK-NEXT: sub.s32 %r6, 127, %r5;
-; CHECK-NEXT: shl.b64 %rd61, %rd42, %r6;
+; CHECK-NEXT: shl.b64 %rd63, %rd42, %r6;
; CHECK-NEXT: sub.s32 %r7, 64, %r6;
-; CHECK-NEXT: shr.u64 %rd62, %rd41, %r7;
-; CHECK-NEXT: or.b64 %rd63, %rd61, %rd62;
+; CHECK-NEXT: shr.u64 %rd64, %rd41, %r7;
+; CHECK-NEXT: or.b64 %rd65, %rd63, %rd64;
; CHECK-NEXT: sub.s32 %r8, 63, %r5;
-; CHECK-NEXT: shl.b64 %rd64, %rd41, %r8;
+; CHECK-NEXT: shl.b64 %rd66, %rd41, %r8;
; CHECK-NEXT: setp.gt.s32 %p15, %r6, 63;
-; CHECK-NEXT: selp.b64 %rd102, %rd64, %rd63, %p15;
-; CHECK-NEXT: shl.b64 %rd101, %rd41, %r6;
-; CHECK-NEXT: mov.b64 %rd92, %rd95;
+; CHECK-NEXT: selp.b64 %rd104, %rd66, %rd65, %p15;
+; CHECK-NEXT: shl.b64 %rd103, %rd41, %r6;
+; CHECK-NEXT: mov.b64 %rd94, %rd97;
; CHECK-NEXT: @%p14 bra $L__BB5_4;
; CHECK-NEXT: // %bb.1: // %udiv-preheader
-; CHECK-NEXT: cvt.u32.u64 %r9, %rd97;
-; CHECK-NEXT: shr.u64 %rd67, %rd41, %r9;
+; CHECK-NEXT: cvt.u32.u64 %r9, %rd99;
+; CHECK-NEXT: shr.u64 %rd69, %rd41, %r9;
; CHECK-NEXT: sub.s32 %r10, 64, %r9;
-; CHECK-NEXT: shl.b64 %rd68, %rd42, %r10;
-; CHECK-NEXT: or.b64 %rd69, %rd67, %rd68;
+; CHECK-NEXT: shl.b64 %rd70, %rd42, %r10;
+; CHECK-NEXT: or.b64 %rd71, %rd69, %rd70;
; CHECK-NEXT: add.s32 %r11, %r9, -64;
-; CHECK-NEXT: shr.u64 %rd70, %rd42, %r11;
+; CHECK-NEXT: shr.u64 %rd72, %rd42, %r11;
; CHECK-NEXT: setp.gt.s32 %p16, %r9, 63;
-; CHECK-NEXT: selp.b64 %rd99, %rd70, %rd69, %p16;
-; CHECK-NEXT: shr.u64 %rd100, %rd42, %r9;
+; CHECK-NEXT: selp.b64 %rd101, %rd72, %rd71, %p16;
+; CHECK-NEXT: shr.u64 %rd102, %rd42, %r9;
; CHECK-NEXT: add.cc.s64 %rd33, %rd43, -1;
; CHECK-NEXT: addc.cc.s64 %rd34, %rd44, -1;
-; CHECK-NEXT: mov.b64 %rd92, 0;
-; CHECK-NEXT: mov.b64 %rd95, %rd92;
+; CHECK-NEXT: mov.b64 %rd94, 0;
+; CHECK-NEXT: mov.b64 %rd97, %rd94;
; CHECK-NEXT: $L__BB5_2: // %udiv-do-while
; CHECK-NEXT: // =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: shr.u64 %rd71, %rd99, 63;
-; CHECK-NEXT: shl.b64 %rd72, %rd100, 1;
-; CHECK-NEXT: or.b64 %rd73, %rd72, %rd71;
-; CHECK-NEXT: shl.b64 %rd74, %rd99, 1;
-; CHECK-NEXT: shr.u64 %rd75, %rd102, 63;
-; CHECK-NEXT: or.b64 %rd76, %rd74, %rd75;
-; CHECK-NEXT: shr.u64 %rd77, %rd101, 63;
-; CHECK-NEXT: shl.b64 %rd78, %rd102, 1;
-; CHECK-NEXT: or.b64 %rd79, %rd78, %rd77;
-; CHECK-NEXT: shl.b64 %rd80, %rd101, 1;
-; CHECK-NEXT: or.b64 %rd101, %rd95, %rd80;
-; CHECK-NEXT: or.b64 %rd102, %rd92, %rd79;
-; CHECK-NEXT: sub.cc.s64 %rd81, %rd33, %rd76;
-; CHECK-NEXT: subc.cc.s64 %rd82, %rd34, %rd73;
-; CHECK-NEXT: shr.s64 %rd83, %rd82, 63;
-; CHECK-NEXT: and.b64 %rd95, %rd83, 1;
-; CHECK-NEXT: and.b64 %rd84, %rd83, %rd43;
-; CHECK-NEXT: and.b64 %rd85, %rd83, %rd44;
-; CHECK-NEXT: sub.cc.s64 %rd99, %rd76, %rd84;
-; CHECK-NEXT: subc.cc.s64 %rd100, %rd73, %rd85;
-; CHECK-NEXT: add.cc.s64 %rd97, %rd97, -1;
-; CHECK-NEXT: addc.cc.s64 %rd98, %rd98, -1;
-; CHECK-NEXT: or.b64 %rd86, %rd97, %rd98;
-; CHECK-NEXT: setp.eq.b64 %p17, %rd86, 0;
+; CHECK-NEXT: shr.u64 %rd73, %rd101, 63;
+; CHECK-NEXT: shl.b64 %rd74, %rd102, 1;
+; CHECK-NEXT: or.b64 %rd75, %rd74, %rd73;
+; CHECK-NEXT: shl.b64 %rd76, %rd101, 1;
+; CHECK-NEXT: shr.u64 %rd77, %rd104, 63;
+; CHECK-NEXT: or.b64 %rd78, %rd76, %rd77;
+; CHECK-NEXT: shr.u64 %rd79, %rd103, 63;
+; CHECK-NEXT: shl.b64 %rd80, %rd104, 1;
+; CHECK-NEXT: or.b64 %rd81, %rd80, %rd79;
+; CHECK-NEXT: shl.b64 %rd82, %rd103, 1;
+; CHECK-NEXT: or.b64 %rd103, %rd97, %rd82;
+; CHECK-NEXT: or.b64 %rd104, %rd94, %rd81;
+; CHECK-NEXT: sub.cc.s64 %rd83, %rd33, %rd78;
+; CHECK-NEXT: subc.cc.s64 %rd84, %rd34, %rd75;
+; CHECK-NEXT: shr.s64 %rd85, %rd84, 63;
+; CHECK-NEXT: and.b64 %rd97, %rd85, 1;
+; CHECK-NEXT: and.b64 %rd86, %rd85, %rd43;
+; CHECK-NEXT: and.b64 %rd87, %rd85, %rd44;
+; CHECK-NEXT: sub.cc.s64 %rd101, %rd78, %rd86;
+; CHECK-NEXT: subc.cc.s64 %rd102, %rd75, %rd87;
+; CHECK-NEXT: add.cc.s64 %rd99, %rd99, -1;
+; CHECK-NEXT: addc.cc.s64 %rd100, %rd100, -1;
+; CHECK-NEXT: or.b64 %rd88, %rd99, %rd100;
+; CHECK-NEXT: setp.eq.b64 %p17, %rd88, 0;
; CHECK-NEXT: @%p17 bra $L__BB5_4;
; CHECK-NEXT: bra.uni $L__BB5_2;
; CHECK-NEXT: $L__BB5_4: // %udiv-loop-exit
-; CHECK-NEXT: shr.u64 %rd87, %rd101, 63;
-; CHECK-NEXT: shl.b64 %rd88, %rd102, 1;
-; CHECK-NEXT: or.b64 %rd89, %rd88, %rd87;
-; CHECK-NEXT: shl.b64 %rd90, %rd101, 1;
-; CHECK-NEXT: or.b64 %rd103, %rd95, %rd90;
-; CHECK-NEXT: or.b64 %rd104, %rd92, %rd89;
+; CHECK-NEXT: shr.u64 %rd89, %rd103, 63;
+; CHECK-NEXT: shl.b64 %rd90, %rd104, 1;
+; CHECK-NEXT: or.b64 %rd91, %rd90, %rd89;
+; CHECK-NEXT: shl.b64 %rd92, %rd103, 1;
+; CHECK-NEXT: or.b64 %rd105, %rd97, %rd92;
+; CHECK-NEXT: or.b64 %rd106, %rd94, %rd91;
; CHECK-NEXT: $L__BB5_5: // %udiv-end
-; CHECK-NEXT: st.param.v2.b64 [func_retval0], {%rd103, %rd104};
+; CHECK-NEXT: st.param.v2.b64 [func_retval0], {%rd105, %rd106};
; CHECK-NEXT: ret;
%div = udiv i128 %lhs, %rhs
ret i128 %div
diff --git a/llvm/test/CodeGen/PowerPC/ppc64-P9-vabsd.ll b/llvm/test/CodeGen/PowerPC/ppc64-P9-vabsd.ll
index 821cfd0..b540948 100644
--- a/llvm/test/CodeGen/PowerPC/ppc64-P9-vabsd.ll
+++ b/llvm/test/CodeGen/PowerPC/ppc64-P9-vabsd.ll
@@ -764,8 +764,13 @@ define <16 x i8> @sub_absv_8_ext(<16 x i8> %a, <16 x i8> %b) local_unnamed_addr
;
; CHECK-PWR7-LABEL: sub_absv_8_ext:
; CHECK-PWR7: # %bb.0: # %entry
-; CHECK-PWR7-NEXT: stdu r1, -448(r1)
-; CHECK-PWR7-NEXT: .cfi_def_cfa_offset 448
+; CHECK-PWR7-NEXT: stdu r1, -512(r1)
+; CHECK-PWR7-NEXT: .cfi_def_cfa_offset 512
+; CHECK-PWR7-NEXT: .cfi_offset r14, -144
+; CHECK-PWR7-NEXT: .cfi_offset r15, -136
+; CHECK-PWR7-NEXT: .cfi_offset r16, -128
+; CHECK-PWR7-NEXT: .cfi_offset r17, -120
+; CHECK-PWR7-NEXT: .cfi_offset r18, -112
; CHECK-PWR7-NEXT: .cfi_offset r19, -104
; CHECK-PWR7-NEXT: .cfi_offset r20, -96
; CHECK-PWR7-NEXT: .cfi_offset r21, -88
@@ -778,258 +783,244 @@ define <16 x i8> @sub_absv_8_ext(<16 x i8> %a, <16 x i8> %b) local_unnamed_addr
; CHECK-PWR7-NEXT: .cfi_offset r28, -32
; CHECK-PWR7-NEXT: .cfi_offset r29, -24
; CHECK-PWR7-NEXT: .cfi_offset r30, -16
-; CHECK-PWR7-NEXT: addi r3, r1, 304
-; CHECK-PWR7-NEXT: std r19, 344(r1) # 8-byte Folded Spill
-; CHECK-PWR7-NEXT: std r20, 352(r1) # 8-byte Folded Spill
-; CHECK-PWR7-NEXT: std r21, 360(r1) # 8-byte Folded Spill
-; CHECK-PWR7-NEXT: std r22, 368(r1) # 8-byte Folded Spill
-; CHECK-PWR7-NEXT: std r23, 376(r1) # 8-byte Folded Spill
-; CHECK-PWR7-NEXT: std r24, 384(r1) # 8-byte Folded Spill
-; CHECK-PWR7-NEXT: std r25, 392(r1) # 8-byte Folded Spill
-; CHECK-PWR7-NEXT: std r26, 400(r1) # 8-byte Folded Spill
-; CHECK-PWR7-NEXT: std r27, 408(r1) # 8-byte Folded Spill
-; CHECK-PWR7-NEXT: std r28, 416(r1) # 8-byte Folded Spill
-; CHECK-PWR7-NEXT: std r29, 424(r1) # 8-byte Folded Spill
-; CHECK-PWR7-NEXT: std r30, 432(r1) # 8-byte Folded Spill
-; CHECK-PWR7-NEXT: stxvw4x v2, 0, r3
+; CHECK-PWR7-NEXT: .cfi_offset r31, -8
+; CHECK-PWR7-NEXT: .cfi_offset r2, -152
; CHECK-PWR7-NEXT: addi r3, r1, 320
-; CHECK-PWR7-NEXT: lbz r7, 304(r1)
-; CHECK-PWR7-NEXT: stxvw4x v3, 0, r3
-; CHECK-PWR7-NEXT: lbz r8, 320(r1)
-; CHECK-PWR7-NEXT: lbz r9, 305(r1)
-; CHECK-PWR7-NEXT: lbz r10, 321(r1)
-; CHECK-PWR7-NEXT: lbz r26, 325(r1)
-; CHECK-PWR7-NEXT: clrlwi r7, r7, 24
-; CHECK-PWR7-NEXT: clrlwi r8, r8, 24
-; CHECK-PWR7-NEXT: clrlwi r9, r9, 24
-; CHECK-PWR7-NEXT: clrlwi r10, r10, 24
-; CHECK-PWR7-NEXT: lbz r11, 306(r1)
-; CHECK-PWR7-NEXT: lbz r12, 322(r1)
-; CHECK-PWR7-NEXT: lbz r23, 314(r1)
-; CHECK-PWR7-NEXT: clrlwi r22, r26, 24
-; CHECK-PWR7-NEXT: lbz r26, 330(r1)
-; CHECK-PWR7-NEXT: sub r8, r7, r8
-; CHECK-PWR7-NEXT: lbz r7, 315(r1)
-; CHECK-PWR7-NEXT: sub r20, r9, r10
-; CHECK-PWR7-NEXT: lbz r9, 331(r1)
-; CHECK-PWR7-NEXT: lbz r0, 307(r1)
-; CHECK-PWR7-NEXT: lbz r30, 323(r1)
-; CHECK-PWR7-NEXT: clrlwi r11, r11, 24
-; CHECK-PWR7-NEXT: clrlwi r12, r12, 24
-; CHECK-PWR7-NEXT: clrlwi r23, r23, 24
-; CHECK-PWR7-NEXT: clrlwi r21, r26, 24
-; CHECK-PWR7-NEXT: clrlwi r7, r7, 24
-; CHECK-PWR7-NEXT: clrlwi r9, r9, 24
-; CHECK-PWR7-NEXT: clrlwi r0, r0, 24
-; CHECK-PWR7-NEXT: clrlwi r30, r30, 24
-; CHECK-PWR7-NEXT: lbz r29, 308(r1)
-; CHECK-PWR7-NEXT: lbz r28, 324(r1)
-; CHECK-PWR7-NEXT: lbz r27, 309(r1)
-; CHECK-PWR7-NEXT: lbz r25, 310(r1)
-; CHECK-PWR7-NEXT: lbz r24, 326(r1)
-; CHECK-PWR7-NEXT: sub r19, r11, r12
-; CHECK-PWR7-NEXT: sub r11, r23, r21
-; CHECK-PWR7-NEXT: sub r9, r7, r9
-; CHECK-PWR7-NEXT: sub r26, r0, r30
-; CHECK-PWR7-NEXT: srawi r12, r11, 31
-; CHECK-PWR7-NEXT: srawi r0, r9, 31
-; CHECK-PWR7-NEXT: lbz r3, 312(r1)
-; CHECK-PWR7-NEXT: clrlwi r29, r29, 24
-; CHECK-PWR7-NEXT: clrlwi r28, r28, 24
-; CHECK-PWR7-NEXT: clrlwi r27, r27, 24
-; CHECK-PWR7-NEXT: clrlwi r25, r25, 24
-; CHECK-PWR7-NEXT: clrlwi r24, r24, 24
-; CHECK-PWR7-NEXT: xor r11, r11, r12
-; CHECK-PWR7-NEXT: xor r9, r9, r0
-; CHECK-PWR7-NEXT: sub r28, r29, r28
-; CHECK-PWR7-NEXT: sub r30, r27, r22
-; CHECK-PWR7-NEXT: sub r29, r25, r24
-; CHECK-PWR7-NEXT: sub r27, r11, r12
-; CHECK-PWR7-NEXT: sub r24, r9, r0
-; CHECK-PWR7-NEXT: lbz r9, 316(r1)
-; CHECK-PWR7-NEXT: lbz r11, 332(r1)
-; CHECK-PWR7-NEXT: lbz r4, 328(r1)
-; CHECK-PWR7-NEXT: lbz r5, 311(r1)
-; CHECK-PWR7-NEXT: lbz r6, 327(r1)
-; CHECK-PWR7-NEXT: clrlwi r11, r11, 24
-; CHECK-PWR7-NEXT: clrlwi r3, r3, 24
-; CHECK-PWR7-NEXT: clrlwi r4, r4, 24
-; CHECK-PWR7-NEXT: clrlwi r5, r5, 24
-; CHECK-PWR7-NEXT: clrlwi r6, r6, 24
-; CHECK-PWR7-NEXT: sub r3, r3, r4
+; CHECK-PWR7-NEXT: std r14, 368(r1) # 8-byte Folded Spill
+; CHECK-PWR7-NEXT: std r15, 376(r1) # 8-byte Folded Spill
+; CHECK-PWR7-NEXT: std r16, 384(r1) # 8-byte Folded Spill
+; CHECK-PWR7-NEXT: std r17, 392(r1) # 8-byte Folded Spill
+; CHECK-PWR7-NEXT: std r18, 400(r1) # 8-byte Folded Spill
+; CHECK-PWR7-NEXT: std r19, 408(r1) # 8-byte Folded Spill
+; CHECK-PWR7-NEXT: std r20, 416(r1) # 8-byte Folded Spill
+; CHECK-PWR7-NEXT: std r21, 424(r1) # 8-byte Folded Spill
+; CHECK-PWR7-NEXT: std r22, 432(r1) # 8-byte Folded Spill
+; CHECK-PWR7-NEXT: std r23, 440(r1) # 8-byte Folded Spill
+; CHECK-PWR7-NEXT: std r24, 448(r1) # 8-byte Folded Spill
+; CHECK-PWR7-NEXT: std r25, 456(r1) # 8-byte Folded Spill
+; CHECK-PWR7-NEXT: std r26, 464(r1) # 8-byte Folded Spill
+; CHECK-PWR7-NEXT: std r27, 472(r1) # 8-byte Folded Spill
+; CHECK-PWR7-NEXT: std r28, 480(r1) # 8-byte Folded Spill
+; CHECK-PWR7-NEXT: std r29, 488(r1) # 8-byte Folded Spill
+; CHECK-PWR7-NEXT: std r30, 496(r1) # 8-byte Folded Spill
+; CHECK-PWR7-NEXT: std r31, 504(r1) # 8-byte Folded Spill
+; CHECK-PWR7-NEXT: std r2, 360(r1) # 8-byte Folded Spill
+; CHECK-PWR7-NEXT: stxvw4x v2, 0, r3
+; CHECK-PWR7-NEXT: lbz r3, 320(r1)
+; CHECK-PWR7-NEXT: addi r4, r1, 336
+; CHECK-PWR7-NEXT: stw r3, 60(r1) # 4-byte Folded Spill
+; CHECK-PWR7-NEXT: stxvw4x v3, 0, r4
+; CHECK-PWR7-NEXT: lbz r15, 334(r1)
+; CHECK-PWR7-NEXT: lbz r14, 350(r1)
+; CHECK-PWR7-NEXT: lbz r31, 335(r1)
+; CHECK-PWR7-NEXT: lbz r2, 351(r1)
+; CHECK-PWR7-NEXT: sub r15, r15, r14
+; CHECK-PWR7-NEXT: sub r14, r31, r2
+; CHECK-PWR7-NEXT: srawi r2, r14, 31
+; CHECK-PWR7-NEXT: xor r14, r14, r2
+; CHECK-PWR7-NEXT: lbz r3, 333(r1)
+; CHECK-PWR7-NEXT: lbz r19, 331(r1)
+; CHECK-PWR7-NEXT: lbz r18, 347(r1)
+; CHECK-PWR7-NEXT: sub r19, r19, r18
+; CHECK-PWR7-NEXT: lbz r17, 332(r1)
+; CHECK-PWR7-NEXT: lbz r16, 348(r1)
+; CHECK-PWR7-NEXT: sub r17, r17, r16
+; CHECK-PWR7-NEXT: lbz r23, 329(r1)
+; CHECK-PWR7-NEXT: sub r14, r14, r2
+; CHECK-PWR7-NEXT: lbz r2, 349(r1)
+; CHECK-PWR7-NEXT: lbz r22, 345(r1)
+; CHECK-PWR7-NEXT: lbz r4, 336(r1)
+; CHECK-PWR7-NEXT: lbz r5, 321(r1)
+; CHECK-PWR7-NEXT: lbz r6, 337(r1)
+; CHECK-PWR7-NEXT: lbz r7, 322(r1)
+; CHECK-PWR7-NEXT: lbz r8, 338(r1)
+; CHECK-PWR7-NEXT: lbz r9, 323(r1)
+; CHECK-PWR7-NEXT: lbz r10, 339(r1)
+; CHECK-PWR7-NEXT: lbz r11, 324(r1)
+; CHECK-PWR7-NEXT: lbz r12, 340(r1)
+; CHECK-PWR7-NEXT: lbz r0, 325(r1)
+; CHECK-PWR7-NEXT: lbz r30, 341(r1)
+; CHECK-PWR7-NEXT: lbz r29, 326(r1)
+; CHECK-PWR7-NEXT: lbz r28, 342(r1)
+; CHECK-PWR7-NEXT: lbz r27, 327(r1)
+; CHECK-PWR7-NEXT: lbz r26, 343(r1)
+; CHECK-PWR7-NEXT: sub r3, r3, r2
+; CHECK-PWR7-NEXT: lbz r25, 328(r1)
+; CHECK-PWR7-NEXT: lbz r24, 344(r1)
+; CHECK-PWR7-NEXT: lbz r21, 330(r1)
+; CHECK-PWR7-NEXT: lbz r20, 346(r1)
; CHECK-PWR7-NEXT: sub r5, r5, r6
-; CHECK-PWR7-NEXT: clrlwi r9, r9, 24
-; CHECK-PWR7-NEXT: srawi r4, r3, 31
+; CHECK-PWR7-NEXT: srawi r18, r3, 31
+; CHECK-PWR7-NEXT: sub r7, r7, r8
+; CHECK-PWR7-NEXT: sub r9, r9, r10
+; CHECK-PWR7-NEXT: sub r11, r11, r12
+; CHECK-PWR7-NEXT: sub r0, r0, r30
+; CHECK-PWR7-NEXT: sub r29, r29, r28
+; CHECK-PWR7-NEXT: sub r27, r27, r26
+; CHECK-PWR7-NEXT: sub r25, r25, r24
+; CHECK-PWR7-NEXT: srawi r31, r15, 31
+; CHECK-PWR7-NEXT: ld r2, 360(r1) # 8-byte Folded Reload
+; CHECK-PWR7-NEXT: xor r3, r3, r18
; CHECK-PWR7-NEXT: srawi r6, r5, 31
-; CHECK-PWR7-NEXT: xor r3, r3, r4
-; CHECK-PWR7-NEXT: sldi r27, r27, 56
-; CHECK-PWR7-NEXT: xor r5, r5, r6
-; CHECK-PWR7-NEXT: sub r9, r9, r11
-; CHECK-PWR7-NEXT: sub r3, r3, r4
-; CHECK-PWR7-NEXT: sldi r24, r24, 56
+; CHECK-PWR7-NEXT: srawi r8, r7, 31
+; CHECK-PWR7-NEXT: srawi r10, r9, 31
+; CHECK-PWR7-NEXT: srawi r12, r11, 31
+; CHECK-PWR7-NEXT: srawi r30, r0, 31
+; CHECK-PWR7-NEXT: sub r3, r3, r18
+; CHECK-PWR7-NEXT: srawi r18, r19, 31
+; CHECK-PWR7-NEXT: srawi r28, r29, 31
+; CHECK-PWR7-NEXT: ld r16, 384(r1) # 8-byte Folded Reload
; CHECK-PWR7-NEXT: sldi r3, r3, 56
-; CHECK-PWR7-NEXT: srawi r11, r9, 31
-; CHECK-PWR7-NEXT: std r27, 208(r1)
-; CHECK-PWR7-NEXT: sub r4, r5, r6
-; CHECK-PWR7-NEXT: std r27, 216(r1)
-; CHECK-PWR7-NEXT: srawi r27, r29, 31
-; CHECK-PWR7-NEXT: lbz r10, 313(r1)
-; CHECK-PWR7-NEXT: xor r9, r9, r11
-; CHECK-PWR7-NEXT: std r24, 224(r1)
-; CHECK-PWR7-NEXT: lbz r22, 329(r1)
-; CHECK-PWR7-NEXT: std r24, 232(r1)
-; CHECK-PWR7-NEXT: srawi r24, r30, 31
-; CHECK-PWR7-NEXT: ld r21, 360(r1) # 8-byte Folded Reload
-; CHECK-PWR7-NEXT: sub r23, r9, r11
-; CHECK-PWR7-NEXT: lbz r9, 317(r1)
-; CHECK-PWR7-NEXT: lbz r11, 333(r1)
-; CHECK-PWR7-NEXT: xor r29, r29, r27
-; CHECK-PWR7-NEXT: std r3, 176(r1)
-; CHECK-PWR7-NEXT: std r3, 184(r1)
-; CHECK-PWR7-NEXT: sldi r3, r4, 56
-; CHECK-PWR7-NEXT: sldi r23, r23, 56
-; CHECK-PWR7-NEXT: xor r30, r30, r24
-; CHECK-PWR7-NEXT: clrlwi r9, r9, 24
-; CHECK-PWR7-NEXT: clrlwi r11, r11, 24
-; CHECK-PWR7-NEXT: sub r4, r30, r24
-; CHECK-PWR7-NEXT: ld r30, 432(r1) # 8-byte Folded Reload
-; CHECK-PWR7-NEXT: std r3, 160(r1)
-; CHECK-PWR7-NEXT: std r3, 168(r1)
-; CHECK-PWR7-NEXT: sub r9, r9, r11
-; CHECK-PWR7-NEXT: sub r3, r29, r27
-; CHECK-PWR7-NEXT: std r23, 240(r1)
-; CHECK-PWR7-NEXT: ld r29, 424(r1) # 8-byte Folded Reload
-; CHECK-PWR7-NEXT: srawi r11, r9, 31
-; CHECK-PWR7-NEXT: std r23, 248(r1)
-; CHECK-PWR7-NEXT: ld r27, 408(r1) # 8-byte Folded Reload
-; CHECK-PWR7-NEXT: srawi r23, r28, 31
+; CHECK-PWR7-NEXT: srawi r26, r27, 31
+; CHECK-PWR7-NEXT: srawi r24, r25, 31
+; CHECK-PWR7-NEXT: xor r19, r19, r18
+; CHECK-PWR7-NEXT: xor r15, r15, r31
+; CHECK-PWR7-NEXT: xor r5, r5, r6
+; CHECK-PWR7-NEXT: std r3, 272(r1)
+; CHECK-PWR7-NEXT: std r3, 280(r1)
+; CHECK-PWR7-NEXT: srawi r3, r17, 31
+; CHECK-PWR7-NEXT: sub r19, r19, r18
+; CHECK-PWR7-NEXT: xor r7, r7, r8
+; CHECK-PWR7-NEXT: sub r15, r15, r31
+; CHECK-PWR7-NEXT: xor r17, r17, r3
+; CHECK-PWR7-NEXT: xor r9, r9, r10
+; CHECK-PWR7-NEXT: xor r11, r11, r12
+; CHECK-PWR7-NEXT: xor r0, r0, r30
+; CHECK-PWR7-NEXT: xor r29, r29, r28
+; CHECK-PWR7-NEXT: xor r27, r27, r26
+; CHECK-PWR7-NEXT: sub r3, r17, r3
+; CHECK-PWR7-NEXT: xor r25, r25, r24
+; CHECK-PWR7-NEXT: sub r25, r25, r24
+; CHECK-PWR7-NEXT: sub r27, r27, r26
+; CHECK-PWR7-NEXT: sub r29, r29, r28
; CHECK-PWR7-NEXT: sldi r3, r3, 56
-; CHECK-PWR7-NEXT: xor r28, r28, r23
-; CHECK-PWR7-NEXT: xor r9, r9, r11
-; CHECK-PWR7-NEXT: std r3, 144(r1)
-; CHECK-PWR7-NEXT: ld r24, 384(r1) # 8-byte Folded Reload
-; CHECK-PWR7-NEXT: std r3, 152(r1)
-; CHECK-PWR7-NEXT: sldi r3, r4, 56
-; CHECK-PWR7-NEXT: sub r25, r9, r11
-; CHECK-PWR7-NEXT: lbz r9, 318(r1)
-; CHECK-PWR7-NEXT: lbz r11, 334(r1)
-; CHECK-PWR7-NEXT: std r3, 128(r1)
+; CHECK-PWR7-NEXT: sub r0, r0, r30
+; CHECK-PWR7-NEXT: sub r11, r11, r12
+; CHECK-PWR7-NEXT: sub r9, r9, r10
+; CHECK-PWR7-NEXT: sub r7, r7, r8
+; CHECK-PWR7-NEXT: sub r5, r5, r6
+; CHECK-PWR7-NEXT: sldi r14, r14, 56
+; CHECK-PWR7-NEXT: sldi r15, r15, 56
+; CHECK-PWR7-NEXT: ld r31, 504(r1) # 8-byte Folded Reload
+; CHECK-PWR7-NEXT: std r3, 256(r1)
+; CHECK-PWR7-NEXT: std r3, 264(r1)
+; CHECK-PWR7-NEXT: sldi r3, r19, 56
; CHECK-PWR7-NEXT: sldi r25, r25, 56
-; CHECK-PWR7-NEXT: std r3, 136(r1)
-; CHECK-PWR7-NEXT: sub r3, r28, r23
+; CHECK-PWR7-NEXT: sldi r27, r27, 56
+; CHECK-PWR7-NEXT: std r3, 240(r1)
+; CHECK-PWR7-NEXT: std r3, 248(r1)
+; CHECK-PWR7-NEXT: sub r3, r23, r22
+; CHECK-PWR7-NEXT: srawi r23, r3, 31
+; CHECK-PWR7-NEXT: sub r22, r21, r20
+; CHECK-PWR7-NEXT: srawi r21, r22, 31
+; CHECK-PWR7-NEXT: sldi r29, r29, 56
+; CHECK-PWR7-NEXT: sldi r0, r0, 56
+; CHECK-PWR7-NEXT: sldi r11, r11, 56
+; CHECK-PWR7-NEXT: xor r3, r3, r23
+; CHECK-PWR7-NEXT: xor r22, r22, r21
+; CHECK-PWR7-NEXT: sldi r9, r9, 56
+; CHECK-PWR7-NEXT: sldi r7, r7, 56
+; CHECK-PWR7-NEXT: sldi r5, r5, 56
+; CHECK-PWR7-NEXT: ld r30, 496(r1) # 8-byte Folded Reload
+; CHECK-PWR7-NEXT: ld r28, 480(r1) # 8-byte Folded Reload
+; CHECK-PWR7-NEXT: sub r3, r3, r23
+; CHECK-PWR7-NEXT: sub r22, r22, r21
+; CHECK-PWR7-NEXT: std r14, 304(r1)
+; CHECK-PWR7-NEXT: ld r26, 464(r1) # 8-byte Folded Reload
; CHECK-PWR7-NEXT: sldi r3, r3, 56
-; CHECK-PWR7-NEXT: std r3, 112(r1)
-; CHECK-PWR7-NEXT: ld r28, 416(r1) # 8-byte Folded Reload
-; CHECK-PWR7-NEXT: clrlwi r9, r9, 24
-; CHECK-PWR7-NEXT: clrlwi r11, r11, 24
-; CHECK-PWR7-NEXT: clrlwi r10, r10, 24
-; CHECK-PWR7-NEXT: std r25, 256(r1)
-; CHECK-PWR7-NEXT: std r25, 264(r1)
-; CHECK-PWR7-NEXT: sub r9, r9, r11
-; CHECK-PWR7-NEXT: srawi r25, r26, 31
-; CHECK-PWR7-NEXT: xor r26, r26, r25
-; CHECK-PWR7-NEXT: ld r23, 376(r1) # 8-byte Folded Reload
-; CHECK-PWR7-NEXT: srawi r11, r9, 31
-; CHECK-PWR7-NEXT: std r3, 120(r1)
-; CHECK-PWR7-NEXT: sub r4, r26, r25
-; CHECK-PWR7-NEXT: clrlwi r22, r22, 24
-; CHECK-PWR7-NEXT: srawi r7, r8, 31
-; CHECK-PWR7-NEXT: sub r10, r10, r22
-; CHECK-PWR7-NEXT: ld r26, 400(r1) # 8-byte Folded Reload
-; CHECK-PWR7-NEXT: xor r9, r9, r11
-; CHECK-PWR7-NEXT: sldi r3, r4, 56
-; CHECK-PWR7-NEXT: srawi r22, r10, 31
-; CHECK-PWR7-NEXT: xor r8, r8, r7
-; CHECK-PWR7-NEXT: xor r10, r10, r22
-; CHECK-PWR7-NEXT: sub r10, r10, r22
-; CHECK-PWR7-NEXT: ld r25, 392(r1) # 8-byte Folded Reload
-; CHECK-PWR7-NEXT: sub r12, r9, r11
-; CHECK-PWR7-NEXT: lbz r9, 319(r1)
-; CHECK-PWR7-NEXT: lbz r11, 335(r1)
-; CHECK-PWR7-NEXT: std r3, 96(r1)
-; CHECK-PWR7-NEXT: sldi r12, r12, 56
-; CHECK-PWR7-NEXT: std r3, 104(r1)
-; CHECK-PWR7-NEXT: ld r22, 368(r1) # 8-byte Folded Reload
-; CHECK-PWR7-NEXT: sldi r10, r10, 56
-; CHECK-PWR7-NEXT: std r10, 192(r1)
-; CHECK-PWR7-NEXT: clrlwi r9, r9, 24
-; CHECK-PWR7-NEXT: clrlwi r11, r11, 24
-; CHECK-PWR7-NEXT: sub r9, r9, r11
-; CHECK-PWR7-NEXT: std r12, 272(r1)
-; CHECK-PWR7-NEXT: std r12, 280(r1)
-; CHECK-PWR7-NEXT: srawi r12, r19, 31
-; CHECK-PWR7-NEXT: xor r0, r19, r12
-; CHECK-PWR7-NEXT: ld r19, 344(r1) # 8-byte Folded Reload
-; CHECK-PWR7-NEXT: sub r3, r0, r12
-; CHECK-PWR7-NEXT: srawi r11, r9, 31
-; CHECK-PWR7-NEXT: std r10, 200(r1)
-; CHECK-PWR7-NEXT: xor r9, r9, r11
+; CHECK-PWR7-NEXT: sldi r22, r22, 56
+; CHECK-PWR7-NEXT: ld r24, 448(r1) # 8-byte Folded Reload
+; CHECK-PWR7-NEXT: ld r23, 440(r1) # 8-byte Folded Reload
+; CHECK-PWR7-NEXT: std r14, 312(r1)
+; CHECK-PWR7-NEXT: std r15, 288(r1)
+; CHECK-PWR7-NEXT: std r3, 208(r1)
+; CHECK-PWR7-NEXT: std r3, 216(r1)
+; CHECK-PWR7-NEXT: lwz r3, 60(r1) # 4-byte Folded Reload
+; CHECK-PWR7-NEXT: std r15, 296(r1)
+; CHECK-PWR7-NEXT: ld r21, 424(r1) # 8-byte Folded Reload
+; CHECK-PWR7-NEXT: ld r20, 416(r1) # 8-byte Folded Reload
+; CHECK-PWR7-NEXT: std r22, 224(r1)
+; CHECK-PWR7-NEXT: std r22, 232(r1)
+; CHECK-PWR7-NEXT: sub r4, r3, r4
+; CHECK-PWR7-NEXT: std r25, 192(r1)
+; CHECK-PWR7-NEXT: ld r22, 432(r1) # 8-byte Folded Reload
+; CHECK-PWR7-NEXT: ld r19, 408(r1) # 8-byte Folded Reload
+; CHECK-PWR7-NEXT: srawi r3, r4, 31
+; CHECK-PWR7-NEXT: std r25, 200(r1)
+; CHECK-PWR7-NEXT: ld r25, 456(r1) # 8-byte Folded Reload
+; CHECK-PWR7-NEXT: std r27, 176(r1)
+; CHECK-PWR7-NEXT: std r27, 184(r1)
+; CHECK-PWR7-NEXT: xor r4, r4, r3
+; CHECK-PWR7-NEXT: std r29, 160(r1)
+; CHECK-PWR7-NEXT: ld r27, 472(r1) # 8-byte Folded Reload
+; CHECK-PWR7-NEXT: std r29, 168(r1)
+; CHECK-PWR7-NEXT: std r0, 144(r1)
+; CHECK-PWR7-NEXT: sub r3, r4, r3
+; CHECK-PWR7-NEXT: std r0, 152(r1)
+; CHECK-PWR7-NEXT: ld r29, 488(r1) # 8-byte Folded Reload
+; CHECK-PWR7-NEXT: ld r18, 400(r1) # 8-byte Folded Reload
; CHECK-PWR7-NEXT: sldi r3, r3, 56
-; CHECK-PWR7-NEXT: sub r9, r9, r11
-; CHECK-PWR7-NEXT: std r3, 80(r1)
-; CHECK-PWR7-NEXT: std r3, 88(r1)
-; CHECK-PWR7-NEXT: sldi r9, r9, 56
-; CHECK-PWR7-NEXT: std r9, 288(r1)
-; CHECK-PWR7-NEXT: std r9, 296(r1)
-; CHECK-PWR7-NEXT: srawi r9, r20, 31
-; CHECK-PWR7-NEXT: xor r11, r20, r9
-; CHECK-PWR7-NEXT: ld r20, 352(r1) # 8-byte Folded Reload
-; CHECK-PWR7-NEXT: sub r4, r11, r9
-; CHECK-PWR7-NEXT: sldi r3, r4, 56
+; CHECK-PWR7-NEXT: std r11, 128(r1)
+; CHECK-PWR7-NEXT: ld r17, 392(r1) # 8-byte Folded Reload
+; CHECK-PWR7-NEXT: std r11, 136(r1)
+; CHECK-PWR7-NEXT: std r9, 112(r1)
; CHECK-PWR7-NEXT: std r3, 64(r1)
; CHECK-PWR7-NEXT: std r3, 72(r1)
-; CHECK-PWR7-NEXT: sub r3, r8, r7
-; CHECK-PWR7-NEXT: sldi r3, r3, 56
-; CHECK-PWR7-NEXT: std r3, 48(r1)
-; CHECK-PWR7-NEXT: std r3, 56(r1)
-; CHECK-PWR7-NEXT: addi r3, r1, 288
+; CHECK-PWR7-NEXT: addi r3, r1, 304
+; CHECK-PWR7-NEXT: std r9, 120(r1)
+; CHECK-PWR7-NEXT: ld r15, 376(r1) # 8-byte Folded Reload
+; CHECK-PWR7-NEXT: std r7, 96(r1)
+; CHECK-PWR7-NEXT: std r7, 104(r1)
+; CHECK-PWR7-NEXT: std r5, 80(r1)
+; CHECK-PWR7-NEXT: std r5, 88(r1)
; CHECK-PWR7-NEXT: lxvw4x v2, 0, r3
-; CHECK-PWR7-NEXT: addi r3, r1, 272
+; CHECK-PWR7-NEXT: addi r3, r1, 288
; CHECK-PWR7-NEXT: lxvw4x v3, 0, r3
-; CHECK-PWR7-NEXT: addi r3, r1, 256
+; CHECK-PWR7-NEXT: addi r3, r1, 272
+; CHECK-PWR7-NEXT: ld r14, 368(r1) # 8-byte Folded Reload
; CHECK-PWR7-NEXT: vmrghb v2, v3, v2
; CHECK-PWR7-NEXT: lxvw4x v3, 0, r3
-; CHECK-PWR7-NEXT: addi r3, r1, 240
+; CHECK-PWR7-NEXT: addi r3, r1, 256
; CHECK-PWR7-NEXT: lxvw4x v4, 0, r3
-; CHECK-PWR7-NEXT: addi r3, r1, 224
+; CHECK-PWR7-NEXT: addi r3, r1, 240
; CHECK-PWR7-NEXT: vmrghb v3, v4, v3
; CHECK-PWR7-NEXT: vmrghh v2, v3, v2
; CHECK-PWR7-NEXT: lxvw4x v3, 0, r3
-; CHECK-PWR7-NEXT: addi r3, r1, 208
+; CHECK-PWR7-NEXT: addi r3, r1, 224
; CHECK-PWR7-NEXT: lxvw4x v4, 0, r3
-; CHECK-PWR7-NEXT: addi r3, r1, 192
+; CHECK-PWR7-NEXT: addi r3, r1, 208
; CHECK-PWR7-NEXT: vmrghb v3, v4, v3
; CHECK-PWR7-NEXT: lxvw4x v4, 0, r3
-; CHECK-PWR7-NEXT: addi r3, r1, 176
+; CHECK-PWR7-NEXT: addi r3, r1, 192
; CHECK-PWR7-NEXT: lxvw4x v5, 0, r3
-; CHECK-PWR7-NEXT: addi r3, r1, 160
+; CHECK-PWR7-NEXT: addi r3, r1, 176
; CHECK-PWR7-NEXT: vmrghb v4, v5, v4
; CHECK-PWR7-NEXT: vmrghh v3, v4, v3
; CHECK-PWR7-NEXT: xxmrghw vs0, v3, v2
; CHECK-PWR7-NEXT: lxvw4x v2, 0, r3
-; CHECK-PWR7-NEXT: addi r3, r1, 144
+; CHECK-PWR7-NEXT: addi r3, r1, 160
; CHECK-PWR7-NEXT: lxvw4x v3, 0, r3
-; CHECK-PWR7-NEXT: addi r3, r1, 128
+; CHECK-PWR7-NEXT: addi r3, r1, 144
; CHECK-PWR7-NEXT: vmrghb v2, v3, v2
; CHECK-PWR7-NEXT: lxvw4x v3, 0, r3
-; CHECK-PWR7-NEXT: addi r3, r1, 112
+; CHECK-PWR7-NEXT: addi r3, r1, 128
; CHECK-PWR7-NEXT: lxvw4x v4, 0, r3
-; CHECK-PWR7-NEXT: addi r3, r1, 96
; CHECK-PWR7-NEXT: vmrghb v3, v4, v3
+; CHECK-PWR7-NEXT: addi r3, r1, 112
; CHECK-PWR7-NEXT: vmrghh v2, v3, v2
; CHECK-PWR7-NEXT: lxvw4x v3, 0, r3
-; CHECK-PWR7-NEXT: addi r3, r1, 80
+; CHECK-PWR7-NEXT: addi r3, r1, 96
; CHECK-PWR7-NEXT: lxvw4x v4, 0, r3
-; CHECK-PWR7-NEXT: addi r3, r1, 64
+; CHECK-PWR7-NEXT: addi r3, r1, 80
; CHECK-PWR7-NEXT: vmrghb v3, v4, v3
; CHECK-PWR7-NEXT: lxvw4x v4, 0, r3
-; CHECK-PWR7-NEXT: addi r3, r1, 48
+; CHECK-PWR7-NEXT: addi r3, r1, 64
; CHECK-PWR7-NEXT: lxvw4x v5, 0, r3
; CHECK-PWR7-NEXT: vmrghb v4, v5, v4
; CHECK-PWR7-NEXT: vmrghh v3, v4, v3
; CHECK-PWR7-NEXT: xxmrghw vs1, v3, v2
; CHECK-PWR7-NEXT: xxmrghd v2, vs1, vs0
-; CHECK-PWR7-NEXT: addi r1, r1, 448
+; CHECK-PWR7-NEXT: addi r1, r1, 512
; CHECK-PWR7-NEXT: blr
entry:
%vecext = extractelement <16 x i8> %a, i32 0
diff --git a/llvm/test/CodeGen/RISCV/fpclamptosat.ll b/llvm/test/CodeGen/RISCV/fpclamptosat.ll
index 246e6a6..117e3e4 100644
--- a/llvm/test/CodeGen/RISCV/fpclamptosat.ll
+++ b/llvm/test/CodeGen/RISCV/fpclamptosat.ll
@@ -3292,30 +3292,30 @@ define i64 @ustest_f64i64_mm(double %x) {
; RV32IF-NEXT: mv a1, a0
; RV32IF-NEXT: addi a0, sp, 8
; RV32IF-NEXT: call __fixdfti
-; RV32IF-NEXT: lw a0, 8(sp)
-; RV32IF-NEXT: lw a1, 12(sp)
-; RV32IF-NEXT: lw a2, 20(sp)
+; RV32IF-NEXT: lw a0, 20(sp)
+; RV32IF-NEXT: lw a1, 8(sp)
+; RV32IF-NEXT: lw a2, 12(sp)
; RV32IF-NEXT: lw a3, 16(sp)
-; RV32IF-NEXT: beqz a2, .LBB47_2
+; RV32IF-NEXT: beqz a0, .LBB47_2
; RV32IF-NEXT: # %bb.1: # %entry
-; RV32IF-NEXT: slti a4, a2, 0
+; RV32IF-NEXT: slti a4, a0, 0
; RV32IF-NEXT: j .LBB47_3
; RV32IF-NEXT: .LBB47_2:
; RV32IF-NEXT: seqz a4, a3
; RV32IF-NEXT: .LBB47_3: # %entry
; RV32IF-NEXT: xori a3, a3, 1
-; RV32IF-NEXT: or a3, a3, a2
+; RV32IF-NEXT: or a3, a3, a0
; RV32IF-NEXT: seqz a3, a3
; RV32IF-NEXT: addi a3, a3, -1
; RV32IF-NEXT: and a3, a3, a4
; RV32IF-NEXT: neg a3, a3
+; RV32IF-NEXT: and a2, a3, a2
; RV32IF-NEXT: and a1, a3, a1
; RV32IF-NEXT: and a0, a3, a0
-; RV32IF-NEXT: and a2, a3, a2
-; RV32IF-NEXT: slti a2, a2, 0
-; RV32IF-NEXT: addi a2, a2, -1
-; RV32IF-NEXT: and a0, a2, a0
-; RV32IF-NEXT: and a1, a2, a1
+; RV32IF-NEXT: slti a0, a0, 0
+; RV32IF-NEXT: addi a3, a0, -1
+; RV32IF-NEXT: and a0, a3, a1
+; RV32IF-NEXT: and a1, a3, a2
; RV32IF-NEXT: lw ra, 28(sp) # 4-byte Folded Reload
; RV32IF-NEXT: .cfi_restore ra
; RV32IF-NEXT: addi sp, sp, 32
@@ -3354,30 +3354,30 @@ define i64 @ustest_f64i64_mm(double %x) {
; RV32IFD-NEXT: .cfi_offset ra, -4
; RV32IFD-NEXT: addi a0, sp, 8
; RV32IFD-NEXT: call __fixdfti
-; RV32IFD-NEXT: lw a0, 8(sp)
-; RV32IFD-NEXT: lw a1, 12(sp)
-; RV32IFD-NEXT: lw a2, 20(sp)
+; RV32IFD-NEXT: lw a0, 20(sp)
+; RV32IFD-NEXT: lw a1, 8(sp)
+; RV32IFD-NEXT: lw a2, 12(sp)
; RV32IFD-NEXT: lw a3, 16(sp)
-; RV32IFD-NEXT: beqz a2, .LBB47_2
+; RV32IFD-NEXT: beqz a0, .LBB47_2
; RV32IFD-NEXT: # %bb.1: # %entry
-; RV32IFD-NEXT: slti a4, a2, 0
+; RV32IFD-NEXT: slti a4, a0, 0
; RV32IFD-NEXT: j .LBB47_3
; RV32IFD-NEXT: .LBB47_2:
; RV32IFD-NEXT: seqz a4, a3
; RV32IFD-NEXT: .LBB47_3: # %entry
; RV32IFD-NEXT: xori a3, a3, 1
-; RV32IFD-NEXT: or a3, a3, a2
+; RV32IFD-NEXT: or a3, a3, a0
; RV32IFD-NEXT: seqz a3, a3
; RV32IFD-NEXT: addi a3, a3, -1
; RV32IFD-NEXT: and a3, a3, a4
; RV32IFD-NEXT: neg a3, a3
+; RV32IFD-NEXT: and a2, a3, a2
; RV32IFD-NEXT: and a1, a3, a1
; RV32IFD-NEXT: and a0, a3, a0
-; RV32IFD-NEXT: and a2, a3, a2
-; RV32IFD-NEXT: slti a2, a2, 0
-; RV32IFD-NEXT: addi a2, a2, -1
-; RV32IFD-NEXT: and a0, a2, a0
-; RV32IFD-NEXT: and a1, a2, a1
+; RV32IFD-NEXT: slti a0, a0, 0
+; RV32IFD-NEXT: addi a3, a0, -1
+; RV32IFD-NEXT: and a0, a3, a1
+; RV32IFD-NEXT: and a1, a3, a2
; RV32IFD-NEXT: lw ra, 28(sp) # 4-byte Folded Reload
; RV32IFD-NEXT: .cfi_restore ra
; RV32IFD-NEXT: addi sp, sp, 32
@@ -3530,30 +3530,30 @@ define i64 @ustest_f32i64_mm(float %x) {
; RV32-NEXT: .cfi_offset ra, -4
; RV32-NEXT: addi a0, sp, 8
; RV32-NEXT: call __fixsfti
-; RV32-NEXT: lw a0, 8(sp)
-; RV32-NEXT: lw a1, 12(sp)
-; RV32-NEXT: lw a2, 20(sp)
+; RV32-NEXT: lw a0, 20(sp)
+; RV32-NEXT: lw a1, 8(sp)
+; RV32-NEXT: lw a2, 12(sp)
; RV32-NEXT: lw a3, 16(sp)
-; RV32-NEXT: beqz a2, .LBB50_2
+; RV32-NEXT: beqz a0, .LBB50_2
; RV32-NEXT: # %bb.1: # %entry
-; RV32-NEXT: slti a4, a2, 0
+; RV32-NEXT: slti a4, a0, 0
; RV32-NEXT: j .LBB50_3
; RV32-NEXT: .LBB50_2:
; RV32-NEXT: seqz a4, a3
; RV32-NEXT: .LBB50_3: # %entry
; RV32-NEXT: xori a3, a3, 1
-; RV32-NEXT: or a3, a3, a2
+; RV32-NEXT: or a3, a3, a0
; RV32-NEXT: seqz a3, a3
; RV32-NEXT: addi a3, a3, -1
; RV32-NEXT: and a3, a3, a4
; RV32-NEXT: neg a3, a3
+; RV32-NEXT: and a2, a3, a2
; RV32-NEXT: and a1, a3, a1
; RV32-NEXT: and a0, a3, a0
-; RV32-NEXT: and a2, a3, a2
-; RV32-NEXT: slti a2, a2, 0
-; RV32-NEXT: addi a2, a2, -1
-; RV32-NEXT: and a0, a2, a0
-; RV32-NEXT: and a1, a2, a1
+; RV32-NEXT: slti a0, a0, 0
+; RV32-NEXT: addi a3, a0, -1
+; RV32-NEXT: and a0, a3, a1
+; RV32-NEXT: and a1, a3, a2
; RV32-NEXT: lw ra, 28(sp) # 4-byte Folded Reload
; RV32-NEXT: .cfi_restore ra
; RV32-NEXT: addi sp, sp, 32
@@ -3767,30 +3767,30 @@ define i64 @ustest_f16i64_mm(half %x) {
; RV32-NEXT: call __extendhfsf2
; RV32-NEXT: addi a0, sp, 8
; RV32-NEXT: call __fixsfti
-; RV32-NEXT: lw a0, 8(sp)
-; RV32-NEXT: lw a1, 12(sp)
-; RV32-NEXT: lw a2, 20(sp)
+; RV32-NEXT: lw a0, 20(sp)
+; RV32-NEXT: lw a1, 8(sp)
+; RV32-NEXT: lw a2, 12(sp)
; RV32-NEXT: lw a3, 16(sp)
-; RV32-NEXT: beqz a2, .LBB53_2
+; RV32-NEXT: beqz a0, .LBB53_2
; RV32-NEXT: # %bb.1: # %entry
-; RV32-NEXT: slti a4, a2, 0
+; RV32-NEXT: slti a4, a0, 0
; RV32-NEXT: j .LBB53_3
; RV32-NEXT: .LBB53_2:
; RV32-NEXT: seqz a4, a3
; RV32-NEXT: .LBB53_3: # %entry
; RV32-NEXT: xori a3, a3, 1
-; RV32-NEXT: or a3, a3, a2
+; RV32-NEXT: or a3, a3, a0
; RV32-NEXT: seqz a3, a3
; RV32-NEXT: addi a3, a3, -1
; RV32-NEXT: and a3, a3, a4
; RV32-NEXT: neg a3, a3
+; RV32-NEXT: and a2, a3, a2
; RV32-NEXT: and a1, a3, a1
; RV32-NEXT: and a0, a3, a0
-; RV32-NEXT: and a2, a3, a2
-; RV32-NEXT: slti a2, a2, 0
-; RV32-NEXT: addi a2, a2, -1
-; RV32-NEXT: and a0, a2, a0
-; RV32-NEXT: and a1, a2, a1
+; RV32-NEXT: slti a0, a0, 0
+; RV32-NEXT: addi a3, a0, -1
+; RV32-NEXT: and a0, a3, a1
+; RV32-NEXT: and a1, a3, a2
; RV32-NEXT: lw ra, 28(sp) # 4-byte Folded Reload
; RV32-NEXT: .cfi_restore ra
; RV32-NEXT: addi sp, sp, 32
diff --git a/llvm/test/CodeGen/RISCV/intrinsic-cttz-elts-vscale.ll b/llvm/test/CodeGen/RISCV/intrinsic-cttz-elts-vscale.ll
index 87c8343..a06c750 100644
--- a/llvm/test/CodeGen/RISCV/intrinsic-cttz-elts-vscale.ll
+++ b/llvm/test/CodeGen/RISCV/intrinsic-cttz-elts-vscale.ll
@@ -7,18 +7,18 @@
define i32 @ctz_nxv4i32(<vscale x 4 x i32> %a) #0 {
; RV32-LABEL: ctz_nxv4i32:
; RV32: # %bb.0:
-; RV32-NEXT: vsetvli a0, zero, e16, m1, ta, ma
-; RV32-NEXT: vid.v v10
-; RV32-NEXT: vmv.v.i v11, -1
; RV32-NEXT: csrr a0, vlenb
+; RV32-NEXT: vsetvli a1, zero, e16, m1, ta, ma
+; RV32-NEXT: vid.v v10
+; RV32-NEXT: li a1, -1
; RV32-NEXT: vsetvli zero, zero, e32, m2, ta, ma
; RV32-NEXT: vmsne.vi v0, v8, 0
; RV32-NEXT: srli a0, a0, 1
; RV32-NEXT: vsetvli zero, zero, e16, m1, ta, ma
; RV32-NEXT: vmv.v.x v8, a0
-; RV32-NEXT: vmacc.vv v8, v10, v11
-; RV32-NEXT: vmv.v.i v9, 0
-; RV32-NEXT: vmerge.vvm v8, v9, v8, v0
+; RV32-NEXT: vmadd.vx v10, a1, v8
+; RV32-NEXT: vmv.v.i v8, 0
+; RV32-NEXT: vmerge.vvm v8, v8, v10, v0
; RV32-NEXT: vredmaxu.vs v8, v8, v8
; RV32-NEXT: vmv.x.s a1, v8
; RV32-NEXT: sub a0, a0, a1
@@ -28,18 +28,18 @@ define i32 @ctz_nxv4i32(<vscale x 4 x i32> %a) #0 {
;
; RV64-LABEL: ctz_nxv4i32:
; RV64: # %bb.0:
-; RV64-NEXT: vsetvli a0, zero, e16, m1, ta, ma
-; RV64-NEXT: vid.v v10
-; RV64-NEXT: vmv.v.i v11, -1
; RV64-NEXT: csrr a0, vlenb
+; RV64-NEXT: vsetvli a1, zero, e16, m1, ta, ma
+; RV64-NEXT: vid.v v10
+; RV64-NEXT: li a1, -1
; RV64-NEXT: vsetvli zero, zero, e32, m2, ta, ma
; RV64-NEXT: vmsne.vi v0, v8, 0
; RV64-NEXT: srli a0, a0, 1
; RV64-NEXT: vsetvli zero, zero, e16, m1, ta, ma
; RV64-NEXT: vmv.v.x v8, a0
-; RV64-NEXT: vmacc.vv v8, v10, v11
-; RV64-NEXT: vmv.v.i v9, 0
-; RV64-NEXT: vmerge.vvm v8, v9, v8, v0
+; RV64-NEXT: vmadd.vx v10, a1, v8
+; RV64-NEXT: vmv.v.i v8, 0
+; RV64-NEXT: vmerge.vvm v8, v8, v10, v0
; RV64-NEXT: vredmaxu.vs v8, v8, v8
; RV64-NEXT: vmv.x.s a1, v8
; RV64-NEXT: sub a0, a0, a1
@@ -109,17 +109,17 @@ define i64 @ctz_nxv8i1_no_range(<vscale x 8 x i16> %a) {
;
; RV64-LABEL: ctz_nxv8i1_no_range:
; RV64: # %bb.0:
-; RV64-NEXT: vsetvli a0, zero, e64, m8, ta, ma
-; RV64-NEXT: vid.v v16
-; RV64-NEXT: vmv.v.i v24, -1
; RV64-NEXT: csrr a0, vlenb
+; RV64-NEXT: vsetvli a1, zero, e64, m8, ta, ma
+; RV64-NEXT: vid.v v16
+; RV64-NEXT: li a1, -1
; RV64-NEXT: vsetvli zero, zero, e16, m2, ta, ma
; RV64-NEXT: vmsne.vi v0, v8, 0
; RV64-NEXT: vsetvli zero, zero, e64, m8, ta, ma
; RV64-NEXT: vmv.v.x v8, a0
-; RV64-NEXT: vmacc.vv v8, v16, v24
-; RV64-NEXT: vmv.v.i v16, 0
-; RV64-NEXT: vmerge.vvm v8, v16, v8, v0
+; RV64-NEXT: vmadd.vx v16, a1, v8
+; RV64-NEXT: vmv.v.i v8, 0
+; RV64-NEXT: vmerge.vvm v8, v8, v16, v0
; RV64-NEXT: vredmaxu.vs v8, v8, v8
; RV64-NEXT: vmv.x.s a1, v8
; RV64-NEXT: sub a0, a0, a1
diff --git a/llvm/test/CodeGen/RISCV/rvv/rvv-peephole-vmerge-to-vmv.mir b/llvm/test/CodeGen/RISCV/rvv/rvv-peephole-vmerge-to-vmv.mir
index a050034..a7eaf39 100644
--- a/llvm/test/CodeGen/RISCV/rvv/rvv-peephole-vmerge-to-vmv.mir
+++ b/llvm/test/CodeGen/RISCV/rvv/rvv-peephole-vmerge-to-vmv.mir
@@ -78,12 +78,12 @@ body: |
; CHECK-NEXT: %false:vrnov0 = COPY $v9
; CHECK-NEXT: %mask:vmv0 = COPY $v0
; CHECK-NEXT: %true:vrnov0 = PseudoVADD_VV_M1_MASK %false, $noreg, $noreg, %mask, 4, 5 /* e32 */, 0 /* tu, mu */
- ; CHECK-NEXT: %x:vr = PseudoVMV_V_V_M1 %pt, %true, 8, 5 /* e32 */, 0 /* tu, mu */
+ ; CHECK-NEXT: %x:vr = PseudoVMV_V_V_M1 %pt, %true, 4, 5 /* e32 */, 0 /* tu, mu */
%pt:vrnov0 = COPY $v8
%false:vrnov0 = COPY $v9
%mask:vmv0 = COPY $v0
- %true:vrnov0 = PseudoVADD_VV_M1_MASK %false, $noreg, $noreg, %mask, 4, 5 /* e32 */, 0 /* tu, mu */
- %x:vrnov0 = PseudoVMERGE_VVM_M1 %pt, %false, %true, %mask, 8, 5 /* e32 */
+ %true:vrnov0 = PseudoVADD_VV_M1_MASK %false, $noreg, $noreg, %mask, 8, 5 /* e32 */, 0 /* tu, mu */
+ %x:vrnov0 = PseudoVMERGE_VVM_M1 %pt, %false, %true, %mask, 4, 5 /* e32 */
...
---
# Shouldn't be converted because false operands are different
@@ -163,3 +163,47 @@ body: |
%true:vrnov0 = PseudoVADD_VV_M1_MASK $noreg, $noreg, $noreg, %mask, 4, 5 /* e32 */, 0 /* tu, mu */
bb.1:
%5:vrnov0 = PseudoVMERGE_VVM_M1 $noreg, %false, %true, %mask, 4, 5 /* e32 */
+...
+---
+# Shouldn't be converted because vmerge adds back in elements from false past avl that would be lost if we converted to vmv.v.v
+name: preserve_false
+body: |
+ bb.0:
+ liveins: $v8, $v9, $v0, $x8, $x9
+ ; CHECK-LABEL: name: preserve_false
+ ; CHECK: liveins: $v8, $v9, $v0, $x8, $x9
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: %pt:vrnov0 = COPY $v8
+ ; CHECK-NEXT: %false:vr = COPY $v9
+ ; CHECK-NEXT: %mask:vmv0 = COPY $v0
+ ; CHECK-NEXT: %avl1:gprnox0 = COPY $x8
+ ; CHECK-NEXT: %avl2:gprnox0 = COPY $x9
+ ; CHECK-NEXT: %true:vrnov0 = PseudoVADD_VV_M1_MASK $noreg, $noreg, $noreg, %mask, %avl1, 5 /* e32 */, 3 /* ta, ma */
+ ; CHECK-NEXT: [[PseudoVMERGE_VVM_M1_:%[0-9]+]]:vrnov0 = PseudoVMERGE_VVM_M1 %pt, %false, %true, %mask, %avl2, 5 /* e32 */
+ %pt:vrnov0 = COPY $v8
+ %false:vr = COPY $v9
+ %mask:vmv0 = COPY $v0
+ %avl1:gprnox0 = COPY $x8
+ %avl2:gprnox0 = COPY $x9
+ %true:vrnov0 = PseudoVADD_VV_M1_MASK $noreg, $noreg, $noreg, %mask, %avl1, 5 /* e32 */, 3 /* ta, ma */
+ %5:vrnov0 = PseudoVMERGE_VVM_M1 %pt, %false, %true, %mask, %avl2, 5 /* e32 */
+...
+---
+# But we can convert this one because vmerge's avl being <= true's means we don't lose any false elements past avl.
+name: preserve_false_avl_known_le
+body: |
+ bb.0:
+ liveins: $v8, $v9, $v0
+ ; CHECK-LABEL: name: preserve_false_avl_known_le
+ ; CHECK: liveins: $v8, $v9, $v0
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: %pt:vr = COPY $v8
+ ; CHECK-NEXT: %false:vrnov0 = COPY $v9
+ ; CHECK-NEXT: %mask:vmv0 = COPY $v0
+ ; CHECK-NEXT: %true:vrnov0 = PseudoVADD_VV_M1_MASK %false, $noreg, $noreg, %mask, 1, 5 /* e32 */, 3 /* ta, ma */
+ ; CHECK-NEXT: [[PseudoVMV_V_V_M1_:%[0-9]+]]:vr = PseudoVMV_V_V_M1 %pt, %true, 1, 5 /* e32 */, 0 /* tu, mu */
+ %pt:vrnov0 = COPY $v8
+ %false:vr = COPY $v9
+ %mask:vmv0 = COPY $v0
+ %true:vrnov0 = PseudoVADD_VV_M1_MASK $noreg, $noreg, $noreg, %mask, 2, 5 /* e32 */, 3 /* ta, ma */
+ %5:vrnov0 = PseudoVMERGE_VVM_M1 %pt, %false, %true, %mask, 1, 5 /* e32 */
diff --git a/llvm/test/CodeGen/RISCV/rvv/rvv-vmerge-to-vmv.ll b/llvm/test/CodeGen/RISCV/rvv/rvv-vmerge-to-vmv.ll
index 3aeb4e8..9ffc84a 100644
--- a/llvm/test/CodeGen/RISCV/rvv/rvv-vmerge-to-vmv.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/rvv-vmerge-to-vmv.ll
@@ -71,10 +71,31 @@ define <vscale x 8 x i64> @vpmerge_m8(<vscale x 8 x i64> %x, <vscale x 8 x i64>
ret <vscale x 8 x i64> %1
}
-declare <vscale x 1 x i8> @llvm.vp.merge.nxv1i8(<vscale x 1 x i1>, <vscale x 1 x i8>, <vscale x 1 x i8>, i32)
-declare <vscale x 2 x i8> @llvm.vp.merge.nxv2i8(<vscale x 2 x i1>, <vscale x 2 x i8>, <vscale x 2 x i8>, i32)
-declare <vscale x 4 x i8> @llvm.vp.merge.nxv4i8(<vscale x 4 x i1>, <vscale x 4 x i8>, <vscale x 4 x i8>, i32)
-declare <vscale x 8 x i8> @llvm.vp.merge.nxv8i8(<vscale x 8 x i1>, <vscale x 8 x i8>, <vscale x 8 x i8>, i32)
-declare <vscale x 8 x i16> @llvm.vp.merge.nxv8i16(<vscale x 8 x i1>, <vscale x 8 x i16>, <vscale x 8 x i16>, i32)
-declare <vscale x 8 x i32> @llvm.vp.merge.nxv8i32(<vscale x 8 x i1>, <vscale x 8 x i32>, <vscale x 8 x i32>, i32)
-declare <vscale x 8 x i64> @llvm.vp.merge.nxv8i64(<vscale x 8 x i1>, <vscale x 8 x i64>, <vscale x 8 x i64>, i32)
+; Shouldn't be converted because vmerge adds back in elements from false past avl that would be lost if we converted to vmv.v.v
+define <vscale x 2 x i32> @preserve_false(ptr %p, <vscale x 2 x i32> %pt, <vscale x 2 x i32> %false, <vscale x 2 x i1> %mask, i64 %avl1, i64 %avl2) {
+; CHECK-LABEL: preserve_false:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma
+; CHECK-NEXT: vmv1r.v v10, v9
+; CHECK-NEXT: vle32.v v10, (a0), v0.t
+; CHECK-NEXT: vsetvli zero, a2, e32, m1, tu, ma
+; CHECK-NEXT: vmerge.vvm v8, v9, v10, v0
+; CHECK-NEXT: ret
+ %true = call <vscale x 2 x i32> @llvm.riscv.vle.mask(<vscale x 2 x i32> %false, ptr %p, <vscale x 2 x i1> %mask, i64 %avl1, i64 3)
+ %res = call <vscale x 2 x i32> @llvm.riscv.vmerge(<vscale x 2 x i32> %pt, <vscale x 2 x i32> %false, <vscale x 2 x i32> %true, <vscale x 2 x i1> %mask, i64 %avl2)
+ ret <vscale x 2 x i32> %res
+}
+
+; Can fold this because its avl is known to be <= than true, so no elements from false need to be introduced past avl.
+define <vscale x 2 x i32> @preserve_false_avl_known_le(ptr %p, <vscale x 2 x i32> %pt, <vscale x 2 x i32> %false, <vscale x 2 x i1> %mask) {
+; CHECK-LABEL: preserve_false_avl_known_le:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetivli zero, 1, e32, m1, ta, ma
+; CHECK-NEXT: vle32.v v9, (a0), v0.t
+; CHECK-NEXT: vsetvli zero, zero, e32, m1, tu, ma
+; CHECK-NEXT: vmv.v.v v8, v9
+; CHECK-NEXT: ret
+ %true = call <vscale x 2 x i32> @llvm.riscv.vle.mask(<vscale x 2 x i32> %false, ptr %p, <vscale x 2 x i1> %mask, i64 2, i64 3)
+ %res = call <vscale x 2 x i32> @llvm.riscv.vmerge(<vscale x 2 x i32> %pt, <vscale x 2 x i32> %false, <vscale x 2 x i32> %true, <vscale x 2 x i1> %mask, i64 1)
+ ret <vscale x 2 x i32> %res
+}
diff --git a/llvm/test/CodeGen/RISCV/wide-scalar-shift-legalization.ll b/llvm/test/CodeGen/RISCV/wide-scalar-shift-legalization.ll
index 32753ca..cd7f30d 100644
--- a/llvm/test/CodeGen/RISCV/wide-scalar-shift-legalization.ll
+++ b/llvm/test/CodeGen/RISCV/wide-scalar-shift-legalization.ll
@@ -716,92 +716,101 @@ define void @lshr_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; RV32I-NEXT: slli a4, a4, 8
; RV32I-NEXT: slli a5, a5, 16
; RV32I-NEXT: slli a6, a6, 24
+; RV32I-NEXT: or a3, a4, a3
+; RV32I-NEXT: or a4, a6, a5
+; RV32I-NEXT: lbu a5, 8(a0)
+; RV32I-NEXT: lbu a6, 9(a0)
+; RV32I-NEXT: lbu t3, 10(a0)
+; RV32I-NEXT: lbu t4, 11(a0)
; RV32I-NEXT: slli t0, t0, 8
-; RV32I-NEXT: or a4, a4, a3
-; RV32I-NEXT: or a5, a6, a5
-; RV32I-NEXT: or a3, t0, a7
-; RV32I-NEXT: lbu a6, 8(a0)
-; RV32I-NEXT: lbu a7, 9(a0)
-; RV32I-NEXT: lbu t0, 10(a0)
-; RV32I-NEXT: lbu t3, 11(a0)
; RV32I-NEXT: slli t1, t1, 16
; RV32I-NEXT: slli t2, t2, 24
-; RV32I-NEXT: slli a7, a7, 8
-; RV32I-NEXT: slli t0, t0, 16
-; RV32I-NEXT: slli t3, t3, 24
-; RV32I-NEXT: or t1, t2, t1
-; RV32I-NEXT: or a6, a7, a6
-; RV32I-NEXT: or a7, t3, t0
-; RV32I-NEXT: lbu t0, 12(a0)
-; RV32I-NEXT: lbu t2, 13(a0)
-; RV32I-NEXT: lbu t3, 14(a0)
-; RV32I-NEXT: lbu t4, 15(a0)
-; RV32I-NEXT: lbu a0, 0(a1)
+; RV32I-NEXT: slli a6, a6, 8
+; RV32I-NEXT: or a7, t0, a7
+; RV32I-NEXT: or t0, t2, t1
+; RV32I-NEXT: or a5, a6, a5
+; RV32I-NEXT: lbu a6, 12(a0)
+; RV32I-NEXT: lbu t1, 13(a0)
+; RV32I-NEXT: lbu t2, 14(a0)
+; RV32I-NEXT: lbu a0, 15(a0)
+; RV32I-NEXT: slli t3, t3, 16
+; RV32I-NEXT: slli t4, t4, 24
+; RV32I-NEXT: slli t1, t1, 8
+; RV32I-NEXT: slli t2, t2, 16
+; RV32I-NEXT: slli a0, a0, 24
+; RV32I-NEXT: or t3, t4, t3
+; RV32I-NEXT: or a6, t1, a6
+; RV32I-NEXT: or a0, a0, t2
+; RV32I-NEXT: lbu t1, 1(a1)
+; RV32I-NEXT: lbu t2, 0(a1)
+; RV32I-NEXT: lbu t4, 2(a1)
+; RV32I-NEXT: lbu a1, 3(a1)
+; RV32I-NEXT: slli t1, t1, 8
+; RV32I-NEXT: or t1, t1, t2
; RV32I-NEXT: sw zero, 16(sp)
; RV32I-NEXT: sw zero, 20(sp)
; RV32I-NEXT: sw zero, 24(sp)
; RV32I-NEXT: sw zero, 28(sp)
-; RV32I-NEXT: slli t2, t2, 8
-; RV32I-NEXT: or a1, t2, t0
-; RV32I-NEXT: mv t0, sp
-; RV32I-NEXT: slli t3, t3, 16
-; RV32I-NEXT: slli t4, t4, 24
-; RV32I-NEXT: or t2, t4, t3
-; RV32I-NEXT: srli t3, a0, 3
-; RV32I-NEXT: or a4, a5, a4
-; RV32I-NEXT: andi a5, a0, 31
-; RV32I-NEXT: andi t3, t3, 12
-; RV32I-NEXT: xori a5, a5, 31
-; RV32I-NEXT: or a3, t1, a3
-; RV32I-NEXT: or a6, a7, a6
-; RV32I-NEXT: or a1, t2, a1
-; RV32I-NEXT: add t0, t0, t3
-; RV32I-NEXT: sw a4, 0(sp)
-; RV32I-NEXT: sw a3, 4(sp)
-; RV32I-NEXT: sw a6, 8(sp)
-; RV32I-NEXT: sw a1, 12(sp)
-; RV32I-NEXT: lw a1, 4(t0)
-; RV32I-NEXT: lw a3, 8(t0)
-; RV32I-NEXT: lw a4, 0(t0)
-; RV32I-NEXT: lw a6, 12(t0)
-; RV32I-NEXT: srl a7, a1, a0
-; RV32I-NEXT: slli t0, a3, 1
-; RV32I-NEXT: srl a4, a4, a0
-; RV32I-NEXT: slli a1, a1, 1
-; RV32I-NEXT: srl a3, a3, a0
-; RV32I-NEXT: slli t1, a6, 1
-; RV32I-NEXT: srl a0, a6, a0
-; RV32I-NEXT: sll a6, t0, a5
-; RV32I-NEXT: sll a1, a1, a5
-; RV32I-NEXT: sll a5, t1, a5
+; RV32I-NEXT: slli t4, t4, 16
+; RV32I-NEXT: slli a1, a1, 24
+; RV32I-NEXT: or a1, a1, t4
+; RV32I-NEXT: mv t2, sp
+; RV32I-NEXT: or a3, a4, a3
+; RV32I-NEXT: or a4, t0, a7
+; RV32I-NEXT: or a5, t3, a5
+; RV32I-NEXT: or a0, a0, a6
+; RV32I-NEXT: or a1, a1, t1
+; RV32I-NEXT: sw a3, 0(sp)
+; RV32I-NEXT: sw a4, 4(sp)
+; RV32I-NEXT: sw a5, 8(sp)
+; RV32I-NEXT: sw a0, 12(sp)
+; RV32I-NEXT: srli a0, a1, 3
+; RV32I-NEXT: andi a3, a1, 31
+; RV32I-NEXT: andi a0, a0, 12
+; RV32I-NEXT: xori a3, a3, 31
+; RV32I-NEXT: add a0, t2, a0
+; RV32I-NEXT: lw a4, 4(a0)
+; RV32I-NEXT: lw a5, 8(a0)
+; RV32I-NEXT: lw a6, 0(a0)
+; RV32I-NEXT: lw a0, 12(a0)
+; RV32I-NEXT: srl a7, a4, a1
+; RV32I-NEXT: slli t0, a5, 1
+; RV32I-NEXT: srl a6, a6, a1
+; RV32I-NEXT: slli a4, a4, 1
+; RV32I-NEXT: srl a5, a5, a1
+; RV32I-NEXT: slli t1, a0, 1
+; RV32I-NEXT: srl a0, a0, a1
+; RV32I-NEXT: sll a1, t0, a3
+; RV32I-NEXT: sll a4, a4, a3
+; RV32I-NEXT: sll a3, t1, a3
; RV32I-NEXT: srli t0, a0, 16
; RV32I-NEXT: srli t1, a0, 24
; RV32I-NEXT: srli t2, a0, 8
-; RV32I-NEXT: or a6, a7, a6
-; RV32I-NEXT: or a1, a4, a1
-; RV32I-NEXT: or a3, a3, a5
+; RV32I-NEXT: or a1, a7, a1
+; RV32I-NEXT: or a4, a6, a4
+; RV32I-NEXT: or a3, a5, a3
; RV32I-NEXT: sb a0, 12(a2)
; RV32I-NEXT: sb t2, 13(a2)
; RV32I-NEXT: sb t0, 14(a2)
; RV32I-NEXT: sb t1, 15(a2)
; RV32I-NEXT: srli a0, a3, 16
-; RV32I-NEXT: srli a4, a3, 24
-; RV32I-NEXT: srli a5, a3, 8
-; RV32I-NEXT: srli a7, a1, 16
-; RV32I-NEXT: srli t0, a1, 24
-; RV32I-NEXT: srli t1, a1, 8
-; RV32I-NEXT: srli t2, a6, 16
-; RV32I-NEXT: srli t3, a6, 24
+; RV32I-NEXT: srli a5, a3, 24
+; RV32I-NEXT: srli a6, a3, 8
+; RV32I-NEXT: srli a7, a4, 16
+; RV32I-NEXT: srli t0, a4, 24
+; RV32I-NEXT: srli t1, a4, 8
+; RV32I-NEXT: srli t2, a1, 16
+; RV32I-NEXT: srli t3, a1, 24
; RV32I-NEXT: sb a3, 8(a2)
-; RV32I-NEXT: sb a5, 9(a2)
+; RV32I-NEXT: sb a6, 9(a2)
; RV32I-NEXT: sb a0, 10(a2)
-; RV32I-NEXT: sb a4, 11(a2)
-; RV32I-NEXT: srli a0, a6, 8
-; RV32I-NEXT: sb a1, 0(a2)
+; RV32I-NEXT: sb a5, 11(a2)
+; RV32I-NEXT: srli a0, a1, 8
+; RV32I-NEXT: sb a4, 0(a2)
; RV32I-NEXT: sb t1, 1(a2)
; RV32I-NEXT: sb a7, 2(a2)
; RV32I-NEXT: sb t0, 3(a2)
-; RV32I-NEXT: sb a6, 4(a2)
+; RV32I-NEXT: sb a1, 4(a2)
; RV32I-NEXT: sb a0, 5(a2)
; RV32I-NEXT: sb t2, 6(a2)
; RV32I-NEXT: sb t3, 7(a2)
@@ -943,93 +952,102 @@ define void @shl_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; RV32I-NEXT: slli a4, a4, 8
; RV32I-NEXT: slli a5, a5, 16
; RV32I-NEXT: slli a6, a6, 24
+; RV32I-NEXT: or a3, a4, a3
+; RV32I-NEXT: or a4, a6, a5
+; RV32I-NEXT: lbu a5, 8(a0)
+; RV32I-NEXT: lbu a6, 9(a0)
+; RV32I-NEXT: lbu t3, 10(a0)
+; RV32I-NEXT: lbu t4, 11(a0)
; RV32I-NEXT: slli t0, t0, 8
-; RV32I-NEXT: or a4, a4, a3
-; RV32I-NEXT: or a5, a6, a5
-; RV32I-NEXT: or a3, t0, a7
-; RV32I-NEXT: lbu a6, 8(a0)
-; RV32I-NEXT: lbu a7, 9(a0)
-; RV32I-NEXT: lbu t0, 10(a0)
-; RV32I-NEXT: lbu t3, 11(a0)
; RV32I-NEXT: slli t1, t1, 16
; RV32I-NEXT: slli t2, t2, 24
-; RV32I-NEXT: slli a7, a7, 8
-; RV32I-NEXT: slli t0, t0, 16
-; RV32I-NEXT: slli t3, t3, 24
-; RV32I-NEXT: or t1, t2, t1
-; RV32I-NEXT: or a6, a7, a6
-; RV32I-NEXT: or a7, t3, t0
-; RV32I-NEXT: lbu t0, 12(a0)
-; RV32I-NEXT: lbu t2, 13(a0)
-; RV32I-NEXT: lbu t3, 14(a0)
-; RV32I-NEXT: lbu t4, 15(a0)
-; RV32I-NEXT: lbu a0, 0(a1)
+; RV32I-NEXT: slli a6, a6, 8
+; RV32I-NEXT: or a7, t0, a7
+; RV32I-NEXT: or t0, t2, t1
+; RV32I-NEXT: or a5, a6, a5
+; RV32I-NEXT: lbu a6, 12(a0)
+; RV32I-NEXT: lbu t1, 13(a0)
+; RV32I-NEXT: lbu t2, 14(a0)
+; RV32I-NEXT: lbu a0, 15(a0)
+; RV32I-NEXT: slli t3, t3, 16
+; RV32I-NEXT: slli t4, t4, 24
+; RV32I-NEXT: slli t1, t1, 8
+; RV32I-NEXT: slli t2, t2, 16
+; RV32I-NEXT: slli a0, a0, 24
+; RV32I-NEXT: or t3, t4, t3
+; RV32I-NEXT: or a6, t1, a6
+; RV32I-NEXT: or a0, a0, t2
+; RV32I-NEXT: lbu t1, 1(a1)
+; RV32I-NEXT: lbu t2, 0(a1)
+; RV32I-NEXT: lbu t4, 2(a1)
+; RV32I-NEXT: lbu a1, 3(a1)
+; RV32I-NEXT: slli t1, t1, 8
+; RV32I-NEXT: or t1, t1, t2
; RV32I-NEXT: sw zero, 0(sp)
; RV32I-NEXT: sw zero, 4(sp)
; RV32I-NEXT: sw zero, 8(sp)
; RV32I-NEXT: sw zero, 12(sp)
-; RV32I-NEXT: slli t2, t2, 8
-; RV32I-NEXT: or a1, t2, t0
-; RV32I-NEXT: addi t0, sp, 16
-; RV32I-NEXT: slli t3, t3, 16
-; RV32I-NEXT: slli t4, t4, 24
-; RV32I-NEXT: or t2, t4, t3
-; RV32I-NEXT: srli t3, a0, 3
-; RV32I-NEXT: or a4, a5, a4
-; RV32I-NEXT: andi a5, a0, 31
-; RV32I-NEXT: andi t3, t3, 12
-; RV32I-NEXT: or a3, t1, a3
-; RV32I-NEXT: or a6, a7, a6
-; RV32I-NEXT: or a1, t2, a1
-; RV32I-NEXT: sub a7, t0, t3
-; RV32I-NEXT: sw a4, 16(sp)
-; RV32I-NEXT: sw a3, 20(sp)
-; RV32I-NEXT: sw a6, 24(sp)
-; RV32I-NEXT: sw a1, 28(sp)
-; RV32I-NEXT: lw a1, 0(a7)
-; RV32I-NEXT: lw a3, 4(a7)
-; RV32I-NEXT: lw a4, 8(a7)
-; RV32I-NEXT: lw a6, 12(a7)
-; RV32I-NEXT: xori a5, a5, 31
-; RV32I-NEXT: sll a7, a3, a0
-; RV32I-NEXT: srli t0, a1, 1
-; RV32I-NEXT: sll a6, a6, a0
-; RV32I-NEXT: srli t1, a4, 1
-; RV32I-NEXT: sll a4, a4, a0
-; RV32I-NEXT: srli a3, a3, 1
-; RV32I-NEXT: sll a0, a1, a0
-; RV32I-NEXT: srl a1, t0, a5
-; RV32I-NEXT: srl t0, t1, a5
-; RV32I-NEXT: srl a3, a3, a5
-; RV32I-NEXT: srli a5, a0, 16
-; RV32I-NEXT: srli t1, a0, 24
-; RV32I-NEXT: srli t2, a0, 8
-; RV32I-NEXT: or a1, a7, a1
-; RV32I-NEXT: or a6, a6, t0
+; RV32I-NEXT: slli t4, t4, 16
+; RV32I-NEXT: slli a1, a1, 24
+; RV32I-NEXT: or a1, a1, t4
+; RV32I-NEXT: addi t2, sp, 16
; RV32I-NEXT: or a3, a4, a3
-; RV32I-NEXT: sb a0, 0(a2)
+; RV32I-NEXT: or a4, t0, a7
+; RV32I-NEXT: or a5, t3, a5
+; RV32I-NEXT: or a0, a0, a6
+; RV32I-NEXT: or a1, a1, t1
+; RV32I-NEXT: sw a3, 16(sp)
+; RV32I-NEXT: sw a4, 20(sp)
+; RV32I-NEXT: sw a5, 24(sp)
+; RV32I-NEXT: sw a0, 28(sp)
+; RV32I-NEXT: srli a0, a1, 3
+; RV32I-NEXT: andi a3, a1, 31
+; RV32I-NEXT: andi a0, a0, 12
+; RV32I-NEXT: sub a0, t2, a0
+; RV32I-NEXT: lw a4, 0(a0)
+; RV32I-NEXT: lw a5, 4(a0)
+; RV32I-NEXT: lw a6, 8(a0)
+; RV32I-NEXT: lw a0, 12(a0)
+; RV32I-NEXT: xori a3, a3, 31
+; RV32I-NEXT: sll a7, a5, a1
+; RV32I-NEXT: srli t0, a4, 1
+; RV32I-NEXT: sll a0, a0, a1
+; RV32I-NEXT: srli t1, a6, 1
+; RV32I-NEXT: sll a6, a6, a1
+; RV32I-NEXT: srli a5, a5, 1
+; RV32I-NEXT: sll a1, a4, a1
+; RV32I-NEXT: srl a4, t0, a3
+; RV32I-NEXT: srl t0, t1, a3
+; RV32I-NEXT: srl a3, a5, a3
+; RV32I-NEXT: srli a5, a1, 16
+; RV32I-NEXT: srli t1, a1, 24
+; RV32I-NEXT: srli t2, a1, 8
+; RV32I-NEXT: or a4, a7, a4
+; RV32I-NEXT: or a0, a0, t0
+; RV32I-NEXT: or a3, a6, a3
+; RV32I-NEXT: sb a1, 0(a2)
; RV32I-NEXT: sb t2, 1(a2)
; RV32I-NEXT: sb a5, 2(a2)
; RV32I-NEXT: sb t1, 3(a2)
-; RV32I-NEXT: srli a0, a3, 16
-; RV32I-NEXT: srli a4, a3, 24
-; RV32I-NEXT: srli a5, a3, 8
-; RV32I-NEXT: srli a7, a6, 16
-; RV32I-NEXT: srli t0, a6, 24
-; RV32I-NEXT: srli t1, a6, 8
-; RV32I-NEXT: srli t2, a1, 16
-; RV32I-NEXT: srli t3, a1, 24
+; RV32I-NEXT: srli a1, a3, 16
+; RV32I-NEXT: srli a5, a3, 24
+; RV32I-NEXT: srli a6, a3, 8
+; RV32I-NEXT: srli a7, a0, 16
+; RV32I-NEXT: srli t0, a0, 24
+; RV32I-NEXT: srli t1, a0, 8
+; RV32I-NEXT: srli t2, a4, 16
+; RV32I-NEXT: srli t3, a4, 24
; RV32I-NEXT: sb a3, 8(a2)
-; RV32I-NEXT: sb a5, 9(a2)
-; RV32I-NEXT: sb a0, 10(a2)
-; RV32I-NEXT: sb a4, 11(a2)
-; RV32I-NEXT: srli a0, a1, 8
-; RV32I-NEXT: sb a6, 12(a2)
+; RV32I-NEXT: sb a6, 9(a2)
+; RV32I-NEXT: sb a1, 10(a2)
+; RV32I-NEXT: sb a5, 11(a2)
+; RV32I-NEXT: srli a1, a4, 8
+; RV32I-NEXT: sb a0, 12(a2)
; RV32I-NEXT: sb t1, 13(a2)
; RV32I-NEXT: sb a7, 14(a2)
; RV32I-NEXT: sb t0, 15(a2)
-; RV32I-NEXT: sb a1, 4(a2)
-; RV32I-NEXT: sb a0, 5(a2)
+; RV32I-NEXT: sb a4, 4(a2)
+; RV32I-NEXT: sb a1, 5(a2)
; RV32I-NEXT: sb t2, 6(a2)
; RV32I-NEXT: sb t3, 7(a2)
; RV32I-NEXT: addi sp, sp, 32
@@ -1168,73 +1186,82 @@ define void @ashr_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; RV32I-NEXT: lbu t1, 6(a0)
; RV32I-NEXT: lbu t2, 7(a0)
; RV32I-NEXT: slli a4, a4, 8
+; RV32I-NEXT: or a3, a4, a3
+; RV32I-NEXT: lbu a4, 8(a0)
+; RV32I-NEXT: lbu t3, 9(a0)
+; RV32I-NEXT: lbu t4, 10(a0)
+; RV32I-NEXT: lbu t5, 11(a0)
; RV32I-NEXT: slli a5, a5, 16
; RV32I-NEXT: slli a6, a6, 24
; RV32I-NEXT: slli t0, t0, 8
-; RV32I-NEXT: or a3, a4, a3
-; RV32I-NEXT: or a4, a6, a5
-; RV32I-NEXT: or a5, t0, a7
-; RV32I-NEXT: lbu a6, 8(a0)
-; RV32I-NEXT: lbu a7, 9(a0)
-; RV32I-NEXT: lbu t0, 10(a0)
-; RV32I-NEXT: lbu t3, 11(a0)
; RV32I-NEXT: slli t1, t1, 16
; RV32I-NEXT: slli t2, t2, 24
-; RV32I-NEXT: slli a7, a7, 8
-; RV32I-NEXT: slli t0, t0, 16
-; RV32I-NEXT: slli t3, t3, 24
-; RV32I-NEXT: or t1, t2, t1
-; RV32I-NEXT: or a6, a7, a6
-; RV32I-NEXT: or a7, t3, t0
+; RV32I-NEXT: or a5, a6, a5
+; RV32I-NEXT: or a6, t0, a7
+; RV32I-NEXT: or a7, t2, t1
; RV32I-NEXT: lbu t0, 12(a0)
-; RV32I-NEXT: lbu t2, 13(a0)
-; RV32I-NEXT: lbu t3, 14(a0)
-; RV32I-NEXT: lbu t4, 15(a0)
-; RV32I-NEXT: lbu a0, 0(a1)
-; RV32I-NEXT: slli t2, t2, 8
-; RV32I-NEXT: or a1, t2, t0
-; RV32I-NEXT: mv t0, sp
-; RV32I-NEXT: slli t3, t3, 16
-; RV32I-NEXT: slli t4, t4, 24
-; RV32I-NEXT: or a3, a4, a3
-; RV32I-NEXT: srli a4, a0, 3
-; RV32I-NEXT: or a5, t1, a5
-; RV32I-NEXT: andi t1, a0, 31
-; RV32I-NEXT: or t2, t4, t3
-; RV32I-NEXT: srai t3, t4, 31
-; RV32I-NEXT: andi a4, a4, 12
-; RV32I-NEXT: xori t1, t1, 31
+; RV32I-NEXT: lbu t1, 13(a0)
+; RV32I-NEXT: lbu t2, 14(a0)
+; RV32I-NEXT: lbu a0, 15(a0)
+; RV32I-NEXT: slli t3, t3, 8
+; RV32I-NEXT: slli t4, t4, 16
+; RV32I-NEXT: slli t5, t5, 24
+; RV32I-NEXT: slli t1, t1, 8
+; RV32I-NEXT: or a4, t3, a4
+; RV32I-NEXT: or t3, t5, t4
+; RV32I-NEXT: or t0, t1, t0
+; RV32I-NEXT: lbu t1, 1(a1)
+; RV32I-NEXT: lbu t4, 0(a1)
+; RV32I-NEXT: lbu t5, 2(a1)
+; RV32I-NEXT: lbu a1, 3(a1)
+; RV32I-NEXT: slli t1, t1, 8
+; RV32I-NEXT: or t1, t1, t4
+; RV32I-NEXT: slli t5, t5, 16
+; RV32I-NEXT: slli a1, a1, 24
+; RV32I-NEXT: or a1, a1, t5
+; RV32I-NEXT: or a3, a5, a3
+; RV32I-NEXT: mv a5, sp
+; RV32I-NEXT: slli t2, t2, 16
+; RV32I-NEXT: slli a0, a0, 24
+; RV32I-NEXT: or t2, a0, t2
+; RV32I-NEXT: srai a0, a0, 31
; RV32I-NEXT: or a6, a7, a6
-; RV32I-NEXT: or a1, t2, a1
-; RV32I-NEXT: sw t3, 16(sp)
-; RV32I-NEXT: sw t3, 20(sp)
-; RV32I-NEXT: sw t3, 24(sp)
-; RV32I-NEXT: sw t3, 28(sp)
-; RV32I-NEXT: add a4, t0, a4
+; RV32I-NEXT: or a4, t3, a4
+; RV32I-NEXT: or a7, t2, t0
+; RV32I-NEXT: or a1, a1, t1
+; RV32I-NEXT: sw a0, 16(sp)
+; RV32I-NEXT: sw a0, 20(sp)
+; RV32I-NEXT: sw a0, 24(sp)
+; RV32I-NEXT: sw a0, 28(sp)
; RV32I-NEXT: sw a3, 0(sp)
-; RV32I-NEXT: sw a5, 4(sp)
-; RV32I-NEXT: sw a6, 8(sp)
-; RV32I-NEXT: sw a1, 12(sp)
-; RV32I-NEXT: lw a1, 4(a4)
-; RV32I-NEXT: lw a3, 8(a4)
-; RV32I-NEXT: lw a5, 0(a4)
-; RV32I-NEXT: lw a4, 12(a4)
-; RV32I-NEXT: srl a6, a1, a0
-; RV32I-NEXT: slli a7, a3, 1
-; RV32I-NEXT: srl a5, a5, a0
-; RV32I-NEXT: slli a1, a1, 1
-; RV32I-NEXT: srl a3, a3, a0
-; RV32I-NEXT: slli t0, a4, 1
-; RV32I-NEXT: sra a0, a4, a0
-; RV32I-NEXT: sll a4, a7, t1
-; RV32I-NEXT: sll a1, a1, t1
-; RV32I-NEXT: sll a7, t0, t1
+; RV32I-NEXT: sw a6, 4(sp)
+; RV32I-NEXT: sw a4, 8(sp)
+; RV32I-NEXT: sw a7, 12(sp)
+; RV32I-NEXT: srli a0, a1, 3
+; RV32I-NEXT: andi a3, a1, 31
+; RV32I-NEXT: andi a0, a0, 12
+; RV32I-NEXT: xori a3, a3, 31
+; RV32I-NEXT: add a0, a5, a0
+; RV32I-NEXT: lw a4, 4(a0)
+; RV32I-NEXT: lw a5, 8(a0)
+; RV32I-NEXT: lw a6, 0(a0)
+; RV32I-NEXT: lw a0, 12(a0)
+; RV32I-NEXT: srl a7, a4, a1
+; RV32I-NEXT: slli t0, a5, 1
+; RV32I-NEXT: srl a6, a6, a1
+; RV32I-NEXT: slli a4, a4, 1
+; RV32I-NEXT: srl a5, a5, a1
+; RV32I-NEXT: slli t1, a0, 1
+; RV32I-NEXT: sra a0, a0, a1
+; RV32I-NEXT: sll a1, t0, a3
+; RV32I-NEXT: sll a4, a4, a3
+; RV32I-NEXT: sll a3, t1, a3
; RV32I-NEXT: srli t0, a0, 16
; RV32I-NEXT: srli t1, a0, 24
; RV32I-NEXT: srli t2, a0, 8
+; RV32I-NEXT: or a1, a7, a1
; RV32I-NEXT: or a4, a6, a4
-; RV32I-NEXT: or a1, a5, a1
-; RV32I-NEXT: or a3, a3, a7
+; RV32I-NEXT: or a3, a5, a3
; RV32I-NEXT: sb a0, 12(a2)
; RV32I-NEXT: sb t2, 13(a2)
; RV32I-NEXT: sb t0, 14(a2)
@@ -1242,21 +1269,21 @@ define void @ashr_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; RV32I-NEXT: srli a0, a3, 16
; RV32I-NEXT: srli a5, a3, 24
; RV32I-NEXT: srli a6, a3, 8
-; RV32I-NEXT: srli a7, a1, 16
-; RV32I-NEXT: srli t0, a1, 24
-; RV32I-NEXT: srli t1, a1, 8
-; RV32I-NEXT: srli t2, a4, 16
-; RV32I-NEXT: srli t3, a4, 24
+; RV32I-NEXT: srli a7, a4, 16
+; RV32I-NEXT: srli t0, a4, 24
+; RV32I-NEXT: srli t1, a4, 8
+; RV32I-NEXT: srli t2, a1, 16
+; RV32I-NEXT: srli t3, a1, 24
; RV32I-NEXT: sb a3, 8(a2)
; RV32I-NEXT: sb a6, 9(a2)
; RV32I-NEXT: sb a0, 10(a2)
; RV32I-NEXT: sb a5, 11(a2)
-; RV32I-NEXT: srli a0, a4, 8
-; RV32I-NEXT: sb a1, 0(a2)
+; RV32I-NEXT: srli a0, a1, 8
+; RV32I-NEXT: sb a4, 0(a2)
; RV32I-NEXT: sb t1, 1(a2)
; RV32I-NEXT: sb a7, 2(a2)
; RV32I-NEXT: sb t0, 3(a2)
-; RV32I-NEXT: sb a4, 4(a2)
+; RV32I-NEXT: sb a1, 4(a2)
; RV32I-NEXT: sb a0, 5(a2)
; RV32I-NEXT: sb t2, 6(a2)
; RV32I-NEXT: sb t3, 7(a2)
@@ -1272,17 +1299,19 @@ define void @ashr_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
define void @lshr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; RV64I-LABEL: lshr_32bytes:
; RV64I: # %bb.0:
-; RV64I-NEXT: addi sp, sp, -144
-; RV64I-NEXT: sd s0, 136(sp) # 8-byte Folded Spill
-; RV64I-NEXT: sd s1, 128(sp) # 8-byte Folded Spill
-; RV64I-NEXT: sd s2, 120(sp) # 8-byte Folded Spill
-; RV64I-NEXT: sd s3, 112(sp) # 8-byte Folded Spill
-; RV64I-NEXT: sd s4, 104(sp) # 8-byte Folded Spill
-; RV64I-NEXT: sd s5, 96(sp) # 8-byte Folded Spill
-; RV64I-NEXT: sd s6, 88(sp) # 8-byte Folded Spill
-; RV64I-NEXT: sd s7, 80(sp) # 8-byte Folded Spill
-; RV64I-NEXT: sd s8, 72(sp) # 8-byte Folded Spill
-; RV64I-NEXT: sd s9, 64(sp) # 8-byte Folded Spill
+; RV64I-NEXT: addi sp, sp, -160
+; RV64I-NEXT: sd s0, 152(sp) # 8-byte Folded Spill
+; RV64I-NEXT: sd s1, 144(sp) # 8-byte Folded Spill
+; RV64I-NEXT: sd s2, 136(sp) # 8-byte Folded Spill
+; RV64I-NEXT: sd s3, 128(sp) # 8-byte Folded Spill
+; RV64I-NEXT: sd s4, 120(sp) # 8-byte Folded Spill
+; RV64I-NEXT: sd s5, 112(sp) # 8-byte Folded Spill
+; RV64I-NEXT: sd s6, 104(sp) # 8-byte Folded Spill
+; RV64I-NEXT: sd s7, 96(sp) # 8-byte Folded Spill
+; RV64I-NEXT: sd s8, 88(sp) # 8-byte Folded Spill
+; RV64I-NEXT: sd s9, 80(sp) # 8-byte Folded Spill
+; RV64I-NEXT: sd s10, 72(sp) # 8-byte Folded Spill
+; RV64I-NEXT: sd s11, 64(sp) # 8-byte Folded Spill
; RV64I-NEXT: lbu a3, 0(a0)
; RV64I-NEXT: lbu a4, 1(a0)
; RV64I-NEXT: lbu a5, 2(a0)
@@ -1299,122 +1328,143 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; RV64I-NEXT: lbu s1, 13(a0)
; RV64I-NEXT: lbu s2, 14(a0)
; RV64I-NEXT: lbu s3, 15(a0)
-; RV64I-NEXT: slli a4, a4, 8
-; RV64I-NEXT: slli a5, a5, 16
-; RV64I-NEXT: slli a6, a6, 24
-; RV64I-NEXT: or a3, a4, a3
-; RV64I-NEXT: or a4, a6, a5
; RV64I-NEXT: lbu s4, 16(a0)
; RV64I-NEXT: lbu s5, 17(a0)
; RV64I-NEXT: lbu s6, 18(a0)
; RV64I-NEXT: lbu s7, 19(a0)
+; RV64I-NEXT: slli a4, a4, 8
+; RV64I-NEXT: slli s8, a5, 16
+; RV64I-NEXT: slli a6, a6, 24
; RV64I-NEXT: slli t0, t0, 8
; RV64I-NEXT: slli t1, t1, 16
; RV64I-NEXT: slli t2, t2, 24
+; RV64I-NEXT: or a5, a4, a3
+; RV64I-NEXT: or a6, a6, s8
+; RV64I-NEXT: or a3, t0, a7
+; RV64I-NEXT: or a4, t2, t1
+; RV64I-NEXT: lbu s8, 20(a0)
+; RV64I-NEXT: lbu s9, 21(a0)
+; RV64I-NEXT: lbu s10, 22(a0)
+; RV64I-NEXT: lbu s11, 23(a0)
; RV64I-NEXT: slli t4, t4, 8
; RV64I-NEXT: slli t5, t5, 16
; RV64I-NEXT: slli t6, t6, 24
-; RV64I-NEXT: or a5, t0, a7
-; RV64I-NEXT: or a6, t2, t1
-; RV64I-NEXT: or a7, t4, t3
-; RV64I-NEXT: or t0, t6, t5
-; RV64I-NEXT: lbu t5, 20(a0)
-; RV64I-NEXT: lbu t6, 21(a0)
-; RV64I-NEXT: lbu s8, 22(a0)
-; RV64I-NEXT: lbu s9, 23(a0)
; RV64I-NEXT: slli s1, s1, 8
; RV64I-NEXT: slli s2, s2, 16
; RV64I-NEXT: slli s3, s3, 24
+; RV64I-NEXT: or a7, t4, t3
+; RV64I-NEXT: or t0, t6, t5
+; RV64I-NEXT: or t1, s1, s0
+; RV64I-NEXT: or t2, s3, s2
+; RV64I-NEXT: lbu t6, 24(a0)
+; RV64I-NEXT: lbu s0, 25(a0)
+; RV64I-NEXT: lbu s1, 26(a0)
+; RV64I-NEXT: lbu s2, 27(a0)
; RV64I-NEXT: slli s5, s5, 8
; RV64I-NEXT: slli s6, s6, 16
; RV64I-NEXT: slli s7, s7, 24
-; RV64I-NEXT: or t1, s1, s0
-; RV64I-NEXT: or t2, s3, s2
+; RV64I-NEXT: slli s9, s9, 8
; RV64I-NEXT: or t3, s5, s4
; RV64I-NEXT: or t4, s7, s6
-; RV64I-NEXT: lbu s0, 24(a0)
-; RV64I-NEXT: lbu s1, 25(a0)
-; RV64I-NEXT: lbu s2, 26(a0)
-; RV64I-NEXT: lbu s3, 27(a0)
-; RV64I-NEXT: slli t6, t6, 8
-; RV64I-NEXT: slli s8, s8, 16
-; RV64I-NEXT: slli s9, s9, 24
-; RV64I-NEXT: slli s1, s1, 8
-; RV64I-NEXT: or t5, t6, t5
-; RV64I-NEXT: or t6, s9, s8
-; RV64I-NEXT: or s0, s1, s0
-; RV64I-NEXT: lbu s1, 28(a0)
+; RV64I-NEXT: or t5, s9, s8
+; RV64I-NEXT: lbu s3, 28(a0)
; RV64I-NEXT: lbu s4, 29(a0)
; RV64I-NEXT: lbu s5, 30(a0)
; RV64I-NEXT: lbu s6, 31(a0)
-; RV64I-NEXT: lbu a0, 0(a1)
+; RV64I-NEXT: slli s10, s10, 16
+; RV64I-NEXT: slli s11, s11, 24
+; RV64I-NEXT: slli s0, s0, 8
+; RV64I-NEXT: slli s1, s1, 16
+; RV64I-NEXT: slli s2, s2, 24
+; RV64I-NEXT: slli s4, s4, 8
+; RV64I-NEXT: or a0, s11, s10
+; RV64I-NEXT: or t6, s0, t6
+; RV64I-NEXT: or s0, s2, s1
+; RV64I-NEXT: or s1, s4, s3
+; RV64I-NEXT: lbu s2, 0(a1)
+; RV64I-NEXT: lbu s3, 1(a1)
+; RV64I-NEXT: lbu s4, 2(a1)
+; RV64I-NEXT: lbu s7, 3(a1)
+; RV64I-NEXT: slli s5, s5, 16
+; RV64I-NEXT: slli s6, s6, 24
+; RV64I-NEXT: slli s3, s3, 8
+; RV64I-NEXT: slli s4, s4, 16
+; RV64I-NEXT: slli s7, s7, 24
+; RV64I-NEXT: or s5, s6, s5
+; RV64I-NEXT: or s2, s3, s2
+; RV64I-NEXT: or s3, s7, s4
+; RV64I-NEXT: lbu s4, 5(a1)
+; RV64I-NEXT: lbu s6, 4(a1)
+; RV64I-NEXT: lbu s7, 6(a1)
+; RV64I-NEXT: lbu a1, 7(a1)
+; RV64I-NEXT: slli s4, s4, 8
+; RV64I-NEXT: or s4, s4, s6
+; RV64I-NEXT: slli s7, s7, 16
+; RV64I-NEXT: slli a1, a1, 24
+; RV64I-NEXT: or a1, a1, s7
; RV64I-NEXT: sd zero, 32(sp)
; RV64I-NEXT: sd zero, 40(sp)
; RV64I-NEXT: sd zero, 48(sp)
; RV64I-NEXT: sd zero, 56(sp)
-; RV64I-NEXT: slli s2, s2, 16
-; RV64I-NEXT: slli s3, s3, 24
-; RV64I-NEXT: or a1, s3, s2
-; RV64I-NEXT: mv s2, sp
-; RV64I-NEXT: slli s4, s4, 8
-; RV64I-NEXT: slli s5, s5, 16
-; RV64I-NEXT: slli s6, s6, 24
-; RV64I-NEXT: or s1, s4, s1
-; RV64I-NEXT: srli s3, a0, 3
-; RV64I-NEXT: or s4, s6, s5
-; RV64I-NEXT: andi s5, a0, 63
-; RV64I-NEXT: andi s3, s3, 24
-; RV64I-NEXT: xori s5, s5, 63
-; RV64I-NEXT: or a3, a4, a3
-; RV64I-NEXT: or a4, a6, a5
-; RV64I-NEXT: or a5, t0, a7
-; RV64I-NEXT: or a6, t2, t1
-; RV64I-NEXT: or a7, t4, t3
-; RV64I-NEXT: or t0, t6, t5
-; RV64I-NEXT: or a1, a1, s0
-; RV64I-NEXT: or t1, s4, s1
-; RV64I-NEXT: add s2, s2, s3
-; RV64I-NEXT: slli a4, a4, 32
-; RV64I-NEXT: slli a6, a6, 32
-; RV64I-NEXT: slli t0, t0, 32
-; RV64I-NEXT: slli t1, t1, 32
+; RV64I-NEXT: or a5, a6, a5
+; RV64I-NEXT: mv a6, sp
; RV64I-NEXT: or a3, a4, a3
-; RV64I-NEXT: or a4, a6, a5
-; RV64I-NEXT: or a5, t0, a7
-; RV64I-NEXT: or a1, t1, a1
+; RV64I-NEXT: or a4, t0, a7
+; RV64I-NEXT: or a7, t2, t1
+; RV64I-NEXT: or t0, t4, t3
+; RV64I-NEXT: or a0, a0, t5
+; RV64I-NEXT: or t1, s0, t6
+; RV64I-NEXT: or t2, s5, s1
+; RV64I-NEXT: or t3, s3, s2
+; RV64I-NEXT: or a1, a1, s4
+; RV64I-NEXT: slli a3, a3, 32
+; RV64I-NEXT: slli a7, a7, 32
+; RV64I-NEXT: slli a0, a0, 32
+; RV64I-NEXT: slli t2, t2, 32
+; RV64I-NEXT: slli a1, a1, 32
+; RV64I-NEXT: or a3, a3, a5
+; RV64I-NEXT: or a4, a7, a4
+; RV64I-NEXT: or a0, a0, t0
+; RV64I-NEXT: or a5, t2, t1
+; RV64I-NEXT: or a1, a1, t3
; RV64I-NEXT: sd a3, 0(sp)
; RV64I-NEXT: sd a4, 8(sp)
-; RV64I-NEXT: sd a5, 16(sp)
-; RV64I-NEXT: sd a1, 24(sp)
-; RV64I-NEXT: ld a1, 8(s2)
-; RV64I-NEXT: ld a3, 16(s2)
-; RV64I-NEXT: ld a4, 0(s2)
-; RV64I-NEXT: ld a5, 24(s2)
-; RV64I-NEXT: srl a6, a1, a0
-; RV64I-NEXT: slli a7, a3, 1
-; RV64I-NEXT: srl a4, a4, a0
-; RV64I-NEXT: slli a1, a1, 1
-; RV64I-NEXT: srl a3, a3, a0
+; RV64I-NEXT: sd a0, 16(sp)
+; RV64I-NEXT: sd a5, 24(sp)
+; RV64I-NEXT: srli a0, a1, 3
+; RV64I-NEXT: andi a3, a1, 63
+; RV64I-NEXT: andi a0, a0, 24
+; RV64I-NEXT: xori a3, a3, 63
+; RV64I-NEXT: add a0, a6, a0
+; RV64I-NEXT: ld a4, 8(a0)
+; RV64I-NEXT: ld a5, 16(a0)
+; RV64I-NEXT: ld a6, 0(a0)
+; RV64I-NEXT: ld a0, 24(a0)
+; RV64I-NEXT: srl a7, a4, a1
; RV64I-NEXT: slli t0, a5, 1
-; RV64I-NEXT: srl a5, a5, a0
-; RV64I-NEXT: sll a0, a7, s5
-; RV64I-NEXT: sll a1, a1, s5
-; RV64I-NEXT: sll a7, t0, s5
-; RV64I-NEXT: srli t0, a5, 56
-; RV64I-NEXT: srli t1, a5, 48
-; RV64I-NEXT: srli t2, a5, 40
-; RV64I-NEXT: srli t3, a5, 32
-; RV64I-NEXT: srli t4, a5, 24
-; RV64I-NEXT: srli t5, a5, 16
-; RV64I-NEXT: srli t6, a5, 8
-; RV64I-NEXT: or a0, a6, a0
-; RV64I-NEXT: or a1, a4, a1
-; RV64I-NEXT: or a3, a3, a7
+; RV64I-NEXT: srl a6, a6, a1
+; RV64I-NEXT: slli a4, a4, 1
+; RV64I-NEXT: srl a5, a5, a1
+; RV64I-NEXT: slli t1, a0, 1
+; RV64I-NEXT: srl t2, a0, a1
+; RV64I-NEXT: sll a0, t0, a3
+; RV64I-NEXT: sll a1, a4, a3
+; RV64I-NEXT: sll a3, t1, a3
+; RV64I-NEXT: srli a4, t2, 56
+; RV64I-NEXT: srli t0, t2, 48
+; RV64I-NEXT: srli t1, t2, 40
+; RV64I-NEXT: srli t3, t2, 32
+; RV64I-NEXT: srli t4, t2, 24
+; RV64I-NEXT: srli t5, t2, 16
+; RV64I-NEXT: srli t6, t2, 8
+; RV64I-NEXT: or a0, a7, a0
+; RV64I-NEXT: or a1, a6, a1
+; RV64I-NEXT: or a3, a5, a3
; RV64I-NEXT: sb t3, 28(a2)
-; RV64I-NEXT: sb t2, 29(a2)
-; RV64I-NEXT: sb t1, 30(a2)
-; RV64I-NEXT: sb t0, 31(a2)
-; RV64I-NEXT: sb a5, 24(a2)
+; RV64I-NEXT: sb t1, 29(a2)
+; RV64I-NEXT: sb t0, 30(a2)
+; RV64I-NEXT: sb a4, 31(a2)
+; RV64I-NEXT: sb t2, 24(a2)
; RV64I-NEXT: sb t6, 25(a2)
; RV64I-NEXT: sb t5, 26(a2)
; RV64I-NEXT: sb t4, 27(a2)
@@ -1463,17 +1513,19 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; RV64I-NEXT: sb a1, 9(a2)
; RV64I-NEXT: sb a5, 10(a2)
; RV64I-NEXT: sb a3, 11(a2)
-; RV64I-NEXT: ld s0, 136(sp) # 8-byte Folded Reload
-; RV64I-NEXT: ld s1, 128(sp) # 8-byte Folded Reload
-; RV64I-NEXT: ld s2, 120(sp) # 8-byte Folded Reload
-; RV64I-NEXT: ld s3, 112(sp) # 8-byte Folded Reload
-; RV64I-NEXT: ld s4, 104(sp) # 8-byte Folded Reload
-; RV64I-NEXT: ld s5, 96(sp) # 8-byte Folded Reload
-; RV64I-NEXT: ld s6, 88(sp) # 8-byte Folded Reload
-; RV64I-NEXT: ld s7, 80(sp) # 8-byte Folded Reload
-; RV64I-NEXT: ld s8, 72(sp) # 8-byte Folded Reload
-; RV64I-NEXT: ld s9, 64(sp) # 8-byte Folded Reload
-; RV64I-NEXT: addi sp, sp, 144
+; RV64I-NEXT: ld s0, 152(sp) # 8-byte Folded Reload
+; RV64I-NEXT: ld s1, 144(sp) # 8-byte Folded Reload
+; RV64I-NEXT: ld s2, 136(sp) # 8-byte Folded Reload
+; RV64I-NEXT: ld s3, 128(sp) # 8-byte Folded Reload
+; RV64I-NEXT: ld s4, 120(sp) # 8-byte Folded Reload
+; RV64I-NEXT: ld s5, 112(sp) # 8-byte Folded Reload
+; RV64I-NEXT: ld s6, 104(sp) # 8-byte Folded Reload
+; RV64I-NEXT: ld s7, 96(sp) # 8-byte Folded Reload
+; RV64I-NEXT: ld s8, 88(sp) # 8-byte Folded Reload
+; RV64I-NEXT: ld s9, 80(sp) # 8-byte Folded Reload
+; RV64I-NEXT: ld s10, 72(sp) # 8-byte Folded Reload
+; RV64I-NEXT: ld s11, 64(sp) # 8-byte Folded Reload
+; RV64I-NEXT: addi sp, sp, 160
; RV64I-NEXT: ret
;
; RV32I-LABEL: lshr_32bytes:
@@ -1498,55 +1550,67 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; RV32I-NEXT: lbu a7, 3(a0)
; RV32I-NEXT: lbu a5, 4(a0)
; RV32I-NEXT: lbu t0, 5(a0)
-; RV32I-NEXT: lbu t3, 6(a0)
-; RV32I-NEXT: lbu t6, 7(a0)
-; RV32I-NEXT: lbu s2, 8(a0)
-; RV32I-NEXT: lbu s3, 9(a0)
-; RV32I-NEXT: lbu s4, 10(a0)
-; RV32I-NEXT: lbu s5, 11(a0)
-; RV32I-NEXT: lbu s7, 12(a0)
-; RV32I-NEXT: lbu s8, 13(a0)
-; RV32I-NEXT: lbu s9, 14(a0)
-; RV32I-NEXT: lbu s10, 15(a0)
-; RV32I-NEXT: lbu s11, 16(a0)
-; RV32I-NEXT: lbu ra, 17(a0)
-; RV32I-NEXT: lbu t4, 18(a0)
-; RV32I-NEXT: lbu s0, 19(a0)
+; RV32I-NEXT: lbu t1, 6(a0)
+; RV32I-NEXT: lbu t2, 7(a0)
+; RV32I-NEXT: lbu t3, 8(a0)
+; RV32I-NEXT: lbu t4, 9(a0)
+; RV32I-NEXT: lbu t5, 10(a0)
+; RV32I-NEXT: lbu t6, 11(a0)
+; RV32I-NEXT: lbu s0, 12(a0)
+; RV32I-NEXT: lbu s2, 13(a0)
+; RV32I-NEXT: lbu s4, 14(a0)
+; RV32I-NEXT: lbu s5, 15(a0)
+; RV32I-NEXT: lbu s6, 16(a0)
+; RV32I-NEXT: lbu s7, 17(a0)
+; RV32I-NEXT: lbu s8, 18(a0)
+; RV32I-NEXT: lbu s9, 19(a0)
; RV32I-NEXT: slli a4, a4, 8
; RV32I-NEXT: slli a6, a6, 16
; RV32I-NEXT: slli a7, a7, 24
; RV32I-NEXT: or a3, a4, a3
+; RV32I-NEXT: sw a3, 4(sp) # 4-byte Folded Spill
; RV32I-NEXT: or a4, a7, a6
-; RV32I-NEXT: lbu t1, 20(a0)
-; RV32I-NEXT: lbu t2, 21(a0)
-; RV32I-NEXT: lbu t5, 22(a0)
-; RV32I-NEXT: lbu s1, 23(a0)
+; RV32I-NEXT: lbu s10, 20(a0)
+; RV32I-NEXT: lbu s11, 21(a0)
+; RV32I-NEXT: lbu ra, 22(a0)
+; RV32I-NEXT: lbu a3, 23(a0)
; RV32I-NEXT: slli t0, t0, 8
-; RV32I-NEXT: slli t3, t3, 16
+; RV32I-NEXT: slli t1, t1, 16
+; RV32I-NEXT: slli t2, t2, 24
+; RV32I-NEXT: slli t4, t4, 8
+; RV32I-NEXT: slli t5, t5, 16
; RV32I-NEXT: slli t6, t6, 24
-; RV32I-NEXT: slli s3, s3, 8
+; RV32I-NEXT: or a5, t0, a5
+; RV32I-NEXT: or a6, t2, t1
+; RV32I-NEXT: or a7, t4, t3
+; RV32I-NEXT: or t0, t6, t5
+; RV32I-NEXT: lbu s1, 24(a0)
+; RV32I-NEXT: lbu s3, 25(a0)
+; RV32I-NEXT: lbu t4, 26(a0)
+; RV32I-NEXT: lbu t5, 27(a0)
+; RV32I-NEXT: slli s2, s2, 8
; RV32I-NEXT: slli s4, s4, 16
; RV32I-NEXT: slli s5, s5, 24
-; RV32I-NEXT: or a5, t0, a5
-; RV32I-NEXT: or a6, t6, t3
-; RV32I-NEXT: or a7, s3, s2
-; RV32I-NEXT: or t0, s5, s4
-; RV32I-NEXT: lbu t3, 24(a0)
-; RV32I-NEXT: lbu s5, 25(a0)
-; RV32I-NEXT: lbu s6, 26(a0)
-; RV32I-NEXT: lbu t6, 27(a0)
-; RV32I-NEXT: slli s8, s8, 8
-; RV32I-NEXT: slli s9, s9, 16
-; RV32I-NEXT: slli s10, s10, 24
-; RV32I-NEXT: slli ra, ra, 8
-; RV32I-NEXT: or s7, s8, s7
-; RV32I-NEXT: or s2, s10, s9
-; RV32I-NEXT: or s3, ra, s11
-; RV32I-NEXT: lbu s4, 28(a0)
-; RV32I-NEXT: lbu s8, 29(a0)
-; RV32I-NEXT: lbu s9, 30(a0)
-; RV32I-NEXT: lbu s10, 31(a0)
-; RV32I-NEXT: lbu a0, 0(a1)
+; RV32I-NEXT: slli s7, s7, 8
+; RV32I-NEXT: or t1, s2, s0
+; RV32I-NEXT: or t2, s5, s4
+; RV32I-NEXT: or t3, s7, s6
+; RV32I-NEXT: lbu t6, 28(a0)
+; RV32I-NEXT: lbu s4, 29(a0)
+; RV32I-NEXT: lbu s5, 30(a0)
+; RV32I-NEXT: lbu s6, 31(a0)
+; RV32I-NEXT: slli s8, s8, 16
+; RV32I-NEXT: slli s9, s9, 24
+; RV32I-NEXT: slli s11, s11, 8
+; RV32I-NEXT: slli ra, ra, 16
+; RV32I-NEXT: slli a3, a3, 24
+; RV32I-NEXT: or a0, s9, s8
+; RV32I-NEXT: or s0, s11, s10
+; RV32I-NEXT: or s2, a3, ra
+; RV32I-NEXT: lbu a3, 0(a1)
+; RV32I-NEXT: lbu s7, 1(a1)
+; RV32I-NEXT: lbu s8, 2(a1)
+; RV32I-NEXT: lbu a1, 3(a1)
; RV32I-NEXT: sw zero, 56(sp)
; RV32I-NEXT: sw zero, 60(sp)
; RV32I-NEXT: sw zero, 64(sp)
@@ -1555,90 +1619,89 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; RV32I-NEXT: sw zero, 44(sp)
; RV32I-NEXT: sw zero, 48(sp)
; RV32I-NEXT: sw zero, 52(sp)
+; RV32I-NEXT: slli s3, s3, 8
+; RV32I-NEXT: or s1, s3, s1
+; RV32I-NEXT: addi s3, sp, 8
; RV32I-NEXT: slli t4, t4, 16
-; RV32I-NEXT: slli s0, s0, 24
-; RV32I-NEXT: or t4, s0, t4
-; RV32I-NEXT: addi s0, sp, 8
-; RV32I-NEXT: slli t2, t2, 8
-; RV32I-NEXT: slli t5, t5, 16
-; RV32I-NEXT: slli s1, s1, 24
-; RV32I-NEXT: slli s5, s5, 8
-; RV32I-NEXT: slli s6, s6, 16
-; RV32I-NEXT: slli t6, t6, 24
-; RV32I-NEXT: slli s8, s8, 8
-; RV32I-NEXT: slli s9, s9, 16
-; RV32I-NEXT: slli s10, s10, 24
-; RV32I-NEXT: or t1, t2, t1
+; RV32I-NEXT: slli t5, t5, 24
+; RV32I-NEXT: slli s4, s4, 8
+; RV32I-NEXT: slli s5, s5, 16
+; RV32I-NEXT: slli s6, s6, 24
+; RV32I-NEXT: slli s7, s7, 8
+; RV32I-NEXT: slli s8, s8, 16
+; RV32I-NEXT: slli a1, a1, 24
+; RV32I-NEXT: or t4, t5, t4
+; RV32I-NEXT: or t5, s4, t6
+; RV32I-NEXT: or t6, s6, s5
+; RV32I-NEXT: or a3, s7, a3
+; RV32I-NEXT: or a1, a1, s8
+; RV32I-NEXT: lw s4, 4(sp) # 4-byte Folded Reload
+; RV32I-NEXT: or a4, a4, s4
+; RV32I-NEXT: or a5, a6, a5
+; RV32I-NEXT: or a6, t0, a7
+; RV32I-NEXT: or a7, t2, t1
+; RV32I-NEXT: or t0, a0, t3
+; RV32I-NEXT: or t1, s2, s0
+; RV32I-NEXT: or t2, t4, s1
+; RV32I-NEXT: or t3, t6, t5
+; RV32I-NEXT: or a0, a1, a3
+; RV32I-NEXT: sw t0, 24(sp)
+; RV32I-NEXT: sw t1, 28(sp)
+; RV32I-NEXT: sw t2, 32(sp)
+; RV32I-NEXT: sw t3, 36(sp)
+; RV32I-NEXT: sw a4, 8(sp)
+; RV32I-NEXT: sw a5, 12(sp)
+; RV32I-NEXT: sw a6, 16(sp)
+; RV32I-NEXT: sw a7, 20(sp)
; RV32I-NEXT: srli a1, a0, 3
-; RV32I-NEXT: or t2, s1, t5
-; RV32I-NEXT: andi t5, a0, 31
-; RV32I-NEXT: or t3, s5, t3
-; RV32I-NEXT: or t6, t6, s6
-; RV32I-NEXT: or s1, s8, s4
-; RV32I-NEXT: or s4, s10, s9
-; RV32I-NEXT: andi s5, a1, 28
-; RV32I-NEXT: xori a1, t5, 31
-; RV32I-NEXT: or a3, a4, a3
-; RV32I-NEXT: or a4, a6, a5
-; RV32I-NEXT: or a5, t0, a7
-; RV32I-NEXT: or a6, s2, s7
-; RV32I-NEXT: or a7, t4, s3
-; RV32I-NEXT: or t0, t2, t1
-; RV32I-NEXT: or t1, t6, t3
-; RV32I-NEXT: or t2, s4, s1
-; RV32I-NEXT: add s0, s0, s5
-; RV32I-NEXT: sw a7, 24(sp)
-; RV32I-NEXT: sw t0, 28(sp)
-; RV32I-NEXT: sw t1, 32(sp)
-; RV32I-NEXT: sw t2, 36(sp)
-; RV32I-NEXT: sw a3, 8(sp)
-; RV32I-NEXT: sw a4, 12(sp)
-; RV32I-NEXT: sw a5, 16(sp)
-; RV32I-NEXT: sw a6, 20(sp)
-; RV32I-NEXT: lw a3, 0(s0)
-; RV32I-NEXT: lw a4, 4(s0)
-; RV32I-NEXT: lw a5, 8(s0)
-; RV32I-NEXT: lw a6, 12(s0)
-; RV32I-NEXT: lw a7, 16(s0)
-; RV32I-NEXT: lw t0, 20(s0)
-; RV32I-NEXT: lw t1, 24(s0)
-; RV32I-NEXT: lw t2, 28(s0)
-; RV32I-NEXT: srl t3, a4, a0
-; RV32I-NEXT: slli t4, a5, 1
+; RV32I-NEXT: andi a3, a0, 31
+; RV32I-NEXT: andi a4, a1, 28
+; RV32I-NEXT: xori a1, a3, 31
+; RV32I-NEXT: add a4, s3, a4
+; RV32I-NEXT: lw a3, 0(a4)
+; RV32I-NEXT: lw a5, 4(a4)
+; RV32I-NEXT: lw a6, 8(a4)
+; RV32I-NEXT: lw a7, 12(a4)
+; RV32I-NEXT: lw t0, 16(a4)
+; RV32I-NEXT: lw t1, 20(a4)
+; RV32I-NEXT: lw t2, 24(a4)
+; RV32I-NEXT: lw a4, 28(a4)
+; RV32I-NEXT: srl t3, a5, a0
+; RV32I-NEXT: slli t4, a6, 1
; RV32I-NEXT: srl a3, a3, a0
-; RV32I-NEXT: slli a4, a4, 1
-; RV32I-NEXT: srl t5, a6, a0
-; RV32I-NEXT: slli t6, a7, 1
-; RV32I-NEXT: srl a5, a5, a0
-; RV32I-NEXT: slli a6, a6, 1
-; RV32I-NEXT: srl s0, t0, a0
-; RV32I-NEXT: slli s1, t1, 1
-; RV32I-NEXT: srl a7, a7, a0
-; RV32I-NEXT: slli t0, t0, 1
-; RV32I-NEXT: srl t1, t1, a0
-; RV32I-NEXT: slli s2, t2, 1
+; RV32I-NEXT: slli a5, a5, 1
+; RV32I-NEXT: srl t5, a7, a0
+; RV32I-NEXT: slli t6, t0, 1
+; RV32I-NEXT: srl a6, a6, a0
+; RV32I-NEXT: slli a7, a7, 1
+; RV32I-NEXT: srl s0, t1, a0
+; RV32I-NEXT: slli s1, t2, 1
+; RV32I-NEXT: srl t0, t0, a0
+; RV32I-NEXT: slli t1, t1, 1
; RV32I-NEXT: srl t2, t2, a0
+; RV32I-NEXT: slli s2, a4, 1
+; RV32I-NEXT: srl s3, a4, a0
; RV32I-NEXT: sll a0, t4, a1
-; RV32I-NEXT: sll a4, a4, a1
-; RV32I-NEXT: sll t4, t6, a1
-; RV32I-NEXT: sll a6, a6, a1
-; RV32I-NEXT: sll t6, s1, a1
-; RV32I-NEXT: sll t0, t0, a1
-; RV32I-NEXT: sll s1, s2, a1
-; RV32I-NEXT: srli s2, t2, 24
-; RV32I-NEXT: srli s3, t2, 16
-; RV32I-NEXT: srli s4, t2, 8
+; RV32I-NEXT: sll a4, a5, a1
+; RV32I-NEXT: sll a5, t6, a1
+; RV32I-NEXT: sll a7, a7, a1
+; RV32I-NEXT: sll t4, s1, a1
+; RV32I-NEXT: sll t1, t1, a1
+; RV32I-NEXT: sll t6, s2, a1
+; RV32I-NEXT: srli s1, s3, 24
+; RV32I-NEXT: srli s2, s3, 16
+; RV32I-NEXT: srli s4, s3, 8
; RV32I-NEXT: or a0, t3, a0
; RV32I-NEXT: or a1, a3, a4
-; RV32I-NEXT: or a3, t5, t4
-; RV32I-NEXT: or a4, a5, a6
-; RV32I-NEXT: or a5, s0, t6
-; RV32I-NEXT: or a6, a7, t0
-; RV32I-NEXT: or a7, t1, s1
-; RV32I-NEXT: sb t2, 28(a2)
+; RV32I-NEXT: or a3, t5, a5
+; RV32I-NEXT: or a4, a6, a7
+; RV32I-NEXT: or a5, s0, t4
+; RV32I-NEXT: or a6, t0, t1
+; RV32I-NEXT: or a7, t2, t6
+; RV32I-NEXT: sb s3, 28(a2)
; RV32I-NEXT: sb s4, 29(a2)
-; RV32I-NEXT: sb s3, 30(a2)
-; RV32I-NEXT: sb s2, 31(a2)
+; RV32I-NEXT: sb s2, 30(a2)
+; RV32I-NEXT: sb s1, 31(a2)
; RV32I-NEXT: srli t0, a7, 24
; RV32I-NEXT: srli t1, a7, 16
; RV32I-NEXT: srli t2, a7, 8
@@ -1712,17 +1775,19 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
define void @shl_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; RV64I-LABEL: shl_32bytes:
; RV64I: # %bb.0:
-; RV64I-NEXT: addi sp, sp, -144
-; RV64I-NEXT: sd s0, 136(sp) # 8-byte Folded Spill
-; RV64I-NEXT: sd s1, 128(sp) # 8-byte Folded Spill
-; RV64I-NEXT: sd s2, 120(sp) # 8-byte Folded Spill
-; RV64I-NEXT: sd s3, 112(sp) # 8-byte Folded Spill
-; RV64I-NEXT: sd s4, 104(sp) # 8-byte Folded Spill
-; RV64I-NEXT: sd s5, 96(sp) # 8-byte Folded Spill
-; RV64I-NEXT: sd s6, 88(sp) # 8-byte Folded Spill
-; RV64I-NEXT: sd s7, 80(sp) # 8-byte Folded Spill
-; RV64I-NEXT: sd s8, 72(sp) # 8-byte Folded Spill
-; RV64I-NEXT: sd s9, 64(sp) # 8-byte Folded Spill
+; RV64I-NEXT: addi sp, sp, -160
+; RV64I-NEXT: sd s0, 152(sp) # 8-byte Folded Spill
+; RV64I-NEXT: sd s1, 144(sp) # 8-byte Folded Spill
+; RV64I-NEXT: sd s2, 136(sp) # 8-byte Folded Spill
+; RV64I-NEXT: sd s3, 128(sp) # 8-byte Folded Spill
+; RV64I-NEXT: sd s4, 120(sp) # 8-byte Folded Spill
+; RV64I-NEXT: sd s5, 112(sp) # 8-byte Folded Spill
+; RV64I-NEXT: sd s6, 104(sp) # 8-byte Folded Spill
+; RV64I-NEXT: sd s7, 96(sp) # 8-byte Folded Spill
+; RV64I-NEXT: sd s8, 88(sp) # 8-byte Folded Spill
+; RV64I-NEXT: sd s9, 80(sp) # 8-byte Folded Spill
+; RV64I-NEXT: sd s10, 72(sp) # 8-byte Folded Spill
+; RV64I-NEXT: sd s11, 64(sp) # 8-byte Folded Spill
; RV64I-NEXT: lbu a3, 0(a0)
; RV64I-NEXT: lbu a4, 1(a0)
; RV64I-NEXT: lbu a5, 2(a0)
@@ -1739,125 +1804,146 @@ define void @shl_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; RV64I-NEXT: lbu s1, 13(a0)
; RV64I-NEXT: lbu s2, 14(a0)
; RV64I-NEXT: lbu s3, 15(a0)
-; RV64I-NEXT: slli a4, a4, 8
-; RV64I-NEXT: slli a5, a5, 16
-; RV64I-NEXT: slli a6, a6, 24
-; RV64I-NEXT: or a3, a4, a3
-; RV64I-NEXT: or a4, a6, a5
; RV64I-NEXT: lbu s4, 16(a0)
; RV64I-NEXT: lbu s5, 17(a0)
; RV64I-NEXT: lbu s6, 18(a0)
; RV64I-NEXT: lbu s7, 19(a0)
+; RV64I-NEXT: slli a4, a4, 8
+; RV64I-NEXT: slli s8, a5, 16
+; RV64I-NEXT: slli a6, a6, 24
; RV64I-NEXT: slli t0, t0, 8
; RV64I-NEXT: slli t1, t1, 16
; RV64I-NEXT: slli t2, t2, 24
+; RV64I-NEXT: or a5, a4, a3
+; RV64I-NEXT: or a6, a6, s8
+; RV64I-NEXT: or a3, t0, a7
+; RV64I-NEXT: or a4, t2, t1
+; RV64I-NEXT: lbu s8, 20(a0)
+; RV64I-NEXT: lbu s9, 21(a0)
+; RV64I-NEXT: lbu s10, 22(a0)
+; RV64I-NEXT: lbu s11, 23(a0)
; RV64I-NEXT: slli t4, t4, 8
; RV64I-NEXT: slli t5, t5, 16
; RV64I-NEXT: slli t6, t6, 24
-; RV64I-NEXT: or a5, t0, a7
-; RV64I-NEXT: or a6, t2, t1
-; RV64I-NEXT: or a7, t4, t3
-; RV64I-NEXT: or t0, t6, t5
-; RV64I-NEXT: lbu t5, 20(a0)
-; RV64I-NEXT: lbu t6, 21(a0)
-; RV64I-NEXT: lbu s8, 22(a0)
-; RV64I-NEXT: lbu s9, 23(a0)
; RV64I-NEXT: slli s1, s1, 8
; RV64I-NEXT: slli s2, s2, 16
; RV64I-NEXT: slli s3, s3, 24
+; RV64I-NEXT: or a7, t4, t3
+; RV64I-NEXT: or t0, t6, t5
+; RV64I-NEXT: or t1, s1, s0
+; RV64I-NEXT: or t2, s3, s2
+; RV64I-NEXT: lbu t6, 24(a0)
+; RV64I-NEXT: lbu s0, 25(a0)
+; RV64I-NEXT: lbu s1, 26(a0)
+; RV64I-NEXT: lbu s2, 27(a0)
; RV64I-NEXT: slli s5, s5, 8
; RV64I-NEXT: slli s6, s6, 16
; RV64I-NEXT: slli s7, s7, 24
-; RV64I-NEXT: or t1, s1, s0
-; RV64I-NEXT: or t2, s3, s2
+; RV64I-NEXT: slli s9, s9, 8
; RV64I-NEXT: or t3, s5, s4
; RV64I-NEXT: or t4, s7, s6
-; RV64I-NEXT: lbu s0, 24(a0)
-; RV64I-NEXT: lbu s1, 25(a0)
-; RV64I-NEXT: lbu s2, 26(a0)
-; RV64I-NEXT: lbu s3, 27(a0)
-; RV64I-NEXT: slli t6, t6, 8
-; RV64I-NEXT: slli s8, s8, 16
-; RV64I-NEXT: slli s9, s9, 24
-; RV64I-NEXT: slli s1, s1, 8
-; RV64I-NEXT: or t5, t6, t5
-; RV64I-NEXT: or t6, s9, s8
-; RV64I-NEXT: or s0, s1, s0
-; RV64I-NEXT: lbu s1, 28(a0)
+; RV64I-NEXT: or t5, s9, s8
+; RV64I-NEXT: lbu s3, 28(a0)
; RV64I-NEXT: lbu s4, 29(a0)
; RV64I-NEXT: lbu s5, 30(a0)
; RV64I-NEXT: lbu s6, 31(a0)
-; RV64I-NEXT: lbu a0, 0(a1)
+; RV64I-NEXT: slli s10, s10, 16
+; RV64I-NEXT: slli s11, s11, 24
+; RV64I-NEXT: slli s0, s0, 8
+; RV64I-NEXT: slli s1, s1, 16
+; RV64I-NEXT: slli s2, s2, 24
+; RV64I-NEXT: slli s4, s4, 8
+; RV64I-NEXT: or a0, s11, s10
+; RV64I-NEXT: or t6, s0, t6
+; RV64I-NEXT: or s0, s2, s1
+; RV64I-NEXT: or s1, s4, s3
+; RV64I-NEXT: lbu s2, 0(a1)
+; RV64I-NEXT: lbu s3, 1(a1)
+; RV64I-NEXT: lbu s4, 2(a1)
+; RV64I-NEXT: lbu s7, 3(a1)
+; RV64I-NEXT: slli s5, s5, 16
+; RV64I-NEXT: slli s6, s6, 24
+; RV64I-NEXT: slli s3, s3, 8
+; RV64I-NEXT: slli s4, s4, 16
+; RV64I-NEXT: slli s7, s7, 24
+; RV64I-NEXT: or s5, s6, s5
+; RV64I-NEXT: or s2, s3, s2
+; RV64I-NEXT: or s3, s7, s4
+; RV64I-NEXT: lbu s4, 5(a1)
+; RV64I-NEXT: lbu s6, 4(a1)
+; RV64I-NEXT: lbu s7, 6(a1)
+; RV64I-NEXT: lbu a1, 7(a1)
+; RV64I-NEXT: slli s4, s4, 8
+; RV64I-NEXT: or s4, s4, s6
+; RV64I-NEXT: slli s7, s7, 16
+; RV64I-NEXT: slli a1, a1, 24
+; RV64I-NEXT: or a1, a1, s7
; RV64I-NEXT: sd zero, 0(sp)
; RV64I-NEXT: sd zero, 8(sp)
; RV64I-NEXT: sd zero, 16(sp)
; RV64I-NEXT: sd zero, 24(sp)
-; RV64I-NEXT: slli s2, s2, 16
-; RV64I-NEXT: slli s3, s3, 24
-; RV64I-NEXT: or a1, s3, s2
-; RV64I-NEXT: addi s2, sp, 32
-; RV64I-NEXT: slli s4, s4, 8
-; RV64I-NEXT: slli s5, s5, 16
-; RV64I-NEXT: slli s6, s6, 24
-; RV64I-NEXT: or s1, s4, s1
-; RV64I-NEXT: srli s3, a0, 3
-; RV64I-NEXT: or s4, s6, s5
-; RV64I-NEXT: andi s5, a0, 63
-; RV64I-NEXT: andi s3, s3, 24
-; RV64I-NEXT: or a3, a4, a3
-; RV64I-NEXT: or a4, a6, a5
-; RV64I-NEXT: or a5, t0, a7
-; RV64I-NEXT: or a6, t2, t1
-; RV64I-NEXT: or a7, t4, t3
-; RV64I-NEXT: or t0, t6, t5
-; RV64I-NEXT: or a1, a1, s0
-; RV64I-NEXT: or t1, s4, s1
-; RV64I-NEXT: sub t2, s2, s3
-; RV64I-NEXT: slli a4, a4, 32
-; RV64I-NEXT: slli a6, a6, 32
-; RV64I-NEXT: slli t0, t0, 32
-; RV64I-NEXT: slli t1, t1, 32
+; RV64I-NEXT: or a5, a6, a5
+; RV64I-NEXT: addi a6, sp, 32
; RV64I-NEXT: or a3, a4, a3
-; RV64I-NEXT: or a4, a6, a5
-; RV64I-NEXT: or a5, t0, a7
-; RV64I-NEXT: or a1, t1, a1
+; RV64I-NEXT: or a4, t0, a7
+; RV64I-NEXT: or a7, t2, t1
+; RV64I-NEXT: or t0, t4, t3
+; RV64I-NEXT: or a0, a0, t5
+; RV64I-NEXT: or t1, s0, t6
+; RV64I-NEXT: or t2, s5, s1
+; RV64I-NEXT: or t3, s3, s2
+; RV64I-NEXT: or a1, a1, s4
+; RV64I-NEXT: slli a3, a3, 32
+; RV64I-NEXT: slli a7, a7, 32
+; RV64I-NEXT: slli a0, a0, 32
+; RV64I-NEXT: slli t2, t2, 32
+; RV64I-NEXT: slli a1, a1, 32
+; RV64I-NEXT: or a3, a3, a5
+; RV64I-NEXT: or a4, a7, a4
+; RV64I-NEXT: or a0, a0, t0
+; RV64I-NEXT: or a5, t2, t1
+; RV64I-NEXT: or a1, a1, t3
; RV64I-NEXT: sd a3, 32(sp)
; RV64I-NEXT: sd a4, 40(sp)
-; RV64I-NEXT: sd a5, 48(sp)
-; RV64I-NEXT: sd a1, 56(sp)
-; RV64I-NEXT: ld a1, 0(t2)
-; RV64I-NEXT: ld a3, 8(t2)
-; RV64I-NEXT: ld a4, 16(t2)
-; RV64I-NEXT: ld a5, 24(t2)
-; RV64I-NEXT: xori a6, s5, 63
-; RV64I-NEXT: sll a7, a3, a0
-; RV64I-NEXT: srli t0, a1, 1
-; RV64I-NEXT: sll a5, a5, a0
-; RV64I-NEXT: srli t1, a4, 1
-; RV64I-NEXT: sll a4, a4, a0
-; RV64I-NEXT: srli a3, a3, 1
-; RV64I-NEXT: sll t2, a1, a0
-; RV64I-NEXT: srl a0, t0, a6
-; RV64I-NEXT: srl a1, t1, a6
-; RV64I-NEXT: srl a3, a3, a6
-; RV64I-NEXT: srli a6, t2, 56
-; RV64I-NEXT: srli t0, t2, 48
-; RV64I-NEXT: srli t1, t2, 40
-; RV64I-NEXT: srli t3, t2, 32
-; RV64I-NEXT: srli t4, t2, 24
-; RV64I-NEXT: srli t5, t2, 16
-; RV64I-NEXT: srli t6, t2, 8
-; RV64I-NEXT: or a0, a7, a0
-; RV64I-NEXT: or a1, a5, a1
-; RV64I-NEXT: or a3, a4, a3
-; RV64I-NEXT: sb t3, 4(a2)
-; RV64I-NEXT: sb t1, 5(a2)
-; RV64I-NEXT: sb t0, 6(a2)
-; RV64I-NEXT: sb a6, 7(a2)
-; RV64I-NEXT: sb t2, 0(a2)
-; RV64I-NEXT: sb t6, 1(a2)
-; RV64I-NEXT: sb t5, 2(a2)
-; RV64I-NEXT: sb t4, 3(a2)
+; RV64I-NEXT: sd a0, 48(sp)
+; RV64I-NEXT: sd a5, 56(sp)
+; RV64I-NEXT: srli a0, a1, 3
+; RV64I-NEXT: andi a3, a1, 63
+; RV64I-NEXT: andi a0, a0, 24
+; RV64I-NEXT: sub a0, a6, a0
+; RV64I-NEXT: ld a4, 0(a0)
+; RV64I-NEXT: ld a5, 8(a0)
+; RV64I-NEXT: ld a6, 16(a0)
+; RV64I-NEXT: ld a0, 24(a0)
+; RV64I-NEXT: xori a3, a3, 63
+; RV64I-NEXT: sll a7, a5, a1
+; RV64I-NEXT: srli t0, a4, 1
+; RV64I-NEXT: sll t1, a0, a1
+; RV64I-NEXT: srli a0, a6, 1
+; RV64I-NEXT: sll a6, a6, a1
+; RV64I-NEXT: srli a5, a5, 1
+; RV64I-NEXT: sll a4, a4, a1
+; RV64I-NEXT: srl a1, t0, a3
+; RV64I-NEXT: srl t0, a0, a3
+; RV64I-NEXT: srl a3, a5, a3
+; RV64I-NEXT: srli a5, a4, 56
+; RV64I-NEXT: srli t2, a4, 48
+; RV64I-NEXT: srli t3, a4, 40
+; RV64I-NEXT: srli t4, a4, 32
+; RV64I-NEXT: srli t5, a4, 24
+; RV64I-NEXT: srli t6, a4, 16
+; RV64I-NEXT: srli s0, a4, 8
+; RV64I-NEXT: or a0, a7, a1
+; RV64I-NEXT: or a1, t1, t0
+; RV64I-NEXT: or a3, a6, a3
+; RV64I-NEXT: sb t4, 4(a2)
+; RV64I-NEXT: sb t3, 5(a2)
+; RV64I-NEXT: sb t2, 6(a2)
+; RV64I-NEXT: sb a5, 7(a2)
+; RV64I-NEXT: sb a4, 0(a2)
+; RV64I-NEXT: sb s0, 1(a2)
+; RV64I-NEXT: sb t6, 2(a2)
+; RV64I-NEXT: sb t5, 3(a2)
; RV64I-NEXT: srli a4, a3, 56
; RV64I-NEXT: srli a5, a3, 48
; RV64I-NEXT: srli a6, a3, 40
@@ -1903,17 +1989,19 @@ define void @shl_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; RV64I-NEXT: sb a1, 9(a2)
; RV64I-NEXT: sb a5, 10(a2)
; RV64I-NEXT: sb a3, 11(a2)
-; RV64I-NEXT: ld s0, 136(sp) # 8-byte Folded Reload
-; RV64I-NEXT: ld s1, 128(sp) # 8-byte Folded Reload
-; RV64I-NEXT: ld s2, 120(sp) # 8-byte Folded Reload
-; RV64I-NEXT: ld s3, 112(sp) # 8-byte Folded Reload
-; RV64I-NEXT: ld s4, 104(sp) # 8-byte Folded Reload
-; RV64I-NEXT: ld s5, 96(sp) # 8-byte Folded Reload
-; RV64I-NEXT: ld s6, 88(sp) # 8-byte Folded Reload
-; RV64I-NEXT: ld s7, 80(sp) # 8-byte Folded Reload
-; RV64I-NEXT: ld s8, 72(sp) # 8-byte Folded Reload
-; RV64I-NEXT: ld s9, 64(sp) # 8-byte Folded Reload
-; RV64I-NEXT: addi sp, sp, 144
+; RV64I-NEXT: ld s0, 152(sp) # 8-byte Folded Reload
+; RV64I-NEXT: ld s1, 144(sp) # 8-byte Folded Reload
+; RV64I-NEXT: ld s2, 136(sp) # 8-byte Folded Reload
+; RV64I-NEXT: ld s3, 128(sp) # 8-byte Folded Reload
+; RV64I-NEXT: ld s4, 120(sp) # 8-byte Folded Reload
+; RV64I-NEXT: ld s5, 112(sp) # 8-byte Folded Reload
+; RV64I-NEXT: ld s6, 104(sp) # 8-byte Folded Reload
+; RV64I-NEXT: ld s7, 96(sp) # 8-byte Folded Reload
+; RV64I-NEXT: ld s8, 88(sp) # 8-byte Folded Reload
+; RV64I-NEXT: ld s9, 80(sp) # 8-byte Folded Reload
+; RV64I-NEXT: ld s10, 72(sp) # 8-byte Folded Reload
+; RV64I-NEXT: ld s11, 64(sp) # 8-byte Folded Reload
+; RV64I-NEXT: addi sp, sp, 160
; RV64I-NEXT: ret
;
; RV32I-LABEL: shl_32bytes:
@@ -1938,55 +2026,67 @@ define void @shl_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; RV32I-NEXT: lbu a7, 3(a0)
; RV32I-NEXT: lbu a5, 4(a0)
; RV32I-NEXT: lbu t0, 5(a0)
-; RV32I-NEXT: lbu t3, 6(a0)
-; RV32I-NEXT: lbu t6, 7(a0)
-; RV32I-NEXT: lbu s2, 8(a0)
-; RV32I-NEXT: lbu s3, 9(a0)
-; RV32I-NEXT: lbu s4, 10(a0)
-; RV32I-NEXT: lbu s5, 11(a0)
-; RV32I-NEXT: lbu s7, 12(a0)
-; RV32I-NEXT: lbu s8, 13(a0)
-; RV32I-NEXT: lbu s9, 14(a0)
-; RV32I-NEXT: lbu s10, 15(a0)
-; RV32I-NEXT: lbu s11, 16(a0)
-; RV32I-NEXT: lbu ra, 17(a0)
-; RV32I-NEXT: lbu t4, 18(a0)
-; RV32I-NEXT: lbu s0, 19(a0)
+; RV32I-NEXT: lbu t1, 6(a0)
+; RV32I-NEXT: lbu t2, 7(a0)
+; RV32I-NEXT: lbu t3, 8(a0)
+; RV32I-NEXT: lbu t4, 9(a0)
+; RV32I-NEXT: lbu t5, 10(a0)
+; RV32I-NEXT: lbu t6, 11(a0)
+; RV32I-NEXT: lbu s0, 12(a0)
+; RV32I-NEXT: lbu s2, 13(a0)
+; RV32I-NEXT: lbu s4, 14(a0)
+; RV32I-NEXT: lbu s5, 15(a0)
+; RV32I-NEXT: lbu s6, 16(a0)
+; RV32I-NEXT: lbu s7, 17(a0)
+; RV32I-NEXT: lbu s8, 18(a0)
+; RV32I-NEXT: lbu s9, 19(a0)
; RV32I-NEXT: slli a4, a4, 8
; RV32I-NEXT: slli a6, a6, 16
; RV32I-NEXT: slli a7, a7, 24
; RV32I-NEXT: or a3, a4, a3
+; RV32I-NEXT: sw a3, 4(sp) # 4-byte Folded Spill
; RV32I-NEXT: or a4, a7, a6
-; RV32I-NEXT: lbu t1, 20(a0)
-; RV32I-NEXT: lbu t2, 21(a0)
-; RV32I-NEXT: lbu t5, 22(a0)
-; RV32I-NEXT: lbu s1, 23(a0)
+; RV32I-NEXT: lbu s10, 20(a0)
+; RV32I-NEXT: lbu s11, 21(a0)
+; RV32I-NEXT: lbu ra, 22(a0)
+; RV32I-NEXT: lbu a3, 23(a0)
; RV32I-NEXT: slli t0, t0, 8
-; RV32I-NEXT: slli t3, t3, 16
+; RV32I-NEXT: slli t1, t1, 16
+; RV32I-NEXT: slli t2, t2, 24
+; RV32I-NEXT: slli t4, t4, 8
+; RV32I-NEXT: slli t5, t5, 16
; RV32I-NEXT: slli t6, t6, 24
-; RV32I-NEXT: slli s3, s3, 8
+; RV32I-NEXT: or a5, t0, a5
+; RV32I-NEXT: or a6, t2, t1
+; RV32I-NEXT: or a7, t4, t3
+; RV32I-NEXT: or t0, t6, t5
+; RV32I-NEXT: lbu s1, 24(a0)
+; RV32I-NEXT: lbu s3, 25(a0)
+; RV32I-NEXT: lbu t4, 26(a0)
+; RV32I-NEXT: lbu t5, 27(a0)
+; RV32I-NEXT: slli s2, s2, 8
; RV32I-NEXT: slli s4, s4, 16
; RV32I-NEXT: slli s5, s5, 24
-; RV32I-NEXT: or a5, t0, a5
-; RV32I-NEXT: or a6, t6, t3
-; RV32I-NEXT: or a7, s3, s2
-; RV32I-NEXT: or t0, s5, s4
-; RV32I-NEXT: lbu t3, 24(a0)
-; RV32I-NEXT: lbu s5, 25(a0)
-; RV32I-NEXT: lbu s6, 26(a0)
-; RV32I-NEXT: lbu t6, 27(a0)
-; RV32I-NEXT: slli s8, s8, 8
-; RV32I-NEXT: slli s9, s9, 16
-; RV32I-NEXT: slli s10, s10, 24
-; RV32I-NEXT: slli ra, ra, 8
-; RV32I-NEXT: or s7, s8, s7
-; RV32I-NEXT: or s2, s10, s9
-; RV32I-NEXT: or s3, ra, s11
-; RV32I-NEXT: lbu s4, 28(a0)
-; RV32I-NEXT: lbu s8, 29(a0)
-; RV32I-NEXT: lbu s9, 30(a0)
-; RV32I-NEXT: lbu s10, 31(a0)
-; RV32I-NEXT: lbu a0, 0(a1)
+; RV32I-NEXT: slli s7, s7, 8
+; RV32I-NEXT: or t1, s2, s0
+; RV32I-NEXT: or t2, s5, s4
+; RV32I-NEXT: or t3, s7, s6
+; RV32I-NEXT: lbu t6, 28(a0)
+; RV32I-NEXT: lbu s4, 29(a0)
+; RV32I-NEXT: lbu s5, 30(a0)
+; RV32I-NEXT: lbu s6, 31(a0)
+; RV32I-NEXT: slli s8, s8, 16
+; RV32I-NEXT: slli s9, s9, 24
+; RV32I-NEXT: slli s11, s11, 8
+; RV32I-NEXT: slli ra, ra, 16
+; RV32I-NEXT: slli a3, a3, 24
+; RV32I-NEXT: or a0, s9, s8
+; RV32I-NEXT: or s0, s11, s10
+; RV32I-NEXT: or s2, a3, ra
+; RV32I-NEXT: lbu a3, 0(a1)
+; RV32I-NEXT: lbu s7, 1(a1)
+; RV32I-NEXT: lbu s8, 2(a1)
+; RV32I-NEXT: lbu a1, 3(a1)
; RV32I-NEXT: sw zero, 24(sp)
; RV32I-NEXT: sw zero, 28(sp)
; RV32I-NEXT: sw zero, 32(sp)
@@ -1995,89 +2095,88 @@ define void @shl_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; RV32I-NEXT: sw zero, 12(sp)
; RV32I-NEXT: sw zero, 16(sp)
; RV32I-NEXT: sw zero, 20(sp)
+; RV32I-NEXT: slli s3, s3, 8
+; RV32I-NEXT: or s1, s3, s1
+; RV32I-NEXT: addi s3, sp, 40
; RV32I-NEXT: slli t4, t4, 16
-; RV32I-NEXT: slli s0, s0, 24
-; RV32I-NEXT: or t4, s0, t4
-; RV32I-NEXT: addi s0, sp, 40
-; RV32I-NEXT: slli t2, t2, 8
-; RV32I-NEXT: slli t5, t5, 16
-; RV32I-NEXT: slli s1, s1, 24
-; RV32I-NEXT: slli s5, s5, 8
-; RV32I-NEXT: slli s6, s6, 16
-; RV32I-NEXT: slli t6, t6, 24
-; RV32I-NEXT: slli s8, s8, 8
-; RV32I-NEXT: slli s9, s9, 16
-; RV32I-NEXT: slli s10, s10, 24
-; RV32I-NEXT: or t1, t2, t1
+; RV32I-NEXT: slli t5, t5, 24
+; RV32I-NEXT: slli s4, s4, 8
+; RV32I-NEXT: slli s5, s5, 16
+; RV32I-NEXT: slli s6, s6, 24
+; RV32I-NEXT: slli s7, s7, 8
+; RV32I-NEXT: slli s8, s8, 16
+; RV32I-NEXT: slli a1, a1, 24
+; RV32I-NEXT: or t4, t5, t4
+; RV32I-NEXT: or t5, s4, t6
+; RV32I-NEXT: or t6, s6, s5
+; RV32I-NEXT: or a3, s7, a3
+; RV32I-NEXT: or a1, a1, s8
+; RV32I-NEXT: lw s4, 4(sp) # 4-byte Folded Reload
+; RV32I-NEXT: or a4, a4, s4
+; RV32I-NEXT: or a5, a6, a5
+; RV32I-NEXT: or a6, t0, a7
+; RV32I-NEXT: or a7, t2, t1
+; RV32I-NEXT: or t0, a0, t3
+; RV32I-NEXT: or t1, s2, s0
+; RV32I-NEXT: or t2, t4, s1
+; RV32I-NEXT: or t3, t6, t5
+; RV32I-NEXT: or a0, a1, a3
+; RV32I-NEXT: sw t0, 56(sp)
+; RV32I-NEXT: sw t1, 60(sp)
+; RV32I-NEXT: sw t2, 64(sp)
+; RV32I-NEXT: sw t3, 68(sp)
+; RV32I-NEXT: sw a4, 40(sp)
+; RV32I-NEXT: sw a5, 44(sp)
+; RV32I-NEXT: sw a6, 48(sp)
+; RV32I-NEXT: sw a7, 52(sp)
; RV32I-NEXT: srli a1, a0, 3
-; RV32I-NEXT: or t2, s1, t5
-; RV32I-NEXT: andi t5, a0, 31
-; RV32I-NEXT: or t3, s5, t3
-; RV32I-NEXT: or t6, t6, s6
-; RV32I-NEXT: or s1, s8, s4
-; RV32I-NEXT: or s4, s10, s9
-; RV32I-NEXT: andi s5, a1, 28
-; RV32I-NEXT: xori a1, t5, 31
-; RV32I-NEXT: or a3, a4, a3
-; RV32I-NEXT: or a4, a6, a5
-; RV32I-NEXT: or a5, t0, a7
-; RV32I-NEXT: or a6, s2, s7
-; RV32I-NEXT: or a7, t4, s3
-; RV32I-NEXT: or t0, t2, t1
-; RV32I-NEXT: or t1, t6, t3
-; RV32I-NEXT: or t2, s4, s1
-; RV32I-NEXT: sub t3, s0, s5
-; RV32I-NEXT: sw a7, 56(sp)
-; RV32I-NEXT: sw t0, 60(sp)
-; RV32I-NEXT: sw t1, 64(sp)
-; RV32I-NEXT: sw t2, 68(sp)
-; RV32I-NEXT: sw a3, 40(sp)
-; RV32I-NEXT: sw a4, 44(sp)
-; RV32I-NEXT: sw a5, 48(sp)
-; RV32I-NEXT: sw a6, 52(sp)
-; RV32I-NEXT: lw a3, 0(t3)
-; RV32I-NEXT: lw a4, 4(t3)
-; RV32I-NEXT: lw a5, 8(t3)
-; RV32I-NEXT: lw a6, 12(t3)
-; RV32I-NEXT: lw a7, 16(t3)
-; RV32I-NEXT: lw t0, 20(t3)
-; RV32I-NEXT: lw t1, 24(t3)
-; RV32I-NEXT: lw t2, 28(t3)
-; RV32I-NEXT: sll t3, a4, a0
-; RV32I-NEXT: srli t4, a3, 1
-; RV32I-NEXT: sll t5, a6, a0
-; RV32I-NEXT: srli t6, a5, 1
-; RV32I-NEXT: sll a5, a5, a0
-; RV32I-NEXT: srli a4, a4, 1
-; RV32I-NEXT: sll s0, t0, a0
-; RV32I-NEXT: srli s1, a7, 1
-; RV32I-NEXT: sll a7, a7, a0
-; RV32I-NEXT: srli a6, a6, 1
+; RV32I-NEXT: andi a3, a0, 31
+; RV32I-NEXT: andi a4, a1, 28
+; RV32I-NEXT: xori a1, a3, 31
+; RV32I-NEXT: sub a3, s3, a4
+; RV32I-NEXT: lw a4, 0(a3)
+; RV32I-NEXT: lw a5, 4(a3)
+; RV32I-NEXT: lw a6, 8(a3)
+; RV32I-NEXT: lw a7, 12(a3)
+; RV32I-NEXT: lw t0, 16(a3)
+; RV32I-NEXT: lw t1, 20(a3)
+; RV32I-NEXT: lw t2, 24(a3)
+; RV32I-NEXT: lw a3, 28(a3)
+; RV32I-NEXT: sll t3, a5, a0
+; RV32I-NEXT: srli t4, a4, 1
+; RV32I-NEXT: sll t5, a7, a0
+; RV32I-NEXT: srli t6, a6, 1
+; RV32I-NEXT: sll a6, a6, a0
+; RV32I-NEXT: srli a5, a5, 1
+; RV32I-NEXT: sll s0, t1, a0
+; RV32I-NEXT: srli s1, t0, 1
+; RV32I-NEXT: sll t0, t0, a0
+; RV32I-NEXT: srli a7, a7, 1
+; RV32I-NEXT: sll s2, a3, a0
+; RV32I-NEXT: srli a3, t2, 1
; RV32I-NEXT: sll t2, t2, a0
-; RV32I-NEXT: srli s2, t1, 1
-; RV32I-NEXT: sll t1, t1, a0
-; RV32I-NEXT: srli t0, t0, 1
-; RV32I-NEXT: sll s3, a3, a0
+; RV32I-NEXT: srli t1, t1, 1
+; RV32I-NEXT: sll s3, a4, a0
; RV32I-NEXT: srl a0, t4, a1
-; RV32I-NEXT: srl a3, t6, a1
-; RV32I-NEXT: srl a4, a4, a1
+; RV32I-NEXT: srl a4, t6, a1
+; RV32I-NEXT: srl a5, a5, a1
; RV32I-NEXT: srl t4, s1, a1
-; RV32I-NEXT: srl a6, a6, a1
-; RV32I-NEXT: srl t6, s2, a1
-; RV32I-NEXT: srl t0, t0, a1
+; RV32I-NEXT: srl a7, a7, a1
+; RV32I-NEXT: srl t6, a3, a1
+; RV32I-NEXT: srl t1, t1, a1
; RV32I-NEXT: srli s1, s3, 24
-; RV32I-NEXT: srli s2, s3, 16
-; RV32I-NEXT: srli s4, s3, 8
+; RV32I-NEXT: srli s4, s3, 16
+; RV32I-NEXT: srli s5, s3, 8
; RV32I-NEXT: or a0, t3, a0
-; RV32I-NEXT: or a1, t5, a3
-; RV32I-NEXT: or a3, a5, a4
+; RV32I-NEXT: or a1, t5, a4
+; RV32I-NEXT: or a3, a6, a5
; RV32I-NEXT: or a4, s0, t4
-; RV32I-NEXT: or a5, a7, a6
-; RV32I-NEXT: or a6, t2, t6
-; RV32I-NEXT: or a7, t1, t0
+; RV32I-NEXT: or a5, t0, a7
+; RV32I-NEXT: or a6, s2, t6
+; RV32I-NEXT: or a7, t2, t1
; RV32I-NEXT: sb s3, 0(a2)
-; RV32I-NEXT: sb s4, 1(a2)
-; RV32I-NEXT: sb s2, 2(a2)
+; RV32I-NEXT: sb s5, 1(a2)
+; RV32I-NEXT: sb s4, 2(a2)
; RV32I-NEXT: sb s1, 3(a2)
; RV32I-NEXT: srli t0, a7, 24
; RV32I-NEXT: srli t1, a7, 16
@@ -2152,17 +2251,19 @@ define void @shl_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
define void @ashr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; RV64I-LABEL: ashr_32bytes:
; RV64I: # %bb.0:
-; RV64I-NEXT: addi sp, sp, -144
-; RV64I-NEXT: sd s0, 136(sp) # 8-byte Folded Spill
-; RV64I-NEXT: sd s1, 128(sp) # 8-byte Folded Spill
-; RV64I-NEXT: sd s2, 120(sp) # 8-byte Folded Spill
-; RV64I-NEXT: sd s3, 112(sp) # 8-byte Folded Spill
-; RV64I-NEXT: sd s4, 104(sp) # 8-byte Folded Spill
-; RV64I-NEXT: sd s5, 96(sp) # 8-byte Folded Spill
-; RV64I-NEXT: sd s6, 88(sp) # 8-byte Folded Spill
-; RV64I-NEXT: sd s7, 80(sp) # 8-byte Folded Spill
-; RV64I-NEXT: sd s8, 72(sp) # 8-byte Folded Spill
-; RV64I-NEXT: sd s9, 64(sp) # 8-byte Folded Spill
+; RV64I-NEXT: addi sp, sp, -160
+; RV64I-NEXT: sd s0, 152(sp) # 8-byte Folded Spill
+; RV64I-NEXT: sd s1, 144(sp) # 8-byte Folded Spill
+; RV64I-NEXT: sd s2, 136(sp) # 8-byte Folded Spill
+; RV64I-NEXT: sd s3, 128(sp) # 8-byte Folded Spill
+; RV64I-NEXT: sd s4, 120(sp) # 8-byte Folded Spill
+; RV64I-NEXT: sd s5, 112(sp) # 8-byte Folded Spill
+; RV64I-NEXT: sd s6, 104(sp) # 8-byte Folded Spill
+; RV64I-NEXT: sd s7, 96(sp) # 8-byte Folded Spill
+; RV64I-NEXT: sd s8, 88(sp) # 8-byte Folded Spill
+; RV64I-NEXT: sd s9, 80(sp) # 8-byte Folded Spill
+; RV64I-NEXT: sd s10, 72(sp) # 8-byte Folded Spill
+; RV64I-NEXT: sd s11, 64(sp) # 8-byte Folded Spill
; RV64I-NEXT: lbu a3, 0(a0)
; RV64I-NEXT: lbu a4, 1(a0)
; RV64I-NEXT: lbu a5, 2(a0)
@@ -2179,123 +2280,144 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; RV64I-NEXT: lbu s1, 13(a0)
; RV64I-NEXT: lbu s2, 14(a0)
; RV64I-NEXT: lbu s3, 15(a0)
-; RV64I-NEXT: slli a4, a4, 8
-; RV64I-NEXT: slli a5, a5, 16
-; RV64I-NEXT: slli a6, a6, 24
-; RV64I-NEXT: or a3, a4, a3
-; RV64I-NEXT: or a4, a6, a5
; RV64I-NEXT: lbu s4, 16(a0)
; RV64I-NEXT: lbu s5, 17(a0)
; RV64I-NEXT: lbu s6, 18(a0)
; RV64I-NEXT: lbu s7, 19(a0)
+; RV64I-NEXT: slli a4, a4, 8
+; RV64I-NEXT: slli a5, a5, 16
+; RV64I-NEXT: slli a6, a6, 24
; RV64I-NEXT: slli t0, t0, 8
; RV64I-NEXT: slli t1, t1, 16
; RV64I-NEXT: slli t2, t2, 24
+; RV64I-NEXT: or a3, a4, a3
+; RV64I-NEXT: or a4, a6, a5
+; RV64I-NEXT: or a5, t0, a7
+; RV64I-NEXT: or a6, t2, t1
+; RV64I-NEXT: lbu s8, 20(a0)
+; RV64I-NEXT: lbu s9, 21(a0)
+; RV64I-NEXT: lbu s10, 22(a0)
+; RV64I-NEXT: lbu s11, 23(a0)
; RV64I-NEXT: slli t4, t4, 8
; RV64I-NEXT: slli t5, t5, 16
; RV64I-NEXT: slli t6, t6, 24
-; RV64I-NEXT: or a5, t0, a7
-; RV64I-NEXT: or a6, t2, t1
-; RV64I-NEXT: or a7, t4, t3
-; RV64I-NEXT: or t0, t6, t5
-; RV64I-NEXT: lbu t5, 20(a0)
-; RV64I-NEXT: lbu t6, 21(a0)
-; RV64I-NEXT: lbu s8, 22(a0)
-; RV64I-NEXT: lbu s9, 23(a0)
; RV64I-NEXT: slli s1, s1, 8
; RV64I-NEXT: slli s2, s2, 16
; RV64I-NEXT: slli s3, s3, 24
+; RV64I-NEXT: or a7, t4, t3
+; RV64I-NEXT: or t0, t6, t5
+; RV64I-NEXT: or t1, s1, s0
+; RV64I-NEXT: or t2, s3, s2
+; RV64I-NEXT: lbu t6, 24(a0)
+; RV64I-NEXT: lbu s0, 25(a0)
+; RV64I-NEXT: lbu s1, 26(a0)
+; RV64I-NEXT: lbu s2, 27(a0)
; RV64I-NEXT: slli s5, s5, 8
; RV64I-NEXT: slli s6, s6, 16
; RV64I-NEXT: slli s7, s7, 24
-; RV64I-NEXT: or t1, s1, s0
-; RV64I-NEXT: or t2, s3, s2
+; RV64I-NEXT: slli s9, s9, 8
; RV64I-NEXT: or t3, s5, s4
; RV64I-NEXT: or t4, s7, s6
-; RV64I-NEXT: lbu s0, 24(a0)
-; RV64I-NEXT: lbu s1, 25(a0)
-; RV64I-NEXT: lbu s2, 26(a0)
-; RV64I-NEXT: lbu s3, 27(a0)
-; RV64I-NEXT: slli t6, t6, 8
-; RV64I-NEXT: slli s8, s8, 16
-; RV64I-NEXT: slli s9, s9, 24
-; RV64I-NEXT: slli s1, s1, 8
-; RV64I-NEXT: or t5, t6, t5
-; RV64I-NEXT: or t6, s9, s8
-; RV64I-NEXT: or s0, s1, s0
-; RV64I-NEXT: lbu s1, 28(a0)
+; RV64I-NEXT: or t5, s9, s8
+; RV64I-NEXT: lbu s3, 28(a0)
; RV64I-NEXT: lbu s4, 29(a0)
; RV64I-NEXT: lbu s5, 30(a0)
; RV64I-NEXT: lbu s6, 31(a0)
-; RV64I-NEXT: lbu a0, 0(a1)
-; RV64I-NEXT: slli s2, s2, 16
-; RV64I-NEXT: slli s3, s3, 24
-; RV64I-NEXT: or a1, s3, s2
-; RV64I-NEXT: mv s2, sp
+; RV64I-NEXT: slli s10, s10, 16
+; RV64I-NEXT: slli s11, s11, 24
+; RV64I-NEXT: slli s0, s0, 8
+; RV64I-NEXT: slli s1, s1, 16
+; RV64I-NEXT: slli s2, s2, 24
; RV64I-NEXT: slli s4, s4, 8
+; RV64I-NEXT: or a0, s11, s10
+; RV64I-NEXT: or t6, s0, t6
+; RV64I-NEXT: or s0, s2, s1
+; RV64I-NEXT: or s1, s4, s3
+; RV64I-NEXT: lbu s2, 0(a1)
+; RV64I-NEXT: lbu s3, 1(a1)
+; RV64I-NEXT: lbu s4, 2(a1)
+; RV64I-NEXT: lbu s7, 3(a1)
; RV64I-NEXT: slli s5, s5, 16
; RV64I-NEXT: slli s6, s6, 24
-; RV64I-NEXT: or s1, s4, s1
-; RV64I-NEXT: srli s3, a0, 3
-; RV64I-NEXT: or s4, s6, s5
-; RV64I-NEXT: andi s5, a0, 63
-; RV64I-NEXT: andi s3, s3, 24
-; RV64I-NEXT: xori s5, s5, 63
+; RV64I-NEXT: slli s3, s3, 8
+; RV64I-NEXT: slli s4, s4, 16
+; RV64I-NEXT: slli s7, s7, 24
+; RV64I-NEXT: or s5, s6, s5
+; RV64I-NEXT: or s2, s3, s2
+; RV64I-NEXT: or s3, s7, s4
+; RV64I-NEXT: lbu s4, 5(a1)
+; RV64I-NEXT: lbu s6, 4(a1)
+; RV64I-NEXT: lbu s7, 6(a1)
+; RV64I-NEXT: lbu a1, 7(a1)
+; RV64I-NEXT: slli s4, s4, 8
+; RV64I-NEXT: or s4, s4, s6
+; RV64I-NEXT: slli s7, s7, 16
+; RV64I-NEXT: slli a1, a1, 24
+; RV64I-NEXT: or a1, a1, s7
+; RV64I-NEXT: mv s6, sp
; RV64I-NEXT: or a3, a4, a3
; RV64I-NEXT: or a4, a6, a5
; RV64I-NEXT: or a5, t0, a7
; RV64I-NEXT: or a6, t2, t1
; RV64I-NEXT: or a7, t4, t3
-; RV64I-NEXT: or t0, t6, t5
-; RV64I-NEXT: or a1, a1, s0
-; RV64I-NEXT: or t1, s4, s1
-; RV64I-NEXT: add s2, s2, s3
+; RV64I-NEXT: or a0, a0, t5
+; RV64I-NEXT: or t0, s0, t6
+; RV64I-NEXT: or t1, s5, s1
+; RV64I-NEXT: or t2, s3, s2
+; RV64I-NEXT: or a1, a1, s4
; RV64I-NEXT: slli a4, a4, 32
; RV64I-NEXT: slli a6, a6, 32
-; RV64I-NEXT: slli t0, t0, 32
-; RV64I-NEXT: slli t2, t1, 32
+; RV64I-NEXT: slli a0, a0, 32
+; RV64I-NEXT: slli t3, t1, 32
+; RV64I-NEXT: slli a1, a1, 32
; RV64I-NEXT: sraiw t1, t1, 31
; RV64I-NEXT: or a3, a4, a3
; RV64I-NEXT: or a4, a6, a5
-; RV64I-NEXT: or a5, t0, a7
-; RV64I-NEXT: or a1, t2, a1
+; RV64I-NEXT: or a0, a0, a7
+; RV64I-NEXT: or a5, t3, t0
+; RV64I-NEXT: or a1, a1, t2
; RV64I-NEXT: sd t1, 32(sp)
; RV64I-NEXT: sd t1, 40(sp)
; RV64I-NEXT: sd t1, 48(sp)
; RV64I-NEXT: sd t1, 56(sp)
; RV64I-NEXT: sd a3, 0(sp)
; RV64I-NEXT: sd a4, 8(sp)
-; RV64I-NEXT: sd a5, 16(sp)
-; RV64I-NEXT: sd a1, 24(sp)
-; RV64I-NEXT: ld a1, 8(s2)
-; RV64I-NEXT: ld a3, 16(s2)
-; RV64I-NEXT: ld a4, 0(s2)
-; RV64I-NEXT: ld a5, 24(s2)
-; RV64I-NEXT: srl a6, a1, a0
-; RV64I-NEXT: slli a7, a3, 1
-; RV64I-NEXT: srl a4, a4, a0
-; RV64I-NEXT: slli a1, a1, 1
-; RV64I-NEXT: srl a3, a3, a0
+; RV64I-NEXT: sd a0, 16(sp)
+; RV64I-NEXT: sd a5, 24(sp)
+; RV64I-NEXT: srli a0, a1, 3
+; RV64I-NEXT: andi a3, a1, 63
+; RV64I-NEXT: andi a0, a0, 24
+; RV64I-NEXT: xori a3, a3, 63
+; RV64I-NEXT: add a0, s6, a0
+; RV64I-NEXT: ld a4, 8(a0)
+; RV64I-NEXT: ld a5, 16(a0)
+; RV64I-NEXT: ld a6, 0(a0)
+; RV64I-NEXT: ld a0, 24(a0)
+; RV64I-NEXT: srl a7, a4, a1
; RV64I-NEXT: slli t0, a5, 1
-; RV64I-NEXT: sra a5, a5, a0
-; RV64I-NEXT: sll a0, a7, s5
-; RV64I-NEXT: sll a1, a1, s5
-; RV64I-NEXT: sll a7, t0, s5
-; RV64I-NEXT: srli t0, a5, 56
-; RV64I-NEXT: srli t1, a5, 48
-; RV64I-NEXT: srli t2, a5, 40
-; RV64I-NEXT: srli t3, a5, 32
-; RV64I-NEXT: srli t4, a5, 24
-; RV64I-NEXT: srli t5, a5, 16
-; RV64I-NEXT: srli t6, a5, 8
-; RV64I-NEXT: or a0, a6, a0
-; RV64I-NEXT: or a1, a4, a1
-; RV64I-NEXT: or a3, a3, a7
+; RV64I-NEXT: srl a6, a6, a1
+; RV64I-NEXT: slli a4, a4, 1
+; RV64I-NEXT: srl a5, a5, a1
+; RV64I-NEXT: slli t1, a0, 1
+; RV64I-NEXT: sra t2, a0, a1
+; RV64I-NEXT: sll a0, t0, a3
+; RV64I-NEXT: sll a1, a4, a3
+; RV64I-NEXT: sll a3, t1, a3
+; RV64I-NEXT: srli a4, t2, 56
+; RV64I-NEXT: srli t0, t2, 48
+; RV64I-NEXT: srli t1, t2, 40
+; RV64I-NEXT: srli t3, t2, 32
+; RV64I-NEXT: srli t4, t2, 24
+; RV64I-NEXT: srli t5, t2, 16
+; RV64I-NEXT: srli t6, t2, 8
+; RV64I-NEXT: or a0, a7, a0
+; RV64I-NEXT: or a1, a6, a1
+; RV64I-NEXT: or a3, a5, a3
; RV64I-NEXT: sb t3, 28(a2)
-; RV64I-NEXT: sb t2, 29(a2)
-; RV64I-NEXT: sb t1, 30(a2)
-; RV64I-NEXT: sb t0, 31(a2)
-; RV64I-NEXT: sb a5, 24(a2)
+; RV64I-NEXT: sb t1, 29(a2)
+; RV64I-NEXT: sb t0, 30(a2)
+; RV64I-NEXT: sb a4, 31(a2)
+; RV64I-NEXT: sb t2, 24(a2)
; RV64I-NEXT: sb t6, 25(a2)
; RV64I-NEXT: sb t5, 26(a2)
; RV64I-NEXT: sb t4, 27(a2)
@@ -2316,45 +2438,47 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; RV64I-NEXT: srli s3, a0, 56
; RV64I-NEXT: srli s4, a0, 48
; RV64I-NEXT: srli s5, a0, 40
+; RV64I-NEXT: srli s6, a0, 32
; RV64I-NEXT: sb a7, 20(a2)
; RV64I-NEXT: sb a6, 21(a2)
; RV64I-NEXT: sb a5, 22(a2)
; RV64I-NEXT: sb a4, 23(a2)
-; RV64I-NEXT: srli a4, a0, 32
+; RV64I-NEXT: srli a4, a0, 24
; RV64I-NEXT: sb a3, 16(a2)
; RV64I-NEXT: sb t2, 17(a2)
; RV64I-NEXT: sb t1, 18(a2)
; RV64I-NEXT: sb t0, 19(a2)
-; RV64I-NEXT: srli a3, a0, 24
+; RV64I-NEXT: srli a3, a0, 16
; RV64I-NEXT: sb t6, 4(a2)
; RV64I-NEXT: sb t5, 5(a2)
; RV64I-NEXT: sb t4, 6(a2)
; RV64I-NEXT: sb t3, 7(a2)
-; RV64I-NEXT: srli a5, a0, 16
+; RV64I-NEXT: srli a5, a0, 8
; RV64I-NEXT: sb a1, 0(a2)
; RV64I-NEXT: sb s2, 1(a2)
; RV64I-NEXT: sb s1, 2(a2)
; RV64I-NEXT: sb s0, 3(a2)
-; RV64I-NEXT: srli a1, a0, 8
-; RV64I-NEXT: sb a4, 12(a2)
+; RV64I-NEXT: sb s6, 12(a2)
; RV64I-NEXT: sb s5, 13(a2)
; RV64I-NEXT: sb s4, 14(a2)
; RV64I-NEXT: sb s3, 15(a2)
; RV64I-NEXT: sb a0, 8(a2)
-; RV64I-NEXT: sb a1, 9(a2)
-; RV64I-NEXT: sb a5, 10(a2)
-; RV64I-NEXT: sb a3, 11(a2)
-; RV64I-NEXT: ld s0, 136(sp) # 8-byte Folded Reload
-; RV64I-NEXT: ld s1, 128(sp) # 8-byte Folded Reload
-; RV64I-NEXT: ld s2, 120(sp) # 8-byte Folded Reload
-; RV64I-NEXT: ld s3, 112(sp) # 8-byte Folded Reload
-; RV64I-NEXT: ld s4, 104(sp) # 8-byte Folded Reload
-; RV64I-NEXT: ld s5, 96(sp) # 8-byte Folded Reload
-; RV64I-NEXT: ld s6, 88(sp) # 8-byte Folded Reload
-; RV64I-NEXT: ld s7, 80(sp) # 8-byte Folded Reload
-; RV64I-NEXT: ld s8, 72(sp) # 8-byte Folded Reload
-; RV64I-NEXT: ld s9, 64(sp) # 8-byte Folded Reload
-; RV64I-NEXT: addi sp, sp, 144
+; RV64I-NEXT: sb a5, 9(a2)
+; RV64I-NEXT: sb a3, 10(a2)
+; RV64I-NEXT: sb a4, 11(a2)
+; RV64I-NEXT: ld s0, 152(sp) # 8-byte Folded Reload
+; RV64I-NEXT: ld s1, 144(sp) # 8-byte Folded Reload
+; RV64I-NEXT: ld s2, 136(sp) # 8-byte Folded Reload
+; RV64I-NEXT: ld s3, 128(sp) # 8-byte Folded Reload
+; RV64I-NEXT: ld s4, 120(sp) # 8-byte Folded Reload
+; RV64I-NEXT: ld s5, 112(sp) # 8-byte Folded Reload
+; RV64I-NEXT: ld s6, 104(sp) # 8-byte Folded Reload
+; RV64I-NEXT: ld s7, 96(sp) # 8-byte Folded Reload
+; RV64I-NEXT: ld s8, 88(sp) # 8-byte Folded Reload
+; RV64I-NEXT: ld s9, 80(sp) # 8-byte Folded Reload
+; RV64I-NEXT: ld s10, 72(sp) # 8-byte Folded Reload
+; RV64I-NEXT: ld s11, 64(sp) # 8-byte Folded Reload
+; RV64I-NEXT: addi sp, sp, 160
; RV64I-NEXT: ret
;
; RV32I-LABEL: ashr_32bytes:
@@ -2379,148 +2503,159 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; RV32I-NEXT: lbu a7, 3(a0)
; RV32I-NEXT: lbu a5, 4(a0)
; RV32I-NEXT: lbu t0, 5(a0)
-; RV32I-NEXT: lbu t3, 6(a0)
-; RV32I-NEXT: lbu t4, 7(a0)
-; RV32I-NEXT: lbu t6, 8(a0)
-; RV32I-NEXT: lbu s0, 9(a0)
-; RV32I-NEXT: lbu s4, 10(a0)
-; RV32I-NEXT: lbu s5, 11(a0)
-; RV32I-NEXT: lbu s6, 12(a0)
-; RV32I-NEXT: lbu s7, 13(a0)
-; RV32I-NEXT: lbu s8, 14(a0)
-; RV32I-NEXT: lbu s9, 15(a0)
-; RV32I-NEXT: lbu s10, 16(a0)
-; RV32I-NEXT: lbu s11, 17(a0)
-; RV32I-NEXT: lbu s2, 18(a0)
-; RV32I-NEXT: lbu s3, 19(a0)
+; RV32I-NEXT: lbu t1, 6(a0)
+; RV32I-NEXT: lbu t2, 7(a0)
+; RV32I-NEXT: lbu t3, 8(a0)
+; RV32I-NEXT: lbu t4, 9(a0)
+; RV32I-NEXT: lbu t5, 10(a0)
+; RV32I-NEXT: lbu t6, 11(a0)
+; RV32I-NEXT: lbu s0, 12(a0)
+; RV32I-NEXT: lbu s1, 13(a0)
+; RV32I-NEXT: lbu s2, 14(a0)
+; RV32I-NEXT: lbu s3, 15(a0)
+; RV32I-NEXT: lbu s4, 16(a0)
+; RV32I-NEXT: lbu s5, 17(a0)
+; RV32I-NEXT: lbu s6, 18(a0)
+; RV32I-NEXT: lbu s7, 19(a0)
; RV32I-NEXT: slli a4, a4, 8
; RV32I-NEXT: slli a6, a6, 16
; RV32I-NEXT: slli a7, a7, 24
; RV32I-NEXT: or a3, a4, a3
+; RV32I-NEXT: sw a3, 4(sp) # 4-byte Folded Spill
; RV32I-NEXT: or a4, a7, a6
-; RV32I-NEXT: lbu t1, 20(a0)
-; RV32I-NEXT: lbu t2, 21(a0)
-; RV32I-NEXT: lbu t5, 22(a0)
-; RV32I-NEXT: lbu s1, 23(a0)
+; RV32I-NEXT: lbu s8, 20(a0)
+; RV32I-NEXT: lbu s9, 21(a0)
+; RV32I-NEXT: lbu s10, 22(a0)
+; RV32I-NEXT: lbu s11, 23(a0)
; RV32I-NEXT: slli t0, t0, 8
-; RV32I-NEXT: slli t3, t3, 16
-; RV32I-NEXT: slli t4, t4, 24
-; RV32I-NEXT: slli s0, s0, 8
-; RV32I-NEXT: slli s4, s4, 16
-; RV32I-NEXT: slli s5, s5, 24
+; RV32I-NEXT: slli t1, t1, 16
+; RV32I-NEXT: slli t2, t2, 24
+; RV32I-NEXT: slli t4, t4, 8
+; RV32I-NEXT: slli t5, t5, 16
+; RV32I-NEXT: slli t6, t6, 24
; RV32I-NEXT: or a5, t0, a5
-; RV32I-NEXT: or a6, t4, t3
-; RV32I-NEXT: or a7, s0, t6
-; RV32I-NEXT: or t0, s5, s4
-; RV32I-NEXT: lbu t3, 24(a0)
-; RV32I-NEXT: lbu s4, 25(a0)
-; RV32I-NEXT: lbu s5, 26(a0)
-; RV32I-NEXT: lbu ra, 27(a0)
-; RV32I-NEXT: slli s7, s7, 8
-; RV32I-NEXT: slli s8, s8, 16
-; RV32I-NEXT: slli s9, s9, 24
-; RV32I-NEXT: slli s11, s11, 8
-; RV32I-NEXT: or t4, s7, s6
-; RV32I-NEXT: or t6, s9, s8
-; RV32I-NEXT: or s0, s11, s10
-; RV32I-NEXT: lbu s6, 28(a0)
-; RV32I-NEXT: lbu s7, 29(a0)
-; RV32I-NEXT: lbu s8, 30(a0)
-; RV32I-NEXT: lbu s9, 31(a0)
-; RV32I-NEXT: lbu a0, 0(a1)
+; RV32I-NEXT: or a6, t2, t1
+; RV32I-NEXT: or a7, t4, t3
+; RV32I-NEXT: or t0, t6, t5
+; RV32I-NEXT: lbu ra, 24(a0)
+; RV32I-NEXT: lbu a3, 25(a0)
+; RV32I-NEXT: lbu t4, 26(a0)
+; RV32I-NEXT: lbu t5, 27(a0)
+; RV32I-NEXT: slli s1, s1, 8
; RV32I-NEXT: slli s2, s2, 16
; RV32I-NEXT: slli s3, s3, 24
-; RV32I-NEXT: or s2, s3, s2
-; RV32I-NEXT: addi s3, sp, 8
-; RV32I-NEXT: slli t2, t2, 8
-; RV32I-NEXT: slli t5, t5, 16
-; RV32I-NEXT: slli s1, s1, 24
-; RV32I-NEXT: slli s4, s4, 8
-; RV32I-NEXT: slli s5, s5, 16
-; RV32I-NEXT: slli ra, ra, 24
-; RV32I-NEXT: slli s7, s7, 8
-; RV32I-NEXT: slli s8, s8, 16
-; RV32I-NEXT: slli s9, s9, 24
-; RV32I-NEXT: or t1, t2, t1
-; RV32I-NEXT: srli a1, a0, 3
+; RV32I-NEXT: slli s5, s5, 8
+; RV32I-NEXT: or t1, s1, s0
+; RV32I-NEXT: or t2, s3, s2
+; RV32I-NEXT: or t3, s5, s4
+; RV32I-NEXT: lbu t6, 28(a0)
+; RV32I-NEXT: lbu s0, 29(a0)
+; RV32I-NEXT: lbu s1, 30(a0)
+; RV32I-NEXT: lbu a0, 31(a0)
+; RV32I-NEXT: slli s6, s6, 16
+; RV32I-NEXT: slli s7, s7, 24
+; RV32I-NEXT: slli s9, s9, 8
+; RV32I-NEXT: slli s10, s10, 16
+; RV32I-NEXT: slli s11, s11, 24
+; RV32I-NEXT: or s2, s7, s6
+; RV32I-NEXT: or s3, s9, s8
+; RV32I-NEXT: or s4, s11, s10
+; RV32I-NEXT: lbu s5, 0(a1)
+; RV32I-NEXT: lbu s6, 1(a1)
+; RV32I-NEXT: lbu s7, 2(a1)
+; RV32I-NEXT: lbu a1, 3(a1)
+; RV32I-NEXT: slli a3, a3, 8
+; RV32I-NEXT: or a3, a3, ra
+; RV32I-NEXT: addi s8, sp, 8
+; RV32I-NEXT: slli t4, t4, 16
+; RV32I-NEXT: slli t5, t5, 24
+; RV32I-NEXT: slli s0, s0, 8
+; RV32I-NEXT: slli s1, s1, 16
+; RV32I-NEXT: slli a0, a0, 24
+; RV32I-NEXT: slli s6, s6, 8
+; RV32I-NEXT: slli s7, s7, 16
+; RV32I-NEXT: slli a1, a1, 24
+; RV32I-NEXT: or t4, t5, t4
+; RV32I-NEXT: or t5, s0, t6
+; RV32I-NEXT: or s1, a0, s1
+; RV32I-NEXT: or t6, s6, s5
+; RV32I-NEXT: or a1, a1, s7
+; RV32I-NEXT: srai s0, a0, 31
+; RV32I-NEXT: lw a0, 4(sp) # 4-byte Folded Reload
+; RV32I-NEXT: or a4, a4, a0
+; RV32I-NEXT: or a5, a6, a5
+; RV32I-NEXT: or a6, t0, a7
+; RV32I-NEXT: or a7, t2, t1
+; RV32I-NEXT: or t0, s2, t3
+; RV32I-NEXT: or t1, s4, s3
+; RV32I-NEXT: or a3, t4, a3
; RV32I-NEXT: or t2, s1, t5
-; RV32I-NEXT: andi t5, a0, 31
-; RV32I-NEXT: or t3, s4, t3
-; RV32I-NEXT: or s1, ra, s5
-; RV32I-NEXT: or s4, s7, s6
-; RV32I-NEXT: or s5, s9, s8
-; RV32I-NEXT: srai s6, s9, 31
-; RV32I-NEXT: andi s7, a1, 28
-; RV32I-NEXT: xori a1, t5, 31
-; RV32I-NEXT: or a3, a4, a3
-; RV32I-NEXT: or a4, a6, a5
-; RV32I-NEXT: or a5, t0, a7
-; RV32I-NEXT: or a6, t6, t4
-; RV32I-NEXT: or a7, s2, s0
-; RV32I-NEXT: or t0, t2, t1
-; RV32I-NEXT: or t1, s1, t3
-; RV32I-NEXT: or t2, s5, s4
-; RV32I-NEXT: sw s6, 56(sp)
-; RV32I-NEXT: sw s6, 60(sp)
-; RV32I-NEXT: sw s6, 64(sp)
-; RV32I-NEXT: sw s6, 68(sp)
-; RV32I-NEXT: sw s6, 40(sp)
-; RV32I-NEXT: sw s6, 44(sp)
-; RV32I-NEXT: sw s6, 48(sp)
-; RV32I-NEXT: sw s6, 52(sp)
-; RV32I-NEXT: add s3, s3, s7
-; RV32I-NEXT: sw a7, 24(sp)
-; RV32I-NEXT: sw t0, 28(sp)
-; RV32I-NEXT: sw t1, 32(sp)
+; RV32I-NEXT: or a0, a1, t6
+; RV32I-NEXT: sw s0, 56(sp)
+; RV32I-NEXT: sw s0, 60(sp)
+; RV32I-NEXT: sw s0, 64(sp)
+; RV32I-NEXT: sw s0, 68(sp)
+; RV32I-NEXT: sw s0, 40(sp)
+; RV32I-NEXT: sw s0, 44(sp)
+; RV32I-NEXT: sw s0, 48(sp)
+; RV32I-NEXT: sw s0, 52(sp)
+; RV32I-NEXT: sw t0, 24(sp)
+; RV32I-NEXT: sw t1, 28(sp)
+; RV32I-NEXT: sw a3, 32(sp)
; RV32I-NEXT: sw t2, 36(sp)
-; RV32I-NEXT: sw a3, 8(sp)
-; RV32I-NEXT: sw a4, 12(sp)
-; RV32I-NEXT: sw a5, 16(sp)
-; RV32I-NEXT: sw a6, 20(sp)
-; RV32I-NEXT: lw a3, 0(s3)
-; RV32I-NEXT: lw a4, 4(s3)
-; RV32I-NEXT: lw a5, 8(s3)
-; RV32I-NEXT: lw a6, 12(s3)
-; RV32I-NEXT: lw a7, 16(s3)
-; RV32I-NEXT: lw t0, 20(s3)
-; RV32I-NEXT: lw t1, 24(s3)
-; RV32I-NEXT: lw t2, 28(s3)
-; RV32I-NEXT: srl t3, a4, a0
-; RV32I-NEXT: slli t4, a5, 1
+; RV32I-NEXT: sw a4, 8(sp)
+; RV32I-NEXT: sw a5, 12(sp)
+; RV32I-NEXT: sw a6, 16(sp)
+; RV32I-NEXT: sw a7, 20(sp)
+; RV32I-NEXT: srli a1, a0, 3
+; RV32I-NEXT: andi a3, a0, 31
+; RV32I-NEXT: andi a4, a1, 28
+; RV32I-NEXT: xori a1, a3, 31
+; RV32I-NEXT: add a4, s8, a4
+; RV32I-NEXT: lw a3, 0(a4)
+; RV32I-NEXT: lw a5, 4(a4)
+; RV32I-NEXT: lw a6, 8(a4)
+; RV32I-NEXT: lw a7, 12(a4)
+; RV32I-NEXT: lw t0, 16(a4)
+; RV32I-NEXT: lw t1, 20(a4)
+; RV32I-NEXT: lw t2, 24(a4)
+; RV32I-NEXT: lw a4, 28(a4)
+; RV32I-NEXT: srl t3, a5, a0
+; RV32I-NEXT: slli t4, a6, 1
; RV32I-NEXT: srl a3, a3, a0
-; RV32I-NEXT: slli a4, a4, 1
-; RV32I-NEXT: srl t5, a6, a0
-; RV32I-NEXT: slli t6, a7, 1
-; RV32I-NEXT: srl a5, a5, a0
-; RV32I-NEXT: slli a6, a6, 1
-; RV32I-NEXT: srl s0, t0, a0
-; RV32I-NEXT: slli s1, t1, 1
-; RV32I-NEXT: srl a7, a7, a0
-; RV32I-NEXT: slli t0, t0, 1
-; RV32I-NEXT: srl t1, t1, a0
-; RV32I-NEXT: slli s2, t2, 1
-; RV32I-NEXT: sra t2, t2, a0
+; RV32I-NEXT: slli a5, a5, 1
+; RV32I-NEXT: srl t5, a7, a0
+; RV32I-NEXT: slli t6, t0, 1
+; RV32I-NEXT: srl a6, a6, a0
+; RV32I-NEXT: slli a7, a7, 1
+; RV32I-NEXT: srl s0, t1, a0
+; RV32I-NEXT: slli s1, t2, 1
+; RV32I-NEXT: srl t0, t0, a0
+; RV32I-NEXT: slli t1, t1, 1
+; RV32I-NEXT: srl t2, t2, a0
+; RV32I-NEXT: slli s2, a4, 1
+; RV32I-NEXT: sra s3, a4, a0
; RV32I-NEXT: sll a0, t4, a1
-; RV32I-NEXT: sll a4, a4, a1
-; RV32I-NEXT: sll t4, t6, a1
-; RV32I-NEXT: sll a6, a6, a1
-; RV32I-NEXT: sll t6, s1, a1
-; RV32I-NEXT: sll t0, t0, a1
-; RV32I-NEXT: sll s1, s2, a1
-; RV32I-NEXT: srli s2, t2, 24
-; RV32I-NEXT: srli s3, t2, 16
-; RV32I-NEXT: srli s4, t2, 8
+; RV32I-NEXT: sll a4, a5, a1
+; RV32I-NEXT: sll a5, t6, a1
+; RV32I-NEXT: sll a7, a7, a1
+; RV32I-NEXT: sll t4, s1, a1
+; RV32I-NEXT: sll t1, t1, a1
+; RV32I-NEXT: sll t6, s2, a1
+; RV32I-NEXT: srli s1, s3, 24
+; RV32I-NEXT: srli s2, s3, 16
+; RV32I-NEXT: srli s4, s3, 8
; RV32I-NEXT: or a0, t3, a0
; RV32I-NEXT: or a1, a3, a4
-; RV32I-NEXT: or a3, t5, t4
-; RV32I-NEXT: or a4, a5, a6
-; RV32I-NEXT: or a5, s0, t6
-; RV32I-NEXT: or a6, a7, t0
-; RV32I-NEXT: or a7, t1, s1
-; RV32I-NEXT: sb t2, 28(a2)
+; RV32I-NEXT: or a3, t5, a5
+; RV32I-NEXT: or a4, a6, a7
+; RV32I-NEXT: or a5, s0, t4
+; RV32I-NEXT: or a6, t0, t1
+; RV32I-NEXT: or a7, t2, t6
+; RV32I-NEXT: sb s3, 28(a2)
; RV32I-NEXT: sb s4, 29(a2)
-; RV32I-NEXT: sb s3, 30(a2)
-; RV32I-NEXT: sb s2, 31(a2)
+; RV32I-NEXT: sb s2, 30(a2)
+; RV32I-NEXT: sb s1, 31(a2)
; RV32I-NEXT: srli t0, a7, 24
; RV32I-NEXT: srli t1, a7, 16
; RV32I-NEXT: srli t2, a7, 8
diff --git a/llvm/test/CodeGen/RISCV/xtheadfmemidx.ll b/llvm/test/CodeGen/RISCV/xtheadfmemidx.ll
index cdaae23..5724c4f 100644
--- a/llvm/test/CodeGen/RISCV/xtheadfmemidx.ll
+++ b/llvm/test/CodeGen/RISCV/xtheadfmemidx.ll
@@ -1,33 +1,27 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=riscv32 -mattr=+d -mattr=+xtheadfmemidx -mattr=+m -verify-machineinstrs < %s \
-; RUN: | FileCheck %s -check-prefix=RV32XTHEADMEMIDX
-; RUN: llc -mtriple=riscv64 -mattr=+d -mattr=+xtheadfmemidx -verify-machineinstrs < %s \
-; RUN: | FileCheck %s -check-prefix=RV64XTHEADFMEMIDX
+; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+d,+xtheadfmemidx \
+; RUN: -verify-machineinstrs | FileCheck %s -check-prefixes=CHECK,RV32XTHEADFMEMIDX
+; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+d,+xtheadfmemidx \
+; RUN: -verify-machineinstrs | FileCheck %s -check-prefixes=CHECK,RV64XTHEADFMEMIDX
-define float @flrw(ptr %a, i64 %b) {
-; RV32XTHEADMEMIDX-LABEL: flrw:
-; RV32XTHEADMEMIDX: # %bb.0:
-; RV32XTHEADMEMIDX-NEXT: th.flrw fa5, a0, a1, 2
-; RV32XTHEADMEMIDX-NEXT: fadd.s fa0, fa5, fa5
-; RV32XTHEADMEMIDX-NEXT: ret
-;
-; RV64XTHEADFMEMIDX-LABEL: flrw:
-; RV64XTHEADFMEMIDX: # %bb.0:
-; RV64XTHEADFMEMIDX-NEXT: th.flrw fa5, a0, a1, 2
-; RV64XTHEADFMEMIDX-NEXT: fadd.s fa0, fa5, fa5
-; RV64XTHEADFMEMIDX-NEXT: ret
- %1 = getelementptr float, ptr %a, i64 %b
+define float @flrw(ptr %a, iXLen %b) {
+; CHECK-LABEL: flrw:
+; CHECK: # %bb.0:
+; CHECK-NEXT: th.flrw fa5, a0, a1, 2
+; CHECK-NEXT: fadd.s fa0, fa5, fa5
+; CHECK-NEXT: ret
+ %1 = getelementptr float, ptr %a, iXLen %b
%2 = load float, ptr %1, align 4
%3 = fadd float %2, %2
ret float %3
}
define float @flurw(ptr %a, i32 %b) {
-; RV32XTHEADMEMIDX-LABEL: flurw:
-; RV32XTHEADMEMIDX: # %bb.0:
-; RV32XTHEADMEMIDX-NEXT: th.flrw fa5, a0, a1, 2
-; RV32XTHEADMEMIDX-NEXT: fadd.s fa0, fa5, fa5
-; RV32XTHEADMEMIDX-NEXT: ret
+; RV32XTHEADFMEMIDX-LABEL: flurw:
+; RV32XTHEADFMEMIDX: # %bb.0:
+; RV32XTHEADFMEMIDX-NEXT: th.flrw fa5, a0, a1, 2
+; RV32XTHEADFMEMIDX-NEXT: fadd.s fa0, fa5, fa5
+; RV32XTHEADFMEMIDX-NEXT: ret
;
; RV64XTHEADFMEMIDX-LABEL: flurw:
; RV64XTHEADFMEMIDX: # %bb.0:
@@ -41,30 +35,24 @@ define float @flurw(ptr %a, i32 %b) {
ret float %4
}
-define void @fsrw(ptr %a, i64 %b, float %c) {
-; RV32XTHEADMEMIDX-LABEL: fsrw:
-; RV32XTHEADMEMIDX: # %bb.0:
-; RV32XTHEADMEMIDX-NEXT: fadd.s fa5, fa0, fa0
-; RV32XTHEADMEMIDX-NEXT: th.fsrw fa5, a0, a1, 2
-; RV32XTHEADMEMIDX-NEXT: ret
-;
-; RV64XTHEADFMEMIDX-LABEL: fsrw:
-; RV64XTHEADFMEMIDX: # %bb.0:
-; RV64XTHEADFMEMIDX-NEXT: fadd.s fa5, fa0, fa0
-; RV64XTHEADFMEMIDX-NEXT: th.fsrw fa5, a0, a1, 2
-; RV64XTHEADFMEMIDX-NEXT: ret
+define void @fsrw(ptr %a, iXLen %b, float %c) {
+; CHECK-LABEL: fsrw:
+; CHECK: # %bb.0:
+; CHECK-NEXT: fadd.s fa5, fa0, fa0
+; CHECK-NEXT: th.fsrw fa5, a0, a1, 2
+; CHECK-NEXT: ret
%1 = fadd float %c, %c
- %2 = getelementptr float, ptr %a, i64 %b
+ %2 = getelementptr float, ptr %a, iXLen %b
store float %1, ptr %2, align 4
ret void
}
define void @fsurw(ptr %a, i32 %b, float %c) {
-; RV32XTHEADMEMIDX-LABEL: fsurw:
-; RV32XTHEADMEMIDX: # %bb.0:
-; RV32XTHEADMEMIDX-NEXT: fadd.s fa5, fa0, fa0
-; RV32XTHEADMEMIDX-NEXT: th.fsrw fa5, a0, a1, 2
-; RV32XTHEADMEMIDX-NEXT: ret
+; RV32XTHEADFMEMIDX-LABEL: fsurw:
+; RV32XTHEADFMEMIDX: # %bb.0:
+; RV32XTHEADFMEMIDX-NEXT: fadd.s fa5, fa0, fa0
+; RV32XTHEADFMEMIDX-NEXT: th.fsrw fa5, a0, a1, 2
+; RV32XTHEADFMEMIDX-NEXT: ret
;
; RV64XTHEADFMEMIDX-LABEL: fsurw:
; RV64XTHEADFMEMIDX: # %bb.0:
@@ -78,30 +66,24 @@ define void @fsurw(ptr %a, i32 %b, float %c) {
ret void
}
-define double @flrd(ptr %a, i64 %b) {
-; RV32XTHEADMEMIDX-LABEL: flrd:
-; RV32XTHEADMEMIDX: # %bb.0:
-; RV32XTHEADMEMIDX-NEXT: th.flrd fa5, a0, a1, 3
-; RV32XTHEADMEMIDX-NEXT: fadd.d fa0, fa5, fa5
-; RV32XTHEADMEMIDX-NEXT: ret
-;
-; RV64XTHEADFMEMIDX-LABEL: flrd:
-; RV64XTHEADFMEMIDX: # %bb.0:
-; RV64XTHEADFMEMIDX-NEXT: th.flrd fa5, a0, a1, 3
-; RV64XTHEADFMEMIDX-NEXT: fadd.d fa0, fa5, fa5
-; RV64XTHEADFMEMIDX-NEXT: ret
- %1 = getelementptr double, ptr %a, i64 %b
+define double @flrd(ptr %a, iXLen %b) {
+; CHECK-LABEL: flrd:
+; CHECK: # %bb.0:
+; CHECK-NEXT: th.flrd fa5, a0, a1, 3
+; CHECK-NEXT: fadd.d fa0, fa5, fa5
+; CHECK-NEXT: ret
+ %1 = getelementptr double, ptr %a, iXLen %b
%2 = load double, ptr %1, align 8
%3 = fadd double %2, %2
ret double %3
}
define double @flurd(ptr %a, i32 %b) {
-; RV32XTHEADMEMIDX-LABEL: flurd:
-; RV32XTHEADMEMIDX: # %bb.0:
-; RV32XTHEADMEMIDX-NEXT: th.flrd fa5, a0, a1, 3
-; RV32XTHEADMEMIDX-NEXT: fadd.d fa0, fa5, fa5
-; RV32XTHEADMEMIDX-NEXT: ret
+; RV32XTHEADFMEMIDX-LABEL: flurd:
+; RV32XTHEADFMEMIDX: # %bb.0:
+; RV32XTHEADFMEMIDX-NEXT: th.flrd fa5, a0, a1, 3
+; RV32XTHEADFMEMIDX-NEXT: fadd.d fa0, fa5, fa5
+; RV32XTHEADFMEMIDX-NEXT: ret
;
; RV64XTHEADFMEMIDX-LABEL: flurd:
; RV64XTHEADFMEMIDX: # %bb.0:
@@ -115,30 +97,24 @@ define double @flurd(ptr %a, i32 %b) {
ret double %4
}
-define void @fsrd(ptr %a, i64 %b, double %c) {
-; RV32XTHEADMEMIDX-LABEL: fsrd:
-; RV32XTHEADMEMIDX: # %bb.0:
-; RV32XTHEADMEMIDX-NEXT: fadd.d fa5, fa0, fa0
-; RV32XTHEADMEMIDX-NEXT: th.fsrd fa5, a0, a1, 3
-; RV32XTHEADMEMIDX-NEXT: ret
-;
-; RV64XTHEADFMEMIDX-LABEL: fsrd:
-; RV64XTHEADFMEMIDX: # %bb.0:
-; RV64XTHEADFMEMIDX-NEXT: fadd.d fa5, fa0, fa0
-; RV64XTHEADFMEMIDX-NEXT: th.fsrd fa5, a0, a1, 3
-; RV64XTHEADFMEMIDX-NEXT: ret
+define void @fsrd(ptr %a, iXLen %b, double %c) {
+; CHECK-LABEL: fsrd:
+; CHECK: # %bb.0:
+; CHECK-NEXT: fadd.d fa5, fa0, fa0
+; CHECK-NEXT: th.fsrd fa5, a0, a1, 3
+; CHECK-NEXT: ret
%1 = fadd double %c, %c
- %2 = getelementptr double, ptr %a, i64 %b
+ %2 = getelementptr double, ptr %a, iXLen %b
store double %1, ptr %2, align 8
ret void
}
define void @fsurd(ptr %a, i32 %b, double %c) {
-; RV32XTHEADMEMIDX-LABEL: fsurd:
-; RV32XTHEADMEMIDX: # %bb.0:
-; RV32XTHEADMEMIDX-NEXT: fadd.d fa5, fa0, fa0
-; RV32XTHEADMEMIDX-NEXT: th.fsrd fa5, a0, a1, 3
-; RV32XTHEADMEMIDX-NEXT: ret
+; RV32XTHEADFMEMIDX-LABEL: fsurd:
+; RV32XTHEADFMEMIDX: # %bb.0:
+; RV32XTHEADFMEMIDX-NEXT: fadd.d fa5, fa0, fa0
+; RV32XTHEADFMEMIDX-NEXT: th.fsrd fa5, a0, a1, 3
+; RV32XTHEADFMEMIDX-NEXT: ret
;
; RV64XTHEADFMEMIDX-LABEL: fsurd:
; RV64XTHEADFMEMIDX: # %bb.0:
diff --git a/llvm/test/CodeGen/RISCV/xtheadmemidx.ll b/llvm/test/CodeGen/RISCV/xtheadmemidx.ll
index fc20fcb..a20b08a 100644
--- a/llvm/test/CodeGen/RISCV/xtheadmemidx.ll
+++ b/llvm/test/CodeGen/RISCV/xtheadmemidx.ll
@@ -1,238 +1,156 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=riscv32 -mattr=+d -mattr=+xtheadmemidx -mattr=+m -verify-machineinstrs < %s \
-; RUN: | FileCheck %s -check-prefix=RV32XTHEADMEMIDX
-; RUN: llc -mtriple=riscv64 -mattr=+d -mattr=+xtheadmemidx -mattr=+m -verify-machineinstrs < %s \
-; RUN: | FileCheck %s -check-prefix=RV64XTHEADMEMIDX
+; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+d,+xtheadmemidx \
+; RUN: -verify-machineinstrs | FileCheck %s -check-prefixes=CHECK,RV32XTHEADMEMIDX
+; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+d,+xtheadmemidx \
+; RUN: -verify-machineinstrs | FileCheck %s -check-prefixes=CHECK,RV64XTHEADMEMIDX
define ptr @lbia(ptr %base, ptr %addr.2, i8 %a) {
-; RV32XTHEADMEMIDX-LABEL: lbia:
-; RV32XTHEADMEMIDX: # %bb.0:
-; RV32XTHEADMEMIDX-NEXT: th.lbia a3, (a0), -1, 0
-; RV32XTHEADMEMIDX-NEXT: add a2, a3, a2
-; RV32XTHEADMEMIDX-NEXT: sb a2, 0(a1)
-; RV32XTHEADMEMIDX-NEXT: ret
-;
-; RV64XTHEADMEMIDX-LABEL: lbia:
-; RV64XTHEADMEMIDX: # %bb.0:
-; RV64XTHEADMEMIDX-NEXT: th.lbia a3, (a0), -1, 0
-; RV64XTHEADMEMIDX-NEXT: add a2, a3, a2
-; RV64XTHEADMEMIDX-NEXT: sb a2, 0(a1)
-; RV64XTHEADMEMIDX-NEXT: ret
- %addr = getelementptr i8, ptr %base, i8 0
+; CHECK-LABEL: lbia:
+; CHECK: # %bb.0:
+; CHECK-NEXT: th.lbia a3, (a0), -1, 0
+; CHECK-NEXT: add a2, a3, a2
+; CHECK-NEXT: sb a2, 0(a1)
+; CHECK-NEXT: ret
+ %addr = getelementptr i8, ptr %base, iXLen 0
%ld = load i8, ptr %addr
- %addr.1 = getelementptr i8, ptr %base, i8 -1
+ %addr.1 = getelementptr i8, ptr %base, iXLen -1
%res = add i8 %ld, %a
store i8 %res, ptr %addr.2
ret ptr %addr.1
}
define ptr @lbib(ptr %base, i8 %a) {
-; RV32XTHEADMEMIDX-LABEL: lbib:
-; RV32XTHEADMEMIDX: # %bb.0:
-; RV32XTHEADMEMIDX-NEXT: th.lbib a2, (a0), 1, 0
-; RV32XTHEADMEMIDX-NEXT: add a1, a2, a1
-; RV32XTHEADMEMIDX-NEXT: sb a1, 1(a0)
-; RV32XTHEADMEMIDX-NEXT: ret
-;
-; RV64XTHEADMEMIDX-LABEL: lbib:
-; RV64XTHEADMEMIDX: # %bb.0:
-; RV64XTHEADMEMIDX-NEXT: th.lbib a2, (a0), 1, 0
-; RV64XTHEADMEMIDX-NEXT: add a1, a2, a1
-; RV64XTHEADMEMIDX-NEXT: sb a1, 1(a0)
-; RV64XTHEADMEMIDX-NEXT: ret
- %addr = getelementptr i8, ptr %base, i8 1
+; CHECK-LABEL: lbib:
+; CHECK: # %bb.0:
+; CHECK-NEXT: th.lbib a2, (a0), 1, 0
+; CHECK-NEXT: add a1, a2, a1
+; CHECK-NEXT: sb a1, 1(a0)
+; CHECK-NEXT: ret
+ %addr = getelementptr i8, ptr %base, iXLen 1
%ld = load i8, ptr %addr
- %addr.1 = getelementptr i8, ptr %base, i8 2
+ %addr.1 = getelementptr i8, ptr %base, iXLen 2
%res = add i8 %ld, %a
store i8 %res, ptr %addr.1
ret ptr %addr
}
-define ptr @lbuia(ptr %base, ptr %addr.2, i64 %a) {
-; RV32XTHEADMEMIDX-LABEL: lbuia:
-; RV32XTHEADMEMIDX: # %bb.0:
-; RV32XTHEADMEMIDX-NEXT: th.lbuia a4, (a0), -1, 0
-; RV32XTHEADMEMIDX-NEXT: add a2, a4, a2
-; RV32XTHEADMEMIDX-NEXT: sltu a4, a2, a4
-; RV32XTHEADMEMIDX-NEXT: add a3, a3, a4
-; RV32XTHEADMEMIDX-NEXT: sw a2, 0(a1)
-; RV32XTHEADMEMIDX-NEXT: sw a3, 4(a1)
-; RV32XTHEADMEMIDX-NEXT: ret
-;
-; RV64XTHEADMEMIDX-LABEL: lbuia:
-; RV64XTHEADMEMIDX: # %bb.0:
-; RV64XTHEADMEMIDX-NEXT: th.lbuia a3, (a0), -1, 0
-; RV64XTHEADMEMIDX-NEXT: add a2, a3, a2
-; RV64XTHEADMEMIDX-NEXT: sd a2, 0(a1)
-; RV64XTHEADMEMIDX-NEXT: ret
- %addr = getelementptr i8, ptr %base, i8 0
+define ptr @lbuia(ptr %base, ptr %addr.2, i32 %a) {
+; CHECK-LABEL: lbuia:
+; CHECK: # %bb.0:
+; CHECK-NEXT: th.lbuia a3, (a0), -1, 0
+; CHECK-NEXT: add a2, a3, a2
+; CHECK-NEXT: sw a2, 0(a1)
+; CHECK-NEXT: ret
+ %addr = getelementptr i8, ptr %base, iXLen 0
%ld = load i8, ptr %addr
- %zext = zext i8 %ld to i64
- %addr.1 = getelementptr i8, ptr %base, i8 -1
- %res = add i64 %zext, %a
- store i64 %res, ptr %addr.2
+ %zext = zext i8 %ld to i32
+ %addr.1 = getelementptr i8, ptr %base, iXLen -1
+ %res = add i32 %zext, %a
+ store i32 %res, ptr %addr.2
ret ptr %addr.1
}
-define ptr @lbuib(ptr %base, i64 %a, ptr %addr.1) {
-; RV32XTHEADMEMIDX-LABEL: lbuib:
-; RV32XTHEADMEMIDX: # %bb.0:
-; RV32XTHEADMEMIDX-NEXT: th.lbuib a4, (a0), 1, 0
-; RV32XTHEADMEMIDX-NEXT: add a1, a4, a1
-; RV32XTHEADMEMIDX-NEXT: sltu a4, a1, a4
-; RV32XTHEADMEMIDX-NEXT: add a2, a2, a4
-; RV32XTHEADMEMIDX-NEXT: sw a1, 0(a3)
-; RV32XTHEADMEMIDX-NEXT: sw a2, 4(a3)
-; RV32XTHEADMEMIDX-NEXT: ret
-;
-; RV64XTHEADMEMIDX-LABEL: lbuib:
-; RV64XTHEADMEMIDX: # %bb.0:
-; RV64XTHEADMEMIDX-NEXT: th.lbuib a3, (a0), 1, 0
-; RV64XTHEADMEMIDX-NEXT: add a1, a3, a1
-; RV64XTHEADMEMIDX-NEXT: sd a1, 0(a2)
-; RV64XTHEADMEMIDX-NEXT: ret
- %addr = getelementptr i8, ptr %base, i8 1
+define ptr @lbuib(ptr %base, i32 %a, ptr %addr.1) {
+; CHECK-LABEL: lbuib:
+; CHECK: # %bb.0:
+; CHECK-NEXT: th.lbuib a3, (a0), 1, 0
+; CHECK-NEXT: add a1, a3, a1
+; CHECK-NEXT: sw a1, 0(a2)
+; CHECK-NEXT: ret
+ %addr = getelementptr i8, ptr %base, iXLen 1
%ld = load i8, ptr %addr
- %zext = zext i8 %ld to i64
- %res = add i64 %zext, %a
- store i64 %res, ptr %addr.1
+ %zext = zext i8 %ld to i32
+ %res = add i32 %zext, %a
+ store i32 %res, ptr %addr.1
ret ptr %addr
}
define ptr @lhia(ptr %base, ptr %addr.2, i16 %a) {
-; RV32XTHEADMEMIDX-LABEL: lhia:
-; RV32XTHEADMEMIDX: # %bb.0:
-; RV32XTHEADMEMIDX-NEXT: th.lhia a3, (a0), -16, 1
-; RV32XTHEADMEMIDX-NEXT: add a2, a3, a2
-; RV32XTHEADMEMIDX-NEXT: sh a2, 0(a1)
-; RV32XTHEADMEMIDX-NEXT: ret
-;
-; RV64XTHEADMEMIDX-LABEL: lhia:
-; RV64XTHEADMEMIDX: # %bb.0:
-; RV64XTHEADMEMIDX-NEXT: th.lhia a3, (a0), -16, 1
-; RV64XTHEADMEMIDX-NEXT: add a2, a3, a2
-; RV64XTHEADMEMIDX-NEXT: sh a2, 0(a1)
-; RV64XTHEADMEMIDX-NEXT: ret
- %addr = getelementptr i16, ptr %base, i16 0
+; CHECK-LABEL: lhia:
+; CHECK: # %bb.0:
+; CHECK-NEXT: th.lhia a3, (a0), -16, 1
+; CHECK-NEXT: add a2, a3, a2
+; CHECK-NEXT: sh a2, 0(a1)
+; CHECK-NEXT: ret
+ %addr = getelementptr i16, ptr %base, iXLen 0
%ld = load i16, ptr %addr
- %addr.1 = getelementptr i16, ptr %base, i16 -16
+ %addr.1 = getelementptr i16, ptr %base, iXLen -16
%res = add i16 %ld, %a
store i16 %res, ptr %addr.2
ret ptr %addr.1
}
define ptr @lhib(ptr %base, i16 %a) {
-; RV32XTHEADMEMIDX-LABEL: lhib:
-; RV32XTHEADMEMIDX: # %bb.0:
-; RV32XTHEADMEMIDX-NEXT: th.lhib a2, (a0), 2, 0
-; RV32XTHEADMEMIDX-NEXT: add a1, a2, a1
-; RV32XTHEADMEMIDX-NEXT: sh a1, 2(a0)
-; RV32XTHEADMEMIDX-NEXT: ret
-;
-; RV64XTHEADMEMIDX-LABEL: lhib:
-; RV64XTHEADMEMIDX: # %bb.0:
-; RV64XTHEADMEMIDX-NEXT: th.lhib a2, (a0), 2, 0
-; RV64XTHEADMEMIDX-NEXT: add a1, a2, a1
-; RV64XTHEADMEMIDX-NEXT: sh a1, 2(a0)
-; RV64XTHEADMEMIDX-NEXT: ret
- %addr = getelementptr i16, ptr %base, i16 1
+; CHECK-LABEL: lhib:
+; CHECK: # %bb.0:
+; CHECK-NEXT: th.lhib a2, (a0), 2, 0
+; CHECK-NEXT: add a1, a2, a1
+; CHECK-NEXT: sh a1, 2(a0)
+; CHECK-NEXT: ret
+ %addr = getelementptr i16, ptr %base, iXLen 1
%ld = load i16, ptr %addr
- %addr.1 = getelementptr i16, ptr %base, i16 2
+ %addr.1 = getelementptr i16, ptr %base, iXLen 2
%res = add i16 %ld, %a
store i16 %res, ptr %addr.1
ret ptr %addr
}
-define ptr @lhuia(ptr %base, ptr %addr.2, i64 %a) {
-; RV32XTHEADMEMIDX-LABEL: lhuia:
-; RV32XTHEADMEMIDX: # %bb.0:
-; RV32XTHEADMEMIDX-NEXT: th.lhuia a4, (a0), -16, 1
-; RV32XTHEADMEMIDX-NEXT: add a2, a4, a2
-; RV32XTHEADMEMIDX-NEXT: sltu a4, a2, a4
-; RV32XTHEADMEMIDX-NEXT: add a3, a3, a4
-; RV32XTHEADMEMIDX-NEXT: sw a2, 0(a1)
-; RV32XTHEADMEMIDX-NEXT: sw a3, 4(a1)
-; RV32XTHEADMEMIDX-NEXT: ret
-;
-; RV64XTHEADMEMIDX-LABEL: lhuia:
-; RV64XTHEADMEMIDX: # %bb.0:
-; RV64XTHEADMEMIDX-NEXT: th.lhuia a3, (a0), -16, 1
-; RV64XTHEADMEMIDX-NEXT: add a2, a3, a2
-; RV64XTHEADMEMIDX-NEXT: sd a2, 0(a1)
-; RV64XTHEADMEMIDX-NEXT: ret
- %addr = getelementptr i16, ptr %base, i16 0
+define ptr @lhuia(ptr %base, ptr %addr.2, i32 %a) {
+; CHECK-LABEL: lhuia:
+; CHECK: # %bb.0:
+; CHECK-NEXT: th.lhuia a3, (a0), -16, 1
+; CHECK-NEXT: add a2, a3, a2
+; CHECK-NEXT: sw a2, 0(a1)
+; CHECK-NEXT: ret
+ %addr = getelementptr i16, ptr %base, iXLen 0
%ld = load i16, ptr %addr
- %zext = zext i16 %ld to i64
- %addr.1 = getelementptr i16, ptr %base, i16 -16
- %res = add i64 %zext, %a
- store i64 %res, ptr %addr.2
+ %zext = zext i16 %ld to i32
+ %addr.1 = getelementptr i16, ptr %base, iXLen -16
+ %res = add i32 %zext, %a
+ store i32 %res, ptr %addr.2
ret ptr %addr.1
}
-define ptr @lhuib(ptr %base, i64 %a, ptr %addr.1) {
-; RV32XTHEADMEMIDX-LABEL: lhuib:
-; RV32XTHEADMEMIDX: # %bb.0:
-; RV32XTHEADMEMIDX-NEXT: th.lhuib a4, (a0), 2, 0
-; RV32XTHEADMEMIDX-NEXT: add a1, a4, a1
-; RV32XTHEADMEMIDX-NEXT: sltu a4, a1, a4
-; RV32XTHEADMEMIDX-NEXT: add a2, a2, a4
-; RV32XTHEADMEMIDX-NEXT: sw a1, 0(a3)
-; RV32XTHEADMEMIDX-NEXT: sw a2, 4(a3)
-; RV32XTHEADMEMIDX-NEXT: ret
-;
-; RV64XTHEADMEMIDX-LABEL: lhuib:
-; RV64XTHEADMEMIDX: # %bb.0:
-; RV64XTHEADMEMIDX-NEXT: th.lhuib a3, (a0), 2, 0
-; RV64XTHEADMEMIDX-NEXT: add a1, a3, a1
-; RV64XTHEADMEMIDX-NEXT: sd a1, 0(a2)
-; RV64XTHEADMEMIDX-NEXT: ret
- %addr = getelementptr i16, ptr %base, i16 1
+define ptr @lhuib(ptr %base, i32 %a, ptr %addr.1) {
+; CHECK-LABEL: lhuib:
+; CHECK: # %bb.0:
+; CHECK-NEXT: th.lhuib a3, (a0), 2, 0
+; CHECK-NEXT: add a1, a3, a1
+; CHECK-NEXT: sw a1, 0(a2)
+; CHECK-NEXT: ret
+ %addr = getelementptr i16, ptr %base, iXLen 1
%ld = load i16, ptr %addr
- %zext = zext i16 %ld to i64
- %res = add i64 %zext, %a
- store i64 %res, ptr %addr.1
+ %zext = zext i16 %ld to i32
+ %res = add i32 %zext, %a
+ store i32 %res, ptr %addr.1
ret ptr %addr
}
define ptr @lwia(ptr %base, ptr %addr.2, i32 %a) {
-; RV32XTHEADMEMIDX-LABEL: lwia:
-; RV32XTHEADMEMIDX: # %bb.0:
-; RV32XTHEADMEMIDX-NEXT: th.lwia a3, (a0), -16, 2
-; RV32XTHEADMEMIDX-NEXT: add a2, a3, a2
-; RV32XTHEADMEMIDX-NEXT: sw a2, 0(a1)
-; RV32XTHEADMEMIDX-NEXT: ret
-;
-; RV64XTHEADMEMIDX-LABEL: lwia:
-; RV64XTHEADMEMIDX: # %bb.0:
-; RV64XTHEADMEMIDX-NEXT: th.lwia a3, (a0), -16, 2
-; RV64XTHEADMEMIDX-NEXT: add a2, a3, a2
-; RV64XTHEADMEMIDX-NEXT: sw a2, 0(a1)
-; RV64XTHEADMEMIDX-NEXT: ret
- %addr = getelementptr i32, ptr %base, i32 0
+; CHECK-LABEL: lwia:
+; CHECK: # %bb.0:
+; CHECK-NEXT: th.lwia a3, (a0), -16, 2
+; CHECK-NEXT: add a2, a3, a2
+; CHECK-NEXT: sw a2, 0(a1)
+; CHECK-NEXT: ret
+ %addr = getelementptr i32, ptr %base, iXLen 0
%ld = load i32, ptr %addr
- %addr.1 = getelementptr i32, ptr %base, i32 -16
+ %addr.1 = getelementptr i32, ptr %base, iXLen -16
%res = add i32 %ld, %a
store i32 %res, ptr %addr.2
ret ptr %addr.1
}
define ptr @lwib(ptr %base, i32 %a) {
-; RV32XTHEADMEMIDX-LABEL: lwib:
-; RV32XTHEADMEMIDX: # %bb.0:
-; RV32XTHEADMEMIDX-NEXT: th.lwib a2, (a0), 4, 0
-; RV32XTHEADMEMIDX-NEXT: add a1, a2, a1
-; RV32XTHEADMEMIDX-NEXT: sw a1, 4(a0)
-; RV32XTHEADMEMIDX-NEXT: ret
-;
-; RV64XTHEADMEMIDX-LABEL: lwib:
-; RV64XTHEADMEMIDX: # %bb.0:
-; RV64XTHEADMEMIDX-NEXT: th.lwib a2, (a0), 4, 0
-; RV64XTHEADMEMIDX-NEXT: add a1, a2, a1
-; RV64XTHEADMEMIDX-NEXT: sw a1, 4(a0)
-; RV64XTHEADMEMIDX-NEXT: ret
- %addr = getelementptr i32, ptr %base, i32 1
+; CHECK-LABEL: lwib:
+; CHECK: # %bb.0:
+; CHECK-NEXT: th.lwib a2, (a0), 4, 0
+; CHECK-NEXT: add a1, a2, a1
+; CHECK-NEXT: sw a1, 4(a0)
+; CHECK-NEXT: ret
+ %addr = getelementptr i32, ptr %base, iXLen 1
%ld = load i32, ptr %addr
- %addr.1 = getelementptr i32, ptr %base, i32 2
+ %addr.1 = getelementptr i32, ptr %base, iXLen 2
%res = add i32 %ld, %a
store i32 %res, ptr %addr.1
ret ptr %addr
@@ -255,10 +173,10 @@ define ptr @lwuia(ptr %base, ptr %addr.2, i64 %a) {
; RV64XTHEADMEMIDX-NEXT: add a2, a3, a2
; RV64XTHEADMEMIDX-NEXT: sd a2, 0(a1)
; RV64XTHEADMEMIDX-NEXT: ret
- %addr = getelementptr i32, ptr %base, i32 0
+ %addr = getelementptr i32, ptr %base, iXLen 0
%ld = load i32, ptr %addr
%zext = zext i32 %ld to i64
- %addr.1 = getelementptr i32, ptr %base, i32 -16
+ %addr.1 = getelementptr i32, ptr %base, iXLen -16
%res = add i64 %zext, %a
store i64 %res, ptr %addr.2
ret ptr %addr.1
@@ -281,7 +199,7 @@ define ptr @lwuib(ptr %base, i64 %a, ptr %addr.1) {
; RV64XTHEADMEMIDX-NEXT: add a1, a3, a1
; RV64XTHEADMEMIDX-NEXT: sd a1, 0(a2)
; RV64XTHEADMEMIDX-NEXT: ret
- %addr = getelementptr i32, ptr %base, i32 1
+ %addr = getelementptr i32, ptr %base, iXLen 1
%ld = load i32, ptr %addr
%zext = zext i32 %ld to i64
%res = add i64 %zext, %a
@@ -309,9 +227,9 @@ define ptr @ldia(ptr %base, ptr %addr.2, i64 %a) {
; RV64XTHEADMEMIDX-NEXT: add a2, a3, a2
; RV64XTHEADMEMIDX-NEXT: sd a2, 0(a1)
; RV64XTHEADMEMIDX-NEXT: ret
- %addr = getelementptr i64, ptr %base, i64 0
+ %addr = getelementptr i64, ptr %base, iXLen 0
%ld = load i64, ptr %addr
- %addr.1 = getelementptr i64, ptr %base, i64 -16
+ %addr.1 = getelementptr i64, ptr %base, iXLen -16
%res = add i64 %ld, %a
store i64 %res, ptr %addr.2
ret ptr %addr.1
@@ -336,117 +254,81 @@ define ptr @ldib(ptr %base, i64 %a) {
; RV64XTHEADMEMIDX-NEXT: add a1, a2, a1
; RV64XTHEADMEMIDX-NEXT: sd a1, 8(a0)
; RV64XTHEADMEMIDX-NEXT: ret
- %addr = getelementptr i64, ptr %base, i64 1
+ %addr = getelementptr i64, ptr %base, iXLen 1
%ld = load i64, ptr %addr
- %addr.1 = getelementptr i64, ptr %base, i64 2
+ %addr.1 = getelementptr i64, ptr %base, iXLen 2
%res = add i64 %ld, %a
store i64 %res, ptr %addr.1
ret ptr %addr
}
define ptr @sbia(ptr %base, i8 %a, i8 %b) {
-; RV32XTHEADMEMIDX-LABEL: sbia:
-; RV32XTHEADMEMIDX: # %bb.0:
-; RV32XTHEADMEMIDX-NEXT: add a1, a1, a2
-; RV32XTHEADMEMIDX-NEXT: th.sbia a1, (a0), 1, 0
-; RV32XTHEADMEMIDX-NEXT: ret
-;
-; RV64XTHEADMEMIDX-LABEL: sbia:
-; RV64XTHEADMEMIDX: # %bb.0:
-; RV64XTHEADMEMIDX-NEXT: add a1, a1, a2
-; RV64XTHEADMEMIDX-NEXT: th.sbia a1, (a0), 1, 0
-; RV64XTHEADMEMIDX-NEXT: ret
- %addr.1 = getelementptr i8, ptr %base, i8 1
+; CHECK-LABEL: sbia:
+; CHECK: # %bb.0:
+; CHECK-NEXT: add a1, a1, a2
+; CHECK-NEXT: th.sbia a1, (a0), 1, 0
+; CHECK-NEXT: ret
+ %addr.1 = getelementptr i8, ptr %base, iXLen 1
%res = add i8 %a, %b
store i8 %res, ptr %base
ret ptr %addr.1
}
define ptr @sbib(ptr %base, i8 %a, i8 %b) {
-; RV32XTHEADMEMIDX-LABEL: sbib:
-; RV32XTHEADMEMIDX: # %bb.0:
-; RV32XTHEADMEMIDX-NEXT: add a1, a1, a2
-; RV32XTHEADMEMIDX-NEXT: th.sbib a1, (a0), 1, 0
-; RV32XTHEADMEMIDX-NEXT: ret
-;
-; RV64XTHEADMEMIDX-LABEL: sbib:
-; RV64XTHEADMEMIDX: # %bb.0:
-; RV64XTHEADMEMIDX-NEXT: add a1, a1, a2
-; RV64XTHEADMEMIDX-NEXT: th.sbib a1, (a0), 1, 0
-; RV64XTHEADMEMIDX-NEXT: ret
- %addr.1 = getelementptr i8, ptr %base, i8 1
+; CHECK-LABEL: sbib:
+; CHECK: # %bb.0:
+; CHECK-NEXT: add a1, a1, a2
+; CHECK-NEXT: th.sbib a1, (a0), 1, 0
+; CHECK-NEXT: ret
+ %addr.1 = getelementptr i8, ptr %base, iXLen 1
%res = add i8 %a, %b
store i8 %res, ptr %addr.1
ret ptr %addr.1
}
define ptr @shia(ptr %base, i16 %a, i16 %b) {
-; RV32XTHEADMEMIDX-LABEL: shia:
-; RV32XTHEADMEMIDX: # %bb.0:
-; RV32XTHEADMEMIDX-NEXT: add a1, a1, a2
-; RV32XTHEADMEMIDX-NEXT: th.shia a1, (a0), -9, 1
-; RV32XTHEADMEMIDX-NEXT: ret
-;
-; RV64XTHEADMEMIDX-LABEL: shia:
-; RV64XTHEADMEMIDX: # %bb.0:
-; RV64XTHEADMEMIDX-NEXT: add a1, a1, a2
-; RV64XTHEADMEMIDX-NEXT: th.shia a1, (a0), -9, 1
-; RV64XTHEADMEMIDX-NEXT: ret
- %addr.1 = getelementptr i16, ptr %base, i16 -9
+; CHECK-LABEL: shia:
+; CHECK: # %bb.0:
+; CHECK-NEXT: add a1, a1, a2
+; CHECK-NEXT: th.shia a1, (a0), -9, 1
+; CHECK-NEXT: ret
+ %addr.1 = getelementptr i16, ptr %base, iXLen -9
%res = add i16 %a, %b
store i16 %res, ptr %base
ret ptr %addr.1
}
define ptr @shib(ptr %base, i16 %a, i16 %b) {
-; RV32XTHEADMEMIDX-LABEL: shib:
-; RV32XTHEADMEMIDX: # %bb.0:
-; RV32XTHEADMEMIDX-NEXT: add a1, a1, a2
-; RV32XTHEADMEMIDX-NEXT: th.shib a1, (a0), 2, 0
-; RV32XTHEADMEMIDX-NEXT: ret
-;
-; RV64XTHEADMEMIDX-LABEL: shib:
-; RV64XTHEADMEMIDX: # %bb.0:
-; RV64XTHEADMEMIDX-NEXT: add a1, a1, a2
-; RV64XTHEADMEMIDX-NEXT: th.shib a1, (a0), 2, 0
-; RV64XTHEADMEMIDX-NEXT: ret
- %addr.1 = getelementptr i16, ptr %base, i16 1
+; CHECK-LABEL: shib:
+; CHECK: # %bb.0:
+; CHECK-NEXT: add a1, a1, a2
+; CHECK-NEXT: th.shib a1, (a0), 2, 0
+; CHECK-NEXT: ret
+ %addr.1 = getelementptr i16, ptr %base, iXLen 1
%res = add i16 %a, %b
store i16 %res, ptr %addr.1
ret ptr %addr.1
}
define ptr @swia(ptr %base, i32 %a, i32 %b) {
-; RV32XTHEADMEMIDX-LABEL: swia:
-; RV32XTHEADMEMIDX: # %bb.0:
-; RV32XTHEADMEMIDX-NEXT: add a1, a1, a2
-; RV32XTHEADMEMIDX-NEXT: th.swia a1, (a0), 8, 2
-; RV32XTHEADMEMIDX-NEXT: ret
-;
-; RV64XTHEADMEMIDX-LABEL: swia:
-; RV64XTHEADMEMIDX: # %bb.0:
-; RV64XTHEADMEMIDX-NEXT: add a1, a1, a2
-; RV64XTHEADMEMIDX-NEXT: th.swia a1, (a0), 8, 2
-; RV64XTHEADMEMIDX-NEXT: ret
- %addr.1 = getelementptr i32, ptr %base, i32 8
+; CHECK-LABEL: swia:
+; CHECK: # %bb.0:
+; CHECK-NEXT: add a1, a1, a2
+; CHECK-NEXT: th.swia a1, (a0), 8, 2
+; CHECK-NEXT: ret
+ %addr.1 = getelementptr i32, ptr %base, iXLen 8
%res = add i32 %a, %b
store i32 %res, ptr %base
ret ptr %addr.1
}
define ptr @swib(ptr %base, i32 %a, i32 %b) {
-; RV32XTHEADMEMIDX-LABEL: swib:
-; RV32XTHEADMEMIDX: # %bb.0:
-; RV32XTHEADMEMIDX-NEXT: add a1, a1, a2
-; RV32XTHEADMEMIDX-NEXT: th.swib a1, (a0), -13, 3
-; RV32XTHEADMEMIDX-NEXT: ret
-;
-; RV64XTHEADMEMIDX-LABEL: swib:
-; RV64XTHEADMEMIDX: # %bb.0:
-; RV64XTHEADMEMIDX-NEXT: add a1, a1, a2
-; RV64XTHEADMEMIDX-NEXT: th.swib a1, (a0), -13, 3
-; RV64XTHEADMEMIDX-NEXT: ret
- %addr.1 = getelementptr i32, ptr %base, i32 -26
+; CHECK-LABEL: swib:
+; CHECK: # %bb.0:
+; CHECK-NEXT: add a1, a1, a2
+; CHECK-NEXT: th.swib a1, (a0), -13, 3
+; CHECK-NEXT: ret
+ %addr.1 = getelementptr i32, ptr %base, iXLen -26
%res = add i32 %a, %b
store i32 %res, ptr %addr.1
ret ptr %addr.1
@@ -470,7 +352,7 @@ define ptr @sdia(ptr %base, i64 %a, i64 %b) {
; RV64XTHEADMEMIDX-NEXT: add a1, a1, a2
; RV64XTHEADMEMIDX-NEXT: th.sdia a1, (a0), 8, 3
; RV64XTHEADMEMIDX-NEXT: ret
- %addr.1 = getelementptr i64, ptr %base, i64 8
+ %addr.1 = getelementptr i64, ptr %base, iXLen 8
%res = add i64 %a, %b
store i64 %res, ptr %base
ret ptr %addr.1
@@ -492,48 +374,33 @@ define ptr @sdib(ptr %base, i64 %a, i64 %b) {
; RV64XTHEADMEMIDX-NEXT: add a1, a1, a2
; RV64XTHEADMEMIDX-NEXT: th.sdib a1, (a0), 8, 0
; RV64XTHEADMEMIDX-NEXT: ret
- %addr.1 = getelementptr i64, ptr %base, i64 1
+ %addr.1 = getelementptr i64, ptr %base, iXLen 1
%res = add i64 %a, %b
store i64 %res, ptr %addr.1
ret ptr %addr.1
}
-define i8 @lrb_anyext(ptr %a, i64 %b) {
-; RV32XTHEADMEMIDX-LABEL: lrb_anyext:
-; RV32XTHEADMEMIDX: # %bb.0:
-; RV32XTHEADMEMIDX-NEXT: th.lrb a0, a0, a1, 0
-; RV32XTHEADMEMIDX-NEXT: ret
-;
-; RV64XTHEADMEMIDX-LABEL: lrb_anyext:
-; RV64XTHEADMEMIDX: # %bb.0:
-; RV64XTHEADMEMIDX-NEXT: th.lrb a0, a0, a1, 0
-; RV64XTHEADMEMIDX-NEXT: ret
- %1 = getelementptr i8, ptr %a, i64 %b
+define i8 @lrb_anyext(ptr %a, iXLen %b) {
+; CHECK-LABEL: lrb_anyext:
+; CHECK: # %bb.0:
+; CHECK-NEXT: th.lrb a0, a0, a1, 0
+; CHECK-NEXT: ret
+ %1 = getelementptr i8, ptr %a, iXLen %b
%2 = load i8, ptr %1, align 1
ret i8 %2
}
-define i64 @lrb(ptr %a, i64 %b) {
-; RV32XTHEADMEMIDX-LABEL: lrb:
-; RV32XTHEADMEMIDX: # %bb.0:
-; RV32XTHEADMEMIDX-NEXT: th.lrb a1, a0, a1, 0
-; RV32XTHEADMEMIDX-NEXT: srai a2, a1, 31
-; RV32XTHEADMEMIDX-NEXT: add a0, a1, a1
-; RV32XTHEADMEMIDX-NEXT: sltu a1, a0, a1
-; RV32XTHEADMEMIDX-NEXT: add a2, a2, a2
-; RV32XTHEADMEMIDX-NEXT: add a1, a2, a1
-; RV32XTHEADMEMIDX-NEXT: ret
-;
-; RV64XTHEADMEMIDX-LABEL: lrb:
-; RV64XTHEADMEMIDX: # %bb.0:
-; RV64XTHEADMEMIDX-NEXT: th.lrb a0, a0, a1, 0
-; RV64XTHEADMEMIDX-NEXT: add a0, a0, a0
-; RV64XTHEADMEMIDX-NEXT: ret
- %1 = getelementptr i8, ptr %a, i64 %b
+define i32 @lrb(ptr %a, iXLen %b) {
+; CHECK-LABEL: lrb:
+; CHECK: # %bb.0:
+; CHECK-NEXT: th.lrb a0, a0, a1, 0
+; CHECK-NEXT: add a0, a0, a0
+; CHECK-NEXT: ret
+ %1 = getelementptr i8, ptr %a, iXLen %b
%2 = load i8, ptr %1, align 1
- %3 = sext i8 %2 to i64
- %4 = add i64 %3, %3
- ret i64 %4
+ %3 = sext i8 %2 to i32
+ %4 = add i32 %3, %3
+ ret i32 %4
}
define i8 @lurb_anyext(ptr %a, i32 %b) {
@@ -552,15 +419,11 @@ define i8 @lurb_anyext(ptr %a, i32 %b) {
ret i8 %3
}
-define i64 @lurb(ptr %a, i32 %b) {
+define i32 @lurb(ptr %a, i32 %b) {
; RV32XTHEADMEMIDX-LABEL: lurb:
; RV32XTHEADMEMIDX: # %bb.0:
-; RV32XTHEADMEMIDX-NEXT: th.lrb a1, a0, a1, 0
-; RV32XTHEADMEMIDX-NEXT: srai a2, a1, 31
-; RV32XTHEADMEMIDX-NEXT: add a0, a1, a1
-; RV32XTHEADMEMIDX-NEXT: sltu a1, a0, a1
-; RV32XTHEADMEMIDX-NEXT: add a2, a2, a2
-; RV32XTHEADMEMIDX-NEXT: add a1, a2, a1
+; RV32XTHEADMEMIDX-NEXT: th.lrb a0, a0, a1, 0
+; RV32XTHEADMEMIDX-NEXT: add a0, a0, a0
; RV32XTHEADMEMIDX-NEXT: ret
;
; RV64XTHEADMEMIDX-LABEL: lurb:
@@ -571,37 +434,29 @@ define i64 @lurb(ptr %a, i32 %b) {
%1 = zext i32 %b to i64
%2 = getelementptr i8, ptr %a, i64 %1
%3 = load i8, ptr %2, align 1
- %4 = sext i8 %3 to i64
- %5 = add i64 %4, %4
- ret i64 %5
-}
-
-define i64 @lrbu(ptr %a, i64 %b) {
-; RV32XTHEADMEMIDX-LABEL: lrbu:
-; RV32XTHEADMEMIDX: # %bb.0:
-; RV32XTHEADMEMIDX-NEXT: th.lrbu a1, a0, a1, 0
-; RV32XTHEADMEMIDX-NEXT: add a0, a1, a1
-; RV32XTHEADMEMIDX-NEXT: sltu a1, a0, a1
-; RV32XTHEADMEMIDX-NEXT: ret
-;
-; RV64XTHEADMEMIDX-LABEL: lrbu:
-; RV64XTHEADMEMIDX: # %bb.0:
-; RV64XTHEADMEMIDX-NEXT: th.lrbu a0, a0, a1, 0
-; RV64XTHEADMEMIDX-NEXT: add a0, a0, a0
-; RV64XTHEADMEMIDX-NEXT: ret
- %1 = getelementptr i8, ptr %a, i64 %b
+ %4 = sext i8 %3 to i32
+ %5 = add i32 %4, %4
+ ret i32 %5
+}
+
+define i32 @lrbu(ptr %a, iXLen %b) {
+; CHECK-LABEL: lrbu:
+; CHECK: # %bb.0:
+; CHECK-NEXT: th.lrbu a0, a0, a1, 0
+; CHECK-NEXT: add a0, a0, a0
+; CHECK-NEXT: ret
+ %1 = getelementptr i8, ptr %a, iXLen %b
%2 = load i8, ptr %1, align 1
- %3 = zext i8 %2 to i64
- %4 = add i64 %3, %3
- ret i64 %4
+ %3 = zext i8 %2 to i32
+ %4 = add i32 %3, %3
+ ret i32 %4
}
-define i64 @lurbu(ptr %a, i32 %b) {
+define i32 @lurbu(ptr %a, i32 %b) {
; RV32XTHEADMEMIDX-LABEL: lurbu:
; RV32XTHEADMEMIDX: # %bb.0:
-; RV32XTHEADMEMIDX-NEXT: th.lrbu a1, a0, a1, 0
-; RV32XTHEADMEMIDX-NEXT: add a0, a1, a1
-; RV32XTHEADMEMIDX-NEXT: sltu a1, a0, a1
+; RV32XTHEADMEMIDX-NEXT: th.lrbu a0, a0, a1, 0
+; RV32XTHEADMEMIDX-NEXT: add a0, a0, a0
; RV32XTHEADMEMIDX-NEXT: ret
;
; RV64XTHEADMEMIDX-LABEL: lurbu:
@@ -612,47 +467,32 @@ define i64 @lurbu(ptr %a, i32 %b) {
%1 = zext i32 %b to i64
%2 = getelementptr i8, ptr %a, i64 %1
%3 = load i8, ptr %2, align 1
- %4 = zext i8 %3 to i64
- %5 = add i64 %4, %4
- ret i64 %5
+ %4 = zext i8 %3 to i32
+ %5 = add i32 %4, %4
+ ret i32 %5
}
-define i16 @lrh_anyext(ptr %a, i64 %b) {
-; RV32XTHEADMEMIDX-LABEL: lrh_anyext:
-; RV32XTHEADMEMIDX: # %bb.0:
-; RV32XTHEADMEMIDX-NEXT: th.lrh a0, a0, a1, 1
-; RV32XTHEADMEMIDX-NEXT: ret
-;
-; RV64XTHEADMEMIDX-LABEL: lrh_anyext:
-; RV64XTHEADMEMIDX: # %bb.0:
-; RV64XTHEADMEMIDX-NEXT: th.lrh a0, a0, a1, 1
-; RV64XTHEADMEMIDX-NEXT: ret
- %1 = getelementptr i16, ptr %a, i64 %b
+define i16 @lrh_anyext(ptr %a, iXLen %b) {
+; CHECK-LABEL: lrh_anyext:
+; CHECK: # %bb.0:
+; CHECK-NEXT: th.lrh a0, a0, a1, 1
+; CHECK-NEXT: ret
+ %1 = getelementptr i16, ptr %a, iXLen %b
%2 = load i16, ptr %1, align 2
ret i16 %2
}
-define i64 @lrh(ptr %a, i64 %b) {
-; RV32XTHEADMEMIDX-LABEL: lrh:
-; RV32XTHEADMEMIDX: # %bb.0:
-; RV32XTHEADMEMIDX-NEXT: th.lrh a1, a0, a1, 1
-; RV32XTHEADMEMIDX-NEXT: srai a2, a1, 31
-; RV32XTHEADMEMIDX-NEXT: add a0, a1, a1
-; RV32XTHEADMEMIDX-NEXT: sltu a1, a0, a1
-; RV32XTHEADMEMIDX-NEXT: add a2, a2, a2
-; RV32XTHEADMEMIDX-NEXT: add a1, a2, a1
-; RV32XTHEADMEMIDX-NEXT: ret
-;
-; RV64XTHEADMEMIDX-LABEL: lrh:
-; RV64XTHEADMEMIDX: # %bb.0:
-; RV64XTHEADMEMIDX-NEXT: th.lrh a0, a0, a1, 1
-; RV64XTHEADMEMIDX-NEXT: add a0, a0, a0
-; RV64XTHEADMEMIDX-NEXT: ret
- %1 = getelementptr i16, ptr %a, i64 %b
+define i32 @lrh(ptr %a, iXLen %b) {
+; CHECK-LABEL: lrh:
+; CHECK: # %bb.0:
+; CHECK-NEXT: th.lrh a0, a0, a1, 1
+; CHECK-NEXT: add a0, a0, a0
+; CHECK-NEXT: ret
+ %1 = getelementptr i16, ptr %a, iXLen %b
%2 = load i16, ptr %1, align 2
- %3 = sext i16 %2 to i64
- %4 = add i64 %3, %3
- ret i64 %4
+ %3 = sext i16 %2 to i32
+ %4 = add i32 %3, %3
+ ret i32 %4
}
define i16 @lurh_anyext(ptr %a, i32 %b) {
@@ -671,15 +511,11 @@ define i16 @lurh_anyext(ptr %a, i32 %b) {
ret i16 %3
}
-define i64 @lurh(ptr %a, i32 %b) {
+define i32 @lurh(ptr %a, i32 %b) {
; RV32XTHEADMEMIDX-LABEL: lurh:
; RV32XTHEADMEMIDX: # %bb.0:
-; RV32XTHEADMEMIDX-NEXT: th.lrh a1, a0, a1, 1
-; RV32XTHEADMEMIDX-NEXT: srai a2, a1, 31
-; RV32XTHEADMEMIDX-NEXT: add a0, a1, a1
-; RV32XTHEADMEMIDX-NEXT: sltu a1, a0, a1
-; RV32XTHEADMEMIDX-NEXT: add a2, a2, a2
-; RV32XTHEADMEMIDX-NEXT: add a1, a2, a1
+; RV32XTHEADMEMIDX-NEXT: th.lrh a0, a0, a1, 1
+; RV32XTHEADMEMIDX-NEXT: add a0, a0, a0
; RV32XTHEADMEMIDX-NEXT: ret
;
; RV64XTHEADMEMIDX-LABEL: lurh:
@@ -690,37 +526,29 @@ define i64 @lurh(ptr %a, i32 %b) {
%1 = zext i32 %b to i64
%2 = getelementptr i16, ptr %a, i64 %1
%3 = load i16, ptr %2, align 2
- %4 = sext i16 %3 to i64
- %5 = add i64 %4, %4
- ret i64 %5
-}
-
-define i64 @lrhu(ptr %a, i64 %b) {
-; RV32XTHEADMEMIDX-LABEL: lrhu:
-; RV32XTHEADMEMIDX: # %bb.0:
-; RV32XTHEADMEMIDX-NEXT: th.lrhu a1, a0, a1, 1
-; RV32XTHEADMEMIDX-NEXT: add a0, a1, a1
-; RV32XTHEADMEMIDX-NEXT: sltu a1, a0, a1
-; RV32XTHEADMEMIDX-NEXT: ret
-;
-; RV64XTHEADMEMIDX-LABEL: lrhu:
-; RV64XTHEADMEMIDX: # %bb.0:
-; RV64XTHEADMEMIDX-NEXT: th.lrhu a0, a0, a1, 1
-; RV64XTHEADMEMIDX-NEXT: add a0, a0, a0
-; RV64XTHEADMEMIDX-NEXT: ret
- %1 = getelementptr i16, ptr %a, i64 %b
+ %4 = sext i16 %3 to i32
+ %5 = add i32 %4, %4
+ ret i32 %5
+}
+
+define i32 @lrhu(ptr %a, iXLen %b) {
+; CHECK-LABEL: lrhu:
+; CHECK: # %bb.0:
+; CHECK-NEXT: th.lrhu a0, a0, a1, 1
+; CHECK-NEXT: add a0, a0, a0
+; CHECK-NEXT: ret
+ %1 = getelementptr i16, ptr %a, iXLen %b
%2 = load i16, ptr %1, align 2
- %3 = zext i16 %2 to i64
- %4 = add i64 %3, %3
- ret i64 %4
+ %3 = zext i16 %2 to i32
+ %4 = add i32 %3, %3
+ ret i32 %4
}
-define i64 @lurhu(ptr %a, i32 %b) {
+define i32 @lurhu(ptr %a, i32 %b) {
; RV32XTHEADMEMIDX-LABEL: lurhu:
; RV32XTHEADMEMIDX: # %bb.0:
-; RV32XTHEADMEMIDX-NEXT: th.lrhu a1, a0, a1, 1
-; RV32XTHEADMEMIDX-NEXT: add a0, a1, a1
-; RV32XTHEADMEMIDX-NEXT: sltu a1, a0, a1
+; RV32XTHEADMEMIDX-NEXT: th.lrhu a0, a0, a1, 1
+; RV32XTHEADMEMIDX-NEXT: add a0, a0, a0
; RV32XTHEADMEMIDX-NEXT: ret
;
; RV64XTHEADMEMIDX-LABEL: lurhu:
@@ -731,27 +559,22 @@ define i64 @lurhu(ptr %a, i32 %b) {
%1 = zext i32 %b to i64
%2 = getelementptr i16, ptr %a, i64 %1
%3 = load i16, ptr %2, align 2
- %4 = zext i16 %3 to i64
- %5 = add i64 %4, %4
- ret i64 %5
+ %4 = zext i16 %3 to i32
+ %5 = add i32 %4, %4
+ ret i32 %5
}
-define i32 @lrw_anyext(ptr %a, i64 %b) {
-; RV32XTHEADMEMIDX-LABEL: lrw_anyext:
-; RV32XTHEADMEMIDX: # %bb.0:
-; RV32XTHEADMEMIDX-NEXT: th.lrw a0, a0, a1, 2
-; RV32XTHEADMEMIDX-NEXT: ret
-;
-; RV64XTHEADMEMIDX-LABEL: lrw_anyext:
-; RV64XTHEADMEMIDX: # %bb.0:
-; RV64XTHEADMEMIDX-NEXT: th.lrw a0, a0, a1, 2
-; RV64XTHEADMEMIDX-NEXT: ret
- %1 = getelementptr i32, ptr %a, i64 %b
+define i32 @lrw_anyext(ptr %a, iXLen %b) {
+; CHECK-LABEL: lrw_anyext:
+; CHECK: # %bb.0:
+; CHECK-NEXT: th.lrw a0, a0, a1, 2
+; CHECK-NEXT: ret
+ %1 = getelementptr i32, ptr %a, iXLen %b
%2 = load i32, ptr %1, align 4
ret i32 %2
}
-define i64 @lrw(ptr %a, i64 %b) {
+define i64 @lrw(ptr %a, iXLen %b) {
; RV32XTHEADMEMIDX-LABEL: lrw:
; RV32XTHEADMEMIDX: # %bb.0:
; RV32XTHEADMEMIDX-NEXT: th.lrw a1, a0, a1, 2
@@ -767,7 +590,7 @@ define i64 @lrw(ptr %a, i64 %b) {
; RV64XTHEADMEMIDX-NEXT: th.lrw a0, a0, a1, 2
; RV64XTHEADMEMIDX-NEXT: add a0, a0, a0
; RV64XTHEADMEMIDX-NEXT: ret
- %1 = getelementptr i32, ptr %a, i64 %b
+ %1 = getelementptr i32, ptr %a, iXLen %b
%2 = load i32, ptr %1, align 4
%3 = sext i32 %2 to i64
%4 = add i64 %3, %3
@@ -814,7 +637,7 @@ define i64 @lurw(ptr %a, i32 %b) {
ret i64 %5
}
-define i64 @lrwu(ptr %a, i64 %b) {
+define i64 @lrwu(ptr %a, iXLen %b) {
; RV32XTHEADMEMIDX-LABEL: lrwu:
; RV32XTHEADMEMIDX: # %bb.0:
; RV32XTHEADMEMIDX-NEXT: th.lrw a1, a0, a1, 2
@@ -827,7 +650,7 @@ define i64 @lrwu(ptr %a, i64 %b) {
; RV64XTHEADMEMIDX-NEXT: th.lrwu a0, a0, a1, 2
; RV64XTHEADMEMIDX-NEXT: add a0, a0, a0
; RV64XTHEADMEMIDX-NEXT: ret
- %1 = getelementptr i32, ptr %a, i64 %b
+ %1 = getelementptr i32, ptr %a, iXLen %b
%2 = load i32, ptr %1, align 4
%3 = zext i32 %2 to i64
%4 = add i64 %3, %3
@@ -855,7 +678,7 @@ define i64 @lurwu(ptr %a, i32 %b) {
ret i64 %5
}
-define i64 @lrd(ptr %a, i64 %b) {
+define i64 @lrd(ptr %a, iXLen %b) {
; RV32XTHEADMEMIDX-LABEL: lrd:
; RV32XTHEADMEMIDX: # %bb.0:
; RV32XTHEADMEMIDX-NEXT: th.lrw a2, a0, a1, 3
@@ -872,13 +695,13 @@ define i64 @lrd(ptr %a, i64 %b) {
; RV64XTHEADMEMIDX-NEXT: th.lrd a0, a0, a1, 3
; RV64XTHEADMEMIDX-NEXT: add a0, a0, a0
; RV64XTHEADMEMIDX-NEXT: ret
- %1 = getelementptr i64, ptr %a, i64 %b
+ %1 = getelementptr i64, ptr %a, iXLen %b
%2 = load i64, ptr %1, align 8
%3 = add i64 %2, %2
ret i64 %3
}
-define i64 @lrd_2(ptr %a, i64 %b) {
+define i64 @lrd_2(ptr %a, iXLen %b) {
; RV32XTHEADMEMIDX-LABEL: lrd_2:
; RV32XTHEADMEMIDX: # %bb.0:
; RV32XTHEADMEMIDX-NEXT: addi a2, a0, 96
@@ -897,8 +720,8 @@ define i64 @lrd_2(ptr %a, i64 %b) {
; RV64XTHEADMEMIDX-NEXT: th.lrd a0, a0, a1, 3
; RV64XTHEADMEMIDX-NEXT: add a0, a0, a0
; RV64XTHEADMEMIDX-NEXT: ret
- %1 = add i64 %b, 12
- %2 = getelementptr i64, ptr %a, i64 %1
+ %1 = add iXLen %b, 12
+ %2 = getelementptr i64, ptr %a, iXLen %1
%3 = load i64, ptr %2, align 8
%4 = add i64 %3, %3
ret i64 %4
@@ -928,20 +751,14 @@ define i64 @lurd(ptr %a, i32 %b) {
ret i64 %4
}
-define void @srb(ptr %a, i64 %b, i8 %c) {
-; RV32XTHEADMEMIDX-LABEL: srb:
-; RV32XTHEADMEMIDX: # %bb.0:
-; RV32XTHEADMEMIDX-NEXT: add a3, a3, a3
-; RV32XTHEADMEMIDX-NEXT: th.srb a3, a0, a1, 0
-; RV32XTHEADMEMIDX-NEXT: ret
-;
-; RV64XTHEADMEMIDX-LABEL: srb:
-; RV64XTHEADMEMIDX: # %bb.0:
-; RV64XTHEADMEMIDX-NEXT: add a2, a2, a2
-; RV64XTHEADMEMIDX-NEXT: th.srb a2, a0, a1, 0
-; RV64XTHEADMEMIDX-NEXT: ret
+define void @srb(ptr %a, iXLen %b, i8 %c) {
+; CHECK-LABEL: srb:
+; CHECK: # %bb.0:
+; CHECK-NEXT: add a2, a2, a2
+; CHECK-NEXT: th.srb a2, a0, a1, 0
+; CHECK-NEXT: ret
%1 = add i8 %c, %c
- %2 = getelementptr i8, ptr %a, i64 %b
+ %2 = getelementptr i8, ptr %a, iXLen %b
store i8 %1, ptr %2, align 1
ret void
}
@@ -965,20 +782,14 @@ define void @surb(ptr %a, i32 %b, i8 %c) {
ret void
}
-define void @srh(ptr %a, i64 %b, i16 %c) {
-; RV32XTHEADMEMIDX-LABEL: srh:
-; RV32XTHEADMEMIDX: # %bb.0:
-; RV32XTHEADMEMIDX-NEXT: add a3, a3, a3
-; RV32XTHEADMEMIDX-NEXT: th.srh a3, a0, a1, 1
-; RV32XTHEADMEMIDX-NEXT: ret
-;
-; RV64XTHEADMEMIDX-LABEL: srh:
-; RV64XTHEADMEMIDX: # %bb.0:
-; RV64XTHEADMEMIDX-NEXT: add a2, a2, a2
-; RV64XTHEADMEMIDX-NEXT: th.srh a2, a0, a1, 1
-; RV64XTHEADMEMIDX-NEXT: ret
+define void @srh(ptr %a, iXLen %b, i16 %c) {
+; CHECK-LABEL: srh:
+; CHECK: # %bb.0:
+; CHECK-NEXT: add a2, a2, a2
+; CHECK-NEXT: th.srh a2, a0, a1, 1
+; CHECK-NEXT: ret
%1 = add i16 %c, %c
- %2 = getelementptr i16, ptr %a, i64 %b
+ %2 = getelementptr i16, ptr %a, iXLen %b
store i16 %1, ptr %2, align 2
ret void
}
@@ -1002,20 +813,14 @@ define void @surh(ptr %a, i32 %b, i16 %c) {
ret void
}
-define void @srw(ptr %a, i64 %b, i32 %c) {
-; RV32XTHEADMEMIDX-LABEL: srw:
-; RV32XTHEADMEMIDX: # %bb.0:
-; RV32XTHEADMEMIDX-NEXT: add a3, a3, a3
-; RV32XTHEADMEMIDX-NEXT: th.srw a3, a0, a1, 2
-; RV32XTHEADMEMIDX-NEXT: ret
-;
-; RV64XTHEADMEMIDX-LABEL: srw:
-; RV64XTHEADMEMIDX: # %bb.0:
-; RV64XTHEADMEMIDX-NEXT: add a2, a2, a2
-; RV64XTHEADMEMIDX-NEXT: th.srw a2, a0, a1, 2
-; RV64XTHEADMEMIDX-NEXT: ret
+define void @srw(ptr %a, iXLen %b, i32 %c) {
+; CHECK-LABEL: srw:
+; CHECK: # %bb.0:
+; CHECK-NEXT: add a2, a2, a2
+; CHECK-NEXT: th.srw a2, a0, a1, 2
+; CHECK-NEXT: ret
%1 = add i32 %c, %c
- %2 = getelementptr i32, ptr %a, i64 %b
+ %2 = getelementptr i32, ptr %a, iXLen %b
store i32 %1, ptr %2, align 4
ret void
}
@@ -1039,16 +844,16 @@ define void @surw(ptr %a, i32 %b, i32 %c) {
ret void
}
-define void @srd(ptr %a, i64 %b, i64 %c) {
+define void @srd(ptr %a, iXLen %b, i64 %c) {
; RV32XTHEADMEMIDX-LABEL: srd:
; RV32XTHEADMEMIDX: # %bb.0:
-; RV32XTHEADMEMIDX-NEXT: add a2, a3, a3
-; RV32XTHEADMEMIDX-NEXT: add a4, a4, a4
-; RV32XTHEADMEMIDX-NEXT: sltu a3, a2, a3
-; RV32XTHEADMEMIDX-NEXT: th.srw a2, a0, a1, 3
-; RV32XTHEADMEMIDX-NEXT: add a3, a4, a3
+; RV32XTHEADMEMIDX-NEXT: add a4, a2, a2
+; RV32XTHEADMEMIDX-NEXT: add a3, a3, a3
+; RV32XTHEADMEMIDX-NEXT: sltu a2, a4, a2
+; RV32XTHEADMEMIDX-NEXT: th.srw a4, a0, a1, 3
+; RV32XTHEADMEMIDX-NEXT: add a2, a3, a2
; RV32XTHEADMEMIDX-NEXT: addi a0, a0, 4
-; RV32XTHEADMEMIDX-NEXT: th.srw a3, a0, a1, 3
+; RV32XTHEADMEMIDX-NEXT: th.srw a2, a0, a1, 3
; RV32XTHEADMEMIDX-NEXT: ret
;
; RV64XTHEADMEMIDX-LABEL: srd:
@@ -1057,7 +862,7 @@ define void @srd(ptr %a, i64 %b, i64 %c) {
; RV64XTHEADMEMIDX-NEXT: th.srd a2, a0, a1, 3
; RV64XTHEADMEMIDX-NEXT: ret
%1 = add i64 %c, %c
- %2 = getelementptr i64, ptr %a, i64 %b
+ %2 = getelementptr i64, ptr %a, iXLen %b
store i64 %1, ptr %2, align 8
ret void
}
@@ -1087,24 +892,18 @@ define void @surd(ptr %a, i32 %b, i64 %c) {
}
define ptr @test_simm5(ptr %base, i32 %a, i32 %b) {
-; RV32XTHEADMEMIDX-LABEL: test_simm5:
-; RV32XTHEADMEMIDX: # %bb.0:
-; RV32XTHEADMEMIDX-NEXT: add a1, a1, a2
-; RV32XTHEADMEMIDX-NEXT: th.swia a1, (a0), -12, 2
-; RV32XTHEADMEMIDX-NEXT: ret
-;
-; RV64XTHEADMEMIDX-LABEL: test_simm5:
-; RV64XTHEADMEMIDX: # %bb.0:
-; RV64XTHEADMEMIDX-NEXT: add a1, a1, a2
-; RV64XTHEADMEMIDX-NEXT: th.swia a1, (a0), -12, 2
-; RV64XTHEADMEMIDX-NEXT: ret
+; CHECK-LABEL: test_simm5:
+; CHECK: # %bb.0:
+; CHECK-NEXT: add a1, a1, a2
+; CHECK-NEXT: th.swia a1, (a0), -12, 2
+; CHECK-NEXT: ret
%addr.1 = getelementptr i32, ptr %base, i32 -12
%res = add i32 %a, %b
store i32 %res, ptr %base
ret ptr %addr.1
}
-define i64 @lrd_large_shift(ptr %a, i64 %b) {
+define i64 @lrd_large_shift(ptr %a, iXLen %b) {
; RV32XTHEADMEMIDX-LABEL: lrd_large_shift:
; RV32XTHEADMEMIDX: # %bb.0:
; RV32XTHEADMEMIDX-NEXT: slli a1, a1, 5
@@ -1119,14 +918,14 @@ define i64 @lrd_large_shift(ptr %a, i64 %b) {
; RV64XTHEADMEMIDX-NEXT: add a0, a1, a0
; RV64XTHEADMEMIDX-NEXT: ld a0, 384(a0)
; RV64XTHEADMEMIDX-NEXT: ret
- %1 = add i64 %b, 12
- %2 = shl i64 %1, 2
- %3 = getelementptr i64, ptr %a, i64 %2
+ %1 = add iXLen %b, 12
+ %2 = shl iXLen %1, 2
+ %3 = getelementptr i64, ptr %a, iXLen %2
%4 = load i64, ptr %3, align 8
ret i64 %4
}
-define i64 @lrd_large_offset(ptr %a, i64 %b) {
+define i64 @lrd_large_offset(ptr %a, iXLen %b) {
; RV32XTHEADMEMIDX-LABEL: lrd_large_offset:
; RV32XTHEADMEMIDX: # %bb.0:
; RV32XTHEADMEMIDX-NEXT: slli a1, a1, 3
@@ -1145,8 +944,8 @@ define i64 @lrd_large_offset(ptr %a, i64 %b) {
; RV64XTHEADMEMIDX-NEXT: add a0, a0, a1
; RV64XTHEADMEMIDX-NEXT: ld a0, 1792(a0)
; RV64XTHEADMEMIDX-NEXT: ret
- %1 = add i64 %b, 12000
- %2 = getelementptr i64, ptr %a, i64 %1
+ %1 = add iXLen %b, 12000
+ %2 = getelementptr i64, ptr %a, iXLen %1
%3 = load i64, ptr %2, align 8
ret i64 %3
}
diff --git a/llvm/test/CodeGen/SPARC/tls-sp.ll b/llvm/test/CodeGen/SPARC/tls-sp.ll
new file mode 100644
index 0000000..de9af01
--- /dev/null
+++ b/llvm/test/CodeGen/SPARC/tls-sp.ll
@@ -0,0 +1,105 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mtriple=sparc -relocation-model=pic < %s | FileCheck --check-prefix=SPARC %s
+; RUN: llc -mtriple=sparc64 -relocation-model=pic < %s | FileCheck --check-prefix=SPARC64 %s
+
+@x = external thread_local global i8
+
+;; Test that we don't over-allocate stack space when calling __tls_get_addr
+;; with the call frame pseudos able to be eliminated.
+define ptr @no_alloca() nounwind {
+; SPARC-LABEL: no_alloca:
+; SPARC: ! %bb.0: ! %entry
+; SPARC-NEXT: save %sp, -96, %sp
+; SPARC-NEXT: .Ltmp0:
+; SPARC-NEXT: call .Ltmp1
+; SPARC-NEXT: .Ltmp2:
+; SPARC-NEXT: sethi %hi(_GLOBAL_OFFSET_TABLE_+(.Ltmp2-.Ltmp0)), %i0
+; SPARC-NEXT: .Ltmp1:
+; SPARC-NEXT: or %i0, %lo(_GLOBAL_OFFSET_TABLE_+(.Ltmp1-.Ltmp0)), %i0
+; SPARC-NEXT: add %i0, %o7, %i0
+; SPARC-NEXT: sethi %tgd_hi22(x), %i1
+; SPARC-NEXT: add %i1, %tgd_lo10(x), %i1
+; SPARC-NEXT: add %i0, %i1, %o0, %tgd_add(x)
+; SPARC-NEXT: call __tls_get_addr, %tgd_call(x)
+; SPARC-NEXT: nop
+; SPARC-NEXT: ret
+; SPARC-NEXT: restore %g0, %o0, %o0
+;
+; SPARC64-LABEL: no_alloca:
+; SPARC64: ! %bb.0: ! %entry
+; SPARC64-NEXT: save %sp, -128, %sp
+; SPARC64-NEXT: .Ltmp0:
+; SPARC64-NEXT: rd %pc, %o7
+; SPARC64-NEXT: .Ltmp2:
+; SPARC64-NEXT: sethi %hi(_GLOBAL_OFFSET_TABLE_+(.Ltmp2-.Ltmp0)), %i0
+; SPARC64-NEXT: .Ltmp1:
+; SPARC64-NEXT: or %i0, %lo(_GLOBAL_OFFSET_TABLE_+(.Ltmp1-.Ltmp0)), %i0
+; SPARC64-NEXT: add %i0, %o7, %i0
+; SPARC64-NEXT: sethi %tgd_hi22(x), %i1
+; SPARC64-NEXT: add %i1, %tgd_lo10(x), %i1
+; SPARC64-NEXT: add %i0, %i1, %o0, %tgd_add(x)
+; SPARC64-NEXT: call __tls_get_addr, %tgd_call(x)
+; SPARC64-NEXT: nop
+; SPARC64-NEXT: ret
+; SPARC64-NEXT: restore %g0, %o0, %o0
+entry:
+ %0 = call ptr @llvm.threadlocal.address.p0(ptr @x)
+ ret ptr %0
+}
+
+;; Test that %sp is valid for the call to __tls_get_addr. We store to a dynamic
+;; alloca in order to prevent eliminating any call frame pseudos from the call.
+define ptr @dynamic_alloca(i64 %n) nounwind {
+; SPARC-LABEL: dynamic_alloca:
+; SPARC: ! %bb.0: ! %entry
+; SPARC-NEXT: save %sp, -96, %sp
+; SPARC-NEXT: .Ltmp3:
+; SPARC-NEXT: call .Ltmp4
+; SPARC-NEXT: .Ltmp5:
+; SPARC-NEXT: sethi %hi(_GLOBAL_OFFSET_TABLE_+(.Ltmp5-.Ltmp3)), %i0
+; SPARC-NEXT: .Ltmp4:
+; SPARC-NEXT: or %i0, %lo(_GLOBAL_OFFSET_TABLE_+(.Ltmp4-.Ltmp3)), %i0
+; SPARC-NEXT: add %i0, %o7, %i0
+; SPARC-NEXT: sethi %tgd_hi22(x), %i2
+; SPARC-NEXT: add %i2, %tgd_lo10(x), %i2
+; SPARC-NEXT: add %i0, %i2, %o0, %tgd_add(x)
+; SPARC-NEXT: call __tls_get_addr, %tgd_call(x)
+; SPARC-NEXT: nop
+; SPARC-NEXT: add %i1, 7, %i0
+; SPARC-NEXT: and %i0, -8, %i0
+; SPARC-NEXT: sub %sp, %i0, %i0
+; SPARC-NEXT: add %i0, -8, %sp
+; SPARC-NEXT: mov 1, %i1
+; SPARC-NEXT: stb %i1, [%i0+88]
+; SPARC-NEXT: ret
+; SPARC-NEXT: restore %g0, %o0, %o0
+;
+; SPARC64-LABEL: dynamic_alloca:
+; SPARC64: ! %bb.0: ! %entry
+; SPARC64-NEXT: save %sp, -128, %sp
+; SPARC64-NEXT: .Ltmp3:
+; SPARC64-NEXT: rd %pc, %o7
+; SPARC64-NEXT: .Ltmp5:
+; SPARC64-NEXT: sethi %hi(_GLOBAL_OFFSET_TABLE_+(.Ltmp5-.Ltmp3)), %i1
+; SPARC64-NEXT: .Ltmp4:
+; SPARC64-NEXT: or %i1, %lo(_GLOBAL_OFFSET_TABLE_+(.Ltmp4-.Ltmp3)), %i1
+; SPARC64-NEXT: add %i1, %o7, %i1
+; SPARC64-NEXT: sethi %tgd_hi22(x), %i2
+; SPARC64-NEXT: add %i2, %tgd_lo10(x), %i2
+; SPARC64-NEXT: add %i1, %i2, %o0, %tgd_add(x)
+; SPARC64-NEXT: call __tls_get_addr, %tgd_call(x)
+; SPARC64-NEXT: nop
+; SPARC64-NEXT: add %i0, 15, %i0
+; SPARC64-NEXT: and %i0, -16, %i0
+; SPARC64-NEXT: sub %sp, %i0, %i0
+; SPARC64-NEXT: mov %i0, %sp
+; SPARC64-NEXT: mov 1, %i1
+; SPARC64-NEXT: stb %i1, [%i0+2175]
+; SPARC64-NEXT: ret
+; SPARC64-NEXT: restore %g0, %o0, %o0
+entry:
+ %0 = call ptr @llvm.threadlocal.address.p0(ptr @x)
+ %1 = alloca i8, i64 %n
+ store i8 1, ptr %1
+ ret ptr %0
+}
diff --git a/llvm/test/CodeGen/SystemZ/pr60413.ll b/llvm/test/CodeGen/SystemZ/pr60413.ll
index bbf4d50..8a6a303 100644
--- a/llvm/test/CodeGen/SystemZ/pr60413.ll
+++ b/llvm/test/CodeGen/SystemZ/pr60413.ll
@@ -16,31 +16,31 @@ define dso_local void @m() local_unnamed_addr #1 {
; CHECK-NEXT: stmg %r13, %r15, 104(%r15)
; CHECK-NEXT: aghi %r15, -168
; CHECK-NEXT: lhrl %r1, f+4
+; CHECK-NEXT: sll %r1, 8
; CHECK-NEXT: larl %r2, f
-; CHECK-NEXT: llc %r2, 6(%r2)
-; CHECK-NEXT: larl %r3, e
-; CHECK-NEXT: lb %r0, 3(%r3)
-; CHECK-NEXT: rosbg %r2, %r1, 32, 55, 8
-; CHECK-NEXT: vlvgp %v0, %r2, %r0
-; CHECK-NEXT: vlvgf %v0, %r2, 0
-; CHECK-NEXT: vlvgf %v0, %r2, 2
-; CHECK-NEXT: vlvgp %v1, %r0, %r2
-; CHECK-NEXT: vlvgp %v2, %r2, %r2
-; CHECK-NEXT: lr %r1, %r2
+; CHECK-NEXT: ic %r1, 6(%r2)
+; CHECK-NEXT: larl %r2, e
+; CHECK-NEXT: lb %r0, 3(%r2)
+; CHECK-NEXT: vlvgp %v0, %r0, %r1
+; CHECK-NEXT: vlvgp %v1, %r1, %r0
+; CHECK-NEXT: vlvgf %v1, %r1, 0
+; CHECK-NEXT: vlvgf %v1, %r1, 2
+; CHECK-NEXT: vlvgp %v2, %r1, %r1
+; CHECK-NEXT: # kill: def $r1l killed $r1l killed $r1d
; CHECK-NEXT: nilh %r1, 255
; CHECK-NEXT: chi %r1, 128
; CHECK-NEXT: ipm %r1
; CHECK-NEXT: risbg %r1, %r1, 63, 191, 36
+; CHECK-NEXT: vlvgf %v0, %r0, 0
+; CHECK-NEXT: vlvgf %v0, %r0, 2
; CHECK-NEXT: vgbm %v3, 30583
; CHECK-NEXT: vn %v0, %v0, %v3
-; CHECK-NEXT: vlvgf %v1, %r0, 0
-; CHECK-NEXT: vlvgf %v1, %r0, 2
; CHECK-NEXT: vn %v1, %v1, %v3
; CHECK-NEXT: vrepf %v2, %v2, 1
; CHECK-NEXT: vn %v2, %v2, %v3
; CHECK-NEXT: vrepif %v3, 127
-; CHECK-NEXT: vchlf %v0, %v0, %v3
-; CHECK-NEXT: vlgvf %r13, %v0, 0
+; CHECK-NEXT: vchlf %v1, %v1, %v3
+; CHECK-NEXT: vlgvf %r13, %v1, 0
; CHECK-NEXT: vchlf %v2, %v2, %v3
; CHECK-NEXT: vlgvf %r3, %v2, 1
; CHECK-NEXT: nilf %r3, 1
@@ -54,13 +54,13 @@ define dso_local void @m() local_unnamed_addr #1 {
; CHECK-NEXT: nilf %r14, 1
; CHECK-NEXT: rosbg %r2, %r14, 32, 51, 12
; CHECK-NEXT: rosbg %r2, %r13, 52, 52, 11
-; CHECK-NEXT: vlgvf %r13, %v0, 1
+; CHECK-NEXT: vlgvf %r13, %v1, 1
; CHECK-NEXT: rosbg %r2, %r13, 53, 53, 10
-; CHECK-NEXT: vlgvf %r13, %v0, 2
+; CHECK-NEXT: vlgvf %r13, %v1, 2
; CHECK-NEXT: rosbg %r2, %r13, 54, 54, 9
-; CHECK-NEXT: vlgvf %r13, %v0, 3
+; CHECK-NEXT: vlgvf %r13, %v1, 3
; CHECK-NEXT: rosbg %r2, %r13, 55, 55, 8
-; CHECK-NEXT: vchlf %v0, %v1, %v3
+; CHECK-NEXT: vchlf %v0, %v0, %v3
; CHECK-NEXT: vlgvf %r13, %v0, 0
; CHECK-NEXT: rosbg %r2, %r13, 56, 56, 7
; CHECK-NEXT: vlgvf %r13, %v0, 1
diff --git a/llvm/test/CodeGen/WebAssembly/memory-interleave.ll b/llvm/test/CodeGen/WebAssembly/memory-interleave.ll
new file mode 100644
index 0000000..97c2311
--- /dev/null
+++ b/llvm/test/CodeGen/WebAssembly/memory-interleave.ll
@@ -0,0 +1,1413 @@
+; RUN: opt -S -mattr=+simd128 -passes=loop-vectorize %s | llc -mtriple=wasm32 -asm-verbose=false -disable-wasm-fallthrough-return-opt -wasm-keep-registers | FileCheck %s
+
+target datalayout = "e-m:e-p:32:32-p10:8:8-p20:8:8-i64:64-n32:64-S128-ni:1:10:20"
+
+%struct.TwoInts = type { i32, i32 }
+%struct.ThreeInts = type { i32, i32, i32 }
+%struct.FourInts = type { i32, i32, i32, i32 }
+%struct.ThreeShorts = type { i16, i16, i16 }
+%struct.FourShorts = type { i16, i16, i16, i16 }
+%struct.FiveShorts = type { i16, i16, i16, i16, i16 }
+%struct.TwoBytes = type { i8, i8 }
+%struct.ThreeBytes = type { i8, i8, i8 }
+%struct.FourBytes = type { i8, i8, i8, i8 }
+%struct.EightBytes = type { i8, i8, i8, i8, i8, i8, i8, i8 }
+
+; CHECK-LABEL: two_ints_same_op:
+; CHECK: loop
+; CHECK: i32.load
+; CHECK: i32.load
+; CHECK: i32.add
+; CHECK: i32.store
+; CHECK: i32.load
+; CHECK: i32.load
+; CHECK: i32.add
+; CHECK: i32.store
+define hidden void @two_ints_same_op(ptr noalias nocapture noundef writeonly %0, ptr nocapture noundef readonly %1, ptr nocapture noundef readonly %2, i32 noundef %3) {
+ %5 = icmp eq i32 %3, 0
+ br i1 %5, label %6, label %7
+
+6: ; preds = %7, %4
+ ret void
+
+7: ; preds = %4, %7
+ %8 = phi i32 [ %21, %7 ], [ 0, %4 ]
+ %9 = getelementptr inbounds %struct.TwoInts, ptr %1, i32 %8
+ %10 = load i32, ptr %9, align 4
+ %11 = getelementptr inbounds %struct.TwoInts, ptr %2, i32 %8
+ %12 = load i32, ptr %11, align 4
+ %13 = add i32 %12, %10
+ %14 = getelementptr inbounds %struct.TwoInts, ptr %0, i32 %8
+ store i32 %13, ptr %14, align 4
+ %15 = getelementptr inbounds i8, ptr %9, i32 4
+ %16 = load i32, ptr %15, align 4
+ %17 = getelementptr inbounds i8, ptr %11, i32 4
+ %18 = load i32, ptr %17, align 4
+ %19 = add i32 %18, %16
+ %20 = getelementptr inbounds i8, ptr %14, i32 4
+ store i32 %19, ptr %20, align 4
+ %21 = add nuw i32 %8, 1
+ %22 = icmp eq i32 %21, %3
+ br i1 %22, label %6, label %7
+}
+
+; CHECK-LABEL: two_ints_vary_op:
+; CHECK: loop
+; CHECK: i32.load
+; CHECK: i32.load
+; CHECK: i32.add
+; CHECK: i32.store
+; CHECK: i32.load
+; CHECK: i32.load
+; CHECK: i32.sub
+; CHECK: i32.store
+define hidden void @two_ints_vary_op(ptr noalias nocapture noundef writeonly %0, ptr nocapture noundef readonly %1, ptr nocapture noundef readonly %2, i32 noundef %3) {
+ %5 = icmp eq i32 %3, 0
+ br i1 %5, label %6, label %7
+
+6: ; preds = %7, %4
+ ret void
+
+7: ; preds = %4, %7
+ %8 = phi i32 [ %21, %7 ], [ 0, %4 ]
+ %9 = getelementptr inbounds %struct.TwoInts, ptr %1, i32 %8
+ %10 = load i32, ptr %9, align 4
+ %11 = getelementptr inbounds %struct.TwoInts, ptr %2, i32 %8
+ %12 = load i32, ptr %11, align 4
+ %13 = add i32 %12, %10
+ %14 = getelementptr inbounds %struct.TwoInts, ptr %0, i32 %8
+ store i32 %13, ptr %14, align 4
+ %15 = getelementptr inbounds i8, ptr %9, i32 4
+ %16 = load i32, ptr %15, align 4
+ %17 = getelementptr inbounds i8, ptr %11, i32 4
+ %18 = load i32, ptr %17, align 4
+ %19 = sub i32 %16, %18
+ %20 = getelementptr inbounds i8, ptr %14, i32 4
+ store i32 %19, ptr %20, align 4
+ %21 = add nuw i32 %8, 1
+ %22 = icmp eq i32 %21, %3
+ br i1 %22, label %6, label %7
+}
+
+; CHECK-LABEL: three_ints:
+; CHECK: loop
+; CHECK: i32.load
+; CHECK: i32.load
+; CHECK: i32.add
+; CHECK: i32.store
+; CHECK: i32.load
+; CHECK: i32.load
+; CHECK: i32.add
+; CHECK: i32.store
+; CHECK: i32.load
+; CHECK: i32.load
+; CHECK: i32.add
+; CHECK: i32.store
+define hidden void @three_ints(ptr noalias nocapture noundef writeonly %0, ptr nocapture noundef readonly %1, ptr nocapture noundef readonly %2, i32 noundef %3) {
+ %5 = icmp eq i32 %3, 0
+ br i1 %5, label %6, label %7
+
+6: ; preds = %7, %4
+ ret void
+
+7: ; preds = %4, %7
+ %8 = phi i32 [ %27, %7 ], [ 0, %4 ]
+ %9 = getelementptr inbounds %struct.ThreeInts, ptr %1, i32 %8
+ %10 = load i32, ptr %9, align 4
+ %11 = getelementptr inbounds %struct.ThreeInts, ptr %2, i32 %8
+ %12 = load i32, ptr %11, align 4
+ %13 = add nsw i32 %12, %10
+ %14 = getelementptr inbounds %struct.ThreeInts, ptr %0, i32 %8
+ store i32 %13, ptr %14, align 4
+ %15 = getelementptr inbounds i8, ptr %9, i32 4
+ %16 = load i32, ptr %15, align 4
+ %17 = getelementptr inbounds i8, ptr %11, i32 4
+ %18 = load i32, ptr %17, align 4
+ %19 = add nsw i32 %18, %16
+ %20 = getelementptr inbounds i8, ptr %14, i32 4
+ store i32 %19, ptr %20, align 4
+ %21 = getelementptr inbounds i8, ptr %9, i32 8
+ %22 = load i32, ptr %21, align 4
+ %23 = getelementptr inbounds i8, ptr %11, i32 8
+ %24 = load i32, ptr %23, align 4
+ %25 = add nsw i32 %24, %22
+ %26 = getelementptr inbounds i8, ptr %14, i32 8
+ store i32 %25, ptr %26, align 4
+ %27 = add nuw i32 %8, 1
+ %28 = icmp eq i32 %27, %3
+ br i1 %28, label %6, label %7
+}
+
+; CHECK-LABEL: three_shorts:
+; CHECK: loop
+; CHECK: i32.load16_u
+; CHECK: i32.load16_u
+; CHECK: i32.mul
+; CHECK: i32.store16
+; CHECK: i32.load16_u
+; CHECK: i32.load16_u
+; CHECK: i32.mul
+; CHECK: i32.store16
+; CHECK: i32.load16_u
+; CHECK: i32.load16_u
+; CHECK: i32.mul
+; CHECK: i32.store16
+define hidden void @three_shorts(ptr noalias nocapture noundef writeonly %0, ptr nocapture noundef readonly %1, ptr nocapture noundef readonly %2, i32 noundef %3) {
+ %5 = icmp eq i32 %3, 0
+ br i1 %5, label %6, label %7
+
+6: ; preds = %7, %4
+ ret void
+
+7: ; preds = %4, %7
+ %8 = phi i32 [ %27, %7 ], [ 0, %4 ]
+ %9 = getelementptr inbounds %struct.ThreeShorts, ptr %1, i32 %8
+ %10 = load i16, ptr %9, align 2
+ %11 = getelementptr inbounds %struct.ThreeShorts, ptr %2, i32 %8
+ %12 = load i16, ptr %11, align 2
+ %13 = mul i16 %12, %10
+ %14 = getelementptr inbounds %struct.ThreeShorts, ptr %0, i32 %8
+ store i16 %13, ptr %14, align 2
+ %15 = getelementptr inbounds i8, ptr %9, i32 2
+ %16 = load i16, ptr %15, align 2
+ %17 = getelementptr inbounds i8, ptr %11, i32 2
+ %18 = load i16, ptr %17, align 2
+ %19 = mul i16 %18, %16
+ %20 = getelementptr inbounds i8, ptr %14, i32 2
+ store i16 %19, ptr %20, align 2
+ %21 = getelementptr inbounds i8, ptr %9, i32 4
+ %22 = load i16, ptr %21, align 2
+ %23 = getelementptr inbounds i8, ptr %11, i32 4
+ %24 = load i16, ptr %23, align 2
+ %25 = mul i16 %24, %22
+ %26 = getelementptr inbounds i8, ptr %14, i32 4
+ store i16 %25, ptr %26, align 2
+ %27 = add nuw i32 %8, 1
+ %28 = icmp eq i32 %27, %3
+ br i1 %28, label %6, label %7
+}
+
+; CHECK-LABEL: four_shorts_same_op:
+; CHECK: loop
+; CHECK: i32.load16_u
+; CHECK: i32.load16_u
+; CHECK: i32.sub
+; CHECK: i32.store16
+; CHECK: i32.load16_u
+; CHECK: i32.load16_u
+; CHECK: i32.sub
+; CHECK: i32.store16
+; CHECK: i32.load16_u
+; CHECK: i32.load16_u
+; CHECK: i32.sub
+; CHECK: i32.store16
+; CHECK: i32.load16_u
+; CHECK: i32.load16_u
+; CHECK: i32.sub
+; CHECK: i32.store16
+define hidden void @four_shorts_same_op(ptr noalias nocapture noundef writeonly %0, ptr nocapture noundef readonly %1, ptr nocapture noundef readonly %2, i32 noundef %3) {
+ %5 = icmp eq i32 %3, 0
+ br i1 %5, label %6, label %7
+
+6: ; preds = %7, %4
+ ret void
+
+7: ; preds = %4, %7
+ %8 = phi i32 [ %33, %7 ], [ 0, %4 ]
+ %9 = getelementptr inbounds %struct.FourShorts, ptr %1, i32 %8
+ %10 = load i16, ptr %9, align 2
+ %11 = getelementptr inbounds %struct.FourShorts, ptr %2, i32 %8
+ %12 = load i16, ptr %11, align 2
+ %13 = sub i16 %10, %12
+ %14 = getelementptr inbounds %struct.FourShorts, ptr %0, i32 %8
+ store i16 %13, ptr %14, align 2
+ %15 = getelementptr inbounds i8, ptr %9, i32 2
+ %16 = load i16, ptr %15, align 2
+ %17 = getelementptr inbounds i8, ptr %11, i32 2
+ %18 = load i16, ptr %17, align 2
+ %19 = sub i16 %16, %18
+ %20 = getelementptr inbounds i8, ptr %14, i32 2
+ store i16 %19, ptr %20, align 2
+ %21 = getelementptr inbounds i8, ptr %9, i32 4
+ %22 = load i16, ptr %21, align 2
+ %23 = getelementptr inbounds i8, ptr %11, i32 4
+ %24 = load i16, ptr %23, align 2
+ %25 = sub i16 %22, %24
+ %26 = getelementptr inbounds i8, ptr %14, i32 4
+ store i16 %25, ptr %26, align 2
+ %27 = getelementptr inbounds i8, ptr %9, i32 6
+ %28 = load i16, ptr %27, align 2
+ %29 = getelementptr inbounds i8, ptr %11, i32 6
+ %30 = load i16, ptr %29, align 2
+ %31 = sub i16 %28, %30
+ %32 = getelementptr inbounds i8, ptr %14, i32 6
+ store i16 %31, ptr %32, align 2
+ %33 = add nuw i32 %8, 1
+ %34 = icmp eq i32 %33, %3
+ br i1 %34, label %6, label %7
+}
+
+; CHECK-LABEL: four_shorts_split_op:
+; CHECK: loop
+; CHECK: i32.load16_u
+; CHECK: i32.load16_u
+; CHECK: i32.or
+; CHECK: i32.store16
+; CHECK: i32.load16_u
+; CHECK: i32.load16_u
+; CHECK: i32.or
+; CHECK: i32.store16
+; CHECK: i32.load16_u
+; CHECK: i32.load16_u
+; CHECK: i32.xor
+; CHECK: i32.store16
+; CHECK: i32.load16_u
+; CHECK: i32.load16_u
+; CHECK: i32.xor
+; CHECK: i32.store16
+define hidden void @four_shorts_split_op(ptr noalias nocapture noundef writeonly %0, ptr nocapture noundef readonly %1, ptr nocapture noundef readonly %2, i32 noundef %3) {
+ %5 = icmp eq i32 %3, 0
+ br i1 %5, label %6, label %7
+
+6: ; preds = %7, %4
+ ret void
+
+7: ; preds = %4, %7
+ %8 = phi i32 [ %33, %7 ], [ 0, %4 ]
+ %9 = getelementptr inbounds %struct.FourShorts, ptr %1, i32 %8
+ %10 = load i16, ptr %9, align 2
+ %11 = getelementptr inbounds %struct.FourShorts, ptr %2, i32 %8
+ %12 = load i16, ptr %11, align 2
+ %13 = or i16 %12, %10
+ %14 = getelementptr inbounds %struct.FourShorts, ptr %0, i32 %8
+ store i16 %13, ptr %14, align 2
+ %15 = getelementptr inbounds i8, ptr %9, i32 2
+ %16 = load i16, ptr %15, align 2
+ %17 = getelementptr inbounds i8, ptr %11, i32 2
+ %18 = load i16, ptr %17, align 2
+ %19 = or i16 %18, %16
+ %20 = getelementptr inbounds i8, ptr %14, i32 2
+ store i16 %19, ptr %20, align 2
+ %21 = getelementptr inbounds i8, ptr %9, i32 4
+ %22 = load i16, ptr %21, align 2
+ %23 = getelementptr inbounds i8, ptr %11, i32 4
+ %24 = load i16, ptr %23, align 2
+ %25 = xor i16 %24, %22
+ %26 = getelementptr inbounds i8, ptr %14, i32 4
+ store i16 %25, ptr %26, align 2
+ %27 = getelementptr inbounds i8, ptr %9, i32 6
+ %28 = load i16, ptr %27, align 2
+ %29 = getelementptr inbounds i8, ptr %11, i32 6
+ %30 = load i16, ptr %29, align 2
+ %31 = xor i16 %30, %28
+ %32 = getelementptr inbounds i8, ptr %14, i32 6
+ store i16 %31, ptr %32, align 2
+ %33 = add nuw i32 %8, 1
+ %34 = icmp eq i32 %33, %3
+ br i1 %34, label %6, label %7
+}
+
+; CHECK-LABEL: four_shorts_interleave_op:
+; CHECK: loop
+; CHECK: i32.load16_u
+; CHECK: i32.load16_u
+; CHECK: i32.or
+; CHECK: i32.store16
+; CHECK: i32.load16_u
+; CHECK: i32.load16_u
+; CHECK: i32.xor
+; CHECK: i32.store16
+; CHECK: i32.load16_u
+; CHECK: i32.load16_u
+; CHECK: i32.or
+; CHECK: i32.store16
+; CHECK: i32.load16_u
+; CHECK: i32.load16_u
+; CHECK: i32.xor
+; CHECK: i32.store16
+define hidden void @four_shorts_interleave_op(ptr noalias nocapture noundef writeonly %0, ptr nocapture noundef readonly %1, ptr nocapture noundef readonly %2, i32 noundef %3) {
+ %5 = icmp eq i32 %3, 0
+ br i1 %5, label %6, label %7
+
+6: ; preds = %7, %4
+ ret void
+
+7: ; preds = %4, %7
+ %8 = phi i32 [ %33, %7 ], [ 0, %4 ]
+ %9 = getelementptr inbounds %struct.FourShorts, ptr %1, i32 %8
+ %10 = load i16, ptr %9, align 2
+ %11 = getelementptr inbounds %struct.FourShorts, ptr %2, i32 %8
+ %12 = load i16, ptr %11, align 2
+ %13 = or i16 %12, %10
+ %14 = getelementptr inbounds %struct.FourShorts, ptr %0, i32 %8
+ store i16 %13, ptr %14, align 2
+ %15 = getelementptr inbounds i8, ptr %9, i32 2
+ %16 = load i16, ptr %15, align 2
+ %17 = getelementptr inbounds i8, ptr %11, i32 2
+ %18 = load i16, ptr %17, align 2
+ %19 = xor i16 %18, %16
+ %20 = getelementptr inbounds i8, ptr %14, i32 2
+ store i16 %19, ptr %20, align 2
+ %21 = getelementptr inbounds i8, ptr %9, i32 4
+ %22 = load i16, ptr %21, align 2
+ %23 = getelementptr inbounds i8, ptr %11, i32 4
+ %24 = load i16, ptr %23, align 2
+ %25 = or i16 %24, %22
+ %26 = getelementptr inbounds i8, ptr %14, i32 4
+ store i16 %25, ptr %26, align 2
+ %27 = getelementptr inbounds i8, ptr %9, i32 6
+ %28 = load i16, ptr %27, align 2
+ %29 = getelementptr inbounds i8, ptr %11, i32 6
+ %30 = load i16, ptr %29, align 2
+ %31 = xor i16 %30, %28
+ %32 = getelementptr inbounds i8, ptr %14, i32 6
+ store i16 %31, ptr %32, align 2
+ %33 = add nuw i32 %8, 1
+ %34 = icmp eq i32 %33, %3
+ br i1 %34, label %6, label %7
+}
+
+; CHECK-LABEL: five_shorts:
+; CHECK: loop
+; CHECK: i32.load16_u
+; CHECK: i32.load16_u
+; CHECK: i32.sub
+; CHECK: i32.store16
+; CHECK: i32.load16_u
+; CHECK: i32.load16_u
+; CHECK: i32.sub
+; CHECK: i32.store16
+; CHECK: i32.load16_u
+; CHECK: i32.load16_u
+; CHECK: i32.sub
+; CHECK: i32.store16
+; CHECK: i32.load16_u
+; CHECK: i32.load16_u
+; CHECK: i32.sub
+; CHECK: i32.store16
+; CHECK: i32.load16_u
+; CHECK: i32.load16_u
+; CHECK: i32.sub
+; CHECK: i32.store16
+define hidden void @five_shorts(ptr noalias nocapture noundef writeonly %0, ptr nocapture noundef readonly %1, ptr nocapture noundef readonly %2, i32 noundef %3) {
+ %5 = icmp eq i32 %3, 0
+ br i1 %5, label %6, label %7
+
+6: ; preds = %7, %4
+ ret void
+
+7: ; preds = %4, %7
+ %8 = phi i32 [ %39, %7 ], [ 0, %4 ]
+ %9 = getelementptr inbounds %struct.FiveShorts, ptr %1, i32 %8
+ %10 = load i16, ptr %9, align 1
+ %11 = getelementptr inbounds %struct.FiveShorts, ptr %2, i32 %8
+ %12 = load i16, ptr %11, align 1
+ %13 = sub i16 %10, %12
+ %14 = getelementptr inbounds %struct.FiveShorts, ptr %0, i32 %8
+ store i16 %13, ptr %14, align 1
+ %15 = getelementptr inbounds i16, ptr %9, i32 1
+ %16 = load i16, ptr %15, align 1
+ %17 = getelementptr inbounds i16, ptr %11, i32 1
+ %18 = load i16, ptr %17, align 1
+ %19 = sub i16 %16, %18
+ %20 = getelementptr inbounds i16, ptr %14, i32 1
+ store i16 %19, ptr %20, align 1
+ %21 = getelementptr inbounds i16, ptr %9, i32 2
+ %22 = load i16, ptr %21, align 1
+ %23 = getelementptr inbounds i16, ptr %11, i32 2
+ %24 = load i16, ptr %23, align 1
+ %25 = sub i16 %22, %24
+ %26 = getelementptr inbounds i16, ptr %14, i32 2
+ store i16 %25, ptr %26, align 1
+ %27 = getelementptr inbounds i16, ptr %9, i32 3
+ %28 = load i16, ptr %27, align 1
+ %29 = getelementptr inbounds i16, ptr %11, i32 3
+ %30 = load i16, ptr %29, align 1
+ %31 = sub i16 %28, %30
+ %32 = getelementptr inbounds i16, ptr %14, i32 3
+ store i16 %31, ptr %32, align 1
+ %33 = getelementptr inbounds i16, ptr %9, i32 4
+ %34 = load i16, ptr %33, align 1
+ %35 = getelementptr inbounds i16, ptr %11, i32 4
+ %36 = load i16, ptr %35, align 1
+ %37 = sub i16 %34, %36
+ %38 = getelementptr inbounds i16, ptr %14, i32 4
+ store i16 %37, ptr %38, align 1
+ %39 = add nuw i32 %8, 1
+ %40 = icmp eq i32 %39, %3
+ br i1 %40, label %6, label %7
+}
+
+; CHECK-LABEL: two_bytes_same_op:
+; CHECK: loop
+; CHECK: i32.load8_u
+; CHECK: i32.load8_u
+; CHECK: i32.mul
+; CHECK: i32.store8
+; CHECK: i32.load8_u
+; CHECK: i32.load8_u
+; CHECK: i32.mul
+; CHECK: i32.store8
+define hidden void @two_bytes_same_op(ptr noalias nocapture noundef writeonly %0, ptr nocapture noundef readonly %1, ptr nocapture noundef readonly %2, i32 noundef %3) {
+ %5 = icmp eq i32 %3, 0
+ br i1 %5, label %6, label %7
+
+6: ; preds = %7, %4
+ ret void
+
+7: ; preds = %4, %7
+ %8 = phi i32 [ %21, %7 ], [ 0, %4 ]
+ %9 = getelementptr inbounds %struct.TwoBytes, ptr %1, i32 %8
+ %10 = load i8, ptr %9, align 1
+ %11 = getelementptr inbounds %struct.TwoBytes, ptr %2, i32 %8
+ %12 = load i8, ptr %11, align 1
+ %13 = mul i8 %12, %10
+ %14 = getelementptr inbounds %struct.TwoBytes, ptr %0, i32 %8
+ store i8 %13, ptr %14, align 1
+ %15 = getelementptr inbounds i8, ptr %9, i32 1
+ %16 = load i8, ptr %15, align 1
+ %17 = getelementptr inbounds i8, ptr %11, i32 1
+ %18 = load i8, ptr %17, align 1
+ %19 = mul i8 %18, %16
+ %20 = getelementptr inbounds i8, ptr %14, i32 1
+ store i8 %19, ptr %20, align 1
+ %21 = add nuw i32 %8, 1
+ %22 = icmp eq i32 %21, %3
+ br i1 %22, label %6, label %7
+}
+
+; CHECK-LABEL: two_bytes_vary_op:
+; CHECK: loop
+; CHECK: i32.load8_u
+; CHECK: i32.load8_u
+; CHECK: i32.mul
+; CHECK: i32.store8
+; CHECK: i32.load8_u
+; CHECK: i32.load8_u
+; CHECK: i32.sub
+; CHECK: i32.store8
+define hidden void @two_bytes_vary_op(ptr noalias nocapture noundef writeonly %0, ptr nocapture noundef readonly %1, ptr nocapture noundef readonly %2, i32 noundef %3) {
+ %5 = icmp eq i32 %3, 0
+ br i1 %5, label %6, label %7
+
+6: ; preds = %7, %4
+ ret void
+
+7: ; preds = %4, %7
+ %8 = phi i32 [ %21, %7 ], [ 0, %4 ]
+ %9 = getelementptr inbounds %struct.TwoBytes, ptr %1, i32 %8
+ %10 = load i8, ptr %9, align 1
+ %11 = getelementptr inbounds %struct.TwoBytes, ptr %2, i32 %8
+ %12 = load i8, ptr %11, align 1
+ %13 = mul i8 %12, %10
+ %14 = getelementptr inbounds %struct.TwoBytes, ptr %0, i32 %8
+ store i8 %13, ptr %14, align 1
+ %15 = getelementptr inbounds i8, ptr %9, i32 1
+ %16 = load i8, ptr %15, align 1
+ %17 = getelementptr inbounds i8, ptr %11, i32 1
+ %18 = load i8, ptr %17, align 1
+ %19 = sub i8 %16, %18
+ %20 = getelementptr inbounds i8, ptr %14, i32 1
+ store i8 %19, ptr %20, align 1
+ %21 = add nuw i32 %8, 1
+ %22 = icmp eq i32 %21, %3
+ br i1 %22, label %6, label %7
+}
+
+; CHECK-LABEL: three_bytes_same_op:
+; CHECK: loop
+; CHECK: i32.load8_u
+; CHECK: i32.load8_u
+; CHECK: i32.and
+; CHECK: i32.store8
+; CHECK: i32.load8_u
+; CHECK: i32.load8_u
+; CHECK: i32.and
+; CHECK: i32.store8
+; CHECK: i32.load8_u
+; CHECK: i32.load8_u
+; CHECK: i32.and
+; CHECK: i32.store8
+define hidden void @three_bytes_same_op(ptr noalias nocapture noundef writeonly %0, ptr nocapture noundef readonly %1, ptr nocapture noundef readonly %2, i32 noundef %3) {
+ %5 = icmp eq i32 %3, 0
+ br i1 %5, label %6, label %7
+
+6: ; preds = %7, %4
+ ret void
+
+7: ; preds = %4, %7
+ %8 = phi i32 [ %27, %7 ], [ 0, %4 ]
+ %9 = getelementptr inbounds %struct.ThreeBytes, ptr %1, i32 %8
+ %10 = load i8, ptr %9, align 1
+ %11 = getelementptr inbounds %struct.ThreeBytes, ptr %2, i32 %8
+ %12 = load i8, ptr %11, align 1
+ %13 = and i8 %12, %10
+ %14 = getelementptr inbounds %struct.ThreeBytes, ptr %0, i32 %8
+ store i8 %13, ptr %14, align 1
+ %15 = getelementptr inbounds i8, ptr %9, i32 1
+ %16 = load i8, ptr %15, align 1
+ %17 = getelementptr inbounds i8, ptr %11, i32 1
+ %18 = load i8, ptr %17, align 1
+ %19 = and i8 %18, %16
+ %20 = getelementptr inbounds i8, ptr %14, i32 1
+ store i8 %19, ptr %20, align 1
+ %21 = getelementptr inbounds i8, ptr %9, i32 2
+ %22 = load i8, ptr %21, align 1
+ %23 = getelementptr inbounds i8, ptr %11, i32 2
+ %24 = load i8, ptr %23, align 1
+ %25 = and i8 %24, %22
+ %26 = getelementptr inbounds i8, ptr %14, i32 2
+ store i8 %25, ptr %26, align 1
+ %27 = add nuw i32 %8, 1
+ %28 = icmp eq i32 %27, %3
+ br i1 %28, label %6, label %7
+}
+
+; CHECK-LABEL: three_bytes_interleave_op:
+; CHECK: loop
+; CHECK: i32.load8_u
+; CHECK: i32.load8_u
+; CHECK: i32.add
+; CHECK: i32.store8
+; CHECK: i32.load8_u
+; CHECK: i32.load8_u
+; CHECK: i32.sub
+; CHECK: i32.store8
+; CHECK: i32.load8_u
+; CHECK: i32.load8_u
+; CHECK: i32.add
+; CHECK: i32.store8
+define hidden void @three_bytes_interleave_op(ptr noalias nocapture noundef writeonly %0, ptr nocapture noundef readonly %1, ptr nocapture noundef readonly %2, i32 noundef %3) {
+ %5 = icmp eq i32 %3, 0
+ br i1 %5, label %6, label %7
+
+6: ; preds = %7, %4
+ ret void
+
+7: ; preds = %4, %7
+ %8 = phi i32 [ %27, %7 ], [ 0, %4 ]
+ %9 = getelementptr inbounds %struct.ThreeBytes, ptr %1, i32 %8
+ %10 = load i8, ptr %9, align 1
+ %11 = getelementptr inbounds %struct.ThreeBytes, ptr %2, i32 %8
+ %12 = load i8, ptr %11, align 1
+ %13 = add i8 %12, %10
+ %14 = getelementptr inbounds %struct.ThreeBytes, ptr %0, i32 %8
+ store i8 %13, ptr %14, align 1
+ %15 = getelementptr inbounds i8, ptr %9, i32 1
+ %16 = load i8, ptr %15, align 1
+ %17 = getelementptr inbounds i8, ptr %11, i32 1
+ %18 = load i8, ptr %17, align 1
+ %19 = sub i8 %16, %18
+ %20 = getelementptr inbounds i8, ptr %14, i32 1
+ store i8 %19, ptr %20, align 1
+ %21 = getelementptr inbounds i8, ptr %9, i32 2
+ %22 = load i8, ptr %21, align 1
+ %23 = getelementptr inbounds i8, ptr %11, i32 2
+ %24 = load i8, ptr %23, align 1
+ %25 = add i8 %24, %22
+ %26 = getelementptr inbounds i8, ptr %14, i32 2
+ store i8 %25, ptr %26, align 1
+ %27 = add nuw i32 %8, 1
+ %28 = icmp eq i32 %27, %3
+ br i1 %28, label %6, label %7
+}
+
+; CHECK-LABEL: four_bytes_same_op:
+; CHECK: loop
+; CHECK: i32.load8_u
+; CHECK: i32.load8_u
+; CHECK: i32.and
+; CHECK: i32.store8
+; CHECK: i32.load8_u
+; CHECK: i32.load8_u
+; CHECK: i32.and
+; CHECK: i32.store8
+; CHECK: i32.load8_u
+; CHECK: i32.load8_u
+; CHECK: i32.and
+; CHECK: i32.store8
+; CHECK: i32.load8_u
+; CHECK: i32.load8_u
+; CHECK: i32.and
+; CHECK: i32.store8
+define hidden void @four_bytes_same_op(ptr noalias nocapture noundef writeonly %0, ptr nocapture noundef readonly %1, ptr nocapture noundef readonly %2, i32 noundef %3) {
+ %5 = icmp eq i32 %3, 0
+ br i1 %5, label %6, label %7
+
+6: ; preds = %7, %4
+ ret void
+
+7: ; preds = %4, %7
+ %8 = phi i32 [ %33, %7 ], [ 0, %4 ]
+ %9 = getelementptr inbounds %struct.FourBytes, ptr %1, i32 %8
+ %10 = load i8, ptr %9, align 1
+ %11 = getelementptr inbounds %struct.FourBytes, ptr %2, i32 %8
+ %12 = load i8, ptr %11, align 1
+ %13 = and i8 %12, %10
+ %14 = getelementptr inbounds %struct.FourBytes, ptr %0, i32 %8
+ store i8 %13, ptr %14, align 1
+ %15 = getelementptr inbounds i8, ptr %9, i32 1
+ %16 = load i8, ptr %15, align 1
+ %17 = getelementptr inbounds i8, ptr %11, i32 1
+ %18 = load i8, ptr %17, align 1
+ %19 = and i8 %18, %16
+ %20 = getelementptr inbounds i8, ptr %14, i32 1
+ store i8 %19, ptr %20, align 1
+ %21 = getelementptr inbounds i8, ptr %9, i32 2
+ %22 = load i8, ptr %21, align 1
+ %23 = getelementptr inbounds i8, ptr %11, i32 2
+ %24 = load i8, ptr %23, align 1
+ %25 = and i8 %24, %22
+ %26 = getelementptr inbounds i8, ptr %14, i32 2
+ store i8 %25, ptr %26, align 1
+ %27 = getelementptr inbounds i8, ptr %9, i32 3
+ %28 = load i8, ptr %27, align 1
+ %29 = getelementptr inbounds i8, ptr %11, i32 3
+ %30 = load i8, ptr %29, align 1
+ %31 = and i8 %30, %28
+ %32 = getelementptr inbounds i8, ptr %14, i32 3
+ store i8 %31, ptr %32, align 1
+ %33 = add nuw i32 %8, 1
+ %34 = icmp eq i32 %33, %3
+ br i1 %34, label %6, label %7
+}
+
+; CHECK-LABEL: four_bytes_split_op:
+; CHECK: loop
+; CHECK: i32.load8_u
+; CHECK: i32.load8_u
+; CHECK: i32.mul
+; CHECK: i32.store8
+; CHECK: i32.load8_u
+; CHECK: i32.load8_u
+; CHECK: i32.mul
+; CHECK: i32.store8
+; CHECK: i32.load8_u
+; CHECK: i32.load8_u
+; CHECK: i32.sub
+; CHECK: i32.store8
+; CHECK: i32.load8_u
+; CHECK: i32.load8_u
+; CHECK: i32.sub
+; CHECK: i32.store8
+define hidden void @four_bytes_split_op(ptr noalias nocapture noundef writeonly %0, ptr nocapture noundef readonly %1, ptr nocapture noundef readonly %2, i32 noundef %3) {
+ %5 = icmp eq i32 %3, 0
+ br i1 %5, label %6, label %7
+
+6: ; preds = %7, %4
+ ret void
+
+7: ; preds = %4, %7
+ %8 = phi i32 [ %33, %7 ], [ 0, %4 ]
+ %9 = getelementptr inbounds %struct.FourBytes, ptr %1, i32 %8
+ %10 = load i8, ptr %9, align 1
+ %11 = getelementptr inbounds %struct.FourBytes, ptr %2, i32 %8
+ %12 = load i8, ptr %11, align 1
+ %13 = mul i8 %12, %10
+ %14 = getelementptr inbounds %struct.FourBytes, ptr %0, i32 %8
+ store i8 %13, ptr %14, align 1
+ %15 = getelementptr inbounds i8, ptr %9, i32 1
+ %16 = load i8, ptr %15, align 1
+ %17 = getelementptr inbounds i8, ptr %11, i32 1
+ %18 = load i8, ptr %17, align 1
+ %19 = mul i8 %18, %16
+ %20 = getelementptr inbounds i8, ptr %14, i32 1
+ store i8 %19, ptr %20, align 1
+ %21 = getelementptr inbounds i8, ptr %9, i32 2
+ %22 = load i8, ptr %21, align 1
+ %23 = getelementptr inbounds i8, ptr %11, i32 2
+ %24 = load i8, ptr %23, align 1
+ %25 = sub i8 %22, %24
+ %26 = getelementptr inbounds i8, ptr %14, i32 2
+ store i8 %25, ptr %26, align 1
+ %27 = getelementptr inbounds i8, ptr %9, i32 3
+ %28 = load i8, ptr %27, align 1
+ %29 = getelementptr inbounds i8, ptr %11, i32 3
+ %30 = load i8, ptr %29, align 1
+ %31 = sub i8 %28, %30
+ %32 = getelementptr inbounds i8, ptr %14, i32 3
+ store i8 %31, ptr %32, align 1
+ %33 = add nuw i32 %8, 1
+ %34 = icmp eq i32 %33, %3
+ br i1 %34, label %6, label %7
+}
+
+; CHECK-LABEL: four_bytes_interleave_op:
+; CHECK: loop
+; CHECK: i32.load8_u
+; CHECK: i32.load8_u
+; CHECK: i32.add
+; CHECK: i32.store8
+; CHECK: i32.load8_u
+; CHECK: i32.load8_u
+; CHECK: i32.sub
+; CHECK: i32.store8
+; CHECK: i32.load8_u
+; CHECK: i32.load8_u
+; CHECK: i32.add
+; CHECK: i32.store8
+; CHECK: i32.load8_u
+; CHECK: i32.load8_u
+; CHECK: i32.sub
+; CHECK: i32.store8
+define hidden void @four_bytes_interleave_op(ptr noalias nocapture noundef writeonly %0, ptr nocapture noundef readonly %1, ptr nocapture noundef readonly %2, i32 noundef %3) {
+ %5 = icmp eq i32 %3, 0
+ br i1 %5, label %6, label %7
+
+6: ; preds = %7, %4
+ ret void
+
+7: ; preds = %4, %7
+ %8 = phi i32 [ %33, %7 ], [ 0, %4 ]
+ %9 = getelementptr inbounds %struct.FourBytes, ptr %1, i32 %8
+ %10 = load i8, ptr %9, align 1
+ %11 = getelementptr inbounds %struct.FourBytes, ptr %2, i32 %8
+ %12 = load i8, ptr %11, align 1
+ %13 = add i8 %12, %10
+ %14 = getelementptr inbounds %struct.FourBytes, ptr %0, i32 %8
+ store i8 %13, ptr %14, align 1
+ %15 = getelementptr inbounds i8, ptr %9, i32 1
+ %16 = load i8, ptr %15, align 1
+ %17 = getelementptr inbounds i8, ptr %11, i32 1
+ %18 = load i8, ptr %17, align 1
+ %19 = sub i8 %16, %18
+ %20 = getelementptr inbounds i8, ptr %14, i32 1
+ store i8 %19, ptr %20, align 1
+ %21 = getelementptr inbounds i8, ptr %9, i32 2
+ %22 = load i8, ptr %21, align 1
+ %23 = getelementptr inbounds i8, ptr %11, i32 2
+ %24 = load i8, ptr %23, align 1
+ %25 = add i8 %24, %22
+ %26 = getelementptr inbounds i8, ptr %14, i32 2
+ store i8 %25, ptr %26, align 1
+ %27 = getelementptr inbounds i8, ptr %9, i32 3
+ %28 = load i8, ptr %27, align 1
+ %29 = getelementptr inbounds i8, ptr %11, i32 3
+ %30 = load i8, ptr %29, align 1
+ %31 = sub i8 %28, %30
+ %32 = getelementptr inbounds i8, ptr %14, i32 3
+ store i8 %31, ptr %32, align 1
+ %33 = add nuw i32 %8, 1
+ %34 = icmp eq i32 %33, %3
+ br i1 %34, label %6, label %7
+}
+
+; CHECK-LABEL: eight_bytes_same_op:
+; CHECK: loop
+; CHECK: i32.load8_u
+; CHECK: i32.load8_u
+; CHECK: i32.mul
+; CHECK: i32.store8
+; CHECK: i32.load8_u
+; CHECK: i32.load8_u
+; CHECK: i32.mul
+; CHECK: i32.store8
+; CHECK: i32.load8_u
+; CHECK: i32.load8_u
+; CHECK: i32.mul
+; CHECK: i32.store8
+; CHECK: i32.load8_u
+; CHECK: i32.load8_u
+; CHECK: i32.mul
+; CHECK: i32.store8
+; CHECK: i32.load8_u
+; CHECK: i32.load8_u
+; CHECK: i32.mul
+; CHECK: i32.store8
+; CHECK: i32.load8_u
+; CHECK: i32.load8_u
+; CHECK: i32.mul
+; CHECK: i32.store8
+; CHECK: i32.load8_u
+; CHECK: i32.load8_u
+; CHECK: i32.mul
+; CHECK: i32.store8
+; CHECK: i32.load8_u
+; CHECK: i32.load8_u
+; CHECK: i32.mul
+; CHECK: i32.store8
+define hidden void @eight_bytes_same_op(ptr noalias nocapture noundef writeonly %0, ptr nocapture noundef readonly %1, ptr nocapture noundef readonly %2, i32 noundef %3) {
+ %5 = icmp eq i32 %3, 0
+ br i1 %5, label %6, label %7
+
+6: ; preds = %7, %4
+ ret void
+
+7: ; preds = %4, %7
+ %8 = phi i32 [ %57, %7 ], [ 0, %4 ]
+ %9 = getelementptr inbounds %struct.EightBytes, ptr %1, i32 %8
+ %10 = load i8, ptr %9, align 1
+ %11 = getelementptr inbounds %struct.EightBytes, ptr %2, i32 %8
+ %12 = load i8, ptr %11, align 1
+ %13 = mul i8 %12, %10
+ %14 = getelementptr inbounds %struct.EightBytes, ptr %0, i32 %8
+ store i8 %13, ptr %14, align 1
+ %15 = getelementptr inbounds i8, ptr %9, i32 1
+ %16 = load i8, ptr %15, align 1
+ %17 = getelementptr inbounds i8, ptr %11, i32 1
+ %18 = load i8, ptr %17, align 1
+ %19 = mul i8 %18, %16
+ %20 = getelementptr inbounds i8, ptr %14, i32 1
+ store i8 %19, ptr %20, align 1
+ %21 = getelementptr inbounds i8, ptr %9, i32 2
+ %22 = load i8, ptr %21, align 1
+ %23 = getelementptr inbounds i8, ptr %11, i32 2
+ %24 = load i8, ptr %23, align 1
+ %25 = mul i8 %24, %22
+ %26 = getelementptr inbounds i8, ptr %14, i32 2
+ store i8 %25, ptr %26, align 1
+ %27 = getelementptr inbounds i8, ptr %9, i32 3
+ %28 = load i8, ptr %27, align 1
+ %29 = getelementptr inbounds i8, ptr %11, i32 3
+ %30 = load i8, ptr %29, align 1
+ %31 = mul i8 %30, %28
+ %32 = getelementptr inbounds i8, ptr %14, i32 3
+ store i8 %31, ptr %32, align 1
+ %33 = getelementptr inbounds i8, ptr %9, i32 4
+ %34 = load i8, ptr %33, align 1
+ %35 = getelementptr inbounds i8, ptr %11, i32 4
+ %36 = load i8, ptr %35, align 1
+ %37 = mul i8 %36, %34
+ %38 = getelementptr inbounds i8, ptr %14, i32 4
+ store i8 %37, ptr %38, align 1
+ %39 = getelementptr inbounds i8, ptr %9, i32 5
+ %40 = load i8, ptr %39, align 1
+ %41 = getelementptr inbounds i8, ptr %11, i32 5
+ %42 = load i8, ptr %41, align 1
+ %43 = mul i8 %42, %40
+ %44 = getelementptr inbounds i8, ptr %14, i32 5
+ store i8 %43, ptr %44, align 1
+ %45 = getelementptr inbounds i8, ptr %9, i32 6
+ %46 = load i8, ptr %45, align 1
+ %47 = getelementptr inbounds i8, ptr %11, i32 6
+ %48 = load i8, ptr %47, align 1
+ %49 = mul i8 %48, %46
+ %50 = getelementptr inbounds i8, ptr %14, i32 6
+ store i8 %49, ptr %50, align 1
+ %51 = getelementptr inbounds i8, ptr %9, i32 7
+ %52 = load i8, ptr %51, align 1
+ %53 = getelementptr inbounds i8, ptr %11, i32 7
+ %54 = load i8, ptr %53, align 1
+ %55 = mul i8 %54, %52
+ %56 = getelementptr inbounds i8, ptr %14, i32 7
+ store i8 %55, ptr %56, align 1
+ %57 = add nuw i32 %8, 1
+ %58 = icmp eq i32 %57, %3
+ br i1 %58, label %6, label %7
+}
+
+; CHECK-LABEL: eight_bytes_split_op:
+; CHECK: loop
+; CHECK: i32.load8_u
+; CHECK: i32.load8_u
+; CHECK: i32.add
+; CHECK: i32.store8
+; CHECK: i32.load8_u
+; CHECK: i32.load8_u
+; CHECK: i32.add
+; CHECK: i32.store8
+; CHECK: i32.load8_u
+; CHECK: i32.load8_u
+; CHECK: i32.add
+; CHECK: i32.store8
+; CHECK: i32.load8_u
+; CHECK: i32.load8_u
+; CHECK: i32.add
+; CHECK: i32.store8
+; CHECK: i32.load8_u
+; CHECK: i32.load8_u
+; CHECK: i32.sub
+; CHECK: i32.store8
+; CHECK: i32.load8_u
+; CHECK: i32.load8_u
+; CHECK: i32.sub
+; CHECK: i32.store8
+; CHECK: i32.load8_u
+; CHECK: i32.load8_u
+; CHECK: i32.sub
+; CHECK: i32.store8
+; CHECK: i32.load8_u
+; CHECK: i32.load8_u
+; CHECK: i32.sub
+; CHECK: i32.store8
+define hidden void @eight_bytes_split_op(ptr noalias nocapture noundef writeonly %0, ptr nocapture noundef readonly %1, ptr nocapture noundef readonly %2, i32 noundef %3) {
+ %5 = icmp eq i32 %3, 0
+ br i1 %5, label %6, label %7
+
+6: ; preds = %7, %4
+ ret void
+
+7: ; preds = %4, %7
+ %8 = phi i32 [ %57, %7 ], [ 0, %4 ]
+ %9 = getelementptr inbounds %struct.EightBytes, ptr %1, i32 %8
+ %10 = load i8, ptr %9, align 1
+ %11 = getelementptr inbounds %struct.EightBytes, ptr %2, i32 %8
+ %12 = load i8, ptr %11, align 1
+ %13 = add i8 %12, %10
+ %14 = getelementptr inbounds %struct.EightBytes, ptr %0, i32 %8
+ store i8 %13, ptr %14, align 1
+ %15 = getelementptr inbounds i8, ptr %9, i32 1
+ %16 = load i8, ptr %15, align 1
+ %17 = getelementptr inbounds i8, ptr %11, i32 1
+ %18 = load i8, ptr %17, align 1
+ %19 = add i8 %18, %16
+ %20 = getelementptr inbounds i8, ptr %14, i32 1
+ store i8 %19, ptr %20, align 1
+ %21 = getelementptr inbounds i8, ptr %9, i32 2
+ %22 = load i8, ptr %21, align 1
+ %23 = getelementptr inbounds i8, ptr %11, i32 2
+ %24 = load i8, ptr %23, align 1
+ %25 = add i8 %24, %22
+ %26 = getelementptr inbounds i8, ptr %14, i32 2
+ store i8 %25, ptr %26, align 1
+ %27 = getelementptr inbounds i8, ptr %9, i32 3
+ %28 = load i8, ptr %27, align 1
+ %29 = getelementptr inbounds i8, ptr %11, i32 3
+ %30 = load i8, ptr %29, align 1
+ %31 = add i8 %30, %28
+ %32 = getelementptr inbounds i8, ptr %14, i32 3
+ store i8 %31, ptr %32, align 1
+ %33 = getelementptr inbounds i8, ptr %9, i32 4
+ %34 = load i8, ptr %33, align 1
+ %35 = getelementptr inbounds i8, ptr %11, i32 4
+ %36 = load i8, ptr %35, align 1
+ %37 = sub i8 %34, %36
+ %38 = getelementptr inbounds i8, ptr %14, i32 4
+ store i8 %37, ptr %38, align 1
+ %39 = getelementptr inbounds i8, ptr %9, i32 5
+ %40 = load i8, ptr %39, align 1
+ %41 = getelementptr inbounds i8, ptr %11, i32 5
+ %42 = load i8, ptr %41, align 1
+ %43 = sub i8 %40, %42
+ %44 = getelementptr inbounds i8, ptr %14, i32 5
+ store i8 %43, ptr %44, align 1
+ %45 = getelementptr inbounds i8, ptr %9, i32 6
+ %46 = load i8, ptr %45, align 1
+ %47 = getelementptr inbounds i8, ptr %11, i32 6
+ %48 = load i8, ptr %47, align 1
+ %49 = sub i8 %46, %48
+ %50 = getelementptr inbounds i8, ptr %14, i32 6
+ store i8 %49, ptr %50, align 1
+ %51 = getelementptr inbounds i8, ptr %9, i32 7
+ %52 = load i8, ptr %51, align 1
+ %53 = getelementptr inbounds i8, ptr %11, i32 7
+ %54 = load i8, ptr %53, align 1
+ %55 = sub i8 %52, %54
+ %56 = getelementptr inbounds i8, ptr %14, i32 7
+ store i8 %55, ptr %56, align 1
+ %57 = add nuw i32 %8, 1
+ %58 = icmp eq i32 %57, %3
+ br i1 %58, label %6, label %7
+}
+
+; CHECK-LABEL: eight_bytes_interleave_op:
+; CHECK: loop
+; CHECK: i32.load8_u
+; CHECK: i32.load8_u
+; CHECK: i32.add
+; CHECK: i32.store8
+; CHECK: i32.load8_u
+; CHECK: i32.load8_u
+; CHECK: i32.sub
+; CHECK: i32.store8
+; CHECK: i32.load8_u
+; CHECK: i32.load8_u
+; CHECK: i32.add
+; CHECK: i32.store8
+; CHECK: i32.load8_u
+; CHECK: i32.load8_u
+; CHECK: i32.sub
+; CHECK: i32.store8
+; CHECK: i32.load8_u
+; CHECK: i32.load8_u
+; CHECK: i32.add
+; CHECK: i32.store8
+; CHECK: i32.load8_u
+; CHECK: i32.load8_u
+; CHECK: i32.sub
+; CHECK: i32.store8
+; CHECK: i32.load8_u
+; CHECK: i32.load8_u
+; CHECK: i32.add
+; CHECK: i32.store8
+; CHECK: i32.load8_u
+; CHECK: i32.load8_u
+; CHECK: i32.sub
+; CHECK: i32.store8
+define hidden void @eight_bytes_interleave_op(ptr noalias nocapture noundef writeonly %0, ptr nocapture noundef readonly %1, ptr nocapture noundef readonly %2, i32 noundef %3) {
+ %5 = icmp eq i32 %3, 0
+ br i1 %5, label %6, label %7
+
+6: ; preds = %7, %4
+ ret void
+
+7: ; preds = %4, %7
+ %8 = phi i32 [ %57, %7 ], [ 0, %4 ]
+ %9 = getelementptr inbounds %struct.EightBytes, ptr %1, i32 %8
+ %10 = load i8, ptr %9, align 1
+ %11 = getelementptr inbounds %struct.EightBytes, ptr %2, i32 %8
+ %12 = load i8, ptr %11, align 1
+ %13 = add i8 %12, %10
+ %14 = getelementptr inbounds %struct.EightBytes, ptr %0, i32 %8
+ store i8 %13, ptr %14, align 1
+ %15 = getelementptr inbounds i8, ptr %9, i32 1
+ %16 = load i8, ptr %15, align 1
+ %17 = getelementptr inbounds i8, ptr %11, i32 1
+ %18 = load i8, ptr %17, align 1
+ %19 = sub i8 %16, %18
+ %20 = getelementptr inbounds i8, ptr %14, i32 1
+ store i8 %19, ptr %20, align 1
+ %21 = getelementptr inbounds i8, ptr %9, i32 2
+ %22 = load i8, ptr %21, align 1
+ %23 = getelementptr inbounds i8, ptr %11, i32 2
+ %24 = load i8, ptr %23, align 1
+ %25 = add i8 %24, %22
+ %26 = getelementptr inbounds i8, ptr %14, i32 2
+ store i8 %25, ptr %26, align 1
+ %27 = getelementptr inbounds i8, ptr %9, i32 3
+ %28 = load i8, ptr %27, align 1
+ %29 = getelementptr inbounds i8, ptr %11, i32 3
+ %30 = load i8, ptr %29, align 1
+ %31 = sub i8 %28, %30
+ %32 = getelementptr inbounds i8, ptr %14, i32 3
+ store i8 %31, ptr %32, align 1
+ %33 = getelementptr inbounds i8, ptr %9, i32 4
+ %34 = load i8, ptr %33, align 1
+ %35 = getelementptr inbounds i8, ptr %11, i32 4
+ %36 = load i8, ptr %35, align 1
+ %37 = add i8 %36, %34
+ %38 = getelementptr inbounds i8, ptr %14, i32 4
+ store i8 %37, ptr %38, align 1
+ %39 = getelementptr inbounds i8, ptr %9, i32 5
+ %40 = load i8, ptr %39, align 1
+ %41 = getelementptr inbounds i8, ptr %11, i32 5
+ %42 = load i8, ptr %41, align 1
+ %43 = sub i8 %40, %42
+ %44 = getelementptr inbounds i8, ptr %14, i32 5
+ store i8 %43, ptr %44, align 1
+ %45 = getelementptr inbounds i8, ptr %9, i32 6
+ %46 = load i8, ptr %45, align 1
+ %47 = getelementptr inbounds i8, ptr %11, i32 6
+ %48 = load i8, ptr %47, align 1
+ %49 = add i8 %48, %46
+ %50 = getelementptr inbounds i8, ptr %14, i32 6
+ store i8 %49, ptr %50, align 1
+ %51 = getelementptr inbounds i8, ptr %9, i32 7
+ %52 = load i8, ptr %51, align 1
+ %53 = getelementptr inbounds i8, ptr %11, i32 7
+ %54 = load i8, ptr %53, align 1
+ %55 = sub i8 %52, %54
+ %56 = getelementptr inbounds i8, ptr %14, i32 7
+ store i8 %55, ptr %56, align 1
+ %57 = add nuw i32 %8, 1
+ %58 = icmp eq i32 %57, %3
+ br i1 %58, label %6, label %7
+}
+
+; CHECK-LABEL: four_bytes_into_four_ints_same_op:
+; CHECK: loop
+; CHECK: i32.load8_u
+; CHECK: i32.load8_u
+; CHECK: i32.mul
+; CHECK: i32.load
+; CHECK: i32.add
+; CHECK: i32.store
+; CHECK: i32.load8_u
+; CHECK: i32.load8_u
+; CHECK: i32.mul
+; CHECK: i32.load
+; CHECK: i32.add
+; CHECK: i32.store
+; CHECK: i32.load8_u
+; CHECK: i32.load8_u
+; CHECK: i32.mul
+; CHECK: i32.load
+; CHECK: i32.add
+; CHECK: i32.store
+; CHECK: i32.load8_u
+; CHECK: i32.load8_u
+; CHECK: i32.mul
+; CHECK: i32.load
+; CHECK: i32.add
+; CHECK: i32.store
+define hidden void @four_bytes_into_four_ints_same_op(ptr noalias nocapture noundef %0, ptr nocapture noundef readonly %1, ptr nocapture noundef readonly %2, i32 noundef %3) {
+ %5 = icmp eq i32 %3, 0
+ br i1 %5, label %6, label %7
+
+6: ; preds = %7, %4
+ ret void
+
+7: ; preds = %4, %7
+ %8 = phi i32 [ %49, %7 ], [ 0, %4 ]
+ %9 = getelementptr inbounds %struct.FourBytes, ptr %1, i32 %8
+ %10 = load i8, ptr %9, align 1
+ %11 = zext i8 %10 to i32
+ %12 = getelementptr inbounds %struct.FourBytes, ptr %2, i32 %8
+ %13 = load i8, ptr %12, align 1
+ %14 = zext i8 %13 to i32
+ %15 = mul nuw nsw i32 %14, %11
+ %16 = getelementptr inbounds %struct.FourInts, ptr %0, i32 %8
+ %17 = load i32, ptr %16, align 4
+ %18 = add nsw i32 %15, %17
+ store i32 %18, ptr %16, align 4
+ %19 = getelementptr inbounds i8, ptr %9, i32 1
+ %20 = load i8, ptr %19, align 1
+ %21 = zext i8 %20 to i32
+ %22 = getelementptr inbounds i8, ptr %12, i32 1
+ %23 = load i8, ptr %22, align 1
+ %24 = zext i8 %23 to i32
+ %25 = mul nuw nsw i32 %24, %21
+ %26 = getelementptr inbounds i8, ptr %16, i32 4
+ %27 = load i32, ptr %26, align 4
+ %28 = add nsw i32 %25, %27
+ store i32 %28, ptr %26, align 4
+ %29 = getelementptr inbounds i8, ptr %9, i32 2
+ %30 = load i8, ptr %29, align 1
+ %31 = zext i8 %30 to i32
+ %32 = getelementptr inbounds i8, ptr %12, i32 2
+ %33 = load i8, ptr %32, align 1
+ %34 = zext i8 %33 to i32
+ %35 = mul nuw nsw i32 %34, %31
+ %36 = getelementptr inbounds i8, ptr %16, i32 8
+ %37 = load i32, ptr %36, align 4
+ %38 = add nsw i32 %35, %37
+ store i32 %38, ptr %36, align 4
+ %39 = getelementptr inbounds i8, ptr %9, i32 3
+ %40 = load i8, ptr %39, align 1
+ %41 = zext i8 %40 to i32
+ %42 = getelementptr inbounds i8, ptr %12, i32 3
+ %43 = load i8, ptr %42, align 1
+ %44 = zext i8 %43 to i32
+ %45 = mul nuw nsw i32 %44, %41
+ %46 = getelementptr inbounds i8, ptr %16, i32 12
+ %47 = load i32, ptr %46, align 4
+ %48 = add nsw i32 %45, %47
+ store i32 %48, ptr %46, align 4
+ %49 = add nuw i32 %8, 1
+ %50 = icmp eq i32 %49, %3
+ br i1 %50, label %6, label %7
+}
+
+; CHECK-LABEL: four_bytes_into_four_ints_vary_op:
+; CHECK: loop
+; CHECK: i32.load8_u
+; CHECK: i32.load8_u
+; CHECK: i32.add
+; CHECK: i32.store
+; CHECK: i32.load8_u
+; CHECK: i32.load8_u
+; CHECK: i32.sub
+; CHECK: i32.store
+; CHECK: i32.load8_u
+; CHECK: i32.load8_u
+; CHECK: i32.mul
+; CHECK: i32.store
+; CHECK: i32.load8_u
+; CHECK: i32.load8_u
+; CHECK: i32.and
+; CHECK: i32.store
+define hidden void @four_bytes_into_four_ints_vary_op(ptr noalias nocapture noundef writeonly %0, ptr nocapture noundef readonly %1, ptr nocapture noundef readonly %2, i32 noundef %3) {
+ %5 = icmp eq i32 %3, 0
+ br i1 %5, label %6, label %7
+
+6: ; preds = %7, %4
+ ret void
+
+7: ; preds = %4, %7
+ %8 = phi i32 [ %40, %7 ], [ 0, %4 ]
+ %9 = getelementptr inbounds %struct.FourBytes, ptr %1, i32 %8
+ %10 = load i8, ptr %9, align 1
+ %11 = zext i8 %10 to i32
+ %12 = getelementptr inbounds %struct.FourBytes, ptr %2, i32 %8
+ %13 = load i8, ptr %12, align 1
+ %14 = zext i8 %13 to i32
+ %15 = add nuw nsw i32 %14, %11
+ %16 = getelementptr inbounds %struct.FourInts, ptr %0, i32 %8
+ store i32 %15, ptr %16, align 4
+ %17 = getelementptr inbounds i8, ptr %9, i32 1
+ %18 = load i8, ptr %17, align 1
+ %19 = zext i8 %18 to i32
+ %20 = getelementptr inbounds i8, ptr %12, i32 1
+ %21 = load i8, ptr %20, align 1
+ %22 = zext i8 %21 to i32
+ %23 = sub nsw i32 %19, %22
+ %24 = getelementptr inbounds i8, ptr %16, i32 4
+ store i32 %23, ptr %24, align 4
+ %25 = getelementptr inbounds i8, ptr %9, i32 2
+ %26 = load i8, ptr %25, align 1
+ %27 = zext i8 %26 to i32
+ %28 = getelementptr inbounds i8, ptr %12, i32 2
+ %29 = load i8, ptr %28, align 1
+ %30 = zext i8 %29 to i32
+ %31 = mul nuw nsw i32 %30, %27
+ %32 = getelementptr inbounds i8, ptr %16, i32 8
+ store i32 %31, ptr %32, align 4
+ %33 = getelementptr inbounds i8, ptr %9, i32 3
+ %34 = load i8, ptr %33, align 1
+ %35 = getelementptr inbounds i8, ptr %12, i32 3
+ %36 = load i8, ptr %35, align 1
+ %37 = and i8 %36, %34
+ %38 = zext i8 %37 to i32
+ %39 = getelementptr inbounds i8, ptr %16, i32 12
+ store i32 %38, ptr %39, align 4
+ %40 = add nuw i32 %8, 1
+ %41 = icmp eq i32 %40, %3
+ br i1 %41, label %6, label %7
+}
+
+; CHECK-LABEL: scale_uv_row_down2:
+; CHECK: loop
+; CHECK: i32.load8_u
+; CHECK: i32.store8
+; CHECK: i32.load8_u
+; CHECK: i32.store8
+define hidden void @scale_uv_row_down2(ptr nocapture noundef readonly %0, i32 noundef %1, ptr nocapture noundef writeonly %2, i32 noundef %3) {
+ %5 = icmp sgt i32 %3, 0
+ br i1 %5, label %6, label %19
+
+6: ; preds = %4, %6
+ %7 = phi i32 [ %17, %6 ], [ 0, %4 ]
+ %8 = phi ptr [ %15, %6 ], [ %0, %4 ]
+ %9 = phi ptr [ %16, %6 ], [ %2, %4 ]
+ %10 = getelementptr inbounds i8, ptr %8, i32 2
+ %11 = load i8, ptr %10, align 1
+ store i8 %11, ptr %9, align 1
+ %12 = getelementptr inbounds i8, ptr %8, i32 3
+ %13 = load i8, ptr %12, align 1
+ %14 = getelementptr inbounds i8, ptr %9, i32 1
+ store i8 %13, ptr %14, align 1
+ %15 = getelementptr inbounds i8, ptr %8, i32 4
+ %16 = getelementptr inbounds i8, ptr %9, i32 2
+ %17 = add nuw nsw i32 %7, 1
+ %18 = icmp eq i32 %17, %3
+ br i1 %18, label %19, label %6
+
+19: ; preds = %6, %4
+ ret void
+}
+
+; CHECK-LABEL: scale_uv_row_down2_box:
+; CHECK: loop
+; CHECK: i32.load8_u
+; CHECK: i32.load8_u
+; CHECK: i32.load8_u
+; CHECK: i32.load8_u
+; CHECK: i32.shr_u
+; CHECK: i32.store8
+; CHECK: i32.load8_u
+; CHECK: i32.load8_u
+; CHECK: i32.load8_u
+; CHECK: i32.load8_u
+; CHECK: i32.shr_u
+; CHECK: i32.store8
+define hidden void @scale_uv_row_down2_box(ptr nocapture noundef readonly %0, i32 noundef %1, ptr nocapture noundef writeonly %2, i32 noundef %3) {
+ %5 = icmp sgt i32 %3, 0
+ br i1 %5, label %6, label %54
+
+6: ; preds = %4
+ %7 = add nsw i32 %1, 2
+ %8 = add nsw i32 %1, 1
+ %9 = add nsw i32 %1, 3
+ br label %10
+
+10: ; preds = %6, %10
+ %11 = phi i32 [ 0, %6 ], [ %52, %10 ]
+ %12 = phi ptr [ %0, %6 ], [ %50, %10 ]
+ %13 = phi ptr [ %2, %6 ], [ %51, %10 ]
+ %14 = load i8, ptr %12, align 1
+ %15 = zext i8 %14 to i16
+ %16 = getelementptr inbounds i8, ptr %12, i32 2
+ %17 = load i8, ptr %16, align 1
+ %18 = zext i8 %17 to i16
+ %19 = getelementptr inbounds i8, ptr %12, i32 %1
+ %20 = load i8, ptr %19, align 1
+ %21 = zext i8 %20 to i16
+ %22 = getelementptr inbounds i8, ptr %12, i32 %7
+ %23 = load i8, ptr %22, align 1
+ %24 = zext i8 %23 to i16
+ %25 = add nuw nsw i16 %15, 2
+ %26 = add nuw nsw i16 %25, %18
+ %27 = add nuw nsw i16 %26, %21
+ %28 = add nuw nsw i16 %27, %24
+ %29 = lshr i16 %28, 2
+ %30 = trunc nuw i16 %29 to i8
+ store i8 %30, ptr %13, align 1
+ %31 = getelementptr inbounds i8, ptr %12, i32 1
+ %32 = load i8, ptr %31, align 1
+ %33 = zext i8 %32 to i16
+ %34 = getelementptr inbounds i8, ptr %12, i32 3
+ %35 = load i8, ptr %34, align 1
+ %36 = zext i8 %35 to i16
+ %37 = getelementptr inbounds i8, ptr %12, i32 %8
+ %38 = load i8, ptr %37, align 1
+ %39 = zext i8 %38 to i16
+ %40 = getelementptr inbounds i8, ptr %12, i32 %9
+ %41 = load i8, ptr %40, align 1
+ %42 = zext i8 %41 to i16
+ %43 = add nuw nsw i16 %33, 2
+ %44 = add nuw nsw i16 %43, %36
+ %45 = add nuw nsw i16 %44, %39
+ %46 = add nuw nsw i16 %45, %42
+ %47 = lshr i16 %46, 2
+ %48 = trunc nuw i16 %47 to i8
+ %49 = getelementptr inbounds i8, ptr %13, i32 1
+ store i8 %48, ptr %49, align 1
+ %50 = getelementptr inbounds i8, ptr %12, i32 4
+ %51 = getelementptr inbounds i8, ptr %13, i32 2
+ %52 = add nuw nsw i32 %11, 1
+ %53 = icmp eq i32 %52, %3
+ br i1 %53, label %54, label %10
+
+54: ; preds = %10, %4
+ ret void
+}
+
+; CHECK-LABEL: scale_uv_row_down2_linear:
+; CHECK: loop
+; CHECK: i32.load8_u
+; CHECK: i32.load8_u
+; CHECK: i32.shr_u
+; CHECK: i32.store8
+; CHECK: i32.load8_u
+; CHECK: i32.load8_u
+; CHECK: i32.shr_u
+; CHECK: i32.store8
+define hidden void @scale_uv_row_down2_linear(ptr nocapture noundef readonly %0, i32 noundef %1, ptr nocapture noundef writeonly %2, i32 noundef %3) {
+ %5 = icmp sgt i32 %3, 0
+ br i1 %5, label %6, label %34
+
+6: ; preds = %4, %6
+ %7 = phi i32 [ %32, %6 ], [ 0, %4 ]
+ %8 = phi ptr [ %30, %6 ], [ %0, %4 ]
+ %9 = phi ptr [ %31, %6 ], [ %2, %4 ]
+ %10 = load i8, ptr %8, align 1
+ %11 = zext i8 %10 to i16
+ %12 = getelementptr inbounds i8, ptr %8, i32 2
+ %13 = load i8, ptr %12, align 1
+ %14 = zext i8 %13 to i16
+ %15 = add nuw nsw i16 %11, 1
+ %16 = add nuw nsw i16 %15, %14
+ %17 = lshr i16 %16, 1
+ %18 = trunc nuw i16 %17 to i8
+ store i8 %18, ptr %9, align 1
+ %19 = getelementptr inbounds i8, ptr %8, i32 1
+ %20 = load i8, ptr %19, align 1
+ %21 = zext i8 %20 to i16
+ %22 = getelementptr inbounds i8, ptr %8, i32 3
+ %23 = load i8, ptr %22, align 1
+ %24 = zext i8 %23 to i16
+ %25 = add nuw nsw i16 %21, 1
+ %26 = add nuw nsw i16 %25, %24
+ %27 = lshr i16 %26, 1
+ %28 = trunc nuw i16 %27 to i8
+ %29 = getelementptr inbounds i8, ptr %9, i32 1
+ store i8 %28, ptr %29, align 1
+ %30 = getelementptr inbounds i8, ptr %8, i32 4
+ %31 = getelementptr inbounds i8, ptr %9, i32 2
+ %32 = add nuw nsw i32 %7, 1
+ %33 = icmp eq i32 %32, %3
+ br i1 %33, label %34, label %6
+
+34: ; preds = %6, %4
+ ret void
+}
diff --git a/llvm/test/CodeGen/WinEH/wineh-noret-cleanup.ll b/llvm/test/CodeGen/WinEH/wineh-noret-cleanup.ll
index 3b3a460..ab6672e 100644
--- a/llvm/test/CodeGen/WinEH/wineh-noret-cleanup.ll
+++ b/llvm/test/CodeGen/WinEH/wineh-noret-cleanup.ll
@@ -1,4 +1,4 @@
-; RUN: sed -e s/.Cxx:// %s | llc -mtriple=x86_64-pc-windows-msvc | FileCheck %s --check-prefixes=CXX,X64CXX
+; RUN: sed -e s/.Cxx:// %s | llc -mtriple=x86_64-pc-windows-msvc | FileCheck %s --check-prefixes=CXX
; RUN: sed -e s/.Seh:// %s | llc -mtriple=x86_64-pc-windows-msvc | FileCheck %s --check-prefixes=SEH
; RUN: %if aarch64-registered-target %{ sed -e s/.Cxx:// %s | llc -mtriple=aarch64-pc-windows-msvc | FileCheck %s --check-prefix=CXX %}
; RUN: %if aarch64-registered-target %{ sed -e s/.Seh:// %s | llc -mtriple=aarch64-pc-windows-msvc | FileCheck %s --check-prefix=SEH %}
@@ -49,18 +49,14 @@ catch.body.2:
; CXX-NEXT: .[[ENTRY:long|word]] .Lfunc_begin0@IMGREL
; CXX-NEXT: .[[ENTRY]] -1
; CXX-NEXT: .[[ENTRY]] .Ltmp0@IMGREL
-; X64CXX-SAME: +1
; CXX-NEXT: .[[ENTRY]] 1
; CXX-NEXT: .[[ENTRY]] .Ltmp1@IMGREL
-; X64CXX-SAME: +1
; CXX-NEXT: .[[ENTRY]] -1
; CXX-NEXT: .[[ENTRY]] "?catch$3@?0?test@4HA"@IMGREL
; CXX-NEXT: .[[ENTRY]] 2
; CXX-NEXT: .[[ENTRY]] .Ltmp2@IMGREL
-; X64CXX-SAME: +1
; CXX-NEXT: .[[ENTRY]] 3
; CXX-NEXT: .[[ENTRY]] .Ltmp3@IMGREL
-; X64CXX-SAME: +1
; CXX-NEXT: .[[ENTRY]] 2
; CXX-NEXT: .[[ENTRY]] "?catch$5@?0?test@4HA"@IMGREL
; CXX-NEXT: .[[ENTRY]] 4
@@ -70,19 +66,19 @@ catch.body.2:
; SEH: .LBB0_[[CATCH:[0-9]+]]: {{.*}} %catch.body
; SEH-LABEL: .Llsda_begin0:
; SEH-NEXT: .[[ENTRY:long|word]] .Ltmp0@IMGREL
-; SEH-NEXT: .[[ENTRY]] .Ltmp1@IMGREL+1
+; SEH-NEXT: .[[ENTRY]] .Ltmp1@IMGREL
; SEH-NEXT: .[[ENTRY]] dummy_filter@IMGREL
; SEH-NEXT: .[[ENTRY]] .LBB0_[[CATCH]]@IMGREL
; SEH-NEXT: .[[ENTRY]] .Ltmp0@IMGREL
-; SEH-NEXT: .[[ENTRY]] .Ltmp1@IMGREL+1
+; SEH-NEXT: .[[ENTRY]] .Ltmp1@IMGREL
; SEH-NEXT: .[[ENTRY]] dummy_filter@IMGREL
; SEH-NEXT: .[[ENTRY]] .LBB0_[[CATCH2]]@IMGREL
; SEH-NEXT: .[[ENTRY]] .Ltmp2@IMGREL
-; SEH-NEXT: .[[ENTRY]] .Ltmp3@IMGREL+1
+; SEH-NEXT: .[[ENTRY]] .Ltmp3@IMGREL
; SEH-NEXT: .[[ENTRY]] "?dtor$[[DTOR:[0-9]+]]@?0?test@4HA"@IMGREL
; SEH-NEXT: .[[ENTRY]] 0
; SEH-NEXT: .[[ENTRY]] .Ltmp2@IMGREL
-; SEH-NEXT: .[[ENTRY]] .Ltmp3@IMGREL+1
+; SEH-NEXT: .[[ENTRY]] .Ltmp3@IMGREL
; SEH-NEXT: .[[ENTRY]] dummy_filter@IMGREL
; SEH-NEXT: .[[ENTRY]] .LBB0_[[CATCH2]]@IMGREL
; SEH-NEXT: .Llsda_end0:
diff --git a/llvm/test/CodeGen/X86/abds-neg.ll b/llvm/test/CodeGen/X86/abds-neg.ll
index 2911edf..d9064c6 100644
--- a/llvm/test/CodeGen/X86/abds-neg.ll
+++ b/llvm/test/CodeGen/X86/abds-neg.ll
@@ -1076,15 +1076,15 @@ define i64 @abd_subnsw_i64(i64 %a, i64 %b) nounwind {
; X86-NEXT: pushl %esi
; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-NEXT: movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT: subl {{[0-9]+}}(%esp), %esi
-; X86-NEXT: sbbl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT: movl %ecx, %edx
+; X86-NEXT: subl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: sbbl {{[0-9]+}}(%esp), %esi
+; X86-NEXT: movl %esi, %edx
; X86-NEXT: sarl $31, %edx
-; X86-NEXT: xorl %edx, %ecx
; X86-NEXT: xorl %edx, %esi
+; X86-NEXT: xorl %edx, %ecx
; X86-NEXT: movl %edx, %eax
-; X86-NEXT: subl %esi, %eax
-; X86-NEXT: sbbl %ecx, %edx
+; X86-NEXT: subl %ecx, %eax
+; X86-NEXT: sbbl %esi, %edx
; X86-NEXT: popl %esi
; X86-NEXT: retl
;
@@ -1107,15 +1107,15 @@ define i64 @abd_subnsw_i64_undef(i64 %a, i64 %b) nounwind {
; X86-NEXT: pushl %esi
; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-NEXT: movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT: subl {{[0-9]+}}(%esp), %esi
-; X86-NEXT: sbbl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT: movl %ecx, %edx
+; X86-NEXT: subl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: sbbl {{[0-9]+}}(%esp), %esi
+; X86-NEXT: movl %esi, %edx
; X86-NEXT: sarl $31, %edx
-; X86-NEXT: xorl %edx, %ecx
; X86-NEXT: xorl %edx, %esi
+; X86-NEXT: xorl %edx, %ecx
; X86-NEXT: movl %edx, %eax
-; X86-NEXT: subl %esi, %eax
-; X86-NEXT: sbbl %ecx, %edx
+; X86-NEXT: subl %ecx, %eax
+; X86-NEXT: sbbl %esi, %edx
; X86-NEXT: popl %esi
; X86-NEXT: retl
;
@@ -1142,32 +1142,32 @@ define i128 @abd_subnsw_i128(i128 %a, i128 %b) nounwind {
; X86-NEXT: pushl %esi
; X86-NEXT: andl $-16, %esp
; X86-NEXT: subl $16, %esp
-; X86-NEXT: movl 36(%ebp), %eax
; X86-NEXT: movl 32(%ebp), %ecx
+; X86-NEXT: movl 36(%ebp), %eax
+; X86-NEXT: movl 24(%ebp), %edi
; X86-NEXT: movl 28(%ebp), %edx
-; X86-NEXT: movl 24(%ebp), %esi
-; X86-NEXT: subl 40(%ebp), %esi
+; X86-NEXT: subl 40(%ebp), %edi
; X86-NEXT: sbbl 44(%ebp), %edx
; X86-NEXT: sbbl 48(%ebp), %ecx
; X86-NEXT: sbbl 52(%ebp), %eax
-; X86-NEXT: movl %eax, %edi
-; X86-NEXT: sarl $31, %edi
-; X86-NEXT: xorl %edi, %eax
-; X86-NEXT: xorl %edi, %ecx
-; X86-NEXT: xorl %edi, %edx
-; X86-NEXT: xorl %edi, %esi
-; X86-NEXT: movl %edi, %ebx
-; X86-NEXT: subl %esi, %ebx
-; X86-NEXT: movl %edi, %esi
-; X86-NEXT: sbbl %edx, %esi
-; X86-NEXT: movl %edi, %edx
+; X86-NEXT: movl %eax, %esi
+; X86-NEXT: sarl $31, %esi
+; X86-NEXT: xorl %esi, %eax
+; X86-NEXT: xorl %esi, %ecx
+; X86-NEXT: xorl %esi, %edx
+; X86-NEXT: xorl %esi, %edi
+; X86-NEXT: movl %esi, %ebx
+; X86-NEXT: subl %edi, %ebx
+; X86-NEXT: movl %esi, %edi
+; X86-NEXT: sbbl %edx, %edi
+; X86-NEXT: movl %esi, %edx
; X86-NEXT: sbbl %ecx, %edx
-; X86-NEXT: sbbl %eax, %edi
+; X86-NEXT: sbbl %eax, %esi
; X86-NEXT: movl 8(%ebp), %eax
; X86-NEXT: movl %ebx, (%eax)
-; X86-NEXT: movl %esi, 4(%eax)
+; X86-NEXT: movl %edi, 4(%eax)
; X86-NEXT: movl %edx, 8(%eax)
-; X86-NEXT: movl %edi, 12(%eax)
+; X86-NEXT: movl %esi, 12(%eax)
; X86-NEXT: leal -12(%ebp), %esp
; X86-NEXT: popl %esi
; X86-NEXT: popl %edi
@@ -1203,32 +1203,32 @@ define i128 @abd_subnsw_i128_undef(i128 %a, i128 %b) nounwind {
; X86-NEXT: pushl %esi
; X86-NEXT: andl $-16, %esp
; X86-NEXT: subl $16, %esp
-; X86-NEXT: movl 36(%ebp), %eax
; X86-NEXT: movl 32(%ebp), %ecx
+; X86-NEXT: movl 36(%ebp), %eax
+; X86-NEXT: movl 24(%ebp), %edi
; X86-NEXT: movl 28(%ebp), %edx
-; X86-NEXT: movl 24(%ebp), %esi
-; X86-NEXT: subl 40(%ebp), %esi
+; X86-NEXT: subl 40(%ebp), %edi
; X86-NEXT: sbbl 44(%ebp), %edx
; X86-NEXT: sbbl 48(%ebp), %ecx
; X86-NEXT: sbbl 52(%ebp), %eax
-; X86-NEXT: movl %eax, %edi
-; X86-NEXT: sarl $31, %edi
-; X86-NEXT: xorl %edi, %eax
-; X86-NEXT: xorl %edi, %ecx
-; X86-NEXT: xorl %edi, %edx
-; X86-NEXT: xorl %edi, %esi
-; X86-NEXT: movl %edi, %ebx
-; X86-NEXT: subl %esi, %ebx
-; X86-NEXT: movl %edi, %esi
-; X86-NEXT: sbbl %edx, %esi
-; X86-NEXT: movl %edi, %edx
+; X86-NEXT: movl %eax, %esi
+; X86-NEXT: sarl $31, %esi
+; X86-NEXT: xorl %esi, %eax
+; X86-NEXT: xorl %esi, %ecx
+; X86-NEXT: xorl %esi, %edx
+; X86-NEXT: xorl %esi, %edi
+; X86-NEXT: movl %esi, %ebx
+; X86-NEXT: subl %edi, %ebx
+; X86-NEXT: movl %esi, %edi
+; X86-NEXT: sbbl %edx, %edi
+; X86-NEXT: movl %esi, %edx
; X86-NEXT: sbbl %ecx, %edx
-; X86-NEXT: sbbl %eax, %edi
+; X86-NEXT: sbbl %eax, %esi
; X86-NEXT: movl 8(%ebp), %eax
; X86-NEXT: movl %ebx, (%eax)
-; X86-NEXT: movl %esi, 4(%eax)
+; X86-NEXT: movl %edi, 4(%eax)
; X86-NEXT: movl %edx, 8(%eax)
-; X86-NEXT: movl %edi, 12(%eax)
+; X86-NEXT: movl %esi, 12(%eax)
; X86-NEXT: leal -12(%ebp), %esp
; X86-NEXT: popl %esi
; X86-NEXT: popl %edi
diff --git a/llvm/test/CodeGen/X86/avg.ll b/llvm/test/CodeGen/X86/avg.ll
index 9be8166..0de308a 100644
--- a/llvm/test/CodeGen/X86/avg.ll
+++ b/llvm/test/CodeGen/X86/avg.ll
@@ -1734,20 +1734,20 @@ define void @not_avg_v16i8_wide_constants(ptr %a, ptr %b) nounwind {
; SSE2-LABEL: not_avg_v16i8_wide_constants:
; SSE2: # %bb.0:
; SSE2-NEXT: movaps (%rdi), %xmm1
-; SSE2-NEXT: movdqa (%rsi), %xmm2
+; SSE2-NEXT: movdqa (%rsi), %xmm0
; SSE2-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
; SSE2-NEXT: decl %eax
-; SSE2-NEXT: movd %eax, %xmm0
+; SSE2-NEXT: movd %eax, %xmm2
; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
; SSE2-NEXT: decl %eax
; SSE2-NEXT: movd %eax, %xmm1
; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
; SSE2-NEXT: decl %eax
-; SSE2-NEXT: movd %eax, %xmm4
+; SSE2-NEXT: movd %eax, %xmm3
; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
; SSE2-NEXT: decl %eax
-; SSE2-NEXT: movd %eax, %xmm3
+; SSE2-NEXT: movd %eax, %xmm4
; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
; SSE2-NEXT: decl %eax
; SSE2-NEXT: movd %eax, %xmm5
@@ -1762,6 +1762,9 @@ define void @not_avg_v16i8_wide_constants(ptr %a, ptr %b) nounwind {
; SSE2-NEXT: movd %eax, %xmm8
; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
; SSE2-NEXT: decl %eax
+; SSE2-NEXT: movd %eax, %xmm10
+; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
+; SSE2-NEXT: decl %eax
; SSE2-NEXT: movd %eax, %xmm9
; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
; SSE2-NEXT: decl %eax
@@ -1771,9 +1774,6 @@ define void @not_avg_v16i8_wide_constants(ptr %a, ptr %b) nounwind {
; SSE2-NEXT: movd %eax, %xmm12
; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
; SSE2-NEXT: decl %eax
-; SSE2-NEXT: movd %eax, %xmm10
-; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
-; SSE2-NEXT: decl %eax
; SSE2-NEXT: movd %eax, %xmm13
; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
; SSE2-NEXT: decl %eax
@@ -1783,43 +1783,45 @@ define void @not_avg_v16i8_wide_constants(ptr %a, ptr %b) nounwind {
; SSE2-NEXT: movd %eax, %xmm15
; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
; SSE2-NEXT: decl %eax
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; SSE2-NEXT: movd %eax, %xmm0
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3]
-; SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1]
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
+; SSE2-NEXT: movd %eax, %xmm2
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
+; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm4[0,0,0,0]
+; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
; SSE2-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3]
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm6[0,0,0,0]
+; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm6[0,0,0,0]
; SSE2-NEXT: punpcklwd {{.*#+}} xmm8 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3]
; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm8[0,0,0,0]
-; SSE2-NEXT: punpckhdq {{.*#+}} xmm4 = xmm4[2],xmm1[2],xmm4[3],xmm1[3]
-; SSE2-NEXT: movsd {{.*#+}} xmm4 = xmm3[0],xmm4[1]
+; SSE2-NEXT: punpckhdq {{.*#+}} xmm4 = xmm4[2],xmm3[2],xmm4[3],xmm3[3]
+; SSE2-NEXT: movsd {{.*#+}} xmm4 = xmm1[0],xmm4[1]
; SSE2-NEXT: pxor %xmm3, %xmm3
-; SSE2-NEXT: movdqa %xmm2, %xmm1
+; SSE2-NEXT: movdqa %xmm0, %xmm1
; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3],xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7]
; SSE2-NEXT: movapd %xmm4, %xmm5
; SSE2-NEXT: andpd %xmm1, %xmm5
; SSE2-NEXT: xorpd %xmm4, %xmm1
; SSE2-NEXT: psrlw $1, %xmm1
; SSE2-NEXT: paddw %xmm5, %xmm1
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm11 = xmm11[0],xmm9[0],xmm11[1],xmm9[1],xmm11[2],xmm9[2],xmm11[3],xmm9[3]
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm10 = xmm10[0],xmm12[0],xmm10[1],xmm12[1],xmm10[2],xmm12[2],xmm10[3],xmm12[3]
-; SSE2-NEXT: punpckldq {{.*#+}} xmm10 = xmm10[0],xmm11[0],xmm10[1],xmm11[1]
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm9 = xmm9[0],xmm10[0],xmm9[1],xmm10[1],xmm9[2],xmm10[2],xmm9[3],xmm10[3]
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm12 = xmm12[0],xmm11[0],xmm12[1],xmm11[1],xmm12[2],xmm11[2],xmm12[3],xmm11[3]
+; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm12[0,0,0,0]
+; SSE2-NEXT: punpckldq {{.*#+}} xmm9 = xmm9[0],xmm4[0],xmm9[1],xmm4[1]
; SSE2-NEXT: punpcklwd {{.*#+}} xmm14 = xmm14[0],xmm13[0],xmm14[1],xmm13[1],xmm14[2],xmm13[2],xmm14[3],xmm13[3]
; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm14[0,0,0,0]
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm15[0],xmm0[1],xmm15[1],xmm0[2],xmm15[2],xmm0[3],xmm15[3]
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
-; SSE2-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm4[2],xmm0[3],xmm4[3]
-; SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm10[0],xmm0[1]
-; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm3[8],xmm2[9],xmm3[9],xmm2[10],xmm3[10],xmm2[11],xmm3[11],xmm2[12],xmm3[12],xmm2[13],xmm3[13],xmm2[14],xmm3[14],xmm2[15],xmm3[15]
-; SSE2-NEXT: movapd %xmm0, %xmm3
-; SSE2-NEXT: andpd %xmm2, %xmm3
-; SSE2-NEXT: xorpd %xmm0, %xmm2
-; SSE2-NEXT: psrlw $1, %xmm2
-; SSE2-NEXT: paddw %xmm3, %xmm2
-; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
-; SSE2-NEXT: pand %xmm0, %xmm2
-; SSE2-NEXT: pand %xmm0, %xmm1
-; SSE2-NEXT: packuswb %xmm2, %xmm1
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm15[0],xmm2[1],xmm15[1],xmm2[2],xmm15[2],xmm2[3],xmm15[3]
+; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,0,0]
+; SSE2-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm4[2],xmm2[3],xmm4[3]
+; SSE2-NEXT: movsd {{.*#+}} xmm2 = xmm9[0],xmm2[1]
+; SSE2-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm3[8],xmm0[9],xmm3[9],xmm0[10],xmm3[10],xmm0[11],xmm3[11],xmm0[12],xmm3[12],xmm0[13],xmm3[13],xmm0[14],xmm3[14],xmm0[15],xmm3[15]
+; SSE2-NEXT: movapd %xmm2, %xmm3
+; SSE2-NEXT: andpd %xmm0, %xmm3
+; SSE2-NEXT: xorpd %xmm2, %xmm0
+; SSE2-NEXT: psrlw $1, %xmm0
+; SSE2-NEXT: paddw %xmm3, %xmm0
+; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
+; SSE2-NEXT: pand %xmm2, %xmm0
+; SSE2-NEXT: pand %xmm2, %xmm1
+; SSE2-NEXT: packuswb %xmm0, %xmm1
; SSE2-NEXT: movdqu %xmm1, (%rax)
; SSE2-NEXT: retq
;
@@ -1829,71 +1831,75 @@ define void @not_avg_v16i8_wide_constants(ptr %a, ptr %b) nounwind {
; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm3 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
-; AVX1-NEXT: vpextrw $3, %xmm3, %edx
-; AVX1-NEXT: vpextrw $2, %xmm3, %ecx
-; AVX1-NEXT: vpextrw $1, %xmm3, %eax
+; AVX1-NEXT: vpextrw $7, %xmm3, %edx
+; AVX1-NEXT: vpextrw $6, %xmm3, %ecx
+; AVX1-NEXT: vpextrw $5, %xmm3, %eax
; AVX1-NEXT: decl %edx
; AVX1-NEXT: vmovd %edx, %xmm4
-; AVX1-NEXT: vpextrw $0, %xmm3, %edx
+; AVX1-NEXT: vpextrw $4, %xmm3, %edx
; AVX1-NEXT: decl %ecx
; AVX1-NEXT: vmovd %ecx, %xmm5
-; AVX1-NEXT: vpextrw $3, %xmm2, %ecx
+; AVX1-NEXT: vpextrw $1, %xmm3, %ecx
; AVX1-NEXT: decl %eax
; AVX1-NEXT: vmovd %eax, %xmm6
-; AVX1-NEXT: vpextrw $2, %xmm2, %eax
+; AVX1-NEXT: vpextrw $0, %xmm3, %eax
; AVX1-NEXT: decl %edx
; AVX1-NEXT: vmovd %edx, %xmm7
-; AVX1-NEXT: vpextrw $1, %xmm2, %edx
-; AVX1-NEXT: decl %ecx
-; AVX1-NEXT: vmovd %ecx, %xmm8
-; AVX1-NEXT: vpextrw $0, %xmm2, %ecx
-; AVX1-NEXT: decl %eax
-; AVX1-NEXT: vmovd %eax, %xmm9
-; AVX1-NEXT: vpextrw $7, %xmm3, %eax
+; AVX1-NEXT: vpextrw $3, %xmm3, %edx
+; AVX1-NEXT: decq %rcx
+; AVX1-NEXT: vmovq %rcx, %xmm8
+; AVX1-NEXT: vpextrw $2, %xmm3, %ecx
+; AVX1-NEXT: decq %rax
+; AVX1-NEXT: vmovq %rax, %xmm3
+; AVX1-NEXT: vpextrw $7, %xmm2, %eax
; AVX1-NEXT: decl %edx
-; AVX1-NEXT: vmovd %edx, %xmm10
-; AVX1-NEXT: vpextrw $6, %xmm3, %edx
+; AVX1-NEXT: vmovd %edx, %xmm9
+; AVX1-NEXT: vpextrw $6, %xmm2, %edx
; AVX1-NEXT: decl %ecx
-; AVX1-NEXT: vmovd %ecx, %xmm11
-; AVX1-NEXT: vpextrw $7, %xmm2, %ecx
+; AVX1-NEXT: vmovd %ecx, %xmm10
+; AVX1-NEXT: vpextrw $5, %xmm2, %ecx
; AVX1-NEXT: decl %eax
-; AVX1-NEXT: vmovd %eax, %xmm12
-; AVX1-NEXT: vpextrw $6, %xmm2, %eax
+; AVX1-NEXT: vmovd %eax, %xmm11
+; AVX1-NEXT: vpextrw $4, %xmm2, %eax
; AVX1-NEXT: decl %edx
-; AVX1-NEXT: vmovd %edx, %xmm13
-; AVX1-NEXT: vpextrw $5, %xmm3, %edx
+; AVX1-NEXT: vmovd %edx, %xmm12
+; AVX1-NEXT: vpextrw $1, %xmm2, %edx
; AVX1-NEXT: decl %ecx
-; AVX1-NEXT: vmovd %ecx, %xmm14
-; AVX1-NEXT: vpextrw $4, %xmm3, %ecx
+; AVX1-NEXT: vmovd %ecx, %xmm13
+; AVX1-NEXT: vpextrw $0, %xmm2, %ecx
; AVX1-NEXT: decl %eax
-; AVX1-NEXT: vmovd %eax, %xmm3
-; AVX1-NEXT: vpextrw $5, %xmm2, %eax
-; AVX1-NEXT: decl %edx
-; AVX1-NEXT: vmovd %edx, %xmm15
-; AVX1-NEXT: vpextrw $4, %xmm2, %edx
-; AVX1-NEXT: decl %ecx
-; AVX1-NEXT: vmovd %ecx, %xmm2
+; AVX1-NEXT: vmovd %eax, %xmm14
+; AVX1-NEXT: vpextrw $3, %xmm2, %eax
+; AVX1-NEXT: decq %rdx
+; AVX1-NEXT: vmovq %rdx, %xmm15
+; AVX1-NEXT: vpextrw $2, %xmm2, %edx
+; AVX1-NEXT: decq %rcx
+; AVX1-NEXT: vmovq %rcx, %xmm2
; AVX1-NEXT: decl %eax
; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3]
; AVX1-NEXT: vmovd %eax, %xmm5
; AVX1-NEXT: decl %edx
; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3]
; AVX1-NEXT: vmovd %edx, %xmm7
-; AVX1-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm6[0],xmm4[0],xmm6[1],xmm4[1]
-; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3]
-; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm11[0],xmm10[0],xmm11[1],xmm10[1],xmm11[2],xmm10[2],xmm11[3],xmm10[3]
-; AVX1-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm8[0],xmm6[0],xmm8[1],xmm6[1]
-; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm6, %ymm4
-; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm13[0],xmm12[0],xmm13[1],xmm12[1],xmm13[2],xmm12[2],xmm13[3],xmm12[3]
-; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm14[0],xmm3[1],xmm14[1],xmm3[2],xmm14[2],xmm3[3],xmm14[3]
-; AVX1-NEXT: vinsertf128 $1, %xmm6, %ymm3, %ymm3
+; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,0,0,0]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[0,1,0,1]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm6[0,1,2,3,4,5],xmm4[6,7]
+; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm8[0],xmm3[1],xmm8[1],xmm3[2],xmm8[2],xmm3[3],xmm8[3]
+; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm10[0],xmm9[0],xmm10[1],xmm9[1],xmm10[2],xmm9[2],xmm10[3],xmm9[3]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[0,0,1,1]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm6[2,3],xmm3[4,5,6,7]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm4[4,5,6,7]
+; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm12[0],xmm11[0],xmm12[1],xmm11[1],xmm12[2],xmm11[2],xmm12[3],xmm11[3]
+; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm14[0],xmm13[0],xmm14[1],xmm13[1],xmm14[2],xmm13[2],xmm14[3],xmm13[3]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,0,0,0]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[0,1,0,1]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm6[0,1,2,3,4,5],xmm4[6,7]
; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm15[0],xmm2[1],xmm15[1],xmm2[2],xmm15[2],xmm2[3],xmm15[3]
-; AVX1-NEXT: vshufps {{.*#+}} ymm3 = ymm3[0,0,0,0,4,4,4,4]
; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm7[0],xmm5[0],xmm7[1],xmm5[1],xmm7[2],xmm5[2],xmm7[3],xmm5[3]
-; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm5, %ymm2
-; AVX1-NEXT: vmovddup {{.*#+}} ymm2 = ymm2[0,0,2,2]
-; AVX1-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2],ymm3[3],ymm2[4,5,6],ymm3[7]
-; AVX1-NEXT: vblendps {{.*#+}} ymm2 = ymm4[0,1],ymm2[2,3],ymm4[4,5],ymm2[6,7]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[0,0,1,1]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm5[2,3],xmm2[4,5,6,7]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm4[4,5,6,7]
+; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
; AVX1-NEXT: vandps %ymm0, %ymm2, %ymm1
; AVX1-NEXT: vxorps %ymm0, %ymm2, %ymm0
diff --git a/llvm/test/CodeGen/X86/catchret-empty-fallthrough.ll b/llvm/test/CodeGen/X86/catchret-empty-fallthrough.ll
index ab9fa22..24d3030 100644
--- a/llvm/test/CodeGen/X86/catchret-empty-fallthrough.ll
+++ b/llvm/test/CodeGen/X86/catchret-empty-fallthrough.ll
@@ -48,6 +48,6 @@ return: ; preds = %catch, %entry
; CHECK-NEXT: .long (.Llsda_end0-.Llsda_begin0)/16
; CHECK-NEXT: .Llsda_begin0:
; CHECK-NEXT: .long .Ltmp0@IMGREL
-; CHECK-NEXT: .long .Ltmp1@IMGREL+1
+; CHECK-NEXT: .long .Ltmp1@IMGREL
; CHECK-NEXT: .long 1
; CHECK-NEXT: .long .LBB0_[[catch]]@IMGREL
diff --git a/llvm/test/CodeGen/X86/conditional-tailcall-pgso.ll b/llvm/test/CodeGen/X86/conditional-tailcall-pgso.ll
index c4c194e..7855ff2 100644
--- a/llvm/test/CodeGen/X86/conditional-tailcall-pgso.ll
+++ b/llvm/test/CodeGen/X86/conditional-tailcall-pgso.ll
@@ -121,7 +121,6 @@ define void @f_non_leaf(i32 %x, i32 %y) !prof !14 {
; WIN64-NEXT: # encoding: [0xeb,A]
; WIN64-NEXT: # fixup A - offset: 1, value: foo, kind: FK_PCRel_1
; WIN64-NEXT: .LBB1_2: # %bb2
-; WIN64-NEXT: nop # encoding: [0x90]
; WIN64-NEXT: .seh_startepilogue
; WIN64-NEXT: popq %rbx # encoding: [0x5b]
; WIN64-NEXT: .seh_endepilogue
diff --git a/llvm/test/CodeGen/X86/conditional-tailcall.ll b/llvm/test/CodeGen/X86/conditional-tailcall.ll
index 9c1d830..2859a87 100644
--- a/llvm/test/CodeGen/X86/conditional-tailcall.ll
+++ b/llvm/test/CodeGen/X86/conditional-tailcall.ll
@@ -121,7 +121,6 @@ define void @f_non_leaf(i32 %x, i32 %y) optsize {
; WIN64-NEXT: # encoding: [0xeb,A]
; WIN64-NEXT: # fixup A - offset: 1, value: foo, kind: FK_PCRel_1
; WIN64-NEXT: .LBB1_2: # %bb2
-; WIN64-NEXT: nop # encoding: [0x90]
; WIN64-NEXT: .seh_startepilogue
; WIN64-NEXT: popq %rbx # encoding: [0x5b]
; WIN64-NEXT: .seh_endepilogue
diff --git a/llvm/test/CodeGen/X86/div-rem-pair-recomposition-signed.ll b/llvm/test/CodeGen/X86/div-rem-pair-recomposition-signed.ll
index 661e7bb..455b72d 100644
--- a/llvm/test/CodeGen/X86/div-rem-pair-recomposition-signed.ll
+++ b/llvm/test/CodeGen/X86/div-rem-pair-recomposition-signed.ll
@@ -172,10 +172,9 @@ define i128 @scalar_i128(i128 %x, i128 %y, ptr %divdst) nounwind {
; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: sbbl %eax, %edi
; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 52(%ebp), %ecx
-; X86-NEXT: movl %ecx, %edx
+; X86-NEXT: movl 52(%ebp), %esi
+; X86-NEXT: movl %esi, %edx
; X86-NEXT: sarl $31, %edx
-; X86-NEXT: movl %ecx, %esi
; X86-NEXT: xorl %edx, %esi
; X86-NEXT: movl 48(%ebp), %ecx
; X86-NEXT: xorl %edx, %ecx
@@ -204,45 +203,45 @@ define i128 @scalar_i128(i128 %x, i128 %y, ptr %divdst) nounwind {
; X86-NEXT: sete %al
; X86-NEXT: orb %cl, %al
; X86-NEXT: movb %al, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: bsrl %eax, %edx
+; X86-NEXT: bsrl %esi, %edx
; X86-NEXT: xorl $31, %edx
-; X86-NEXT: addl $32, %edx
-; X86-NEXT: bsrl %esi, %ecx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: bsrl %eax, %ecx
; X86-NEXT: xorl $31, %ecx
+; X86-NEXT: orl $32, %ecx
; X86-NEXT: testl %esi, %esi
-; X86-NEXT: cmovel %edx, %ecx
+; X86-NEXT: cmovnel %edx, %ecx
; X86-NEXT: bsrl %ebx, %edx
; X86-NEXT: xorl $31, %edx
; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: bsrl %edi, %edi
; X86-NEXT: xorl $31, %edi
-; X86-NEXT: addl $32, %edi
+; X86-NEXT: orl $32, %edi
; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: testl %ebx, %ebx
; X86-NEXT: cmovnel %edx, %edi
-; X86-NEXT: addl $64, %edi
+; X86-NEXT: orl $64, %edi
; X86-NEXT: movl %eax, %edx
; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: orl %esi, %edx
; X86-NEXT: cmovnel %ecx, %edi
-; X86-NEXT: bsrl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X86-NEXT: xorl $31, %edx
-; X86-NEXT: addl $32, %edx
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: bsrl %eax, %ecx
+; X86-NEXT: bsrl %eax, %edx
+; X86-NEXT: xorl $31, %edx
+; X86-NEXT: bsrl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
; X86-NEXT: xorl $31, %ecx
+; X86-NEXT: orl $32, %ecx
; X86-NEXT: testl %eax, %eax
-; X86-NEXT: cmovel %edx, %ecx
+; X86-NEXT: cmovnel %edx, %ecx
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
; X86-NEXT: bsrl %ebx, %esi
; X86-NEXT: xorl $31, %esi
; X86-NEXT: bsrl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
; X86-NEXT: xorl $31, %edx
-; X86-NEXT: addl $32, %edx
+; X86-NEXT: orl $32, %edx
; X86-NEXT: testl %ebx, %ebx
; X86-NEXT: cmovnel %esi, %edx
-; X86-NEXT: addl $64, %edx
+; X86-NEXT: orl $64, %edx
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
; X86-NEXT: orl %eax, %esi
; X86-NEXT: cmovnel %ecx, %edx
@@ -380,9 +379,9 @@ define i128 @scalar_i128(i128 %x, i128 %y, ptr %divdst) nounwind {
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; X86-NEXT: adcl $-1, %eax
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: adcl $-1, %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: adcl $-1, %ecx
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X86-NEXT: adcl $-1, %ecx
; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
diff --git a/llvm/test/CodeGen/X86/div-rem-pair-recomposition-unsigned.ll b/llvm/test/CodeGen/X86/div-rem-pair-recomposition-unsigned.ll
index 370e1c6..859e924 100644
--- a/llvm/test/CodeGen/X86/div-rem-pair-recomposition-unsigned.ll
+++ b/llvm/test/CodeGen/X86/div-rem-pair-recomposition-unsigned.ll
@@ -173,17 +173,17 @@ define i128 @scalar_i128(i128 %x, i128 %y, ptr %divdst) nounwind {
; X86-NEXT: xorl $31, %edx
; X86-NEXT: bsrl 48(%ebp), %ecx
; X86-NEXT: xorl $31, %ecx
-; X86-NEXT: addl $32, %ecx
+; X86-NEXT: orl $32, %ecx
; X86-NEXT: testl %esi, %esi
; X86-NEXT: cmovnel %edx, %ecx
; X86-NEXT: bsrl %edi, %edx
; X86-NEXT: xorl $31, %edx
; X86-NEXT: bsrl %ebx, %eax
; X86-NEXT: xorl $31, %eax
-; X86-NEXT: addl $32, %eax
+; X86-NEXT: orl $32, %eax
; X86-NEXT: testl %edi, %edi
; X86-NEXT: cmovnel %edx, %eax
-; X86-NEXT: addl $64, %eax
+; X86-NEXT: orl $64, %eax
; X86-NEXT: movl 48(%ebp), %edx
; X86-NEXT: orl %esi, %edx
; X86-NEXT: cmovnel %ecx, %eax
@@ -193,7 +193,7 @@ define i128 @scalar_i128(i128 %x, i128 %y, ptr %divdst) nounwind {
; X86-NEXT: movl 32(%ebp), %ecx
; X86-NEXT: bsrl %ecx, %ecx
; X86-NEXT: xorl $31, %ecx
-; X86-NEXT: addl $32, %ecx
+; X86-NEXT: orl $32, %ecx
; X86-NEXT: testl %ebx, %ebx
; X86-NEXT: cmovnel %edx, %ecx
; X86-NEXT: movl 28(%ebp), %edi
@@ -201,10 +201,10 @@ define i128 @scalar_i128(i128 %x, i128 %y, ptr %divdst) nounwind {
; X86-NEXT: xorl $31, %esi
; X86-NEXT: bsrl 24(%ebp), %edx
; X86-NEXT: xorl $31, %edx
-; X86-NEXT: addl $32, %edx
+; X86-NEXT: orl $32, %edx
; X86-NEXT: testl %edi, %edi
; X86-NEXT: cmovnel %esi, %edx
-; X86-NEXT: addl $64, %edx
+; X86-NEXT: orl $64, %edx
; X86-NEXT: movl 32(%ebp), %esi
; X86-NEXT: orl %ebx, %esi
; X86-NEXT: cmovnel %ecx, %edx
diff --git a/llvm/test/CodeGen/X86/freeze-vector.ll b/llvm/test/CodeGen/X86/freeze-vector.ll
index 0f66d42..953a5e7 100644
--- a/llvm/test/CodeGen/X86/freeze-vector.ll
+++ b/llvm/test/CodeGen/X86/freeze-vector.ll
@@ -171,15 +171,15 @@ define void @freeze_extractelement(ptr %origin0, ptr %origin1, ptr %dst) nounwin
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT: vmovdqa (%ecx), %xmm0
-; X86-NEXT: vpand (%edx), %xmm0, %xmm0
+; X86-NEXT: vmovdqa (%edx), %xmm0
+; X86-NEXT: vpand (%ecx), %xmm0, %xmm0
; X86-NEXT: vpextrb $6, %xmm0, (%eax)
; X86-NEXT: retl
;
; X64-LABEL: freeze_extractelement:
; X64: # %bb.0:
-; X64-NEXT: vmovdqa (%rsi), %xmm0
-; X64-NEXT: vpand (%rdi), %xmm0, %xmm0
+; X64-NEXT: vmovdqa (%rdi), %xmm0
+; X64-NEXT: vpand (%rsi), %xmm0, %xmm0
; X64-NEXT: vpextrb $6, %xmm0, (%rdx)
; X64-NEXT: retq
%i0 = load <16 x i8>, ptr %origin0
@@ -198,8 +198,8 @@ define void @freeze_extractelement_escape(ptr %origin0, ptr %origin1, ptr %dst,
; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
; X86-NEXT: movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT: vmovdqa (%edx), %xmm0
-; X86-NEXT: vpand (%esi), %xmm0, %xmm0
+; X86-NEXT: vmovdqa (%esi), %xmm0
+; X86-NEXT: vpand (%edx), %xmm0, %xmm0
; X86-NEXT: vmovdqa %xmm0, (%ecx)
; X86-NEXT: vpextrb $6, %xmm0, (%eax)
; X86-NEXT: popl %esi
@@ -207,8 +207,8 @@ define void @freeze_extractelement_escape(ptr %origin0, ptr %origin1, ptr %dst,
;
; X64-LABEL: freeze_extractelement_escape:
; X64: # %bb.0:
-; X64-NEXT: vmovdqa (%rsi), %xmm0
-; X64-NEXT: vpand (%rdi), %xmm0, %xmm0
+; X64-NEXT: vmovdqa (%rdi), %xmm0
+; X64-NEXT: vpand (%rsi), %xmm0, %xmm0
; X64-NEXT: vmovdqa %xmm0, (%rcx)
; X64-NEXT: vpextrb $6, %xmm0, (%rdx)
; X64-NEXT: retq
@@ -239,8 +239,8 @@ define void @freeze_extractelement_extra_use(ptr %origin0, ptr %origin1, i64 %id
; X86-NEXT: movl 32(%ebp), %edx
; X86-NEXT: movl 12(%ebp), %esi
; X86-NEXT: movl 8(%ebp), %edi
-; X86-NEXT: vmovaps (%esi), %xmm0
-; X86-NEXT: vandps (%edi), %xmm0, %xmm0
+; X86-NEXT: vmovaps (%edi), %xmm0
+; X86-NEXT: vandps (%esi), %xmm0, %xmm0
; X86-NEXT: vmovaps %xmm0, (%esp)
; X86-NEXT: movzbl (%esp,%ecx), %ecx
; X86-NEXT: cmpb (%esp,%eax), %cl
@@ -255,8 +255,8 @@ define void @freeze_extractelement_extra_use(ptr %origin0, ptr %origin1, i64 %id
; X64: # %bb.0:
; X64-NEXT: andl $15, %ecx
; X64-NEXT: andl $15, %edx
-; X64-NEXT: vmovaps (%rsi), %xmm0
-; X64-NEXT: vandps (%rdi), %xmm0, %xmm0
+; X64-NEXT: vmovaps (%rdi), %xmm0
+; X64-NEXT: vandps (%rsi), %xmm0, %xmm0
; X64-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
; X64-NEXT: movzbl -24(%rsp,%rdx), %eax
; X64-NEXT: cmpb -24(%rsp,%rcx), %al
diff --git a/llvm/test/CodeGen/X86/noreturn-call-win64.ll b/llvm/test/CodeGen/X86/noreturn-call-win64.ll
index 57aa022..13be1f13 100644
--- a/llvm/test/CodeGen/X86/noreturn-call-win64.ll
+++ b/llvm/test/CodeGen/X86/noreturn-call-win64.ll
@@ -111,3 +111,15 @@ declare dso_local void @"??1MakeCleanup@@QEAA@XZ"(ptr)
; CHECK: # %unreachable
; CHECK: int3
; CHECK: .seh_handlerdata
+
+
+define dso_local void @last_call_no_return() {
+ call void @abort1()
+ unreachable
+}
+
+; CHECK-LABEL: last_call_no_return:
+; CHECK: callq abort1
+; CHECK-NEXT: int3
+; CHECK-NEXT: .seh_endproc
+
diff --git a/llvm/test/CodeGen/X86/pr149841.ll b/llvm/test/CodeGen/X86/pr149841.ll
new file mode 100644
index 0000000..c17a617
--- /dev/null
+++ b/llvm/test/CodeGen/X86/pr149841.ll
@@ -0,0 +1,34 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc < %s | FileCheck %s
+
+target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+%struct.bar = type { [5 x ptr] }
+
+@global = external dso_local global %struct.bar
+
+define i1 @foo(ptr %arg, i1 %arg1) {
+; CHECK-LABEL: foo:
+; CHECK: # %bb.0: # %bb
+; CHECK-NEXT: cmpq $global+1, %rdi
+; CHECK-NEXT: setne %al
+; CHECK-NEXT: andb %sil, %al
+; CHECK-NEXT: retq
+bb:
+ #dbg_value(ptr @global, !3, !DIExpression(), !5)
+ %icmp = icmp ne ptr %arg, getelementptr inbounds nuw (i8, ptr @global, i64 1)
+ %select = select i1 %arg1, i1 %icmp, i1 false
+ ret i1 %select
+}
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!2}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C11, file: !1, isOptimized: false, runtimeVersion: 0, emissionKind: NoDebug)
+!1 = !DIFile(filename: "x.c", directory: "/proc/self/cwd")
+!2 = !{i32 2, !"Debug Info Version", i32 3}
+!3 = !DILocalVariable(name: "x", arg: 1, scope: !4, file: !1)
+!4 = distinct !DISubprogram(name: "x", scope: null, file: !1, spFlags: DISPFlagDefinition, unit: !0)
+!5 = !DILocation(line: 0, scope: !4)
+
diff --git a/llvm/test/CodeGen/X86/seh-catch-all.ll b/llvm/test/CodeGen/X86/seh-catch-all.ll
index 5250bb9..4e25aab 100644
--- a/llvm/test/CodeGen/X86/seh-catch-all.ll
+++ b/llvm/test/CodeGen/X86/seh-catch-all.ll
@@ -40,7 +40,7 @@ catchall:
; CHECK-NEXT: .long (.Llsda_end0-.Llsda_begin0)/16
; CHECK-NEXT: .Llsda_begin0:
; CHECK-NEXT: .long .Ltmp{{[0-9]+}}@IMGREL
-; CHECK-NEXT: .long .Ltmp{{[0-9]+}}@IMGREL+1
+; CHECK-NEXT: .long .Ltmp{{[0-9]+}}@IMGREL
; CHECK-NEXT: .long 1
; CHECK-NEXT: .long .LBB0_2@IMGREL
; CHECK-NEXT: .Llsda_end0:
diff --git a/llvm/test/CodeGen/X86/seh-catchpad.ll b/llvm/test/CodeGen/X86/seh-catchpad.ll
index d958580..cb85f39 100644
--- a/llvm/test/CodeGen/X86/seh-catchpad.ll
+++ b/llvm/test/CodeGen/X86/seh-catchpad.ll
@@ -123,23 +123,23 @@ __except.ret: ; preds = %catch.dispatch.7
; CHECK-NEXT: .long (.Llsda_end0-.Llsda_begin0)/16
; CHECK-NEXT: .Llsda_begin0:
; CHECK-NEXT: .long .Ltmp0@IMGREL
-; CHECK-NEXT: .long .Ltmp1@IMGREL+1
+; CHECK-NEXT: .long .Ltmp1@IMGREL
; CHECK-NEXT: .long 1
; CHECK-NEXT: .long .LBB1_[[except1bb]]@IMGREL
; CHECK-NEXT: .long .Ltmp0@IMGREL
-; CHECK-NEXT: .long .Ltmp1@IMGREL+1
+; CHECK-NEXT: .long .Ltmp1@IMGREL
; CHECK-NEXT: .long "?filt$0@0@main@@"@IMGREL
; CHECK-NEXT: .long .LBB1_[[except2bb]]@IMGREL
; CHECK-NEXT: .long .Ltmp2@IMGREL
-; CHECK-NEXT: .long .Ltmp3@IMGREL+1
+; CHECK-NEXT: .long .Ltmp3@IMGREL
; CHECK-NEXT: .long "?dtor$[[finbb:[0-9]+]]@?0?main@4HA"@IMGREL
; CHECK-NEXT: .long 0
; CHECK-NEXT: .long .Ltmp2@IMGREL
-; CHECK-NEXT: .long .Ltmp3@IMGREL+1
+; CHECK-NEXT: .long .Ltmp3@IMGREL
; CHECK-NEXT: .long "?filt$0@0@main@@"@IMGREL
; CHECK-NEXT: .long .LBB1_3@IMGREL
; CHECK-NEXT: .long .Ltmp6@IMGREL
-; CHECK-NEXT: .long .Ltmp7@IMGREL+1
+; CHECK-NEXT: .long .Ltmp7@IMGREL
; CHECK-NEXT: .long "?filt$0@0@main@@"@IMGREL
; CHECK-NEXT: .long .LBB1_3@IMGREL
; CHECK-NEXT: .Llsda_end0:
diff --git a/llvm/test/CodeGen/X86/seh-except-finally.ll b/llvm/test/CodeGen/X86/seh-except-finally.ll
index 7f70655..539d776 100644
--- a/llvm/test/CodeGen/X86/seh-except-finally.ll
+++ b/llvm/test/CodeGen/X86/seh-except-finally.ll
@@ -83,15 +83,15 @@ __try.cont: ; preds = %__except, %invoke.c
; CHECK-NEXT: .long (.Llsda_end0-.Llsda_begin0)/16
; CHECK-NEXT: .Llsda_begin0:
; CHECK-NEXT: .long .Ltmp0@IMGREL
-; CHECK-NEXT: .long .Ltmp1@IMGREL+1
+; CHECK-NEXT: .long .Ltmp1@IMGREL
; CHECK-NEXT: .long "?dtor$2@?0?use_both@4HA"@IMGREL
; CHECK-NEXT: .long 0
; CHECK-NEXT: .long .Ltmp0@IMGREL
-; CHECK-NEXT: .long .Ltmp1@IMGREL+1
+; CHECK-NEXT: .long .Ltmp1@IMGREL
; CHECK-NEXT: .long "?filt$0@0@use_both@@"@IMGREL
; CHECK-NEXT: .long .LBB0_{{[0-9]+}}@IMGREL
; CHECK-NEXT: .long .Ltmp4@IMGREL
-; CHECK-NEXT: .long .Ltmp5@IMGREL+1
+; CHECK-NEXT: .long .Ltmp5@IMGREL
; CHECK-NEXT: .long "?filt$0@0@use_both@@"@IMGREL
; CHECK-NEXT: .long .LBB0_{{[0-9]+}}@IMGREL
; CHECK-NEXT: .Llsda_end0:
diff --git a/llvm/test/CodeGen/X86/seh-finally.ll b/llvm/test/CodeGen/X86/seh-finally.ll
index 41823df..6093e5e 100644
--- a/llvm/test/CodeGen/X86/seh-finally.ll
+++ b/llvm/test/CodeGen/X86/seh-finally.ll
@@ -30,7 +30,7 @@ lpad: ; preds = %entry
; X64-NEXT: .long (.Llsda_end0-.Llsda_begin0)/16 # Number of call sites
; X64-NEXT: .Llsda_begin0:
; X64-NEXT: .long .Ltmp0@IMGREL # LabelStart
-; X64-NEXT: .long .Ltmp1@IMGREL+1 # LabelEnd
+; X64-NEXT: .long .Ltmp1@IMGREL # LabelEnd
; X64-NEXT: .long "?dtor$2@?0?main@4HA"@IMGREL # FinallyFunclet
; X64-NEXT: .long 0 # Null
; X64-NEXT: .Llsda_end0:
diff --git a/llvm/test/CodeGen/X86/seh-safe-div.ll b/llvm/test/CodeGen/X86/seh-safe-div.ll
index 542d9f6..20169f8 100644
--- a/llvm/test/CodeGen/X86/seh-safe-div.ll
+++ b/llvm/test/CodeGen/X86/seh-safe-div.ll
@@ -60,6 +60,7 @@ __try.cont:
; CHECK: .Ltmp0:
; CHECK: leaq [[rloc:.*\(%rbp\)]], %rcx
; CHECK: callq try_body
+; CHECK: nop
; CHECK-NEXT: .Ltmp1
; CHECK: [[cont_bb:\.LBB0_[0-9]+]]:
; CHECK: movl [[rloc]], %eax
@@ -82,11 +83,11 @@ __try.cont:
; CHECK-NEXT: .long (.Llsda_end0-.Llsda_begin0)/16
; CHECK-NEXT: .Llsda_begin0:
; CHECK-NEXT: .long .Ltmp0@IMGREL
-; CHECK-NEXT: .long .Ltmp1@IMGREL+1
+; CHECK-NEXT: .long .Ltmp1@IMGREL
; CHECK-NEXT: .long safe_div_filt0@IMGREL
; CHECK-NEXT: .long [[handler0]]@IMGREL
; CHECK-NEXT: .long .Ltmp0@IMGREL
-; CHECK-NEXT: .long .Ltmp1@IMGREL+1
+; CHECK-NEXT: .long .Ltmp1@IMGREL
; CHECK-NEXT: .long safe_div_filt1@IMGREL
; CHECK-NEXT: .long [[handler1]]@IMGREL
; CHECK-NEXT: .Llsda_end0:
diff --git a/llvm/test/CodeGen/X86/seh-unwind-inline-asm-codegen.ll b/llvm/test/CodeGen/X86/seh-unwind-inline-asm-codegen.ll
index 2c576df..5a6aeb6 100644
--- a/llvm/test/CodeGen/X86/seh-unwind-inline-asm-codegen.ll
+++ b/llvm/test/CodeGen/X86/seh-unwind-inline-asm-codegen.ll
@@ -56,8 +56,8 @@ declare dso_local void @printf(ptr, ...)
; CHECK-NEXT:$ip2state$test:
; CHECK-NEXT: .long .Lfunc_begin0@IMGREL # IP
; CHECK-NEXT: .long -1 # ToState
-; CHECK-NEXT: .long .Ltmp0@IMGREL+1 # IP
+; CHECK-NEXT: .long .Ltmp0@IMGREL # IP
; CHECK-NEXT: .long 0 # ToState
-; CHECK-NEXT: .long .Ltmp1@IMGREL+1 # IP
+; CHECK-NEXT: .long .Ltmp1@IMGREL # IP
; CHECK-NEXT: .long -1 # ToState
diff --git a/llvm/test/CodeGen/X86/setcc-non-simple-type.ll b/llvm/test/CodeGen/X86/setcc-non-simple-type.ll
index d2b292f..2ac2be5 100644
--- a/llvm/test/CodeGen/X86/setcc-non-simple-type.ll
+++ b/llvm/test/CodeGen/X86/setcc-non-simple-type.ll
@@ -119,8 +119,8 @@ define void @failing(ptr %0, ptr %1) nounwind {
; CHECK-AVX2-NEXT: .LBB0_2: # %vector.body
; CHECK-AVX2-NEXT: # Parent Loop BB0_1 Depth=1
; CHECK-AVX2-NEXT: # => This Inner Loop Header: Depth=2
-; CHECK-AVX2-NEXT: vmovdqu 1024(%rdx,%rsi), %ymm5
-; CHECK-AVX2-NEXT: vextracti128 $1, %ymm5, %xmm6
+; CHECK-AVX2-NEXT: vmovdqu 1024(%rdx,%rsi), %xmm5
+; CHECK-AVX2-NEXT: vmovdqu 1040(%rdx,%rsi), %xmm6
; CHECK-AVX2-NEXT: vpextrq $1, %xmm5, %rdi
; CHECK-AVX2-NEXT: vpextrq $1, %xmm6, %r8
; CHECK-AVX2-NEXT: vmovq %xmm5, %r9
diff --git a/llvm/test/CodeGen/X86/stack-coloring-wineh.ll b/llvm/test/CodeGen/X86/stack-coloring-wineh.ll
index e2de2ff..74fe07e 100644
--- a/llvm/test/CodeGen/X86/stack-coloring-wineh.ll
+++ b/llvm/test/CodeGen/X86/stack-coloring-wineh.ll
@@ -84,12 +84,12 @@ define void @pr66984(ptr %arg) personality ptr @__CxxFrameHandler3 {
; X86_64-NEXT: movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
; X86_64-NEXT: .Ltmp0:
; X86_64-NEXT: callq throw
+; X86_64-NEXT: nop
; X86_64-NEXT: .Ltmp1:
; X86_64-NEXT: # %bb.1: # %bb14
; X86_64-NEXT: .LBB0_3: # Block address taken
; X86_64-NEXT: # %exit
; X86_64-NEXT: $ehgcr_0_3:
-; X86_64-NEXT: nop
; X86_64-NEXT: .seh_startepilogue
; X86_64-NEXT: addq $64, %rsp
; X86_64-NEXT: popq %rbp
diff --git a/llvm/test/CodeGen/X86/taildup-heapallocsite.ll b/llvm/test/CodeGen/X86/taildup-heapallocsite.ll
index 967e125..f3bef47 100644
--- a/llvm/test/CodeGen/X86/taildup-heapallocsite.ll
+++ b/llvm/test/CodeGen/X86/taildup-heapallocsite.ll
@@ -37,9 +37,11 @@ cond.end: ; preds = %entry, %cond.true
; CHECK: testq
; CHECK: je
; CHECK: callq alloc
+; CHECK-NEXT: nop
; CHECK-NEXT: [[L1:.Ltmp[0-9]+]]
; CHECK: jmp f2 # TAILCALL
; CHECK: callq alloc
+; CHECK-NEXT: nop
; CHECK-NEXT: [[L3:.Ltmp[0-9]+]]
; CHECK: jmp f2 # TAILCALL
diff --git a/llvm/test/CodeGen/X86/win-catchpad-nested-cxx.ll b/llvm/test/CodeGen/X86/win-catchpad-nested-cxx.ll
index bfb9c43..0bf8370 100644
--- a/llvm/test/CodeGen/X86/win-catchpad-nested-cxx.ll
+++ b/llvm/test/CodeGen/X86/win-catchpad-nested-cxx.ll
@@ -103,15 +103,15 @@ handler2:
; X64: $ip2state$try_in_catch:
; X64-NEXT: .long .Lfunc_begin0@IMGREL
; X64-NEXT: .long -1
-; X64-NEXT: .long .Ltmp0@IMGREL+1
+; X64-NEXT: .long .Ltmp0@IMGREL
; X64-NEXT: .long 0
-; X64-NEXT: .long .Ltmp1@IMGREL+1
+; X64-NEXT: .long .Ltmp1@IMGREL
; X64-NEXT: .long -1
; X64-NEXT: .long "?catch$2@?0?try_in_catch@4HA"@IMGREL
; X64-NEXT: .long 1
-; X64-NEXT: .long .Ltmp2@IMGREL+1
+; X64-NEXT: .long .Ltmp2@IMGREL
; X64-NEXT: .long 2
-; X64-NEXT: .long .Ltmp3@IMGREL+1
+; X64-NEXT: .long .Ltmp3@IMGREL
; X64-NEXT: .long 1
; X64-NEXT: .long "?catch$4@?0?try_in_catch@4HA"@IMGREL
; X64-NEXT: .long 3
diff --git a/llvm/test/CodeGen/X86/win-catchpad.ll b/llvm/test/CodeGen/X86/win-catchpad.ll
index 2491946..62ea510 100644
--- a/llvm/test/CodeGen/X86/win-catchpad.ll
+++ b/llvm/test/CodeGen/X86/win-catchpad.ll
@@ -214,9 +214,9 @@ try.cont:
; X64: $ip2state$try_catch_catch:
; X64-NEXT: .long .Lfunc_begin0@IMGREL
; X64-NEXT: .long -1
-; X64-NEXT: .long .Ltmp0@IMGREL+1
+; X64-NEXT: .long .Ltmp0@IMGREL
; X64-NEXT: .long 0
-; X64-NEXT: .long .Ltmp1@IMGREL+1
+; X64-NEXT: .long .Ltmp1@IMGREL
; X64-NEXT: .long -1
; X64-NEXT: .long "?catch$[[catch1bb]]@?0?try_catch_catch@4HA"@IMGREL
; X64-NEXT: .long 1
@@ -357,9 +357,9 @@ try.cont:
; X64-LABEL: $ip2state$branch_to_normal_dest:
; X64-NEXT: .long .Lfunc_begin1@IMGREL
; X64-NEXT: .long -1
-; X64-NEXT: .long .Ltmp[[before_call]]@IMGREL+1
+; X64-NEXT: .long .Ltmp[[before_call]]@IMGREL
; X64-NEXT: .long 0
-; X64-NEXT: .long .Ltmp[[after_call]]@IMGREL+1
+; X64-NEXT: .long .Ltmp[[after_call]]@IMGREL
; X64-NEXT: .long -1
; X64-NEXT: .long "?catch$[[catchbb]]@?0?branch_to_normal_dest@4HA"@IMGREL
; X64-NEXT: .long 1
diff --git a/llvm/test/CodeGen/X86/win-cleanuppad.ll b/llvm/test/CodeGen/X86/win-cleanuppad.ll
index e3f7f5b..e9265a1 100644
--- a/llvm/test/CodeGen/X86/win-cleanuppad.ll
+++ b/llvm/test/CodeGen/X86/win-cleanuppad.ll
@@ -191,7 +191,7 @@ cleanup.outer: ; preds = %invoke.cont.1, %c
; X64-NEXT: .long 1
; X64-NEXT: .long .Ltmp6@IMGREL
; X64-NEXT: .long 0
-; X64-NEXT: .long .Ltmp7@IMGREL+1
+; X64-NEXT: .long .Ltmp7@IMGREL
; X64-NEXT: .long -1
attributes #0 = { "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-realign-stack" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
diff --git a/llvm/test/CodeGen/X86/win32-eh-states.ll b/llvm/test/CodeGen/X86/win32-eh-states.ll
index 42ae5b0..e645199 100644
--- a/llvm/test/CodeGen/X86/win32-eh-states.ll
+++ b/llvm/test/CodeGen/X86/win32-eh-states.ll
@@ -86,11 +86,11 @@ catch.7:
; X64-LABEL: $ip2state$f:
; X64-NEXT: .long .Lfunc_begin0@IMGREL
; X64-NEXT: .long -1
-; X64-NEXT: .long .Ltmp{{.*}}@IMGREL+1
+; X64-NEXT: .long .Ltmp{{.*}}@IMGREL
; X64-NEXT: .long 0
-; X64-NEXT: .long .Ltmp{{.*}}@IMGREL+1
+; X64-NEXT: .long .Ltmp{{.*}}@IMGREL
; X64-NEXT: .long 1
-; X64-NEXT: .long .Ltmp{{.*}}@IMGREL+1
+; X64-NEXT: .long .Ltmp{{.*}}@IMGREL
; X64-NEXT: .long -1
; X64-NEXT: .long "?catch${{.*}}@?0?f@4HA"@IMGREL
; X64-NEXT: .long 2
@@ -189,15 +189,15 @@ unreachable: ; preds = %entry
; X64-LABEL: $ip2state$g:
; X64-NEXT: .long .Lfunc_begin1@IMGREL
; X64-NEXT: .long -1
-; X64-NEXT: .long .Ltmp{{.*}}@IMGREL+1
+; X64-NEXT: .long .Ltmp{{.*}}@IMGREL
; X64-NEXT: .long 1
-; X64-NEXT: .long .Ltmp{{.*}}@IMGREL+1
+; X64-NEXT: .long .Ltmp{{.*}}@IMGREL
; X64-NEXT: .long -1
; X64-NEXT: .long "?catch${{.*}}@?0?g@4HA"@IMGREL
; X64-NEXT: .long 2
-; X64-NEXT: .long .Ltmp{{.*}}@IMGREL+1
+; X64-NEXT: .long .Ltmp{{.*}}@IMGREL
; X64-NEXT: .long 3
-; X64-NEXT: .long .Ltmp{{.*}}@IMGREL+1
+; X64-NEXT: .long .Ltmp{{.*}}@IMGREL
; X64-NEXT: .long 2
diff --git a/llvm/test/CodeGen/X86/win64-seh-epilogue-statepoint.ll b/llvm/test/CodeGen/X86/win64-seh-epilogue-statepoint.ll
index bc5be7a..75f156f 100644
--- a/llvm/test/CodeGen/X86/win64-seh-epilogue-statepoint.ll
+++ b/llvm/test/CodeGen/X86/win64-seh-epilogue-statepoint.ll
@@ -8,8 +8,8 @@ define i32 @foobar() gc "statepoint-example" personality ptr @__gxx_personality_
; CHECK-NEXT: .seh_stackalloc 40
; CHECK-NEXT: .seh_endprologue
; CHECK-NEXT: callq bar
-; CHECK-NEXT: .Ltmp0:
; CHECK-NEXT: nop
+; CHECK-NEXT: .Ltmp0:
; CHECK-NEXT: .seh_startepilogue
; CHECK-NEXT: addq $40, %rsp
; CHECK-NEXT: .seh_endepilogue
diff --git a/llvm/test/CodeGen/X86/wineh-coreclr.ll b/llvm/test/CodeGen/X86/wineh-coreclr.ll
index baf5eaa..a3d0fde 100644
--- a/llvm/test/CodeGen/X86/wineh-coreclr.ll
+++ b/llvm/test/CodeGen/X86/wineh-coreclr.ll
@@ -38,6 +38,7 @@ entry:
; CHECK: [[test1_before_f1:.+]]:
; CHECK-NEXT: movl $1, %ecx
; CHECK-NEXT: callq f
+; CHECK-NEXT: nop
; CHECK-NEXT: [[test1_after_f1:.+]]:
invoke void @f(i32 1)
to label %inner_try unwind label %finally
@@ -46,6 +47,7 @@ inner_try:
; CHECK: [[test1_before_f2:.+]]:
; CHECK-NEXT: movl $2, %ecx
; CHECK-NEXT: callq f
+; CHECK-NEXT: nop
; CHECK-NEXT: [[test1_after_f2:.+]]:
invoke void @f(i32 2)
to label %finally.clone unwind label %exn.dispatch
@@ -69,6 +71,7 @@ catch1:
; CHECK: [[test1_before_f3:.+]]:
; CHECK-NEXT: movl $3, %ecx
; CHECK-NEXT: callq f
+; CHECK-NEXT: nop
; CHECK-NEXT: [[test1_after_f3:.+]]:
invoke void @f(i32 3) [ "funclet"(token %catch.pad1) ]
to label %catch1.ret unwind label %finally
@@ -92,6 +95,7 @@ catch2:
; CHECK: [[test1_before_f4:.+]]:
; CHECK-NEXT: movl $4, %ecx
; CHECK-NEXT: callq f
+; CHECK-NEXT: nop
; CHECK-NEXT: [[test1_after_f4:.+]]:
invoke void @f(i32 4) [ "funclet"(token %catch.pad2) ]
to label %try_in_catch unwind label %finally
@@ -100,6 +104,7 @@ try_in_catch:
; CHECK: [[test1_before_f5:.+]]:
; CHECK-NEXT: movl $5, %ecx
; CHECK-NEXT: callq f
+; CHECK-NEXT: nop
; CHECK-NEXT: [[test1_after_f5:.+]]:
invoke void @f(i32 5) [ "funclet"(token %catch.pad2) ]
to label %catch2.ret unwind label %fault
@@ -116,6 +121,7 @@ fault:
; CHECK: [[test1_before_f6:.+]]:
; CHECK-NEXT: movl $6, %ecx
; CHECK-NEXT: callq f
+; CHECK-NEXT: nop
; CHECK-NEXT: [[test1_after_f6:.+]]:
invoke void @f(i32 6) [ "funclet"(token %fault.pad) ]
to label %fault.ret unwind label %finally
@@ -312,6 +318,7 @@ unreachable:
; CHECK: [[test2_before_f1:.+]]:
; CHECK-NEXT: movl $1, %ecx
; CHECK-NEXT: callq f
+; CHECK-NEXT: nop
; CHECK-NEXT: [[test2_after_f1:.+]]:
; CHECK: .seh_proc [[test2_catch1:[^ ]+]]
; CHECK: .seh_proc [[test2_catch2:[^ ]+]]
@@ -320,6 +327,7 @@ unreachable:
; CHECK: [[test2_before_f2:.+]]:
; CHECK-NEXT: movl $2, %ecx
; CHECK-NEXT: callq f
+; CHECK-NEXT: nop
; CHECK-NEXT: [[test2_after_f2:.+]]:
; CHECK: int3
; CHECK: [[test2_end:.*func_end.*]]:
@@ -448,6 +456,7 @@ entry:
; CHECK: [[test3_before_f1:.+]]:
; CHECK-NEXT: movl $1, %ecx
; CHECK-NEXT: callq f
+; CHECK-NEXT: nop
; CHECK-NEXT: [[test3_after_f1:.+]]:
invoke void @f(i32 1)
to label %exit unwind label %fault1
@@ -474,6 +483,7 @@ fault4:
; CHECK: [[test3_before_f6:.+]]:
; CHECK-NEXT: movl $6, %ecx
; CHECK-NEXT: callq f
+; CHECK-NEXT: nop
; CHECK-NEXT: [[test3_after_f6:.+]]:
invoke void @f(i32 6) ["funclet"(token %fault.pad4)]
to label %fault4.cont unwind label %exn.dispatch1
@@ -482,6 +492,7 @@ fault4.cont:
; CHECK: [[test3_before_f7:.+]]:
; CHECK-NEXT: movl $7, %ecx
; CHECK-NEXT: callq f
+; CHECK-NEXT: nop
; CHECK-NEXT: [[test3_after_f7:.+]]:
invoke void @f(i32 7) ["funclet"(token %fault.pad4)]
to label %unreachable unwind label %fault5
@@ -512,6 +523,7 @@ unreachable:
; CHECK: [[test3_before_f4:.+]]:
; CHECK-NEXT: movl $4, %ecx
; CHECK-NEXT: callq f
+; CHECK-NEXT: nop
; CHECK-NEXT: [[test3_after_f4:.+]]:
; CHECK: int3
; CHECK: .seh_proc [[test3_fault2:[^ ]+]]
@@ -520,6 +532,7 @@ unreachable:
; CHECK: [[test3_before_f3:.+]]:
; CHECK-NEXT: movl $3, %ecx
; CHECK-NEXT: callq f
+; CHECK-NEXT: nop
; CHECK-NEXT: [[test3_after_f3:.+]]:
; CHECK: int3
; CHECK: .seh_proc [[test3_fault1:[^ ]+]]
@@ -528,6 +541,7 @@ unreachable:
; CHECK: [[test3_before_f2:.+]]:
; CHECK-NEXT: movl $2, %ecx
; CHECK-NEXT: callq f
+; CHECK-NEXT: nop
; CHECK-NEXT: [[test3_after_f2:.+]]:
; CHECK: int3
; CHECK: [[test3_end:.*func_end.*]]:
diff --git a/llvm/test/CodeGen/XCore/exception.ll b/llvm/test/CodeGen/XCore/exception.ll
index f222297..bb5f3f4 100644
--- a/llvm/test/CodeGen/XCore/exception.ll
+++ b/llvm/test/CodeGen/XCore/exception.ll
@@ -60,7 +60,7 @@ entry:
; CHECK: [[PRE_G:.L[a-zA-Z0-9_]+]]
; CHECK: bl g
; CHECK: [[POST_G:.L[a-zA-Z0-9_]+]]
-; CHECK: [[RETURN:.L[a-zA-Z0-9_]+]]
+; CHECK: [[RETURN:^.L[a-zA-Z0-9_]+]]
; CHECK: ldw r6, sp[1]
; CHECK: ldw r5, sp[2]
; CHECK: ldw r4, sp[3]