aboutsummaryrefslogtreecommitdiff
path: root/llvm/test/CodeGen
diff options
context:
space:
mode:
Diffstat (limited to 'llvm/test/CodeGen')
-rw-r--r--llvm/test/CodeGen/AArch64/arm64-vhadd.ll82
-rw-r--r--llvm/test/CodeGen/AMDGPU/absdiff.ll104
-rw-r--r--llvm/test/CodeGen/AMDGPU/loop-vector-sink.ll236
-rw-r--r--llvm/test/CodeGen/AMDGPU/pei-build-spill-partial-agpr.mir10
-rw-r--r--llvm/test/CodeGen/AMDGPU/s_cmp_0.ll25
-rw-r--r--llvm/test/CodeGen/AMDGPU/spill-restore-partial-copy.mir324
-rw-r--r--llvm/test/CodeGen/AMDGPU/spill-to-agpr-partial.mir12
-rw-r--r--llvm/test/CodeGen/AMDGPU/vector-spill-restore-to-other-vector-type.mir12
-rw-r--r--llvm/test/CodeGen/AMDGPU/wait-before-stores-with-scope_sys.ll50
-rw-r--r--llvm/test/CodeGen/PowerPC/p10-spill-crun.ll2
-rw-r--r--llvm/test/CodeGen/PowerPC/vector-reduce-add.ll22
-rw-r--r--llvm/test/CodeGen/SPIRV/branching/OpSwitch64.ll6
-rw-r--r--llvm/test/CodeGen/SystemZ/inline-asm-flag-output-01.ll37
-rw-r--r--llvm/test/CodeGen/X86/AMX/amx-combine-undef.ll3
-rw-r--r--llvm/test/CodeGen/X86/AMX/amx-combine.ll3
-rw-r--r--llvm/test/CodeGen/X86/AMX/amx-configO2toO0-lower.ll3
-rw-r--r--llvm/test/CodeGen/X86/AMX/amx-type.ll3
-rw-r--r--llvm/test/CodeGen/X86/AMX/lat-combine-amx-bitcast.ll3
-rw-r--r--llvm/test/CodeGen/X86/AMX/lat-transform-amx-bitcast.ll3
-rw-r--r--llvm/test/CodeGen/X86/amx_tile_pair_lower_type_O0.ll3
-rw-r--r--llvm/test/CodeGen/X86/amx_tile_pair_lower_type_O2.ll3
-rw-r--r--llvm/test/CodeGen/X86/trunc-srl-load.ll1672
22 files changed, 2564 insertions, 54 deletions
diff --git a/llvm/test/CodeGen/AArch64/arm64-vhadd.ll b/llvm/test/CodeGen/AArch64/arm64-vhadd.ll
index 076cbf7..a505b42 100644
--- a/llvm/test/CodeGen/AArch64/arm64-vhadd.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-vhadd.ll
@@ -1408,6 +1408,88 @@ define <4 x i16> @ext_via_i19(<4 x i16> %a) {
ret <4 x i16> %t6
}
+define <8 x i8> @srhadd_v8i8_trunc(<8 x i8> %s0, <8 x i8> %s1) {
+; CHECK-LABEL: srhadd_v8i8_trunc:
+; CHECK: // %bb.0:
+; CHECK-NEXT: srhadd.8b v0, v0, v1
+; CHECK-NEXT: ret
+ %s0s = sext <8 x i8> %s0 to <8 x i16>
+ %s1s = sext <8 x i8> %s1 to <8 x i16>
+ %s = call <8 x i16> @llvm.aarch64.neon.urhadd.v8i16(<8 x i16> %s0s, <8 x i16> %s1s)
+ %s2 = trunc <8 x i16> %s to <8 x i8>
+ ret <8 x i8> %s2
+}
+
+define <4 x i16> @srhadd_v4i16_trunc(<4 x i16> %s0, <4 x i16> %s1) {
+; CHECK-LABEL: srhadd_v4i16_trunc:
+; CHECK: // %bb.0:
+; CHECK-NEXT: srhadd.4h v0, v0, v1
+; CHECK-NEXT: ret
+ %s0s = sext <4 x i16> %s0 to <4 x i32>
+ %s1s = sext <4 x i16> %s1 to <4 x i32>
+ %s = call <4 x i32> @llvm.aarch64.neon.urhadd.v4i32(<4 x i32> %s0s, <4 x i32> %s1s)
+ %s2 = trunc <4 x i32> %s to <4 x i16>
+ ret <4 x i16> %s2
+}
+
+define <2 x i32> @srhadd_v2i32_trunc(<2 x i32> %s0, <2 x i32> %s1) {
+; CHECK-LABEL: srhadd_v2i32_trunc:
+; CHECK: // %bb.0:
+; CHECK-NEXT: sshll.2d v0, v0, #0
+; CHECK-NEXT: sshll.2d v1, v1, #0
+; CHECK-NEXT: eor.16b v2, v0, v1
+; CHECK-NEXT: orr.16b v0, v0, v1
+; CHECK-NEXT: ushr.2d v1, v2, #1
+; CHECK-NEXT: sub.2d v0, v0, v1
+; CHECK-NEXT: xtn.2s v0, v0
+; CHECK-NEXT: ret
+ %s0s = sext <2 x i32> %s0 to <2 x i64>
+ %s1s = sext <2 x i32> %s1 to <2 x i64>
+ %s = call <2 x i64> @llvm.aarch64.neon.urhadd.v2i64(<2 x i64> %s0s, <2 x i64> %s1s)
+ %s2 = trunc <2 x i64> %s to <2 x i32>
+ ret <2 x i32> %s2
+}
+
+define <8 x i8> @urhadd_v8i8_trunc(<8 x i8> %s0, <8 x i8> %s1) {
+; CHECK-LABEL: urhadd_v8i8_trunc:
+; CHECK: // %bb.0:
+; CHECK-NEXT: urhadd.8b v0, v0, v1
+; CHECK-NEXT: ret
+ %s0s = zext <8 x i8> %s0 to <8 x i16>
+ %s1s = zext <8 x i8> %s1 to <8 x i16>
+ %s = call <8 x i16> @llvm.aarch64.neon.srhadd.v8i16(<8 x i16> %s0s, <8 x i16> %s1s)
+ %s2 = trunc <8 x i16> %s to <8 x i8>
+ ret <8 x i8> %s2
+}
+
+define <4 x i16> @urhadd_v4i16_trunc(<4 x i16> %s0, <4 x i16> %s1) {
+; CHECK-LABEL: urhadd_v4i16_trunc:
+; CHECK: // %bb.0:
+; CHECK-NEXT: urhadd.4h v0, v0, v1
+; CHECK-NEXT: ret
+ %s0s = zext <4 x i16> %s0 to <4 x i32>
+ %s1s = zext <4 x i16> %s1 to <4 x i32>
+ %s = call <4 x i32> @llvm.aarch64.neon.srhadd.v4i32(<4 x i32> %s0s, <4 x i32> %s1s)
+ %s2 = trunc <4 x i32> %s to <4 x i16>
+ ret <4 x i16> %s2
+}
+
+define <2 x i32> @urhadd_v2i32_trunc(<2 x i32> %s0, <2 x i32> %s1) {
+; CHECK-LABEL: urhadd_v2i32_trunc:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov w8, #1 // =0x1
+; CHECK-NEXT: uaddl.2d v0, v0, v1
+; CHECK-NEXT: dup.2d v1, x8
+; CHECK-NEXT: add.2d v0, v0, v1
+; CHECK-NEXT: shrn.2s v0, v0, #1
+; CHECK-NEXT: ret
+ %s0s = zext <2 x i32> %s0 to <2 x i64>
+ %s1s = zext <2 x i32> %s1 to <2 x i64>
+ %s = call <2 x i64> @llvm.aarch64.neon.srhadd.v2i64(<2 x i64> %s0s, <2 x i64> %s1s)
+ %s2 = trunc <2 x i64> %s to <2 x i32>
+ ret <2 x i32> %s2
+}
+
declare <8 x i8> @llvm.aarch64.neon.srhadd.v8i8(<8 x i8>, <8 x i8>)
declare <4 x i16> @llvm.aarch64.neon.srhadd.v4i16(<4 x i16>, <4 x i16>)
declare <2 x i32> @llvm.aarch64.neon.srhadd.v2i32(<2 x i32>, <2 x i32>)
diff --git a/llvm/test/CodeGen/AMDGPU/absdiff.ll b/llvm/test/CodeGen/AMDGPU/absdiff.ll
new file mode 100644
index 0000000..9cb397f
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/absdiff.ll
@@ -0,0 +1,104 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 < %s | FileCheck %s
+
+define amdgpu_ps i16 @absdiff_i16_false(i16 inreg %arg0, i16 inreg %arg1) {
+; CHECK-LABEL: absdiff_i16_false:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_sub_i32 s0, s0, s1
+; CHECK-NEXT: s_sext_i32_i16 s1, s0
+; CHECK-NEXT: s_sub_i32 s0, 0, s0
+; CHECK-NEXT: s_sext_i32_i16 s0, s0
+; CHECK-NEXT: s_max_i32 s0, s1, s0
+; CHECK-NEXT: ; return to shader part epilog
+ %diff = sub i16 %arg0, %arg1
+ %res = call i16 @llvm.abs.i16(i16 %diff, i1 false) ; INT_MIN input returns INT_MIN
+ ret i16 %res
+}
+
+define amdgpu_ps i16 @absdiff_i16_true(i16 inreg %arg0, i16 inreg %arg1) {
+; CHECK-LABEL: absdiff_i16_true:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_sub_i32 s0, s0, s1
+; CHECK-NEXT: s_sext_i32_i16 s1, s0
+; CHECK-NEXT: s_sub_i32 s0, 0, s0
+; CHECK-NEXT: s_sext_i32_i16 s0, s0
+; CHECK-NEXT: s_max_i32 s0, s1, s0
+; CHECK-NEXT: ; return to shader part epilog
+ %diff = sub i16 %arg0, %arg1
+ %res = call i16 @llvm.abs.i16(i16 %diff, i1 true) ; INT_MIN input returns poison
+ ret i16 %res
+}
+
+define amdgpu_ps i32 @absdiff_i32_false(i32 inreg %arg0, i32 inreg %arg1) {
+; CHECK-LABEL: absdiff_i32_false:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_absdiff_i32 s0, s0, s1
+; CHECK-NEXT: ; return to shader part epilog
+ %diff = sub i32 %arg0, %arg1
+ %res = call i32 @llvm.abs.i32(i32 %diff, i1 false) ; INT_MIN input returns INT_MIN
+ ret i32 %res
+}
+
+define amdgpu_ps i32 @absdiff_i32_true(i32 inreg %arg0, i32 inreg %arg1) {
+; CHECK-LABEL: absdiff_i32_true:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_absdiff_i32 s0, s0, s1
+; CHECK-NEXT: ; return to shader part epilog
+ %diff = sub i32 %arg0, %arg1
+ %res = call i32 @llvm.abs.i32(i32 %diff, i1 true) ; INT_MIN input returns poison
+ ret i32 %res
+}
+
+; Multiple uses of %diff. No benefit for using s_absdiff_i32.
+define amdgpu_ps i32 @absdiff_i32_false_multi_use(i32 inreg %arg0, i32 inreg %arg1) {
+; CHECK-LABEL: absdiff_i32_false_multi_use:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_sub_i32 s1, s0, s1
+; CHECK-NEXT: s_abs_i32 s0, s1
+; CHECK-NEXT: ;;#ASMSTART
+; CHECK-NEXT: ; use s1
+; CHECK-NEXT: ;;#ASMEND
+; CHECK-NEXT: ; return to shader part epilog
+ %diff = sub i32 %arg0, %arg1
+ %res = call i32 @llvm.abs.i32(i32 %diff, i1 false) ; INT_MIN input returns INT_MIN
+ call void asm "; use $0", "s"(i32 %diff)
+ ret i32 %res
+}
+
+define <2 x i32> @absdiff_2xi32_false(<2 x i32> %arg0, <2 x i32> %arg1) {
+; CHECK-LABEL: absdiff_2xi32_false:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: v_sub_u32_e32 v0, v0, v2
+; CHECK-NEXT: v_sub_u32_e32 v1, v1, v3
+; CHECK-NEXT: v_sub_u32_e32 v2, 0, v0
+; CHECK-NEXT: v_max_i32_e32 v0, v2, v0
+; CHECK-NEXT: v_sub_u32_e32 v2, 0, v1
+; CHECK-NEXT: v_max_i32_e32 v1, v2, v1
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+ %diff = sub <2 x i32> %arg0, %arg1
+ %res = call <2 x i32> @llvm.abs.v2i32(<2 x i32> %diff, i1 false) ; INT_MIN input returns INT_MIN
+ ret <2 x i32> %res
+}
+
+define <4 x i32> @absdiff_4xi32_false(<4 x i32> %arg0, <4 x i32> %arg1) {
+; CHECK-LABEL: absdiff_4xi32_false:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: v_sub_u32_e32 v0, v0, v4
+; CHECK-NEXT: v_sub_u32_e32 v1, v1, v5
+; CHECK-NEXT: v_sub_u32_e32 v4, 0, v0
+; CHECK-NEXT: v_sub_u32_e32 v2, v2, v6
+; CHECK-NEXT: v_max_i32_e32 v0, v4, v0
+; CHECK-NEXT: v_sub_u32_e32 v4, 0, v1
+; CHECK-NEXT: v_sub_u32_e32 v3, v3, v7
+; CHECK-NEXT: v_max_i32_e32 v1, v4, v1
+; CHECK-NEXT: v_sub_u32_e32 v4, 0, v2
+; CHECK-NEXT: v_max_i32_e32 v2, v4, v2
+; CHECK-NEXT: v_sub_u32_e32 v4, 0, v3
+; CHECK-NEXT: v_max_i32_e32 v3, v4, v3
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+ %diff = sub <4 x i32> %arg0, %arg1
+ %res = call <4 x i32> @llvm.abs.v4i32(<4 x i32> %diff, i1 false) ; INT_MIN input returns INT_MIN
+ ret <4 x i32> %res
+}
diff --git a/llvm/test/CodeGen/AMDGPU/loop-vector-sink.ll b/llvm/test/CodeGen/AMDGPU/loop-vector-sink.ll
new file mode 100644
index 0000000..670e2c5
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/loop-vector-sink.ll
@@ -0,0 +1,236 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
+; RUN: opt -S -passes='require<profile-summary>,function(codegenprepare)' -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 < %s | FileCheck -check-prefix=OPT %s
+
+; testing insert case
+define amdgpu_kernel void @runningSum(ptr addrspace(1) %out0, ptr addrspace(1) %out1, i32 %inputElement1, i32 %inputIter) {
+; OPT-LABEL: define amdgpu_kernel void @runningSum(
+; OPT-SAME: ptr addrspace(1) [[OUT0:%.*]], ptr addrspace(1) [[OUT1:%.*]], i32 [[INPUTELEMENT1:%.*]], i32 [[INPUTITER:%.*]]) #[[ATTR0:[0-9]+]] {
+; OPT-NEXT: [[PREHEADER:.*]]:
+; OPT-NEXT: [[VECELEMENT1:%.*]] = insertelement <2 x i32> poison, i32 [[INPUTELEMENT1]], i64 0
+; OPT-NEXT: [[TMP1:%.*]] = shufflevector <2 x i32> [[VECELEMENT1]], <2 x i32> poison, <2 x i32> zeroinitializer
+; OPT-NEXT: br label %[[LOOPBODY:.*]]
+; OPT: [[LOOPBODY]]:
+; OPT-NEXT: [[PREVIOUSSUM:%.*]] = phi <2 x i32> [ [[TMP1]], %[[PREHEADER]] ], [ [[RUNNINGSUM:%.*]], %[[LOOPBODY]] ]
+; OPT-NEXT: [[ITERCOUNT:%.*]] = phi i32 [ [[INPUTITER]], %[[PREHEADER]] ], [ [[ITERSLEFT:%.*]], %[[LOOPBODY]] ]
+; OPT-NEXT: [[RUNNINGSUM]] = add <2 x i32> [[TMP1]], [[PREVIOUSSUM]]
+; OPT-NEXT: [[ITERSLEFT]] = sub i32 [[ITERCOUNT]], 1
+; OPT-NEXT: [[COND:%.*]] = icmp eq i32 [[ITERSLEFT]], 0
+; OPT-NEXT: br i1 [[COND]], label %[[LOOPEXIT:.*]], label %[[LOOPBODY]]
+; OPT: [[LOOPEXIT]]:
+; OPT-NEXT: [[SUMELEMENT0:%.*]] = extractelement <2 x i32> [[RUNNINGSUM]], i64 0
+; OPT-NEXT: [[SUMELEMENT1:%.*]] = extractelement <2 x i32> [[RUNNINGSUM]], i64 1
+; OPT-NEXT: store i32 [[SUMELEMENT0]], ptr addrspace(1) [[OUT0]], align 4
+; OPT-NEXT: store i32 [[SUMELEMENT1]], ptr addrspace(1) [[OUT1]], align 4
+; OPT-NEXT: ret void
+;
+preheader:
+ %vecElement1 = insertelement <2 x i32> poison, i32 %inputElement1, i64 0
+ %broadcast1 = shufflevector <2 x i32> %vecElement1, <2 x i32> poison, <2 x i32> zeroinitializer
+ br label %loopBody
+
+loopBody:
+ %previousSum = phi <2 x i32> [ %broadcast1, %preheader ], [ %runningSum, %loopBody ]
+ %iterCount = phi i32 [ %inputIter, %preheader ], [ %itersLeft, %loopBody ]
+ %runningSum = add <2 x i32> %broadcast1, %previousSum
+ %itersLeft = sub i32 %iterCount, 1
+ %cond = icmp eq i32 %itersLeft, 0
+ br i1 %cond, label %loopExit, label %loopBody
+
+loopExit:
+ %sumElement0 = extractelement <2 x i32> %runningSum, i64 0
+ %sumElement1 = extractelement <2 x i32> %runningSum, i64 1
+ store i32 %sumElement0, ptr addrspace(1) %out0
+ store i32 %sumElement1, ptr addrspace(1) %out1
+ ret void
+}
+
+; testing extract case with single use - with divergent control flow
+; The vector has SINGLE use (extractelement), both sink into if.then
+define amdgpu_kernel void @test_sink_extract_single_use_operands(ptr addrspace(1) %out0, <2 x i32> %inputVec, i32 %tid, i32 %cond) {
+; OPT-LABEL: define amdgpu_kernel void @test_sink_extract_single_use_operands(
+; OPT-SAME: ptr addrspace(1) [[OUT0:%.*]], <2 x i32> [[INPUTVEC:%.*]], i32 [[TID:%.*]], i32 [[COND:%.*]]) #[[ATTR0]] {
+; OPT-NEXT: [[ENTRY:.*:]]
+; OPT-NEXT: [[RUNNINGSUM:%.*]] = add <2 x i32> [[INPUTVEC]], splat (i32 1)
+; OPT-NEXT: [[TMP0:%.*]] = extractelement <2 x i32> [[RUNNINGSUM]], i64 0
+; OPT-NEXT: [[CMP:%.*]] = icmp slt i32 [[TID]], [[COND]]
+; OPT-NEXT: br i1 [[CMP]], label %[[IF_THEN:.*]], label %[[IF_END:.*]]
+; OPT: [[IF_THEN]]:
+; OPT-NEXT: [[RESULT:%.*]] = add i32 [[TMP0]], 100
+; OPT-NEXT: store i32 [[RESULT]], ptr addrspace(1) [[OUT0]], align 4
+; OPT-NEXT: br label %[[IF_END]]
+; OPT: [[IF_END]]:
+; OPT-NEXT: ret void
+;
+entry:
+ %runningSum = add <2 x i32> %inputVec, <i32 1, i32 1>
+ %sumElement0 = extractelement <2 x i32> %runningSum, i64 0
+ %cmp = icmp slt i32 %tid, %cond
+ br i1 %cmp, label %if.then, label %if.end
+
+if.then:
+ %result = add i32 %sumElement0, 100
+ store i32 %result, ptr addrspace(1) %out0
+ br label %if.end
+
+if.end:
+ ret void
+}
+
+; testing extract case - extracting two elements with divergent control flow
+; The vector has TWO uses (two extractelements), all sink into if.then
+define amdgpu_kernel void @test_sink_extract_operands(ptr addrspace(1) %out0, ptr addrspace(1) %out1, <4 x i32> %input_vec, i32 %tid, i32 %cond) {
+; OPT-LABEL: define amdgpu_kernel void @test_sink_extract_operands(
+; OPT-SAME: ptr addrspace(1) [[OUT0:%.*]], ptr addrspace(1) [[OUT1:%.*]], <4 x i32> [[INPUT_VEC:%.*]], i32 [[TID:%.*]], i32 [[COND:%.*]]) #[[ATTR0]] {
+; OPT-NEXT: [[ENTRY:.*:]]
+; OPT-NEXT: [[VEC_FULL:%.*]] = add <4 x i32> [[INPUT_VEC]], <i32 42, i32 43, i32 44, i32 45>
+; OPT-NEXT: [[TMP0:%.*]] = extractelement <4 x i32> [[VEC_FULL]], i64 0
+; OPT-NEXT: [[TMP1:%.*]] = extractelement <4 x i32> [[VEC_FULL]], i64 1
+; OPT-NEXT: [[CMP:%.*]] = icmp slt i32 [[TID]], [[COND]]
+; OPT-NEXT: br i1 [[CMP]], label %[[IF_THEN:.*]], label %[[IF_END:.*]]
+; OPT: [[IF_THEN]]:
+; OPT-NEXT: [[RESULT0:%.*]] = add i32 [[TMP0]], 100
+; OPT-NEXT: [[RESULT1:%.*]] = add i32 [[TMP1]], 200
+; OPT-NEXT: store i32 [[RESULT0]], ptr addrspace(1) [[OUT0]], align 4
+; OPT-NEXT: store i32 [[RESULT1]], ptr addrspace(1) [[OUT1]], align 4
+; OPT-NEXT: br label %[[IF_END]]
+; OPT: [[IF_END]]:
+; OPT-NEXT: ret void
+;
+entry:
+ %vec_full = add <4 x i32> %input_vec, <i32 42, i32 43, i32 44, i32 45>
+ %extract0 = extractelement <4 x i32> %vec_full, i64 0
+ %extract1 = extractelement <4 x i32> %vec_full, i64 1
+ %cmp = icmp slt i32 %tid, %cond
+ br i1 %cmp, label %if.then, label %if.end
+
+if.then:
+ %result0 = add i32 %extract0, 100
+ %result1 = add i32 %extract1, 200
+ store i32 %result0, ptr addrspace(1) %out0
+ store i32 %result1, ptr addrspace(1) %out1
+ br label %if.end
+
+if.end:
+ ret void
+}
+
+; testing shuffle case with divergent control flow - shuffles sink into if.then
+define amdgpu_kernel void @test_shuffle_insert_subvector(ptr addrspace(1) %ptr, <4 x i16> %vec1, <4 x i16> %vec2, i32 %tid, i32 %cond) {
+; OPT-LABEL: define amdgpu_kernel void @test_shuffle_insert_subvector(
+; OPT-SAME: ptr addrspace(1) [[PTR:%.*]], <4 x i16> [[VEC1:%.*]], <4 x i16> [[VEC2:%.*]], i32 [[TID:%.*]], i32 [[COND:%.*]]) #[[ATTR0]] {
+; OPT-NEXT: [[ENTRY:.*:]]
+; OPT-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i16> [[VEC1]], <4 x i16> [[VEC2]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
+; OPT-NEXT: [[SHUFFLE2:%.*]] = shufflevector <4 x i16> [[VEC1]], <4 x i16> [[VEC2]], <4 x i32> <i32 2, i32 3, i32 6, i32 7>
+; OPT-NEXT: [[SHUFFLE3:%.*]] = shufflevector <4 x i16> [[VEC1]], <4 x i16> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; OPT-NEXT: [[SHUFFLE4:%.*]] = shufflevector <4 x i16> [[VEC2]], <4 x i16> poison, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
+; OPT-NEXT: [[SHUFFLE5:%.*]] = shufflevector <4 x i16> [[SHUFFLE]], <4 x i16> [[SHUFFLE2]], <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+; OPT-NEXT: [[CMP:%.*]] = icmp slt i32 [[TID]], [[COND]]
+; OPT-NEXT: br i1 [[CMP]], label %[[IF_THEN:.*]], label %[[IF_END:.*]]
+; OPT: [[IF_THEN]]:
+; OPT-NEXT: [[RESULT_VEC:%.*]] = add <4 x i16> [[SHUFFLE5]], <i16 100, i16 200, i16 300, i16 400>
+; OPT-NEXT: [[OTHER_RESULT:%.*]] = mul <4 x i16> [[SHUFFLE3]], splat (i16 2)
+; OPT-NEXT: [[MORE_RESULT:%.*]] = sub <4 x i16> [[SHUFFLE4]], splat (i16 5)
+; OPT-NEXT: store <4 x i16> [[RESULT_VEC]], ptr addrspace(1) [[PTR]], align 8
+; OPT-NEXT: store <4 x i16> [[OTHER_RESULT]], ptr addrspace(1) [[PTR]], align 8
+; OPT-NEXT: store <4 x i16> [[MORE_RESULT]], ptr addrspace(1) [[PTR]], align 8
+; OPT-NEXT: br label %[[IF_END]]
+; OPT: [[IF_END]]:
+; OPT-NEXT: ret void
+;
+entry:
+ %shuffle = shufflevector <4 x i16> %vec1, <4 x i16> %vec2, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
+ %shuffle2 = shufflevector <4 x i16> %vec1, <4 x i16> %vec2, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
+ %shuffle3 = shufflevector <4 x i16> %vec1, <4 x i16> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+ %shuffle4 = shufflevector <4 x i16> %vec2, <4 x i16> poison, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
+ %shuffle5 = shufflevector <4 x i16> %shuffle, <4 x i16> %shuffle2, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+ %cmp = icmp slt i32 %tid, %cond
+ br i1 %cmp, label %if.then, label %if.end
+
+if.then:
+ %result_vec = add <4 x i16> %shuffle5, <i16 100, i16 200, i16 300, i16 400>
+ %other_result = mul <4 x i16> %shuffle3, <i16 2, i16 2, i16 2, i16 2>
+ %more_result = sub <4 x i16> %shuffle4, <i16 5, i16 5, i16 5, i16 5>
+ store <4 x i16> %result_vec, ptr addrspace(1) %ptr
+ store <4 x i16> %other_result, ptr addrspace(1) %ptr
+ store <4 x i16> %more_result, ptr addrspace(1) %ptr
+ br label %if.end
+
+if.end:
+ ret void
+}
+
+; testing shuffle extract subvector with divergent control flow - shuffles sink into if.then
+define amdgpu_kernel void @test_shuffle_extract_subvector(ptr addrspace(1) %ptr, <4 x i16> %input_vec, i32 %tid, i32 %cond) {
+; OPT-LABEL: define amdgpu_kernel void @test_shuffle_extract_subvector(
+; OPT-SAME: ptr addrspace(1) [[PTR:%.*]], <4 x i16> [[INPUT_VEC:%.*]], i32 [[TID:%.*]], i32 [[COND:%.*]]) #[[ATTR0]] {
+; OPT-NEXT: [[ENTRY:.*:]]
+; OPT-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i16> [[INPUT_VEC]], <4 x i16> poison, <2 x i32> <i32 2, i32 3>
+; OPT-NEXT: [[SHUFFLE2:%.*]] = shufflevector <4 x i16> [[INPUT_VEC]], <4 x i16> poison, <2 x i32> <i32 0, i32 1>
+; OPT-NEXT: [[SHUFFLE3:%.*]] = shufflevector <4 x i16> [[INPUT_VEC]], <4 x i16> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; OPT-NEXT: [[CMP:%.*]] = icmp slt i32 [[TID]], [[COND]]
+; OPT-NEXT: br i1 [[CMP]], label %[[IF_THEN:.*]], label %[[IF_END:.*]]
+; OPT: [[IF_THEN]]:
+; OPT-NEXT: [[RESULT_VEC:%.*]] = add <2 x i16> [[SHUFFLE]], <i16 100, i16 200>
+; OPT-NEXT: [[RESULT_VEC2:%.*]] = mul <2 x i16> [[SHUFFLE2]], splat (i16 3)
+; OPT-NEXT: [[RESULT_VEC3:%.*]] = sub <4 x i16> [[SHUFFLE3]], splat (i16 10)
+; OPT-NEXT: store <2 x i16> [[RESULT_VEC]], ptr addrspace(1) [[PTR]], align 4
+; OPT-NEXT: store <2 x i16> [[RESULT_VEC2]], ptr addrspace(1) [[PTR]], align 4
+; OPT-NEXT: store <4 x i16> [[RESULT_VEC3]], ptr addrspace(1) [[PTR]], align 8
+; OPT-NEXT: br label %[[IF_END]]
+; OPT: [[IF_END]]:
+; OPT-NEXT: ret void
+;
+entry:
+ %shuffle = shufflevector <4 x i16> %input_vec, <4 x i16> poison, <2 x i32> <i32 2, i32 3>
+ %shuffle2 = shufflevector <4 x i16> %input_vec, <4 x i16> poison, <2 x i32> <i32 0, i32 1>
+ %shuffle3 = shufflevector <4 x i16> %input_vec, <4 x i16> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+ %cmp = icmp slt i32 %tid, %cond
+ br i1 %cmp, label %if.then, label %if.end
+
+if.then:
+ %result_vec = add <2 x i16> %shuffle, <i16 100, i16 200>
+ %result_vec2 = mul <2 x i16> %shuffle2, <i16 3, i16 3>
+ %result_vec3 = sub <4 x i16> %shuffle3, <i16 10, i16 10, i16 10, i16 10>
+ store <2 x i16> %result_vec, ptr addrspace(1) %ptr
+ store <2 x i16> %result_vec2, ptr addrspace(1) %ptr
+ store <4 x i16> %result_vec3, ptr addrspace(1) %ptr
+ br label %if.end
+
+if.end:
+ ret void
+}
+
+; testing shuffle sink with widening operations and divergent control flow
+define amdgpu_kernel void @test_shuffle_sink_operands(ptr addrspace(1) %ptr, <2 x i16> %input_vec, <2 x i16> %input_vec2, i32 %tid, i32 %cond) {
+; OPT-LABEL: define amdgpu_kernel void @test_shuffle_sink_operands(
+; OPT-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x i16> [[INPUT_VEC:%.*]], <2 x i16> [[INPUT_VEC2:%.*]], i32 [[TID:%.*]], i32 [[COND:%.*]]) #[[ATTR0]] {
+; OPT-NEXT: [[ENTRY:.*:]]
+; OPT-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x i16> [[INPUT_VEC]], <2 x i16> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+; OPT-NEXT: [[SHUFFLE2:%.*]] = shufflevector <2 x i16> [[INPUT_VEC2]], <2 x i16> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+; OPT-NEXT: [[CMP:%.*]] = icmp slt i32 [[TID]], [[COND]]
+; OPT-NEXT: br i1 [[CMP]], label %[[IF_THEN:.*]], label %[[IF_END:.*]]
+; OPT: [[IF_THEN]]:
+; OPT-NEXT: [[RESULT_VEC:%.*]] = add <4 x i16> [[SHUFFLE]], <i16 100, i16 200, i16 300, i16 400>
+; OPT-NEXT: [[RESULT_VEC2:%.*]] = mul <4 x i16> [[SHUFFLE2]], splat (i16 5)
+; OPT-NEXT: store <4 x i16> [[RESULT_VEC]], ptr addrspace(1) [[PTR]], align 8
+; OPT-NEXT: store <4 x i16> [[RESULT_VEC2]], ptr addrspace(1) [[PTR]], align 8
+; OPT-NEXT: br label %[[IF_END]]
+; OPT: [[IF_END]]:
+; OPT-NEXT: ret void
+;
+entry:
+ %shuffle = shufflevector <2 x i16> %input_vec, <2 x i16> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+ %shuffle2 = shufflevector <2 x i16> %input_vec2, <2 x i16> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+ %cmp = icmp slt i32 %tid, %cond
+ br i1 %cmp, label %if.then, label %if.end
+
+if.then:
+ %result_vec = add <4 x i16> %shuffle, <i16 100, i16 200, i16 300, i16 400>
+ %result_vec2 = mul <4 x i16> %shuffle2, <i16 5, i16 5, i16 5, i16 5>
+ store <4 x i16> %result_vec, ptr addrspace(1) %ptr
+ store <4 x i16> %result_vec2, ptr addrspace(1) %ptr
+ br label %if.end
+
+if.end:
+ ret void
+}
diff --git a/llvm/test/CodeGen/AMDGPU/pei-build-spill-partial-agpr.mir b/llvm/test/CodeGen/AMDGPU/pei-build-spill-partial-agpr.mir
index 8eddc9a..c9208bf 100644
--- a/llvm/test/CodeGen/AMDGPU/pei-build-spill-partial-agpr.mir
+++ b/llvm/test/CodeGen/AMDGPU/pei-build-spill-partial-agpr.mir
@@ -73,7 +73,7 @@ body: |
; FLATSCR-V2A-NEXT: $agpr0 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr1, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $vgpr0_vgpr1
; FLATSCR-V2A-NEXT: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr32, 0, 0, implicit $exec, implicit $flat_scr, implicit killed $vgpr0_vgpr1 :: (store (s32) into %stack.0, addrspace 5)
; FLATSCR-V2A-NEXT: $vgpr1 = V_ACCVGPR_READ_B32_e64 $agpr0, implicit $exec, implicit-def $vgpr0_vgpr1
- ; FLATSCR-V2A-NEXT: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 0, 0, implicit $exec, implicit $flat_scr, implicit-def $vgpr0_vgpr1 :: (load (s32) from %stack.0, addrspace 5)
+ ; FLATSCR-V2A-NEXT: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 0, 0, implicit $exec, implicit $flat_scr, implicit-def $vgpr0_vgpr1, implicit $vgpr0_vgpr1 :: (load (s32) from %stack.0, addrspace 5)
; FLATSCR-V2A-NEXT: S_ENDPGM 0
$vgpr0_vgpr1 = IMPLICIT_DEF
SI_SPILL_V64_SAVE killed $vgpr0_vgpr1, %stack.0, $sgpr32, 0, implicit $exec :: (store (s64) into %stack.0, align 4, addrspace 5)
@@ -112,7 +112,7 @@ body: |
; FLATSCR-V2A-NEXT: $agpr0 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr2, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2, implicit $vgpr0_vgpr1_vgpr2
; FLATSCR-V2A-NEXT: SCRATCH_STORE_DWORDX2_SADDR killed $vgpr0_vgpr1, $sgpr32, 0, 0, implicit $exec, implicit $flat_scr, implicit killed $vgpr0_vgpr1_vgpr2 :: (store (s64) into %stack.0, align 4, addrspace 5)
; FLATSCR-V2A-NEXT: $vgpr2 = V_ACCVGPR_READ_B32_e64 $agpr0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2
- ; FLATSCR-V2A-NEXT: $vgpr0_vgpr1 = SCRATCH_LOAD_DWORDX2_SADDR $sgpr32, 0, 0, implicit $exec, implicit $flat_scr, implicit-def $vgpr0_vgpr1_vgpr2 :: (load (s64) from %stack.0, align 4, addrspace 5)
+ ; FLATSCR-V2A-NEXT: $vgpr0_vgpr1 = SCRATCH_LOAD_DWORDX2_SADDR $sgpr32, 0, 0, implicit $exec, implicit $flat_scr, implicit-def $vgpr0_vgpr1_vgpr2, implicit $vgpr0_vgpr1_vgpr2 :: (load (s64) from %stack.0, align 4, addrspace 5)
; FLATSCR-V2A-NEXT: S_ENDPGM 0
$vgpr0_vgpr1_vgpr2 = IMPLICIT_DEF
SI_SPILL_V96_SAVE killed $vgpr0_vgpr1_vgpr2, %stack.0, $sgpr32, 0, implicit $exec :: (store (s96) into %stack.0, align 4, addrspace 5)
@@ -157,7 +157,7 @@ body: |
; FLATSCR-V2A-NEXT: $vgpr3 = V_ACCVGPR_READ_B32_e64 $agpr0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3
; FLATSCR-V2A-NEXT: $vgpr2 = V_ACCVGPR_READ_B32_e64 $agpr1, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3
; FLATSCR-V2A-NEXT: $vgpr1 = V_ACCVGPR_READ_B32_e64 $agpr2, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3
- ; FLATSCR-V2A-NEXT: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 0, 0, implicit $exec, implicit $flat_scr, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3 :: (load (s32) from %stack.0, addrspace 5)
+ ; FLATSCR-V2A-NEXT: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 0, 0, implicit $exec, implicit $flat_scr, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3, implicit $vgpr0_vgpr1_vgpr2_vgpr3 :: (load (s32) from %stack.0, addrspace 5)
; FLATSCR-V2A-NEXT: S_ENDPGM 0
$vgpr0_vgpr1_vgpr2_vgpr3 = IMPLICIT_DEF
SI_SPILL_V128_SAVE killed $vgpr0_vgpr1_vgpr2_vgpr3, %stack.0, $sgpr32, 0, implicit $exec :: (store (s128) into %stack.0, align 4, addrspace 5)
@@ -203,7 +203,7 @@ body: |
; FLATSCR-V2A-NEXT: $agpr0 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr4, implicit $exec, implicit killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4
; FLATSCR-V2A-NEXT: $vgpr3 = V_ACCVGPR_READ_B32_e64 $agpr1, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4
; FLATSCR-V2A-NEXT: $vgpr2 = V_ACCVGPR_READ_B32_e64 $agpr2, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4
- ; FLATSCR-V2A-NEXT: $vgpr0_vgpr1 = SCRATCH_LOAD_DWORDX2_SADDR $sgpr32, 0, 0, implicit $exec, implicit $flat_scr, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4 :: (load (s64) from %stack.0, align 4, addrspace 5)
+ ; FLATSCR-V2A-NEXT: $vgpr0_vgpr1 = SCRATCH_LOAD_DWORDX2_SADDR $sgpr32, 0, 0, implicit $exec, implicit $flat_scr, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4 :: (load (s64) from %stack.0, align 4, addrspace 5)
; FLATSCR-V2A-NEXT: $vgpr4 = V_ACCVGPR_READ_B32_e64 $agpr0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4
; FLATSCR-V2A-NEXT: S_ENDPGM 0
$vgpr0_vgpr1_vgpr2_vgpr3_vgpr4 = IMPLICIT_DEF
@@ -255,7 +255,7 @@ body: |
; FLATSCR-V2A-NEXT: $vgpr3 = V_ACCVGPR_READ_B32_e64 $agpr2, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5
; FLATSCR-V2A-NEXT: $vgpr2 = V_ACCVGPR_READ_B32_e64 $agpr3, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5
; FLATSCR-V2A-NEXT: $vgpr1 = V_ACCVGPR_READ_B32_e64 $agpr4, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5
- ; FLATSCR-V2A-NEXT: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 0, 0, implicit $exec, implicit $flat_scr, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5 :: (load (s32) from %stack.0, addrspace 5)
+ ; FLATSCR-V2A-NEXT: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 0, 0, implicit $exec, implicit $flat_scr, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5 :: (load (s32) from %stack.0, addrspace 5)
; FLATSCR-V2A-NEXT: $vgpr5 = V_ACCVGPR_READ_B32_e64 $agpr0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5
; FLATSCR-V2A-NEXT: $vgpr4 = V_ACCVGPR_READ_B32_e64 $agpr1, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5
; FLATSCR-V2A-NEXT: S_ENDPGM 0
diff --git a/llvm/test/CodeGen/AMDGPU/s_cmp_0.ll b/llvm/test/CodeGen/AMDGPU/s_cmp_0.ll
index dd5f838..0166d7a 100644
--- a/llvm/test/CodeGen/AMDGPU/s_cmp_0.ll
+++ b/llvm/test/CodeGen/AMDGPU/s_cmp_0.ll
@@ -110,6 +110,21 @@ define amdgpu_ps i32 @abs32(i32 inreg %val0) {
ret i32 %zext
}
+define amdgpu_ps i32 @absdiff32(i32 inreg %val0, i32 inreg %val1) {
+; CHECK-LABEL: absdiff32:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_absdiff_i32 s0, s0, s1
+; CHECK-NEXT: s_cselect_b64 s[0:1], -1, 0
+; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
+; CHECK-NEXT: v_readfirstlane_b32 s0, v0
+; CHECK-NEXT: ; return to shader part epilog
+ %diff = sub i32 %val0, %val1
+ %result = call i32 @llvm.abs.i32(i32 %diff, i1 false)
+ %cmp = icmp ne i32 %result, 0
+ %zext = zext i1 %cmp to i32
+ ret i32 %zext
+}
+
define amdgpu_ps i32 @and32(i32 inreg %val0, i32 inreg %val1) {
; CHECK-LABEL: and32:
; CHECK: ; %bb.0:
@@ -608,14 +623,14 @@ define amdgpu_ps i32 @si_pc_add_rel_offset_must_not_optimize() {
; CHECK-NEXT: s_add_u32 s0, s0, __unnamed_1@rel32@lo+4
; CHECK-NEXT: s_addc_u32 s1, s1, __unnamed_1@rel32@hi+12
; CHECK-NEXT: s_cmp_lg_u64 s[0:1], 0
-; CHECK-NEXT: s_cbranch_scc0 .LBB35_2
+; CHECK-NEXT: s_cbranch_scc0 .LBB36_2
; CHECK-NEXT: ; %bb.1: ; %endif
; CHECK-NEXT: s_mov_b32 s0, 1
-; CHECK-NEXT: s_branch .LBB35_3
-; CHECK-NEXT: .LBB35_2: ; %if
+; CHECK-NEXT: s_branch .LBB36_3
+; CHECK-NEXT: .LBB36_2: ; %if
; CHECK-NEXT: s_mov_b32 s0, 0
-; CHECK-NEXT: s_branch .LBB35_3
-; CHECK-NEXT: .LBB35_3:
+; CHECK-NEXT: s_branch .LBB36_3
+; CHECK-NEXT: .LBB36_3:
%cmp = icmp ne ptr addrspace(4) @1, null
br i1 %cmp, label %endif, label %if
diff --git a/llvm/test/CodeGen/AMDGPU/spill-restore-partial-copy.mir b/llvm/test/CodeGen/AMDGPU/spill-restore-partial-copy.mir
new file mode 100644
index 0000000..bb87b6e
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/spill-restore-partial-copy.mir
@@ -0,0 +1,324 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5
+# RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx950 -run-pass prologepilog,machine-cp -o - %s | FileCheck -check-prefix=GFX950 %s
+
+--- |
+ define amdgpu_kernel void @full_copy() #0 { ret void }
+
+ define amdgpu_kernel void @partial_copy() #0 { ret void }
+
+ define amdgpu_kernel void @full_spill() #0 { ret void }
+
+ attributes #0 = { "amdgpu-waves-per-eu"="8,8" }
+...
+
+---
+name: full_copy
+tracksRegLiveness: true
+stack:
+ - { id: 0, name: '', type: spill-slot, offset: 0, size: 16, alignment: 4 }
+ - { id: 1, name: '', type: spill-slot, offset: 0, size: 16, alignment: 4 }
+ - { id: 2, name: '', type: spill-slot, offset: 0, size: 16, alignment: 4 }
+ - { id: 3, name: '', type: spill-slot, offset: 0, size: 16, alignment: 4 }
+ - { id: 4, name: '', type: spill-slot, offset: 0, size: 16, alignment: 4 }
+ - { id: 5, name: '', type: spill-slot, offset: 0, size: 16, alignment: 4 }
+ - { id: 6, name: '', type: spill-slot, offset: 0, size: 16, alignment: 4 }
+machineFunctionInfo:
+ stackPtrOffsetReg: '$sgpr32'
+ hasSpilledVGPRs: true
+body: |
+ bb.0:
+ ; GFX950-LABEL: name: full_copy
+ ; GFX950: liveins: $agpr6, $agpr7, $agpr8, $agpr9, $agpr10, $agpr11, $agpr12, $agpr13, $agpr14, $agpr15, $agpr16, $agpr17, $agpr18, $agpr19, $agpr20, $agpr21, $agpr22, $agpr23, $agpr24, $agpr25, $agpr26, $agpr27, $agpr28, $agpr29
+ ; GFX950-NEXT: {{ $}}
+ ; GFX950-NEXT: renamable $agpr0_agpr1 = IMPLICIT_DEF
+ ; GFX950-NEXT: renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = IMPLICIT_DEF
+ ; GFX950-NEXT: renamable $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27 = IMPLICIT_DEF
+ ; GFX950-NEXT: $agpr6 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr3, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3, implicit $vgpr0_vgpr1_vgpr2_vgpr3
+ ; GFX950-NEXT: $agpr7 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr2, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3
+ ; GFX950-NEXT: $agpr8 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr1, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3
+ ; GFX950-NEXT: $agpr9 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit killed $vgpr0_vgpr1_vgpr2_vgpr3
+ ; GFX950-NEXT: $agpr10 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr7, implicit $exec, implicit-def $vgpr4_vgpr5_vgpr6_vgpr7, implicit $vgpr4_vgpr5_vgpr6_vgpr7
+ ; GFX950-NEXT: $agpr11 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr6, implicit $exec, implicit $vgpr4_vgpr5_vgpr6_vgpr7
+ ; GFX950-NEXT: $agpr12 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr5, implicit $exec, implicit $vgpr4_vgpr5_vgpr6_vgpr7
+ ; GFX950-NEXT: $agpr13 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr4, implicit $exec, implicit killed $vgpr4_vgpr5_vgpr6_vgpr7
+ ; GFX950-NEXT: $agpr14 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr11, implicit $exec, implicit-def $vgpr8_vgpr9_vgpr10_vgpr11, implicit $vgpr8_vgpr9_vgpr10_vgpr11
+ ; GFX950-NEXT: $agpr15 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr10, implicit $exec, implicit $vgpr8_vgpr9_vgpr10_vgpr11
+ ; GFX950-NEXT: $agpr16 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr9, implicit $exec, implicit $vgpr8_vgpr9_vgpr10_vgpr11
+ ; GFX950-NEXT: $agpr17 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr8, implicit $exec, implicit killed $vgpr8_vgpr9_vgpr10_vgpr11
+ ; GFX950-NEXT: $agpr18 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr15, implicit $exec, implicit-def $vgpr12_vgpr13_vgpr14_vgpr15, implicit $vgpr12_vgpr13_vgpr14_vgpr15
+ ; GFX950-NEXT: $agpr19 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr14, implicit $exec, implicit $vgpr12_vgpr13_vgpr14_vgpr15
+ ; GFX950-NEXT: $agpr20 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr13, implicit $exec, implicit $vgpr12_vgpr13_vgpr14_vgpr15
+ ; GFX950-NEXT: $agpr21 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr12, implicit $exec, implicit killed $vgpr12_vgpr13_vgpr14_vgpr15
+ ; GFX950-NEXT: $agpr22 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr19, implicit $exec, implicit-def $vgpr16_vgpr17_vgpr18_vgpr19, implicit $vgpr16_vgpr17_vgpr18_vgpr19
+ ; GFX950-NEXT: $agpr23 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr18, implicit $exec, implicit $vgpr16_vgpr17_vgpr18_vgpr19
+ ; GFX950-NEXT: $agpr24 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr17, implicit $exec, implicit $vgpr16_vgpr17_vgpr18_vgpr19
+ ; GFX950-NEXT: $agpr25 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr16, implicit $exec, implicit killed $vgpr16_vgpr17_vgpr18_vgpr19
+ ; GFX950-NEXT: $agpr26 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr23, implicit $exec, implicit-def $vgpr20_vgpr21_vgpr22_vgpr23, implicit $vgpr20_vgpr21_vgpr22_vgpr23
+ ; GFX950-NEXT: $agpr27 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr22, implicit $exec, implicit $vgpr20_vgpr21_vgpr22_vgpr23
+ ; GFX950-NEXT: $agpr28 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr21, implicit $exec, implicit $vgpr20_vgpr21_vgpr22_vgpr23
+ ; GFX950-NEXT: $agpr29 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr20, implicit $exec, implicit killed $vgpr20_vgpr21_vgpr22_vgpr23
+ ; GFX950-NEXT: $vgpr0 = IMPLICIT_DEF
+ ; GFX950-NEXT: $agpr5 = COPY $agpr6, implicit-def $agpr2_agpr3_agpr4_agpr5
+ ; GFX950-NEXT: $agpr4 = COPY $agpr7, implicit $agpr2_agpr3_agpr4_agpr5
+ ; GFX950-NEXT: $agpr3 = COPY $agpr8, implicit $agpr2_agpr3_agpr4_agpr5
+ ; GFX950-NEXT: $agpr2 = COPY $agpr9, implicit $agpr2_agpr3_agpr4_agpr5
+ ; GFX950-NEXT: DS_WRITE_B128_gfx9 renamable $vgpr0, killed renamable $agpr2_agpr3_agpr4_agpr5, 0, 0, implicit $exec
+ ; GFX950-NEXT: $agpr5 = COPY $agpr10, implicit-def $agpr2_agpr3_agpr4_agpr5
+ ; GFX950-NEXT: $agpr4 = COPY $agpr11, implicit $agpr2_agpr3_agpr4_agpr5
+ ; GFX950-NEXT: $agpr3 = COPY $agpr12, implicit $agpr2_agpr3_agpr4_agpr5
+ ; GFX950-NEXT: $agpr2 = COPY $agpr13, implicit $agpr2_agpr3_agpr4_agpr5
+ ; GFX950-NEXT: DS_WRITE_B128_gfx9 renamable $vgpr0, killed renamable $agpr2_agpr3_agpr4_agpr5, 1024, 0, implicit $exec
+ ; GFX950-NEXT: $agpr5 = COPY $agpr14, implicit-def $agpr2_agpr3_agpr4_agpr5
+ ; GFX950-NEXT: $agpr4 = COPY $agpr15, implicit $agpr2_agpr3_agpr4_agpr5
+ ; GFX950-NEXT: $agpr3 = COPY $agpr16, implicit $agpr2_agpr3_agpr4_agpr5
+ ; GFX950-NEXT: $agpr2 = COPY $agpr17, implicit $agpr2_agpr3_agpr4_agpr5
+ ; GFX950-NEXT: DS_WRITE_B128_gfx9 renamable $vgpr0, killed renamable $agpr2_agpr3_agpr4_agpr5, 2048, 0, implicit $exec
+ ; GFX950-NEXT: $agpr5 = COPY $agpr18, implicit-def $agpr2_agpr3_agpr4_agpr5
+ ; GFX950-NEXT: $agpr4 = COPY $agpr19, implicit $agpr2_agpr3_agpr4_agpr5
+ ; GFX950-NEXT: $agpr3 = COPY $agpr20, implicit $agpr2_agpr3_agpr4_agpr5
+ ; GFX950-NEXT: $agpr2 = COPY $agpr21, implicit $agpr2_agpr3_agpr4_agpr5
+ ; GFX950-NEXT: DS_WRITE_B128_gfx9 renamable $vgpr0, killed renamable $agpr2_agpr3_agpr4_agpr5, 3072, 0, implicit $exec
+ ; GFX950-NEXT: $agpr5 = COPY $agpr22, implicit-def $agpr2_agpr3_agpr4_agpr5
+ ; GFX950-NEXT: $agpr4 = COPY $agpr23, implicit $agpr2_agpr3_agpr4_agpr5
+ ; GFX950-NEXT: $agpr3 = COPY $agpr24, implicit $agpr2_agpr3_agpr4_agpr5
+ ; GFX950-NEXT: $agpr2 = COPY $agpr25, implicit $agpr2_agpr3_agpr4_agpr5
+ ; GFX950-NEXT: DS_WRITE_B128_gfx9 renamable $vgpr0, killed renamable $agpr2_agpr3_agpr4_agpr5, 4096, 0, implicit $exec
+ ; GFX950-NEXT: $agpr5 = COPY $agpr26, implicit-def $agpr2_agpr3_agpr4_agpr5
+ ; GFX950-NEXT: $agpr4 = COPY $agpr27, implicit $agpr2_agpr3_agpr4_agpr5
+ ; GFX950-NEXT: $agpr3 = COPY $agpr28, implicit $agpr2_agpr3_agpr4_agpr5
+ ; GFX950-NEXT: $agpr2 = COPY $agpr29, implicit $agpr2_agpr3_agpr4_agpr5
+ ; GFX950-NEXT: DS_WRITE_B128_gfx9 renamable $vgpr0, killed renamable $agpr2_agpr3_agpr4_agpr5, 5120, 0, implicit $exec
+ ; GFX950-NEXT: S_ENDPGM 0
+ renamable $agpr0_agpr1 = IMPLICIT_DEF
+ renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = IMPLICIT_DEF
+ renamable $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27 = IMPLICIT_DEF
+ SI_SPILL_AV128_SAVE killed $vgpr0_vgpr1_vgpr2_vgpr3, %stack.0, $sgpr32, 0, implicit $exec :: (store (s128) into %stack.0, align 4, addrspace 5)
+ SI_SPILL_AV128_SAVE killed $vgpr4_vgpr5_vgpr6_vgpr7, %stack.1, $sgpr32, 0, implicit $exec :: (store (s128) into %stack.1, align 4, addrspace 5)
+ SI_SPILL_AV128_SAVE killed $vgpr8_vgpr9_vgpr10_vgpr11, %stack.2, $sgpr32, 0, implicit $exec :: (store (s128) into %stack.2, align 4, addrspace 5)
+ SI_SPILL_AV128_SAVE killed $vgpr12_vgpr13_vgpr14_vgpr15, %stack.3, $sgpr32, 0, implicit $exec :: (store (s128) into %stack.3, align 4, addrspace 5)
+ SI_SPILL_AV128_SAVE killed $vgpr16_vgpr17_vgpr18_vgpr19, %stack.4, $sgpr32, 0, implicit $exec :: (store (s128) into %stack.4, align 4, addrspace 5)
+ SI_SPILL_AV128_SAVE killed $vgpr20_vgpr21_vgpr22_vgpr23, %stack.5, $sgpr32, 0, implicit $exec :: (store (s128) into %stack.5, align 4, addrspace 5)
+ $vgpr0 = IMPLICIT_DEF
+ renamable $agpr2_agpr3_agpr4_agpr5 = SI_SPILL_AV128_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load (s128) from %stack.0, align 4, addrspace 5)
+ DS_WRITE_B128_gfx9 renamable $vgpr0, killed renamable $agpr2_agpr3_agpr4_agpr5, 0, 0, implicit $exec
+ renamable $agpr2_agpr3_agpr4_agpr5 = SI_SPILL_AV128_RESTORE %stack.1, $sgpr32, 0, implicit $exec :: (load (s128) from %stack.1, align 4, addrspace 5)
+ DS_WRITE_B128_gfx9 renamable $vgpr0, killed renamable $agpr2_agpr3_agpr4_agpr5, 1024, 0, implicit $exec
+ renamable $agpr2_agpr3_agpr4_agpr5 = SI_SPILL_AV128_RESTORE %stack.2, $sgpr32, 0, implicit $exec :: (load (s128) from %stack.2, align 4, addrspace 5)
+ DS_WRITE_B128_gfx9 renamable $vgpr0, killed renamable $agpr2_agpr3_agpr4_agpr5, 2048, 0, implicit $exec
+ renamable $agpr2_agpr3_agpr4_agpr5 = SI_SPILL_AV128_RESTORE %stack.3, $sgpr32, 0, implicit $exec :: (load (s128) from %stack.3, align 4, addrspace 5)
+ DS_WRITE_B128_gfx9 renamable $vgpr0, killed renamable $agpr2_agpr3_agpr4_agpr5, 3072, 0, implicit $exec
+ renamable $agpr2_agpr3_agpr4_agpr5 = SI_SPILL_AV128_RESTORE %stack.4, $sgpr32, 0, implicit $exec :: (load (s128) from %stack.4, align 4, addrspace 5)
+ DS_WRITE_B128_gfx9 renamable $vgpr0, killed renamable $agpr2_agpr3_agpr4_agpr5, 4096, 0, implicit $exec
+ renamable $agpr2_agpr3_agpr4_agpr5 = SI_SPILL_AV128_RESTORE %stack.5, $sgpr32, 0, implicit $exec :: (load (s128) from %stack.5, align 4, addrspace 5)
+ DS_WRITE_B128_gfx9 renamable $vgpr0, killed renamable $agpr2_agpr3_agpr4_agpr5, 5120, 0, implicit $exec
+ S_ENDPGM 0
+...
+
+
+# We need to add implicit operand as well as implicit-def operand to the scratch_load, otherwise, MachineCopyPropagation will think the preceeding copies are dead, and will delete them.
+
+---
+name: partial_copy
+tracksRegLiveness: true
+stack:
+ - { id: 0, name: '', type: spill-slot, offset: 0, size: 16, alignment: 4 }
+ - { id: 1, name: '', type: spill-slot, offset: 0, size: 16, alignment: 4 }
+ - { id: 2, name: '', type: spill-slot, offset: 0, size: 16, alignment: 4 }
+ - { id: 3, name: '', type: spill-slot, offset: 0, size: 16, alignment: 4 }
+ - { id: 4, name: '', type: spill-slot, offset: 0, size: 16, alignment: 4 }
+ - { id: 5, name: '', type: spill-slot, offset: 0, size: 16, alignment: 4 }
+ - { id: 6, name: '', type: spill-slot, offset: 0, size: 16, alignment: 4 }
+machineFunctionInfo:
+ stackPtrOffsetReg: '$sgpr32'
+ hasSpilledVGPRs: true
+body: |
+ bb.0:
+ ; GFX950-LABEL: name: partial_copy
+ ; GFX950: liveins: $agpr6, $agpr7, $agpr8, $agpr9, $agpr10, $agpr11, $agpr12, $agpr13, $agpr14, $agpr15, $agpr16, $agpr17, $agpr18, $agpr19, $agpr20, $agpr21, $agpr22, $agpr23, $agpr24, $agpr25, $agpr26, $agpr27
+ ; GFX950-NEXT: {{ $}}
+ ; GFX950-NEXT: renamable $agpr0_agpr1 = IMPLICIT_DEF
+ ; GFX950-NEXT: renamable $agpr28_agpr29_agpr30_agpr31 = IMPLICIT_DEF
+ ; GFX950-NEXT: renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = IMPLICIT_DEF
+ ; GFX950-NEXT: renamable $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27 = IMPLICIT_DEF
+ ; GFX950-NEXT: $agpr6 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr3, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3, implicit $vgpr0_vgpr1_vgpr2_vgpr3
+ ; GFX950-NEXT: $agpr7 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr2, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3
+ ; GFX950-NEXT: $agpr8 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr1, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3
+ ; GFX950-NEXT: $agpr9 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit killed $vgpr0_vgpr1_vgpr2_vgpr3
+ ; GFX950-NEXT: $agpr10 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr7, implicit $exec, implicit-def $vgpr4_vgpr5_vgpr6_vgpr7, implicit $vgpr4_vgpr5_vgpr6_vgpr7
+ ; GFX950-NEXT: $agpr11 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr6, implicit $exec, implicit $vgpr4_vgpr5_vgpr6_vgpr7
+ ; GFX950-NEXT: $agpr12 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr5, implicit $exec, implicit $vgpr4_vgpr5_vgpr6_vgpr7
+ ; GFX950-NEXT: $agpr13 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr4, implicit $exec, implicit killed $vgpr4_vgpr5_vgpr6_vgpr7
+ ; GFX950-NEXT: $agpr14 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr11, implicit $exec, implicit-def $vgpr8_vgpr9_vgpr10_vgpr11, implicit $vgpr8_vgpr9_vgpr10_vgpr11
+ ; GFX950-NEXT: $agpr15 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr10, implicit $exec, implicit $vgpr8_vgpr9_vgpr10_vgpr11
+ ; GFX950-NEXT: $agpr16 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr9, implicit $exec, implicit $vgpr8_vgpr9_vgpr10_vgpr11
+ ; GFX950-NEXT: $agpr17 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr8, implicit $exec, implicit killed $vgpr8_vgpr9_vgpr10_vgpr11
+ ; GFX950-NEXT: $agpr18 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr15, implicit $exec, implicit-def $vgpr12_vgpr13_vgpr14_vgpr15, implicit $vgpr12_vgpr13_vgpr14_vgpr15
+ ; GFX950-NEXT: $agpr19 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr14, implicit $exec, implicit $vgpr12_vgpr13_vgpr14_vgpr15
+ ; GFX950-NEXT: $agpr20 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr13, implicit $exec, implicit $vgpr12_vgpr13_vgpr14_vgpr15
+ ; GFX950-NEXT: $agpr21 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr12, implicit $exec, implicit killed $vgpr12_vgpr13_vgpr14_vgpr15
+ ; GFX950-NEXT: $agpr22 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr19, implicit $exec, implicit-def $vgpr16_vgpr17_vgpr18_vgpr19, implicit $vgpr16_vgpr17_vgpr18_vgpr19
+ ; GFX950-NEXT: $agpr23 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr18, implicit $exec, implicit $vgpr16_vgpr17_vgpr18_vgpr19
+ ; GFX950-NEXT: $agpr24 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr17, implicit $exec, implicit $vgpr16_vgpr17_vgpr18_vgpr19
+ ; GFX950-NEXT: $agpr25 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr16, implicit $exec, implicit killed $vgpr16_vgpr17_vgpr18_vgpr19
+ ; GFX950-NEXT: $agpr26 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr23, implicit $exec, implicit-def $vgpr20_vgpr21_vgpr22_vgpr23, implicit $vgpr20_vgpr21_vgpr22_vgpr23
+ ; GFX950-NEXT: $agpr27 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr22, implicit $exec, implicit $vgpr20_vgpr21_vgpr22_vgpr23
+ ; GFX950-NEXT: SCRATCH_STORE_DWORDX2_SADDR killed $vgpr20_vgpr21, $sgpr32, 0, 0, implicit $exec, implicit $flat_scr, implicit killed $vgpr20_vgpr21_vgpr22_vgpr23 :: (store (s64) into %stack.5, align 4, addrspace 5)
+ ; GFX950-NEXT: $vgpr0 = IMPLICIT_DEF
+ ; GFX950-NEXT: $agpr5 = COPY $agpr6, implicit-def $agpr2_agpr3_agpr4_agpr5
+ ; GFX950-NEXT: $agpr4 = COPY $agpr7, implicit $agpr2_agpr3_agpr4_agpr5
+ ; GFX950-NEXT: $agpr3 = COPY $agpr8, implicit $agpr2_agpr3_agpr4_agpr5
+ ; GFX950-NEXT: $agpr2 = COPY $agpr9, implicit $agpr2_agpr3_agpr4_agpr5
+ ; GFX950-NEXT: DS_WRITE_B128_gfx9 renamable $vgpr0, killed renamable $agpr2_agpr3_agpr4_agpr5, 0, 0, implicit $exec
+ ; GFX950-NEXT: $agpr5 = COPY $agpr10, implicit-def $agpr2_agpr3_agpr4_agpr5
+ ; GFX950-NEXT: $agpr4 = COPY $agpr11, implicit $agpr2_agpr3_agpr4_agpr5
+ ; GFX950-NEXT: $agpr3 = COPY $agpr12, implicit $agpr2_agpr3_agpr4_agpr5
+ ; GFX950-NEXT: $agpr2 = COPY $agpr13, implicit $agpr2_agpr3_agpr4_agpr5
+ ; GFX950-NEXT: DS_WRITE_B128_gfx9 renamable $vgpr0, killed renamable $agpr2_agpr3_agpr4_agpr5, 1024, 0, implicit $exec
+ ; GFX950-NEXT: $agpr5 = COPY $agpr14, implicit-def $agpr2_agpr3_agpr4_agpr5
+ ; GFX950-NEXT: $agpr4 = COPY $agpr15, implicit $agpr2_agpr3_agpr4_agpr5
+ ; GFX950-NEXT: $agpr3 = COPY $agpr16, implicit $agpr2_agpr3_agpr4_agpr5
+ ; GFX950-NEXT: $agpr2 = COPY $agpr17, implicit $agpr2_agpr3_agpr4_agpr5
+ ; GFX950-NEXT: DS_WRITE_B128_gfx9 renamable $vgpr0, killed renamable $agpr2_agpr3_agpr4_agpr5, 2048, 0, implicit $exec
+ ; GFX950-NEXT: $agpr5 = COPY $agpr18, implicit-def $agpr2_agpr3_agpr4_agpr5
+ ; GFX950-NEXT: $agpr4 = COPY $agpr19, implicit $agpr2_agpr3_agpr4_agpr5
+ ; GFX950-NEXT: $agpr3 = COPY $agpr20, implicit $agpr2_agpr3_agpr4_agpr5
+ ; GFX950-NEXT: $agpr2 = COPY $agpr21, implicit $agpr2_agpr3_agpr4_agpr5
+ ; GFX950-NEXT: DS_WRITE_B128_gfx9 renamable $vgpr0, killed renamable $agpr2_agpr3_agpr4_agpr5, 3072, 0, implicit $exec
+ ; GFX950-NEXT: $agpr5 = COPY $agpr22, implicit-def $agpr2_agpr3_agpr4_agpr5
+ ; GFX950-NEXT: $agpr4 = COPY $agpr23, implicit $agpr2_agpr3_agpr4_agpr5
+ ; GFX950-NEXT: $agpr3 = COPY $agpr24, implicit $agpr2_agpr3_agpr4_agpr5
+ ; GFX950-NEXT: $agpr2 = COPY $agpr25, implicit $agpr2_agpr3_agpr4_agpr5
+ ; GFX950-NEXT: DS_WRITE_B128_gfx9 renamable $vgpr0, killed renamable $agpr2_agpr3_agpr4_agpr5, 4096, 0, implicit $exec
+ ; GFX950-NEXT: $agpr5 = COPY $agpr26, implicit-def $agpr2_agpr3_agpr4_agpr5
+ ; GFX950-NEXT: $agpr4 = COPY $agpr27, implicit $agpr2_agpr3_agpr4_agpr5
+ ; GFX950-NEXT: $agpr2_agpr3 = SCRATCH_LOAD_DWORDX2_SADDR $sgpr32, 0, 0, implicit $exec, implicit $flat_scr, implicit-def $agpr2_agpr3_agpr4_agpr5, implicit $agpr2_agpr3_agpr4_agpr5 :: (load (s64) from %stack.5, align 4, addrspace 5)
+ ; GFX950-NEXT: DS_WRITE_B128_gfx9 renamable $vgpr0, killed renamable $agpr2_agpr3_agpr4_agpr5, 5120, 0, implicit $exec
+ ; GFX950-NEXT: S_ENDPGM 0
+ renamable $agpr0_agpr1 = IMPLICIT_DEF
+ renamable $agpr28_agpr29_agpr30_agpr31 = IMPLICIT_DEF
+ renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = IMPLICIT_DEF
+ renamable $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27 = IMPLICIT_DEF
+ SI_SPILL_AV128_SAVE killed $vgpr0_vgpr1_vgpr2_vgpr3, %stack.0, $sgpr32, 0, implicit $exec :: (store (s128) into %stack.0, align 4, addrspace 5)
+ SI_SPILL_AV128_SAVE killed $vgpr4_vgpr5_vgpr6_vgpr7, %stack.1, $sgpr32, 0, implicit $exec :: (store (s128) into %stack.1, align 4, addrspace 5)
+ SI_SPILL_AV128_SAVE killed $vgpr8_vgpr9_vgpr10_vgpr11, %stack.2, $sgpr32, 0, implicit $exec :: (store (s128) into %stack.2, align 4, addrspace 5)
+ SI_SPILL_AV128_SAVE killed $vgpr12_vgpr13_vgpr14_vgpr15, %stack.3, $sgpr32, 0, implicit $exec :: (store (s128) into %stack.3, align 4, addrspace 5)
+ SI_SPILL_AV128_SAVE killed $vgpr16_vgpr17_vgpr18_vgpr19, %stack.4, $sgpr32, 0, implicit $exec :: (store (s128) into %stack.4, align 4, addrspace 5)
+ SI_SPILL_AV128_SAVE killed $vgpr20_vgpr21_vgpr22_vgpr23, %stack.5, $sgpr32, 0, implicit $exec :: (store (s128) into %stack.5, align 4, addrspace 5)
+ $vgpr0 = IMPLICIT_DEF
+ renamable $agpr2_agpr3_agpr4_agpr5 = SI_SPILL_AV128_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load (s128) from %stack.0, align 4, addrspace 5)
+ DS_WRITE_B128_gfx9 renamable $vgpr0, killed renamable $agpr2_agpr3_agpr4_agpr5, 0, 0, implicit $exec
+ renamable $agpr2_agpr3_agpr4_agpr5 = SI_SPILL_AV128_RESTORE %stack.1, $sgpr32, 0, implicit $exec :: (load (s128) from %stack.1, align 4, addrspace 5)
+ DS_WRITE_B128_gfx9 renamable $vgpr0, killed renamable $agpr2_agpr3_agpr4_agpr5, 1024, 0, implicit $exec
+ renamable $agpr2_agpr3_agpr4_agpr5 = SI_SPILL_AV128_RESTORE %stack.2, $sgpr32, 0, implicit $exec :: (load (s128) from %stack.2, align 4, addrspace 5)
+ DS_WRITE_B128_gfx9 renamable $vgpr0, killed renamable $agpr2_agpr3_agpr4_agpr5, 2048, 0, implicit $exec
+ renamable $agpr2_agpr3_agpr4_agpr5 = SI_SPILL_AV128_RESTORE %stack.3, $sgpr32, 0, implicit $exec :: (load (s128) from %stack.3, align 4, addrspace 5)
+ DS_WRITE_B128_gfx9 renamable $vgpr0, killed renamable $agpr2_agpr3_agpr4_agpr5, 3072, 0, implicit $exec
+ renamable $agpr2_agpr3_agpr4_agpr5 = SI_SPILL_AV128_RESTORE %stack.4, $sgpr32, 0, implicit $exec :: (load (s128) from %stack.4, align 4, addrspace 5)
+ DS_WRITE_B128_gfx9 renamable $vgpr0, killed renamable $agpr2_agpr3_agpr4_agpr5, 4096, 0, implicit $exec
+ renamable $agpr2_agpr3_agpr4_agpr5 = SI_SPILL_AV128_RESTORE %stack.5, $sgpr32, 0, implicit $exec :: (load (s128) from %stack.5, align 4, addrspace 5)
+ DS_WRITE_B128_gfx9 renamable $vgpr0, killed renamable $agpr2_agpr3_agpr4_agpr5, 5120, 0, implicit $exec
+ S_ENDPGM 0
+...
+
+# Since there are no preceeding copies , we do not need to add implicit operand, as the implicit-def operand does not clobber.
+
+---
+name: full_spill
+tracksRegLiveness: true
+stack:
+ - { id: 0, name: '', type: spill-slot, offset: 0, size: 16, alignment: 4 }
+ - { id: 1, name: '', type: spill-slot, offset: 0, size: 16, alignment: 4 }
+ - { id: 2, name: '', type: spill-slot, offset: 0, size: 16, alignment: 4 }
+ - { id: 3, name: '', type: spill-slot, offset: 0, size: 16, alignment: 4 }
+ - { id: 4, name: '', type: spill-slot, offset: 0, size: 16, alignment: 4 }
+ - { id: 5, name: '', type: spill-slot, offset: 0, size: 16, alignment: 4 }
+ - { id: 6, name: '', type: spill-slot, offset: 0, size: 16, alignment: 4 }
+machineFunctionInfo:
+ stackPtrOffsetReg: '$sgpr32'
+ hasSpilledVGPRs: true
+body: |
+ bb.0:
+ ; GFX950-LABEL: name: full_spill
+ ; GFX950: liveins: $agpr6, $agpr7, $agpr8, $agpr9, $agpr10, $agpr11, $agpr12, $agpr13, $agpr14, $agpr15, $agpr16, $agpr17, $agpr18, $agpr19, $agpr20, $agpr21, $agpr22, $agpr23, $agpr24, $agpr25
+ ; GFX950-NEXT: {{ $}}
+ ; GFX950-NEXT: renamable $agpr0_agpr1 = IMPLICIT_DEF
+ ; GFX950-NEXT: renamable $agpr26_agpr27 = IMPLICIT_DEF
+ ; GFX950-NEXT: renamable $agpr28_agpr29_agpr30_agpr31 = IMPLICIT_DEF
+ ; GFX950-NEXT: renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = IMPLICIT_DEF
+ ; GFX950-NEXT: renamable $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27 = IMPLICIT_DEF
+ ; GFX950-NEXT: $agpr6 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr3, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3, implicit $vgpr0_vgpr1_vgpr2_vgpr3
+ ; GFX950-NEXT: $agpr7 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr2, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3
+ ; GFX950-NEXT: $agpr8 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr1, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3
+ ; GFX950-NEXT: $agpr9 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit killed $vgpr0_vgpr1_vgpr2_vgpr3
+ ; GFX950-NEXT: $agpr10 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr7, implicit $exec, implicit-def $vgpr4_vgpr5_vgpr6_vgpr7, implicit $vgpr4_vgpr5_vgpr6_vgpr7
+ ; GFX950-NEXT: $agpr11 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr6, implicit $exec, implicit $vgpr4_vgpr5_vgpr6_vgpr7
+ ; GFX950-NEXT: $agpr12 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr5, implicit $exec, implicit $vgpr4_vgpr5_vgpr6_vgpr7
+ ; GFX950-NEXT: $agpr13 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr4, implicit $exec, implicit killed $vgpr4_vgpr5_vgpr6_vgpr7
+ ; GFX950-NEXT: $agpr14 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr11, implicit $exec, implicit-def $vgpr8_vgpr9_vgpr10_vgpr11, implicit $vgpr8_vgpr9_vgpr10_vgpr11
+ ; GFX950-NEXT: $agpr15 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr10, implicit $exec, implicit $vgpr8_vgpr9_vgpr10_vgpr11
+ ; GFX950-NEXT: $agpr16 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr9, implicit $exec, implicit $vgpr8_vgpr9_vgpr10_vgpr11
+ ; GFX950-NEXT: $agpr17 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr8, implicit $exec, implicit killed $vgpr8_vgpr9_vgpr10_vgpr11
+ ; GFX950-NEXT: $agpr18 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr15, implicit $exec, implicit-def $vgpr12_vgpr13_vgpr14_vgpr15, implicit $vgpr12_vgpr13_vgpr14_vgpr15
+ ; GFX950-NEXT: $agpr19 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr14, implicit $exec, implicit $vgpr12_vgpr13_vgpr14_vgpr15
+ ; GFX950-NEXT: $agpr20 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr13, implicit $exec, implicit $vgpr12_vgpr13_vgpr14_vgpr15
+ ; GFX950-NEXT: $agpr21 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr12, implicit $exec, implicit killed $vgpr12_vgpr13_vgpr14_vgpr15
+ ; GFX950-NEXT: $agpr22 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr19, implicit $exec, implicit-def $vgpr16_vgpr17_vgpr18_vgpr19, implicit $vgpr16_vgpr17_vgpr18_vgpr19
+ ; GFX950-NEXT: $agpr23 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr18, implicit $exec, implicit $vgpr16_vgpr17_vgpr18_vgpr19
+ ; GFX950-NEXT: $agpr24 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr17, implicit $exec, implicit $vgpr16_vgpr17_vgpr18_vgpr19
+ ; GFX950-NEXT: $agpr25 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr16, implicit $exec, implicit killed $vgpr16_vgpr17_vgpr18_vgpr19
+ ; GFX950-NEXT: SCRATCH_STORE_DWORDX4_SADDR killed $vgpr20_vgpr21_vgpr22_vgpr23, $sgpr32, 0, 0, implicit $exec, implicit $flat_scr :: (store (s128) into %stack.5, align 4, addrspace 5)
+ ; GFX950-NEXT: $vgpr0 = IMPLICIT_DEF
+ ; GFX950-NEXT: $agpr5 = COPY $agpr6, implicit-def $agpr2_agpr3_agpr4_agpr5
+ ; GFX950-NEXT: $agpr4 = COPY $agpr7, implicit $agpr2_agpr3_agpr4_agpr5
+ ; GFX950-NEXT: $agpr3 = COPY $agpr8, implicit $agpr2_agpr3_agpr4_agpr5
+ ; GFX950-NEXT: $agpr2 = COPY $agpr9, implicit $agpr2_agpr3_agpr4_agpr5
+ ; GFX950-NEXT: DS_WRITE_B128_gfx9 renamable $vgpr0, killed renamable $agpr2_agpr3_agpr4_agpr5, 0, 0, implicit $exec
+ ; GFX950-NEXT: $agpr5 = COPY $agpr10, implicit-def $agpr2_agpr3_agpr4_agpr5
+ ; GFX950-NEXT: $agpr4 = COPY $agpr11, implicit $agpr2_agpr3_agpr4_agpr5
+ ; GFX950-NEXT: $agpr3 = COPY $agpr12, implicit $agpr2_agpr3_agpr4_agpr5
+ ; GFX950-NEXT: $agpr2 = COPY $agpr13, implicit $agpr2_agpr3_agpr4_agpr5
+ ; GFX950-NEXT: DS_WRITE_B128_gfx9 renamable $vgpr0, killed renamable $agpr2_agpr3_agpr4_agpr5, 1024, 0, implicit $exec
+ ; GFX950-NEXT: $agpr5 = COPY $agpr14, implicit-def $agpr2_agpr3_agpr4_agpr5
+ ; GFX950-NEXT: $agpr4 = COPY $agpr15, implicit $agpr2_agpr3_agpr4_agpr5
+ ; GFX950-NEXT: $agpr3 = COPY $agpr16, implicit $agpr2_agpr3_agpr4_agpr5
+ ; GFX950-NEXT: $agpr2 = COPY $agpr17, implicit $agpr2_agpr3_agpr4_agpr5
+ ; GFX950-NEXT: DS_WRITE_B128_gfx9 renamable $vgpr0, killed renamable $agpr2_agpr3_agpr4_agpr5, 2048, 0, implicit $exec
+ ; GFX950-NEXT: $agpr5 = COPY $agpr18, implicit-def $agpr2_agpr3_agpr4_agpr5
+ ; GFX950-NEXT: $agpr4 = COPY $agpr19, implicit $agpr2_agpr3_agpr4_agpr5
+ ; GFX950-NEXT: $agpr3 = COPY $agpr20, implicit $agpr2_agpr3_agpr4_agpr5
+ ; GFX950-NEXT: $agpr2 = COPY $agpr21, implicit $agpr2_agpr3_agpr4_agpr5
+ ; GFX950-NEXT: DS_WRITE_B128_gfx9 renamable $vgpr0, killed renamable $agpr2_agpr3_agpr4_agpr5, 3072, 0, implicit $exec
+ ; GFX950-NEXT: $agpr5 = COPY $agpr22, implicit-def $agpr2_agpr3_agpr4_agpr5
+ ; GFX950-NEXT: $agpr4 = COPY $agpr23, implicit $agpr2_agpr3_agpr4_agpr5
+ ; GFX950-NEXT: $agpr3 = COPY $agpr24, implicit $agpr2_agpr3_agpr4_agpr5
+ ; GFX950-NEXT: $agpr2 = COPY $agpr25, implicit $agpr2_agpr3_agpr4_agpr5
+ ; GFX950-NEXT: DS_WRITE_B128_gfx9 renamable $vgpr0, killed renamable $agpr2_agpr3_agpr4_agpr5, 4096, 0, implicit $exec
+ ; GFX950-NEXT: $agpr2_agpr3_agpr4_agpr5 = SCRATCH_LOAD_DWORDX4_SADDR $sgpr32, 0, 0, implicit $exec, implicit $flat_scr :: (load (s128) from %stack.5, align 4, addrspace 5)
+ ; GFX950-NEXT: DS_WRITE_B128_gfx9 renamable $vgpr0, killed renamable $agpr2_agpr3_agpr4_agpr5, 5120, 0, implicit $exec
+ ; GFX950-NEXT: S_ENDPGM 0
+ renamable $agpr0_agpr1 = IMPLICIT_DEF
+ renamable $agpr26_agpr27 = IMPLICIT_DEF
+ renamable $agpr28_agpr29_agpr30_agpr31 = IMPLICIT_DEF
+ renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = IMPLICIT_DEF
+ renamable $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27 = IMPLICIT_DEF
+ SI_SPILL_AV128_SAVE killed $vgpr0_vgpr1_vgpr2_vgpr3, %stack.0, $sgpr32, 0, implicit $exec :: (store (s128) into %stack.0, align 4, addrspace 5)
+ SI_SPILL_AV128_SAVE killed $vgpr4_vgpr5_vgpr6_vgpr7, %stack.1, $sgpr32, 0, implicit $exec :: (store (s128) into %stack.1, align 4, addrspace 5)
+ SI_SPILL_AV128_SAVE killed $vgpr8_vgpr9_vgpr10_vgpr11, %stack.2, $sgpr32, 0, implicit $exec :: (store (s128) into %stack.2, align 4, addrspace 5)
+ SI_SPILL_AV128_SAVE killed $vgpr12_vgpr13_vgpr14_vgpr15, %stack.3, $sgpr32, 0, implicit $exec :: (store (s128) into %stack.3, align 4, addrspace 5)
+ SI_SPILL_AV128_SAVE killed $vgpr16_vgpr17_vgpr18_vgpr19, %stack.4, $sgpr32, 0, implicit $exec :: (store (s128) into %stack.4, align 4, addrspace 5)
+ SI_SPILL_AV128_SAVE killed $vgpr20_vgpr21_vgpr22_vgpr23, %stack.5, $sgpr32, 0, implicit $exec :: (store (s128) into %stack.5, align 4, addrspace 5)
+ $vgpr0 = IMPLICIT_DEF
+ renamable $agpr2_agpr3_agpr4_agpr5 = SI_SPILL_AV128_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load (s128) from %stack.0, align 4, addrspace 5)
+ DS_WRITE_B128_gfx9 renamable $vgpr0, killed renamable $agpr2_agpr3_agpr4_agpr5, 0, 0, implicit $exec
+ renamable $agpr2_agpr3_agpr4_agpr5 = SI_SPILL_AV128_RESTORE %stack.1, $sgpr32, 0, implicit $exec :: (load (s128) from %stack.1, align 4, addrspace 5)
+ DS_WRITE_B128_gfx9 renamable $vgpr0, killed renamable $agpr2_agpr3_agpr4_agpr5, 1024, 0, implicit $exec
+ renamable $agpr2_agpr3_agpr4_agpr5 = SI_SPILL_AV128_RESTORE %stack.2, $sgpr32, 0, implicit $exec :: (load (s128) from %stack.2, align 4, addrspace 5)
+ DS_WRITE_B128_gfx9 renamable $vgpr0, killed renamable $agpr2_agpr3_agpr4_agpr5, 2048, 0, implicit $exec
+ renamable $agpr2_agpr3_agpr4_agpr5 = SI_SPILL_AV128_RESTORE %stack.3, $sgpr32, 0, implicit $exec :: (load (s128) from %stack.3, align 4, addrspace 5)
+ DS_WRITE_B128_gfx9 renamable $vgpr0, killed renamable $agpr2_agpr3_agpr4_agpr5, 3072, 0, implicit $exec
+ renamable $agpr2_agpr3_agpr4_agpr5 = SI_SPILL_AV128_RESTORE %stack.4, $sgpr32, 0, implicit $exec :: (load (s128) from %stack.4, align 4, addrspace 5)
+ DS_WRITE_B128_gfx9 renamable $vgpr0, killed renamable $agpr2_agpr3_agpr4_agpr5, 4096, 0, implicit $exec
+ renamable $agpr2_agpr3_agpr4_agpr5 = SI_SPILL_AV128_RESTORE %stack.5, $sgpr32, 0, implicit $exec :: (load (s128) from %stack.5, align 4, addrspace 5)
+ DS_WRITE_B128_gfx9 renamable $vgpr0, killed renamable $agpr2_agpr3_agpr4_agpr5, 5120, 0, implicit $exec
+ S_ENDPGM 0
+...
diff --git a/llvm/test/CodeGen/AMDGPU/spill-to-agpr-partial.mir b/llvm/test/CodeGen/AMDGPU/spill-to-agpr-partial.mir
index 52593e0..beeb9b2 100644
--- a/llvm/test/CodeGen/AMDGPU/spill-to-agpr-partial.mir
+++ b/llvm/test/CodeGen/AMDGPU/spill-to-agpr-partial.mir
@@ -19,7 +19,7 @@ body: |
; GCN-NEXT: $agpr31 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr3, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3, implicit $vgpr0_vgpr1_vgpr2_vgpr3
; GCN-NEXT: SCRATCH_STORE_DWORDX3_SADDR killed $vgpr0_vgpr1_vgpr2, $sgpr32, 0, 0, implicit $exec, implicit $flat_scr, implicit killed $vgpr0_vgpr1_vgpr2_vgpr3 :: (store (s96) into %stack.0, align 4, addrspace 5)
; GCN-NEXT: $vgpr3 = V_ACCVGPR_READ_B32_e64 $agpr31, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3
- ; GCN-NEXT: $vgpr0_vgpr1_vgpr2 = SCRATCH_LOAD_DWORDX3_SADDR $sgpr32, 0, 0, implicit $exec, implicit $flat_scr, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3 :: (load (s96) from %stack.0, align 4, addrspace 5)
+ ; GCN-NEXT: $vgpr0_vgpr1_vgpr2 = SCRATCH_LOAD_DWORDX3_SADDR $sgpr32, 0, 0, implicit $exec, implicit $flat_scr, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3, implicit $vgpr0_vgpr1_vgpr2_vgpr3 :: (load (s96) from %stack.0, align 4, addrspace 5)
; GCN-NEXT: S_ENDPGM 0, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, implicit $agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23, implicit $agpr24_agpr25_agpr26_agpr27, implicit $agpr28_agpr29, implicit $agpr30
SI_SPILL_V128_SAVE killed $vgpr0_vgpr1_vgpr2_vgpr3, %stack.0, $sgpr32, 0, implicit $exec :: (store (s64) into %stack.0, addrspace 5)
$vgpr0_vgpr1_vgpr2_vgpr3 = SI_SPILL_V128_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load (s128) from %stack.0, align 4, addrspace 5)
@@ -46,7 +46,7 @@ body: |
; GCN-NEXT: SCRATCH_STORE_DWORDX2_SADDR killed $vgpr0_vgpr1, $sgpr32, 0, 0, implicit $exec, implicit $flat_scr, implicit killed $vgpr0_vgpr1_vgpr2_vgpr3 :: (store (s64) into %stack.0, align 4, addrspace 5)
; GCN-NEXT: $vgpr3 = V_ACCVGPR_READ_B32_e64 $agpr30, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3
; GCN-NEXT: $vgpr2 = V_ACCVGPR_READ_B32_e64 $agpr31, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3
- ; GCN-NEXT: $vgpr0_vgpr1 = SCRATCH_LOAD_DWORDX2_SADDR $sgpr32, 0, 0, implicit $exec, implicit $flat_scr, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3 :: (load (s64) from %stack.0, align 4, addrspace 5)
+ ; GCN-NEXT: $vgpr0_vgpr1 = SCRATCH_LOAD_DWORDX2_SADDR $sgpr32, 0, 0, implicit $exec, implicit $flat_scr, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3, implicit $vgpr0_vgpr1_vgpr2_vgpr3 :: (load (s64) from %stack.0, align 4, addrspace 5)
; GCN-NEXT: S_ENDPGM 0, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, implicit $agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23, implicit $agpr24_agpr25_agpr26_agpr27, implicit $agpr28_agpr29
SI_SPILL_V128_SAVE killed $vgpr0_vgpr1_vgpr2_vgpr3, %stack.0, $sgpr32, 0, implicit $exec :: (store (s64) into %stack.0, addrspace 5)
$vgpr0_vgpr1_vgpr2_vgpr3 = SI_SPILL_V128_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load (s128) from %stack.0, align 4, addrspace 5)
@@ -75,7 +75,7 @@ body: |
; GCN-NEXT: $vgpr3 = V_ACCVGPR_READ_B32_e64 $agpr29, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3
; GCN-NEXT: $vgpr2 = V_ACCVGPR_READ_B32_e64 $agpr30, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3
; GCN-NEXT: $vgpr1 = V_ACCVGPR_READ_B32_e64 $agpr31, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3
- ; GCN-NEXT: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 0, 0, implicit $exec, implicit $flat_scr, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3 :: (load (s32) from %stack.0, addrspace 5)
+ ; GCN-NEXT: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 0, 0, implicit $exec, implicit $flat_scr, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3, implicit $vgpr0_vgpr1_vgpr2_vgpr3 :: (load (s32) from %stack.0, addrspace 5)
; GCN-NEXT: S_ENDPGM 0, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, implicit $agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23, implicit $agpr24_agpr25_agpr26_agpr27, implicit $agpr28
SI_SPILL_V128_SAVE killed $vgpr0_vgpr1_vgpr2_vgpr3, %stack.0, $sgpr32, 0, implicit $exec :: (store (s64) into %stack.0, addrspace 5)
$vgpr0_vgpr1_vgpr2_vgpr3 = SI_SPILL_V128_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load (s128) from %stack.0, align 4, addrspace 5)
@@ -129,7 +129,7 @@ body: |
; GCN-NEXT: $vgpr55 = V_ACCVGPR_READ_B32_e64 killed $agpr3, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3, implicit $agpr0_agpr1_agpr2_agpr3
; GCN-NEXT: SCRATCH_STORE_DWORDX3_SADDR killed $agpr0_agpr1_agpr2, $sgpr32, 0, 0, implicit $exec, implicit $flat_scr, implicit killed $agpr0_agpr1_agpr2_agpr3 :: (store (s96) into %stack.0, align 4, addrspace 5)
; GCN-NEXT: $agpr3 = V_ACCVGPR_WRITE_B32_e64 $vgpr55, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3
- ; GCN-NEXT: $agpr0_agpr1_agpr2 = SCRATCH_LOAD_DWORDX3_SADDR $sgpr32, 0, 0, implicit $exec, implicit $flat_scr, implicit-def $agpr0_agpr1_agpr2_agpr3 :: (load (s96) from %stack.0, align 4, addrspace 5)
+ ; GCN-NEXT: $agpr0_agpr1_agpr2 = SCRATCH_LOAD_DWORDX3_SADDR $sgpr32, 0, 0, implicit $exec, implicit $flat_scr, implicit-def $agpr0_agpr1_agpr2_agpr3, implicit $agpr0_agpr1_agpr2_agpr3 :: (load (s96) from %stack.0, align 4, addrspace 5)
; GCN-NEXT: S_ENDPGM 0, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, implicit $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, implicit $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, implicit $vgpr48_vgpr49_vgpr50_vgpr51, implicit $vgpr52_vgpr53, implicit $vgpr54
SI_SPILL_A128_SAVE killed $agpr0_agpr1_agpr2_agpr3, %stack.0, $sgpr32, 0, implicit $exec :: (store (s64) into %stack.0, addrspace 5)
$agpr0_agpr1_agpr2_agpr3 = SI_SPILL_A128_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load (s128) from %stack.0, align 4, addrspace 5)
@@ -156,7 +156,7 @@ body: |
; GCN-NEXT: SCRATCH_STORE_DWORDX2_SADDR killed $agpr0_agpr1, $sgpr32, 0, 0, implicit $exec, implicit $flat_scr, implicit killed $agpr0_agpr1_agpr2_agpr3 :: (store (s64) into %stack.0, align 4, addrspace 5)
; GCN-NEXT: $agpr3 = V_ACCVGPR_WRITE_B32_e64 $vgpr54, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3
; GCN-NEXT: $agpr2 = V_ACCVGPR_WRITE_B32_e64 $vgpr55, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3
- ; GCN-NEXT: $agpr0_agpr1 = SCRATCH_LOAD_DWORDX2_SADDR $sgpr32, 0, 0, implicit $exec, implicit $flat_scr, implicit-def $agpr0_agpr1_agpr2_agpr3 :: (load (s64) from %stack.0, align 4, addrspace 5)
+ ; GCN-NEXT: $agpr0_agpr1 = SCRATCH_LOAD_DWORDX2_SADDR $sgpr32, 0, 0, implicit $exec, implicit $flat_scr, implicit-def $agpr0_agpr1_agpr2_agpr3, implicit $agpr0_agpr1_agpr2_agpr3 :: (load (s64) from %stack.0, align 4, addrspace 5)
; GCN-NEXT: S_ENDPGM 0, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, implicit $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, implicit $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, implicit $vgpr48_vgpr49_vgpr50_vgpr51, implicit $vgpr52_vgpr53
SI_SPILL_A128_SAVE killed $agpr0_agpr1_agpr2_agpr3, %stack.0, $sgpr32, 0, implicit $exec :: (store (s64) into %stack.0, addrspace 5)
$agpr0_agpr1_agpr2_agpr3 = SI_SPILL_A128_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load (s128) from %stack.0, align 4, addrspace 5)
@@ -185,7 +185,7 @@ body: |
; GCN-NEXT: $agpr3 = V_ACCVGPR_WRITE_B32_e64 $vgpr53, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3
; GCN-NEXT: $agpr2 = V_ACCVGPR_WRITE_B32_e64 $vgpr54, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3
; GCN-NEXT: $agpr1 = V_ACCVGPR_WRITE_B32_e64 $vgpr55, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3
- ; GCN-NEXT: $agpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 0, 0, implicit $exec, implicit $flat_scr, implicit-def $agpr0_agpr1_agpr2_agpr3 :: (load (s32) from %stack.0, addrspace 5)
+ ; GCN-NEXT: $agpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 0, 0, implicit $exec, implicit $flat_scr, implicit-def $agpr0_agpr1_agpr2_agpr3, implicit $agpr0_agpr1_agpr2_agpr3 :: (load (s32) from %stack.0, addrspace 5)
; GCN-NEXT: S_ENDPGM 0, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, implicit $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, implicit $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, implicit $vgpr48_vgpr49_vgpr50_vgpr51, implicit $vgpr52
SI_SPILL_A128_SAVE killed $agpr0_agpr1_agpr2_agpr3, %stack.0, $sgpr32, 0, implicit $exec :: (store (s64) into %stack.0, addrspace 5)
$agpr0_agpr1_agpr2_agpr3 = SI_SPILL_A128_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load (s128) from %stack.0, align 4, addrspace 5)
diff --git a/llvm/test/CodeGen/AMDGPU/vector-spill-restore-to-other-vector-type.mir b/llvm/test/CodeGen/AMDGPU/vector-spill-restore-to-other-vector-type.mir
index 2fac3d2..69cf924 100644
--- a/llvm/test/CodeGen/AMDGPU/vector-spill-restore-to-other-vector-type.mir
+++ b/llvm/test/CodeGen/AMDGPU/vector-spill-restore-to-other-vector-type.mir
@@ -22,7 +22,7 @@ body: |
; GCN-NEXT: $vgpr55 = V_ACCVGPR_READ_B32_e64 killed $agpr3, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3, implicit $agpr0_agpr1_agpr2_agpr3
; GCN-NEXT: SCRATCH_STORE_DWORDX3_SADDR killed $agpr0_agpr1_agpr2, $sgpr32, 0, 0, implicit $exec, implicit $flat_scr, implicit killed $agpr0_agpr1_agpr2_agpr3 :: (store (s96) into %stack.0, align 4, addrspace 5)
; GCN-NEXT: $vgpr51 = COPY $vgpr55, implicit-def $vgpr48_vgpr49_vgpr50_vgpr51
- ; GCN-NEXT: $vgpr48_vgpr49_vgpr50 = SCRATCH_LOAD_DWORDX3_SADDR $sgpr32, 0, 0, implicit $exec, implicit $flat_scr, implicit-def $vgpr48_vgpr49_vgpr50_vgpr51 :: (load (s96) from %stack.0, align 4, addrspace 5)
+ ; GCN-NEXT: $vgpr48_vgpr49_vgpr50 = SCRATCH_LOAD_DWORDX3_SADDR $sgpr32, 0, 0, implicit $exec, implicit $flat_scr, implicit-def $vgpr48_vgpr49_vgpr50_vgpr51, implicit $vgpr48_vgpr49_vgpr50_vgpr51 :: (load (s96) from %stack.0, align 4, addrspace 5)
; GCN-NEXT: S_ENDPGM 0, implicit $vgpr52, implicit $vgpr53, implicit $vgpr54, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, implicit $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, implicit $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47
SI_SPILL_A128_SAVE killed $agpr0_agpr1_agpr2_agpr3, %stack.0, $sgpr32, 0, implicit $exec :: (store (s64) into %stack.0, addrspace 5)
$vgpr48_vgpr49_vgpr50_vgpr51 = SI_SPILL_V128_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load (s128) from %stack.0, align 4, addrspace 5)
@@ -49,7 +49,7 @@ body: |
; GCN-NEXT: SCRATCH_STORE_DWORDX2_SADDR killed $agpr0_agpr1, $sgpr32, 0, 0, implicit $exec, implicit $flat_scr, implicit killed $agpr0_agpr1_agpr2_agpr3 :: (store (s64) into %stack.0, align 4, addrspace 5)
; GCN-NEXT: $vgpr51 = COPY $vgpr54, implicit-def $vgpr48_vgpr49_vgpr50_vgpr51
; GCN-NEXT: $vgpr50 = COPY $vgpr55, implicit $vgpr48_vgpr49_vgpr50_vgpr51
- ; GCN-NEXT: $vgpr48_vgpr49 = SCRATCH_LOAD_DWORDX2_SADDR $sgpr32, 0, 0, implicit $exec, implicit $flat_scr, implicit-def $vgpr48_vgpr49_vgpr50_vgpr51 :: (load (s64) from %stack.0, align 4, addrspace 5)
+ ; GCN-NEXT: $vgpr48_vgpr49 = SCRATCH_LOAD_DWORDX2_SADDR $sgpr32, 0, 0, implicit $exec, implicit $flat_scr, implicit-def $vgpr48_vgpr49_vgpr50_vgpr51, implicit $vgpr48_vgpr49_vgpr50_vgpr51 :: (load (s64) from %stack.0, align 4, addrspace 5)
; GCN-NEXT: S_ENDPGM 0, implicit $vgpr52, implicit $vgpr53, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, implicit $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, implicit $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47
SI_SPILL_A128_SAVE killed $agpr0_agpr1_agpr2_agpr3, %stack.0, $sgpr32, 0, implicit $exec :: (store (s64) into %stack.0, addrspace 5)
$vgpr48_vgpr49_vgpr50_vgpr51 = SI_SPILL_V128_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load (s128) from %stack.0, align 4, addrspace 5)
@@ -78,7 +78,7 @@ body: |
; GCN-NEXT: $vgpr51 = COPY $vgpr53, implicit-def $vgpr48_vgpr49_vgpr50_vgpr51
; GCN-NEXT: $vgpr50 = COPY $vgpr54, implicit $vgpr48_vgpr49_vgpr50_vgpr51
; GCN-NEXT: $vgpr49 = COPY $vgpr55, implicit $vgpr48_vgpr49_vgpr50_vgpr51
- ; GCN-NEXT: $vgpr48 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 0, 0, implicit $exec, implicit $flat_scr, implicit-def $vgpr48_vgpr49_vgpr50_vgpr51 :: (load (s32) from %stack.0, addrspace 5)
+ ; GCN-NEXT: $vgpr48 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 0, 0, implicit $exec, implicit $flat_scr, implicit-def $vgpr48_vgpr49_vgpr50_vgpr51, implicit $vgpr48_vgpr49_vgpr50_vgpr51 :: (load (s32) from %stack.0, addrspace 5)
; GCN-NEXT: S_ENDPGM 0, implicit $vgpr52, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, implicit $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, implicit $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47
SI_SPILL_A128_SAVE killed $agpr0_agpr1_agpr2_agpr3, %stack.0, $sgpr32, 0, implicit $exec :: (store (s64) into %stack.0, addrspace 5)
$vgpr48_vgpr49_vgpr50_vgpr51 = SI_SPILL_V128_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load (s128) from %stack.0, align 4, addrspace 5)
@@ -132,7 +132,7 @@ body: |
; GCN-NEXT: $agpr30 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr3, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3, implicit $vgpr0_vgpr1_vgpr2_vgpr3
; GCN-NEXT: SCRATCH_STORE_DWORDX3_SADDR killed $vgpr0_vgpr1_vgpr2, $sgpr32, 0, 0, implicit $exec, implicit $flat_scr, implicit killed $vgpr0_vgpr1_vgpr2_vgpr3 :: (store (s96) into %stack.0, align 4, addrspace 5)
; GCN-NEXT: $agpr29 = COPY $agpr30, implicit-def $agpr26_agpr27_agpr28_agpr29
- ; GCN-NEXT: $agpr26_agpr27_agpr28 = SCRATCH_LOAD_DWORDX3_SADDR $sgpr32, 0, 0, implicit $exec, implicit $flat_scr, implicit-def $agpr26_agpr27_agpr28_agpr29 :: (load (s96) from %stack.0, align 4, addrspace 5)
+ ; GCN-NEXT: $agpr26_agpr27_agpr28 = SCRATCH_LOAD_DWORDX3_SADDR $sgpr32, 0, 0, implicit $exec, implicit $flat_scr, implicit-def $agpr26_agpr27_agpr28_agpr29, implicit $agpr26_agpr27_agpr28_agpr29 :: (load (s96) from %stack.0, align 4, addrspace 5)
; GCN-NEXT: S_ENDPGM 0, implicit $agpr31, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, implicit $agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23, implicit $agpr24_agpr25
SI_SPILL_V128_SAVE killed $vgpr0_vgpr1_vgpr2_vgpr3, %stack.0, $sgpr32, 0, implicit $exec :: (store (s64) into %stack.0, addrspace 5)
$agpr26_agpr27_agpr28_agpr29 = SI_SPILL_A128_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load (s128) from %stack.0, align 4, addrspace 5)
@@ -159,7 +159,7 @@ body: |
; GCN-NEXT: SCRATCH_STORE_DWORDX2_SADDR killed $vgpr0_vgpr1, $sgpr32, 0, 0, implicit $exec, implicit $flat_scr, implicit killed $vgpr0_vgpr1_vgpr2_vgpr3 :: (store (s64) into %stack.0, align 4, addrspace 5)
; GCN-NEXT: $agpr29 = COPY $agpr30, implicit-def $agpr26_agpr27_agpr28_agpr29
; GCN-NEXT: $agpr28 = COPY $agpr31, implicit $agpr26_agpr27_agpr28_agpr29
- ; GCN-NEXT: $agpr26_agpr27 = SCRATCH_LOAD_DWORDX2_SADDR $sgpr32, 0, 0, implicit $exec, implicit $flat_scr, implicit-def $agpr26_agpr27_agpr28_agpr29 :: (load (s64) from %stack.0, align 4, addrspace 5)
+ ; GCN-NEXT: $agpr26_agpr27 = SCRATCH_LOAD_DWORDX2_SADDR $sgpr32, 0, 0, implicit $exec, implicit $flat_scr, implicit-def $agpr26_agpr27_agpr28_agpr29, implicit $agpr26_agpr27_agpr28_agpr29 :: (load (s64) from %stack.0, align 4, addrspace 5)
; GCN-NEXT: S_ENDPGM 0, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, implicit $agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23, implicit $agpr24_agpr25
SI_SPILL_V128_SAVE killed $vgpr0_vgpr1_vgpr2_vgpr3, %stack.0, $sgpr32, 0, implicit $exec :: (store (s64) into %stack.0, addrspace 5)
$agpr26_agpr27_agpr28_agpr29 = SI_SPILL_A128_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load (s128) from %stack.0, align 4, addrspace 5)
@@ -188,7 +188,7 @@ body: |
; GCN-NEXT: $agpr29 = COPY $agpr25, implicit-def $agpr26_agpr27_agpr28_agpr29
; GCN-NEXT: $agpr28 = COPY $agpr30, implicit $agpr26_agpr27_agpr28_agpr29
; GCN-NEXT: $agpr27 = COPY $agpr31, implicit $agpr26_agpr27_agpr28_agpr29
- ; GCN-NEXT: $agpr26 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 0, 0, implicit $exec, implicit $flat_scr, implicit-def $agpr26_agpr27_agpr28_agpr29 :: (load (s32) from %stack.0, addrspace 5)
+ ; GCN-NEXT: $agpr26 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 0, 0, implicit $exec, implicit $flat_scr, implicit-def $agpr26_agpr27_agpr28_agpr29, implicit $agpr26_agpr27_agpr28_agpr29 :: (load (s32) from %stack.0, addrspace 5)
; GCN-NEXT: S_ENDPGM 0, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, implicit $agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23, implicit $agpr24
SI_SPILL_V128_SAVE killed $vgpr0_vgpr1_vgpr2_vgpr3, %stack.0, $sgpr32, 0, implicit $exec :: (store (s64) into %stack.0, addrspace 5)
$agpr26_agpr27_agpr28_agpr29 = SI_SPILL_A128_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load (s128) from %stack.0, align 4, addrspace 5)
diff --git a/llvm/test/CodeGen/AMDGPU/wait-before-stores-with-scope_sys.ll b/llvm/test/CodeGen/AMDGPU/wait-before-stores-with-scope_sys.ll
index 2d7a91f..985bcbd 100644
--- a/llvm/test/CodeGen/AMDGPU/wait-before-stores-with-scope_sys.ll
+++ b/llvm/test/CodeGen/AMDGPU/wait-before-stores-with-scope_sys.ll
@@ -1,22 +1,50 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
-; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1200 < %s | FileCheck -check-prefix=GFX12 %s
-; RUN: llc -global-isel=1 -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx1200 < %s | FileCheck -check-prefix=GFX12 %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1200 < %s | FileCheck -check-prefix=GFX1200 %s
+; RUN: llc -global-isel=1 -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx1200 < %s | FileCheck -check-prefix=GFX1200 %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1250 < %s | FileCheck -check-prefix=GFX1250-SDAG %s
+; RUN: llc -global-isel=1 -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx1250 < %s | FileCheck -check-prefix=GFX1250-GISEL %s
define amdgpu_ps void @intrinsic_store_system_scope(i32 %val, <4 x i32> inreg %rsrc, i32 %vindex, i32 %voffset, i32 inreg %soffset) {
-; GFX12-LABEL: intrinsic_store_system_scope:
-; GFX12: ; %bb.0:
-; GFX12-NEXT: buffer_store_b32 v0, v[1:2], s[0:3], s4 idxen offen scope:SCOPE_SYS
-; GFX12-NEXT: s_endpgm
+; GFX1200-LABEL: intrinsic_store_system_scope:
+; GFX1200: ; %bb.0:
+; GFX1200-NEXT: buffer_store_b32 v0, v[1:2], s[0:3], s4 idxen offen scope:SCOPE_SYS
+; GFX1200-NEXT: s_endpgm
+;
+; GFX1250-SDAG-LABEL: intrinsic_store_system_scope:
+; GFX1250-SDAG: ; %bb.0:
+; GFX1250-SDAG-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1
+; GFX1250-SDAG-NEXT: buffer_store_b32 v0, v[2:3], s[0:3], s4 idxen offen scope:SCOPE_SYS
+; GFX1250-SDAG-NEXT: s_endpgm
+;
+; GFX1250-GISEL-LABEL: intrinsic_store_system_scope:
+; GFX1250-GISEL: ; %bb.0:
+; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2
+; GFX1250-GISEL-NEXT: buffer_store_b32 v0, v[4:5], s[0:3], s4 idxen offen scope:SCOPE_SYS
+; GFX1250-GISEL-NEXT: s_endpgm
call void @llvm.amdgcn.struct.buffer.store.i32(i32 %val, <4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 24)
ret void
}
define amdgpu_ps void @generic_store_volatile(i32 %val, ptr addrspace(1) %out) {
-; GFX12-LABEL: generic_store_volatile:
-; GFX12: ; %bb.0:
-; GFX12-NEXT: global_store_b32 v[1:2], v0, off scope:SCOPE_SYS
-; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: s_endpgm
+; GFX1200-LABEL: generic_store_volatile:
+; GFX1200: ; %bb.0:
+; GFX1200-NEXT: global_store_b32 v[1:2], v0, off scope:SCOPE_SYS
+; GFX1200-NEXT: s_wait_storecnt 0x0
+; GFX1200-NEXT: s_endpgm
+;
+; GFX1250-SDAG-LABEL: generic_store_volatile:
+; GFX1250-SDAG: ; %bb.0:
+; GFX1250-SDAG-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1
+; GFX1250-SDAG-NEXT: global_store_b32 v[2:3], v0, off scope:SCOPE_SYS
+; GFX1250-SDAG-NEXT: s_wait_storecnt 0x0
+; GFX1250-SDAG-NEXT: s_endpgm
+;
+; GFX1250-GISEL-LABEL: generic_store_volatile:
+; GFX1250-GISEL: ; %bb.0:
+; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2
+; GFX1250-GISEL-NEXT: global_store_b32 v[4:5], v0, off scope:SCOPE_SYS
+; GFX1250-GISEL-NEXT: s_wait_storecnt 0x0
+; GFX1250-GISEL-NEXT: s_endpgm
store volatile i32 %val, ptr addrspace(1) %out
ret void
}
diff --git a/llvm/test/CodeGen/PowerPC/p10-spill-crun.ll b/llvm/test/CodeGen/PowerPC/p10-spill-crun.ll
index 4ca2dc5db..eba9faa 100644
--- a/llvm/test/CodeGen/PowerPC/p10-spill-crun.ll
+++ b/llvm/test/CodeGen/PowerPC/p10-spill-crun.ll
@@ -234,8 +234,8 @@ define dso_local void @P10_Spill_CR_UN(ptr %arg, ptr %arg1, i32 %arg2) local_unn
; CHECK-BE-NEXT: # %bb.4: # %bb37
; CHECK-BE-NEXT: bc 4, 4*cr5+lt, .LBB0_14
; CHECK-BE-NEXT: .LBB0_5: # %bb42
-; CHECK-BE-NEXT: addi r3, r3, global_1@toc@l
; CHECK-BE-NEXT: li r4, 0
+; CHECK-BE-NEXT: addi r3, r3, global_1@toc@l
; CHECK-BE-NEXT: cmpwi r28, 0
; CHECK-BE-NEXT: isel r3, r3, r4, 4*cr2+gt
; CHECK-BE-NEXT: crnot 4*cr2+lt, eq
diff --git a/llvm/test/CodeGen/PowerPC/vector-reduce-add.ll b/llvm/test/CodeGen/PowerPC/vector-reduce-add.ll
index d506d20..e5d305f 100644
--- a/llvm/test/CodeGen/PowerPC/vector-reduce-add.ll
+++ b/llvm/test/CodeGen/PowerPC/vector-reduce-add.ll
@@ -1085,14 +1085,14 @@ define dso_local signext i32 @v16i8tov16i32_sign(<16 x i8> %a) local_unnamed_add
; PWR10BE-NEXT: addis r3, r2, .LCPI17_2@toc@ha
; PWR10BE-NEXT: vperm v3, v2, v2, v3
; PWR10BE-NEXT: addi r3, r3, .LCPI17_2@toc@l
-; PWR10BE-NEXT: vextsb2w v3, v3
; PWR10BE-NEXT: lxv v5, 0(r3)
; PWR10BE-NEXT: addis r3, r2, .LCPI17_3@toc@ha
+; PWR10BE-NEXT: vextsb2w v3, v3
; PWR10BE-NEXT: vperm v4, v2, v2, v4
; PWR10BE-NEXT: addi r3, r3, .LCPI17_3@toc@l
-; PWR10BE-NEXT: vextsb2w v4, v4
; PWR10BE-NEXT: lxv v0, 0(r3)
; PWR10BE-NEXT: li r3, 0
+; PWR10BE-NEXT: vextsb2w v4, v4
; PWR10BE-NEXT: vperm v5, v2, v2, v5
; PWR10BE-NEXT: vadduwm v3, v4, v3
; PWR10BE-NEXT: vextsb2w v5, v5
@@ -1212,9 +1212,9 @@ define dso_local zeroext i32 @v16i8tov16i32_zero(<16 x i8> %a) local_unnamed_add
; PWR10BE-NEXT: addis r3, r2, .LCPI18_3@toc@ha
; PWR10BE-NEXT: vperm v5, v4, v2, v5
; PWR10BE-NEXT: addi r3, r3, .LCPI18_3@toc@l
-; PWR10BE-NEXT: vadduwm v3, v5, v3
; PWR10BE-NEXT: lxv v1, 0(r3)
; PWR10BE-NEXT: li r3, 0
+; PWR10BE-NEXT: vadduwm v3, v5, v3
; PWR10BE-NEXT: vperm v0, v4, v2, v0
; PWR10BE-NEXT: vperm v2, v4, v2, v1
; PWR10BE-NEXT: vadduwm v2, v2, v0
@@ -1568,41 +1568,41 @@ define dso_local i64 @v16i8tov16i64_sign(<16 x i8> %a) local_unnamed_addr #0 {
; PWR10BE-NEXT: addis r3, r2, .LCPI23_0@toc@ha
; PWR10BE-NEXT: xxspltib v1, 255
; PWR10BE-NEXT: addi r3, r3, .LCPI23_0@toc@l
-; PWR10BE-NEXT: vsrq v1, v1, v1
; PWR10BE-NEXT: lxv v3, 0(r3)
; PWR10BE-NEXT: addis r3, r2, .LCPI23_1@toc@ha
+; PWR10BE-NEXT: vsrq v1, v1, v1
; PWR10BE-NEXT: addi r3, r3, .LCPI23_1@toc@l
; PWR10BE-NEXT: vperm v1, v2, v2, v1
; PWR10BE-NEXT: lxv v4, 0(r3)
; PWR10BE-NEXT: addis r3, r2, .LCPI23_2@toc@ha
-; PWR10BE-NEXT: vextsb2d v1, v1
; PWR10BE-NEXT: vperm v3, v2, v2, v3
+; PWR10BE-NEXT: vextsb2d v1, v1
; PWR10BE-NEXT: addi r3, r3, .LCPI23_2@toc@l
-; PWR10BE-NEXT: vextsb2d v3, v3
; PWR10BE-NEXT: lxv v5, 0(r3)
; PWR10BE-NEXT: addis r3, r2, .LCPI23_3@toc@ha
+; PWR10BE-NEXT: vextsb2d v3, v3
; PWR10BE-NEXT: vperm v4, v2, v2, v4
; PWR10BE-NEXT: addi r3, r3, .LCPI23_3@toc@l
-; PWR10BE-NEXT: vextsb2d v4, v4
; PWR10BE-NEXT: lxv v0, 0(r3)
; PWR10BE-NEXT: addis r3, r2, .LCPI23_4@toc@ha
+; PWR10BE-NEXT: vextsb2d v4, v4
; PWR10BE-NEXT: vperm v5, v2, v2, v5
; PWR10BE-NEXT: addi r3, r3, .LCPI23_4@toc@l
-; PWR10BE-NEXT: vextsb2d v5, v5
; PWR10BE-NEXT: lxv v6, 0(r3)
; PWR10BE-NEXT: addis r3, r2, .LCPI23_5@toc@ha
+; PWR10BE-NEXT: vextsb2d v5, v5
; PWR10BE-NEXT: vperm v0, v2, v2, v0
; PWR10BE-NEXT: addi r3, r3, .LCPI23_5@toc@l
-; PWR10BE-NEXT: vextsb2d v0, v0
; PWR10BE-NEXT: lxv v7, 0(r3)
; PWR10BE-NEXT: addis r3, r2, .LCPI23_6@toc@ha
+; PWR10BE-NEXT: vextsb2d v0, v0
; PWR10BE-NEXT: vperm v6, v2, v2, v6
+; PWR10BE-NEXT: addi r3, r3, .LCPI23_6@toc@l
; PWR10BE-NEXT: vaddudm v5, v0, v5
; PWR10BE-NEXT: vaddudm v3, v4, v3
; PWR10BE-NEXT: vaddudm v3, v3, v5
-; PWR10BE-NEXT: addi r3, r3, .LCPI23_6@toc@l
-; PWR10BE-NEXT: vextsb2d v6, v6
; PWR10BE-NEXT: lxv v8, 0(r3)
+; PWR10BE-NEXT: vextsb2d v6, v6
; PWR10BE-NEXT: vperm v7, v2, v2, v7
; PWR10BE-NEXT: vextsb2d v7, v7
; PWR10BE-NEXT: vperm v2, v2, v2, v8
diff --git a/llvm/test/CodeGen/SPIRV/branching/OpSwitch64.ll b/llvm/test/CodeGen/SPIRV/branching/OpSwitch64.ll
index 5e4f1f1..8fc986f 100644
--- a/llvm/test/CodeGen/SPIRV/branching/OpSwitch64.ll
+++ b/llvm/test/CodeGen/SPIRV/branching/OpSwitch64.ll
@@ -19,7 +19,11 @@
; RUN: llc -O0 -mtriple=spirv64-unknown-unknown %s -o - | FileCheck %s --check-prefix=CHECK-SPIRV
-; CHECK-SPIRV: OpSwitch %[[#]] %[[#]] 0 0 %[[#]] 1 0 %[[#]] 1 5 %[[#]]
+; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv64-unknown-unknown %s -o - -filetype=obj | spirv-val %}
+
+; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv64-unknown-unknown %s -o - -filetype=asm | spirv-as - -o /dev/null %}
+
+; CHECK-SPIRV: OpSwitch %[[#]] %[[#]] 0 %[[#]] 1 %[[#]] 21474836481 %[[#]]
define spir_kernel void @test_64(i32 addrspace(1)* %res) {
entry:
diff --git a/llvm/test/CodeGen/SystemZ/inline-asm-flag-output-01.ll b/llvm/test/CodeGen/SystemZ/inline-asm-flag-output-01.ll
index 6b8746e..a86420e 100644
--- a/llvm/test/CodeGen/SystemZ/inline-asm-flag-output-01.ll
+++ b/llvm/test/CodeGen/SystemZ/inline-asm-flag-output-01.ll
@@ -736,3 +736,40 @@ exit:
ret void
}
+; Test INLINEASM defines CC.
+@wait_fence = global i32 0, align 4
+@bit_cc = global i32 0, align 4
+define void @test_inlineasm_define_cc() {
+; CHECK-LABEL: test_inlineasm_define_cc:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: lgrl %r1, wait_fence@GOT
+; CHECK-NEXT: chsi 0(%r1), 0
+; CHECK-NEXT: ber %r14
+; CHECK-NEXT: .LBB29_1: # %while.body.lr.ph
+; CHECK-NEXT: lgrl %r1, bit_cc@GOT
+; CHECK-NEXT: #APP
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: ipm %r0
+; CHECK-NEXT: srl %r0, 28
+; CHECK-NEXT: st %r0, 0(%r1)
+; CHECK-NEXT: .LBB29_2: # %while.body
+; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
+; CHECK-NEXT: j .LBB29_2
+entry:
+ %0 = load i32, ptr @wait_fence, align 4
+ %tobool.not = icmp eq i32 %0, 0
+ br i1 %tobool.not, label %while.end, label %while.body.lr.ph
+
+while.body.lr.ph:
+ %1 = tail call i32 asm "", "={@cc}"()
+ %2 = icmp ult i32 %1, 4
+ tail call void @llvm.assume(i1 %2)
+ store i32 %1, ptr @bit_cc, align 4
+ br label %while.body
+
+while.body:
+ br label %while.body
+
+while.end:
+ ret void
+}
diff --git a/llvm/test/CodeGen/X86/AMX/amx-combine-undef.ll b/llvm/test/CodeGen/X86/AMX/amx-combine-undef.ll
index faa119c..5f0682a 100644
--- a/llvm/test/CodeGen/X86/AMX/amx-combine-undef.ll
+++ b/llvm/test/CodeGen/X86/AMX/amx-combine-undef.ll
@@ -1,5 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt --codegen-opt-level=2 -mtriple=x86_64 -lower-amx-type %s -S | FileCheck %s
+; RUN: opt --codegen-opt-level=2 -mtriple=x86_64 -x86-lower-amx-type %s -S | FileCheck %s
+; RUN: opt --codegen-opt-level=2 -mtriple=x86_64 -passes=x86-lower-amx-type %s -S | FileCheck %s
define void @undef_2phi(ptr%buf) {
; CHECK-LABEL: @undef_2phi(
diff --git a/llvm/test/CodeGen/X86/AMX/amx-combine.ll b/llvm/test/CodeGen/X86/AMX/amx-combine.ll
index 07f489c..72e072dd 100644
--- a/llvm/test/CodeGen/X86/AMX/amx-combine.ll
+++ b/llvm/test/CodeGen/X86/AMX/amx-combine.ll
@@ -1,5 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt --codegen-opt-level=2 -mtriple=x86_64 -lower-amx-type %s -S | FileCheck %s
+; RUN: opt --codegen-opt-level=2 -mtriple=x86_64 -x86-lower-amx-type %s -S | FileCheck %s
+; RUN: opt --codegen-opt-level=2 -mtriple=x86_64 -passes=x86-lower-amx-type %s -S | FileCheck %s
define void @combine_store(ptr%p) {
; CHECK-LABEL: @combine_store(
diff --git a/llvm/test/CodeGen/X86/AMX/amx-configO2toO0-lower.ll b/llvm/test/CodeGen/X86/AMX/amx-configO2toO0-lower.ll
index 6c536f1..4ac406c 100644
--- a/llvm/test/CodeGen/X86/AMX/amx-configO2toO0-lower.ll
+++ b/llvm/test/CodeGen/X86/AMX/amx-configO2toO0-lower.ll
@@ -1,4 +1,5 @@
-; RUN: opt < %s -mtriple=x86_64-unknown-unknown -mattr=+amx-int8 -mattr=+avx512f -lower-amx-type -S | FileCheck %s
+; RUN: opt < %s -mtriple=x86_64-unknown-unknown -mattr=+amx-int8 -mattr=+avx512f -x86-lower-amx-type -S | FileCheck %s
+; RUN: opt < %s -mtriple=x86_64-unknown-unknown -mattr=+amx-int8 -mattr=+avx512f -passes=x86-lower-amx-type -S | FileCheck %s
@buf = dso_local global [1024 x i8] zeroinitializer, align 16
@buf2 = dso_local global [1024 x i8] zeroinitializer, align 16
diff --git a/llvm/test/CodeGen/X86/AMX/amx-type.ll b/llvm/test/CodeGen/X86/AMX/amx-type.ll
index 1d9af2b..294195a 100644
--- a/llvm/test/CodeGen/X86/AMX/amx-type.ll
+++ b/llvm/test/CodeGen/X86/AMX/amx-type.ll
@@ -1,5 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt --codegen-opt-level=2 -mtriple=x86_64 -lower-amx-type %s -S | FileCheck %s
+; RUN: opt --codegen-opt-level=2 -mtriple=x86_64 -x86-lower-amx-type %s -S | FileCheck %s
+; RUN: opt --codegen-opt-level=2 -mtriple=x86_64 -passes=x86-lower-amx-type %s -S | FileCheck %s
%struct.__tile_str = type { i16, i16, <256 x i32> }
diff --git a/llvm/test/CodeGen/X86/AMX/lat-combine-amx-bitcast.ll b/llvm/test/CodeGen/X86/AMX/lat-combine-amx-bitcast.ll
index b70668f..cdce783 100644
--- a/llvm/test/CodeGen/X86/AMX/lat-combine-amx-bitcast.ll
+++ b/llvm/test/CodeGen/X86/AMX/lat-combine-amx-bitcast.ll
@@ -1,5 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt --codegen-opt-level=2 -mtriple=x86_64 -lower-amx-type %s -S | FileCheck %s
+; RUN: opt --codegen-opt-level=2 -mtriple=x86_64 -x86-lower-amx-type %s -S | FileCheck %s
+; RUN: opt --codegen-opt-level=2 -mtriple=x86_64 -passes=x86-lower-amx-type %s -S | FileCheck %s
define void @combine_amx_cast_inside_bb() {
; CHECK-LABEL: @combine_amx_cast_inside_bb(
diff --git a/llvm/test/CodeGen/X86/AMX/lat-transform-amx-bitcast.ll b/llvm/test/CodeGen/X86/AMX/lat-transform-amx-bitcast.ll
index 3a5b4245..0b419bb 100644
--- a/llvm/test/CodeGen/X86/AMX/lat-transform-amx-bitcast.ll
+++ b/llvm/test/CodeGen/X86/AMX/lat-transform-amx-bitcast.ll
@@ -1,5 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt --codegen-opt-level=2 -mtriple=x86_64 -lower-amx-type %s -S | FileCheck %s
+; RUN: opt --codegen-opt-level=2 -mtriple=x86_64 -x86-lower-amx-type %s -S | FileCheck %s
+; RUN: opt --codegen-opt-level=2 -mtriple=x86_64 -passes=x86-lower-amx-type %s -S | FileCheck %s
%struct.__tile_str = type { i16, i16, <256 x i32> }
diff --git a/llvm/test/CodeGen/X86/amx_tile_pair_lower_type_O0.ll b/llvm/test/CodeGen/X86/amx_tile_pair_lower_type_O0.ll
index 52641c6..3549875 100644
--- a/llvm/test/CodeGen/X86/amx_tile_pair_lower_type_O0.ll
+++ b/llvm/test/CodeGen/X86/amx_tile_pair_lower_type_O0.ll
@@ -1,5 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
- ; RUN: opt --codegen-opt-level=0 -mtriple=x86_64 -lower-amx-type %s -S | FileCheck %s
+ ; RUN: opt --codegen-opt-level=0 -mtriple=x86_64 -x86-lower-amx-type %s -S | FileCheck %s
+ ; RUN: opt --codegen-opt-level=0 -mtriple=x86_64 -passes=x86-lower-amx-type %s -S | FileCheck %s
@buf = dso_local global [2048 x i8] zeroinitializer, align 16
diff --git a/llvm/test/CodeGen/X86/amx_tile_pair_lower_type_O2.ll b/llvm/test/CodeGen/X86/amx_tile_pair_lower_type_O2.ll
index 346d46b..96966264 100644
--- a/llvm/test/CodeGen/X86/amx_tile_pair_lower_type_O2.ll
+++ b/llvm/test/CodeGen/X86/amx_tile_pair_lower_type_O2.ll
@@ -1,5 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt --codegen-opt-level=2 -mtriple=x86_64 -lower-amx-type %s -S | FileCheck %s
+; RUN: opt --codegen-opt-level=2 -mtriple=x86_64 -x86-lower-amx-type %s -S | FileCheck %s
+; RUN: opt --codegen-opt-level=2 -mtriple=x86_64 -passes=x86-lower-amx-type %s -S | FileCheck %s
@buf = dso_local global [2048 x i8] zeroinitializer, align 16
@buf2 = dso_local global [2048 x i8] zeroinitializer, align 16
diff --git a/llvm/test/CodeGen/X86/trunc-srl-load.ll b/llvm/test/CodeGen/X86/trunc-srl-load.ll
new file mode 100644
index 0000000..4dae143
--- /dev/null
+++ b/llvm/test/CodeGen/X86/trunc-srl-load.ll
@@ -0,0 +1,1672 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=i686-unknown | FileCheck %s --check-prefixes=X86
+; RUN: llc < %s -mtriple=x86_64-unknown -mcpu=x86-64 | FileCheck %s --check-prefixes=X64,SSE
+; RUN: llc < %s -mtriple=x86_64-unknown -mcpu=x86-64-v2 | FileCheck %s --check-prefixes=X64,SSE
+; RUN: llc < %s -mtriple=x86_64-unknown -mcpu=x86-64-v3 | FileCheck %s --check-prefixes=X64,AVX,AVX2
+; RUN: llc < %s -mtriple=x86_64-unknown -mcpu=x86-64-v4 | FileCheck %s --check-prefixes=X64,AVX,AVX512
+
+; Tests showing for the analysis of non-constant shift amounts to improve load address math
+
+; Alignment of shift amounts should allow sub-integer loads.
+
+define i16 @extractSub64_16(ptr %word, i32 %idx) nounwind {
+; X86-LABEL: extractSub64_16:
+; X86: # %bb.0:
+; X86-NEXT: pushl %esi
+; X86-NEXT: movb {{[0-9]+}}(%esp), %ch
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: movl (%eax), %edx
+; X86-NEXT: movl 4(%eax), %esi
+; X86-NEXT: movb %ch, %cl
+; X86-NEXT: andb $16, %cl
+; X86-NEXT: movl %esi, %eax
+; X86-NEXT: shrl %cl, %eax
+; X86-NEXT: shrdl %cl, %esi, %edx
+; X86-NEXT: testb $32, %ch
+; X86-NEXT: jne .LBB0_2
+; X86-NEXT: # %bb.1:
+; X86-NEXT: movl %edx, %eax
+; X86-NEXT: .LBB0_2:
+; X86-NEXT: # kill: def $ax killed $ax killed $eax
+; X86-NEXT: popl %esi
+; X86-NEXT: retl
+;
+; SSE-LABEL: extractSub64_16:
+; SSE: # %bb.0:
+; SSE-NEXT: movl %esi, %ecx
+; SSE-NEXT: movq (%rdi), %rax
+; SSE-NEXT: andb $48, %cl
+; SSE-NEXT: # kill: def $cl killed $cl killed $ecx
+; SSE-NEXT: shrq %cl, %rax
+; SSE-NEXT: # kill: def $ax killed $ax killed $rax
+; SSE-NEXT: retq
+;
+; AVX-LABEL: extractSub64_16:
+; AVX: # %bb.0:
+; AVX-NEXT: # kill: def $esi killed $esi def $rsi
+; AVX-NEXT: andb $48, %sil
+; AVX-NEXT: shrxq %rsi, (%rdi), %rax
+; AVX-NEXT: # kill: def $ax killed $ax killed $rax
+; AVX-NEXT: retq
+ %idx_bounds = and i32 %idx, 63
+ %idx_align = and i32 %idx_bounds, -16
+ %sh = zext nneg i32 %idx_align to i64
+ %ld = load i64, ptr %word, align 8
+ %sub = lshr i64 %ld, %sh
+ %res = trunc i64 %sub to i16
+ ret i16 %res
+}
+
+define i16 @extractSub128_16(ptr %word, i32 %idx) nounwind {
+; X86-LABEL: extractSub128_16:
+; X86: # %bb.0:
+; X86-NEXT: pushl %ebp
+; X86-NEXT: movl %esp, %ebp
+; X86-NEXT: pushl %edi
+; X86-NEXT: pushl %esi
+; X86-NEXT: andl $-16, %esp
+; X86-NEXT: subl $32, %esp
+; X86-NEXT: movzbl 12(%ebp), %eax
+; X86-NEXT: movl 8(%ebp), %ecx
+; X86-NEXT: movl (%ecx), %edx
+; X86-NEXT: movl 4(%ecx), %esi
+; X86-NEXT: movl 8(%ecx), %edi
+; X86-NEXT: movl 12(%ecx), %ecx
+; X86-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT: movl %edi, {{[0-9]+}}(%esp)
+; X86-NEXT: movl %esi, {{[0-9]+}}(%esp)
+; X86-NEXT: movl %edx, (%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl %eax, %ecx
+; X86-NEXT: andb $16, %cl
+; X86-NEXT: shrb $3, %al
+; X86-NEXT: andb $12, %al
+; X86-NEXT: movzbl %al, %edx
+; X86-NEXT: movl (%esp,%edx), %eax
+; X86-NEXT: movl 4(%esp,%edx), %edx
+; X86-NEXT: shrdl %cl, %edx, %eax
+; X86-NEXT: # kill: def $ax killed $ax killed $eax
+; X86-NEXT: leal -8(%ebp), %esp
+; X86-NEXT: popl %esi
+; X86-NEXT: popl %edi
+; X86-NEXT: popl %ebp
+; X86-NEXT: retl
+;
+; SSE-LABEL: extractSub128_16:
+; SSE: # %bb.0:
+; SSE-NEXT: movq (%rdi), %rax
+; SSE-NEXT: movq 8(%rdi), %rdx
+; SSE-NEXT: movl %esi, %ecx
+; SSE-NEXT: andb $48, %cl
+; SSE-NEXT: movq %rdx, %rdi
+; SSE-NEXT: shrq %cl, %rdi
+; SSE-NEXT: shrdq %cl, %rdx, %rax
+; SSE-NEXT: testb $64, %sil
+; SSE-NEXT: cmovneq %rdi, %rax
+; SSE-NEXT: # kill: def $ax killed $ax killed $rax
+; SSE-NEXT: retq
+;
+; AVX-LABEL: extractSub128_16:
+; AVX: # %bb.0:
+; AVX-NEXT: movq (%rdi), %rdx
+; AVX-NEXT: movq 8(%rdi), %rax
+; AVX-NEXT: movl %esi, %ecx
+; AVX-NEXT: andb $48, %cl
+; AVX-NEXT: shrdq %cl, %rax, %rdx
+; AVX-NEXT: shrxq %rcx, %rax, %rax
+; AVX-NEXT: testb $64, %sil
+; AVX-NEXT: cmoveq %rdx, %rax
+; AVX-NEXT: # kill: def $ax killed $ax killed $rax
+; AVX-NEXT: retq
+ %idx_bounds = and i32 %idx, 127
+ %idx_align = and i32 %idx_bounds, -16
+ %sh = zext nneg i32 %idx_align to i128
+ %ld = load i128, ptr %word, align 8
+ %sub = lshr i128 %ld, %sh
+ %res = trunc i128 %sub to i16
+ ret i16 %res
+}
+
+define i32 @extractSub128_32(ptr %word, i32 %idx) nounwind {
+; X86-LABEL: extractSub128_32:
+; X86: # %bb.0:
+; X86-NEXT: pushl %ebp
+; X86-NEXT: movl %esp, %ebp
+; X86-NEXT: pushl %edi
+; X86-NEXT: pushl %esi
+; X86-NEXT: andl $-16, %esp
+; X86-NEXT: subl $32, %esp
+; X86-NEXT: movzbl 12(%ebp), %eax
+; X86-NEXT: movl 8(%ebp), %ecx
+; X86-NEXT: movl (%ecx), %edx
+; X86-NEXT: movl 4(%ecx), %esi
+; X86-NEXT: movl 8(%ecx), %edi
+; X86-NEXT: movl 12(%ecx), %ecx
+; X86-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT: movl %edi, {{[0-9]+}}(%esp)
+; X86-NEXT: movl %esi, {{[0-9]+}}(%esp)
+; X86-NEXT: movl %edx, (%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: andb $96, %al
+; X86-NEXT: shrb $3, %al
+; X86-NEXT: movzbl %al, %eax
+; X86-NEXT: movl (%esp,%eax), %eax
+; X86-NEXT: leal -8(%ebp), %esp
+; X86-NEXT: popl %esi
+; X86-NEXT: popl %edi
+; X86-NEXT: popl %ebp
+; X86-NEXT: retl
+;
+; SSE-LABEL: extractSub128_32:
+; SSE: # %bb.0:
+; SSE-NEXT: movq (%rdi), %rax
+; SSE-NEXT: movq 8(%rdi), %rdx
+; SSE-NEXT: movl %esi, %ecx
+; SSE-NEXT: andb $32, %cl
+; SSE-NEXT: movq %rdx, %rdi
+; SSE-NEXT: shrq %cl, %rdi
+; SSE-NEXT: shrdq %cl, %rdx, %rax
+; SSE-NEXT: testb $64, %sil
+; SSE-NEXT: cmovneq %rdi, %rax
+; SSE-NEXT: # kill: def $eax killed $eax killed $rax
+; SSE-NEXT: retq
+;
+; AVX-LABEL: extractSub128_32:
+; AVX: # %bb.0:
+; AVX-NEXT: movq (%rdi), %rdx
+; AVX-NEXT: movq 8(%rdi), %rax
+; AVX-NEXT: movl %esi, %ecx
+; AVX-NEXT: andb $32, %cl
+; AVX-NEXT: shrdq %cl, %rax, %rdx
+; AVX-NEXT: shrxq %rcx, %rax, %rax
+; AVX-NEXT: testb $64, %sil
+; AVX-NEXT: cmoveq %rdx, %rax
+; AVX-NEXT: # kill: def $eax killed $eax killed $rax
+; AVX-NEXT: retq
+ %idx_bounds = and i32 %idx, 127
+ %idx_align = and i32 %idx_bounds, -32
+ %sh = zext nneg i32 %idx_align to i128
+ %ld = load i128, ptr %word, align 8
+ %sub = lshr i128 %ld, %sh
+ %res = trunc i128 %sub to i32
+ ret i32 %res
+}
+
+define i64 @extractSub128_64(ptr %word, i32 %idx) nounwind {
+; X86-LABEL: extractSub128_64:
+; X86: # %bb.0:
+; X86-NEXT: pushl %ebp
+; X86-NEXT: movl %esp, %ebp
+; X86-NEXT: pushl %edi
+; X86-NEXT: pushl %esi
+; X86-NEXT: andl $-16, %esp
+; X86-NEXT: subl $32, %esp
+; X86-NEXT: movzbl 12(%ebp), %eax
+; X86-NEXT: movl 8(%ebp), %ecx
+; X86-NEXT: movl (%ecx), %edx
+; X86-NEXT: movl 4(%ecx), %esi
+; X86-NEXT: movl 8(%ecx), %edi
+; X86-NEXT: movl 12(%ecx), %ecx
+; X86-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT: movl %edi, {{[0-9]+}}(%esp)
+; X86-NEXT: movl %esi, {{[0-9]+}}(%esp)
+; X86-NEXT: movl %edx, (%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: andb $64, %al
+; X86-NEXT: shrb $3, %al
+; X86-NEXT: movzbl %al, %ecx
+; X86-NEXT: movl (%esp,%ecx), %eax
+; X86-NEXT: movl 4(%esp,%ecx), %edx
+; X86-NEXT: leal -8(%ebp), %esp
+; X86-NEXT: popl %esi
+; X86-NEXT: popl %edi
+; X86-NEXT: popl %ebp
+; X86-NEXT: retl
+;
+; X64-LABEL: extractSub128_64:
+; X64: # %bb.0:
+; X64-NEXT: testb $64, %sil
+; X64-NEXT: je .LBB3_1
+; X64-NEXT: # %bb.2:
+; X64-NEXT: movq 8(%rdi), %rax
+; X64-NEXT: retq
+; X64-NEXT: .LBB3_1:
+; X64-NEXT: movq (%rdi), %rax
+; X64-NEXT: retq
+ %idx_bounds = and i32 %idx, 127
+ %idx_align = and i32 %idx_bounds, -64
+ %sh = zext nneg i32 %idx_align to i128
+ %ld = load i128, ptr %word, align 8
+ %sub = lshr i128 %ld, %sh
+ %res = trunc i128 %sub to i64
+ ret i64 %res
+}
+
+define i8 @extractSub512_8(ptr %word, i32 %idx) nounwind {
+; X86-LABEL: extractSub512_8:
+; X86: # %bb.0:
+; X86-NEXT: pushl %ebp
+; X86-NEXT: movl %esp, %ebp
+; X86-NEXT: pushl %ebx
+; X86-NEXT: pushl %edi
+; X86-NEXT: pushl %esi
+; X86-NEXT: andl $-16, %esp
+; X86-NEXT: subl $192, %esp
+; X86-NEXT: movl 8(%ebp), %eax
+; X86-NEXT: movl (%eax), %ecx
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 4(%eax), %ecx
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 8(%eax), %ecx
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 12(%eax), %ecx
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 16(%eax), %ecx
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 20(%eax), %ecx
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 24(%eax), %ecx
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 28(%eax), %ecx
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 32(%eax), %ecx
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 36(%eax), %ecx
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 40(%eax), %ebx
+; X86-NEXT: movl 44(%eax), %edi
+; X86-NEXT: movl 48(%eax), %esi
+; X86-NEXT: movl 52(%eax), %edx
+; X86-NEXT: movl 56(%eax), %ecx
+; X86-NEXT: movl 60(%eax), %eax
+; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; X86-NEXT: movl %esi, {{[0-9]+}}(%esp)
+; X86-NEXT: movl %edi, {{[0-9]+}}(%esp)
+; X86-NEXT: movl %ebx, {{[0-9]+}}(%esp)
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT: movl 12(%ebp), %edx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT: movl %edx, %ecx
+; X86-NEXT: andl $24, %ecx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: shrl $3, %edx
+; X86-NEXT: andl $60, %edx
+; X86-NEXT: movl 48(%esp,%edx), %eax
+; X86-NEXT: movl 52(%esp,%edx), %edx
+; X86-NEXT: # kill: def $cl killed $cl killed $ecx
+; X86-NEXT: shrdl %cl, %edx, %eax
+; X86-NEXT: # kill: def $al killed $al killed $eax
+; X86-NEXT: leal -12(%ebp), %esp
+; X86-NEXT: popl %esi
+; X86-NEXT: popl %edi
+; X86-NEXT: popl %ebx
+; X86-NEXT: popl %ebp
+; X86-NEXT: retl
+;
+; SSE-LABEL: extractSub512_8:
+; SSE: # %bb.0:
+; SSE-NEXT: pushq %rax
+; SSE-NEXT: # kill: def $esi killed $esi def $rsi
+; SSE-NEXT: movups (%rdi), %xmm0
+; SSE-NEXT: movups 16(%rdi), %xmm1
+; SSE-NEXT: movups 32(%rdi), %xmm2
+; SSE-NEXT: movups 48(%rdi), %xmm3
+; SSE-NEXT: xorps %xmm4, %xmm4
+; SSE-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: movaps %xmm3, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: movl %esi, %ecx
+; SSE-NEXT: andl $56, %ecx
+; SSE-NEXT: shrl $3, %esi
+; SSE-NEXT: andl $56, %esi
+; SSE-NEXT: movq -128(%rsp,%rsi), %rdx
+; SSE-NEXT: shrq %cl, %rdx
+; SSE-NEXT: movl -120(%rsp,%rsi), %eax
+; SSE-NEXT: addl %eax, %eax
+; SSE-NEXT: notl %ecx
+; SSE-NEXT: # kill: def $cl killed $cl killed $ecx
+; SSE-NEXT: shlq %cl, %rax
+; SSE-NEXT: orl %edx, %eax
+; SSE-NEXT: # kill: def $al killed $al killed $rax
+; SSE-NEXT: popq %rcx
+; SSE-NEXT: retq
+;
+; AVX2-LABEL: extractSub512_8:
+; AVX2: # %bb.0:
+; AVX2-NEXT: pushq %rax
+; AVX2-NEXT: # kill: def $esi killed $esi def $rsi
+; AVX2-NEXT: vmovups (%rdi), %ymm0
+; AVX2-NEXT: vmovups 32(%rdi), %ymm1
+; AVX2-NEXT: vxorps %xmm2, %xmm2, %xmm2
+; AVX2-NEXT: vmovups %ymm2, -{{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovups %ymm2, -{{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; AVX2-NEXT: movl %esi, %ecx
+; AVX2-NEXT: andl $56, %ecx
+; AVX2-NEXT: shrl $3, %esi
+; AVX2-NEXT: andl $56, %esi
+; AVX2-NEXT: shrxq %rcx, -128(%rsp,%rsi), %rax
+; AVX2-NEXT: # kill: def $ecx killed $ecx killed $rcx def $rcx
+; AVX2-NEXT: notl %ecx
+; AVX2-NEXT: movl -120(%rsp,%rsi), %edx
+; AVX2-NEXT: addl %edx, %edx
+; AVX2-NEXT: shlxq %rcx, %rdx, %rcx
+; AVX2-NEXT: orl %ecx, %eax
+; AVX2-NEXT: # kill: def $al killed $al killed $rax
+; AVX2-NEXT: popq %rcx
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: extractSub512_8:
+; AVX512: # %bb.0:
+; AVX512-NEXT: pushq %rax
+; AVX512-NEXT: vmovups (%rdi), %ymm0
+; AVX512-NEXT: vmovups 32(%rdi), %ymm1
+; AVX512-NEXT: vxorps %xmm2, %xmm2, %xmm2
+; AVX512-NEXT: vmovups %ymm2, -{{[0-9]+}}(%rsp)
+; AVX512-NEXT: vmovups %ymm2, -{{[0-9]+}}(%rsp)
+; AVX512-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp)
+; AVX512-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; AVX512-NEXT: # kill: def $esi killed $esi def $rsi
+; AVX512-NEXT: movl %esi, %ecx
+; AVX512-NEXT: andl $56, %ecx
+; AVX512-NEXT: shrl $3, %esi
+; AVX512-NEXT: andl $56, %esi
+; AVX512-NEXT: shrxq %rcx, -128(%rsp,%rsi), %rax
+; AVX512-NEXT: # kill: def $ecx killed $ecx killed $rcx def $rcx
+; AVX512-NEXT: notl %ecx
+; AVX512-NEXT: movl -120(%rsp,%rsi), %edx
+; AVX512-NEXT: addl %edx, %edx
+; AVX512-NEXT: shlxq %rcx, %rdx, %rcx
+; AVX512-NEXT: orl %ecx, %eax
+; AVX512-NEXT: # kill: def $al killed $al killed $rax
+; AVX512-NEXT: popq %rcx
+; AVX512-NEXT: vzeroupper
+; AVX512-NEXT: retq
+ %idx_bounds = and i32 %idx, 511
+ %idx_align = and i32 %idx_bounds, -8
+ %ld = load i512, ptr %word, align 8
+ %sh = zext nneg i32 %idx_align to i512
+ %sub = lshr i512 %ld, %sh
+ %res = trunc i512 %sub to i8
+ ret i8 %res
+}
+
+define i64 @extractSub512_64(ptr %word, i32 %idx) nounwind {
+; X86-LABEL: extractSub512_64:
+; X86: # %bb.0:
+; X86-NEXT: pushl %ebp
+; X86-NEXT: movl %esp, %ebp
+; X86-NEXT: pushl %ebx
+; X86-NEXT: pushl %edi
+; X86-NEXT: pushl %esi
+; X86-NEXT: andl $-16, %esp
+; X86-NEXT: subl $192, %esp
+; X86-NEXT: movl 8(%ebp), %eax
+; X86-NEXT: movl (%eax), %ecx
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 4(%eax), %ecx
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 8(%eax), %ecx
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 12(%eax), %ecx
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 16(%eax), %ecx
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 20(%eax), %ecx
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 24(%eax), %ecx
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 28(%eax), %ecx
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 32(%eax), %ecx
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 36(%eax), %ecx
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 40(%eax), %ebx
+; X86-NEXT: movl 44(%eax), %edi
+; X86-NEXT: movl 48(%eax), %esi
+; X86-NEXT: movl 52(%eax), %edx
+; X86-NEXT: movl 56(%eax), %ecx
+; X86-NEXT: movl 60(%eax), %eax
+; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; X86-NEXT: movl %esi, {{[0-9]+}}(%esp)
+; X86-NEXT: movl %edi, {{[0-9]+}}(%esp)
+; X86-NEXT: movl %ebx, {{[0-9]+}}(%esp)
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT: movl 12(%ebp), %ecx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: shrl $3, %ecx
+; X86-NEXT: andl $56, %ecx
+; X86-NEXT: movl 48(%esp,%ecx), %eax
+; X86-NEXT: movl 52(%esp,%ecx), %edx
+; X86-NEXT: leal -12(%ebp), %esp
+; X86-NEXT: popl %esi
+; X86-NEXT: popl %edi
+; X86-NEXT: popl %ebx
+; X86-NEXT: popl %ebp
+; X86-NEXT: retl
+;
+; SSE-LABEL: extractSub512_64:
+; SSE: # %bb.0:
+; SSE-NEXT: pushq %rax
+; SSE-NEXT: # kill: def $esi killed $esi def $rsi
+; SSE-NEXT: movups (%rdi), %xmm0
+; SSE-NEXT: movups 16(%rdi), %xmm1
+; SSE-NEXT: movups 32(%rdi), %xmm2
+; SSE-NEXT: movups 48(%rdi), %xmm3
+; SSE-NEXT: xorps %xmm4, %xmm4
+; SSE-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: movaps %xmm3, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: shrl $3, %esi
+; SSE-NEXT: andl $56, %esi
+; SSE-NEXT: movq -128(%rsp,%rsi), %rax
+; SSE-NEXT: popq %rcx
+; SSE-NEXT: retq
+;
+; AVX2-LABEL: extractSub512_64:
+; AVX2: # %bb.0:
+; AVX2-NEXT: pushq %rax
+; AVX2-NEXT: vmovups (%rdi), %ymm0
+; AVX2-NEXT: vmovups 32(%rdi), %ymm1
+; AVX2-NEXT: vxorps %xmm2, %xmm2, %xmm2
+; AVX2-NEXT: vmovups %ymm2, -{{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovups %ymm2, -{{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; AVX2-NEXT: # kill: def $esi killed $esi def $rsi
+; AVX2-NEXT: shrl $3, %esi
+; AVX2-NEXT: andl $56, %esi
+; AVX2-NEXT: movq -128(%rsp,%rsi), %rax
+; AVX2-NEXT: popq %rcx
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: extractSub512_64:
+; AVX512: # %bb.0:
+; AVX512-NEXT: pushq %rax
+; AVX512-NEXT: vmovups (%rdi), %ymm0
+; AVX512-NEXT: vmovups 32(%rdi), %ymm1
+; AVX512-NEXT: vxorps %xmm2, %xmm2, %xmm2
+; AVX512-NEXT: vmovups %ymm2, -{{[0-9]+}}(%rsp)
+; AVX512-NEXT: vmovups %ymm2, -{{[0-9]+}}(%rsp)
+; AVX512-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp)
+; AVX512-NEXT: # kill: def $esi killed $esi def $rsi
+; AVX512-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; AVX512-NEXT: shrl $3, %esi
+; AVX512-NEXT: andl $56, %esi
+; AVX512-NEXT: movq -128(%rsp,%rsi), %rax
+; AVX512-NEXT: popq %rcx
+; AVX512-NEXT: vzeroupper
+; AVX512-NEXT: retq
+ %idx_bounds = and i32 %idx, 511
+ %idx_align = and i32 %idx_bounds, -64
+ %sh = zext nneg i32 %idx_align to i512
+ %ld = load i512, ptr %word, align 8
+ %sub = lshr i512 %ld, %sh
+ %res = trunc i512 %sub to i64
+ ret i64 %res
+}
+
+define i128 @extractSub512_128(ptr %word, i32 %idx) nounwind {
+; X86-LABEL: extractSub512_128:
+; X86: # %bb.0:
+; X86-NEXT: pushl %ebp
+; X86-NEXT: movl %esp, %ebp
+; X86-NEXT: pushl %ebx
+; X86-NEXT: pushl %edi
+; X86-NEXT: pushl %esi
+; X86-NEXT: andl $-16, %esp
+; X86-NEXT: subl $192, %esp
+; X86-NEXT: movl 12(%ebp), %eax
+; X86-NEXT: movl (%eax), %ecx
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 4(%eax), %ecx
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 8(%eax), %ecx
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 12(%eax), %ecx
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 16(%eax), %ecx
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 20(%eax), %ecx
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 24(%eax), %ecx
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 28(%eax), %ecx
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 32(%eax), %ecx
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 36(%eax), %ecx
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 40(%eax), %ebx
+; X86-NEXT: movl 44(%eax), %edi
+; X86-NEXT: movl 48(%eax), %esi
+; X86-NEXT: movl 52(%eax), %edx
+; X86-NEXT: movl 56(%eax), %ecx
+; X86-NEXT: movl 60(%eax), %eax
+; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; X86-NEXT: movl %esi, {{[0-9]+}}(%esp)
+; X86-NEXT: movl %edi, {{[0-9]+}}(%esp)
+; X86-NEXT: movl %ebx, {{[0-9]+}}(%esp)
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT: movl 8(%ebp), %eax
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT: movl 16(%ebp), %edi
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: shrl $3, %edi
+; X86-NEXT: andl $48, %edi
+; X86-NEXT: movl 48(%esp,%edi), %ecx
+; X86-NEXT: movl 52(%esp,%edi), %edx
+; X86-NEXT: movl 56(%esp,%edi), %esi
+; X86-NEXT: movl 60(%esp,%edi), %edi
+; X86-NEXT: movl %edi, 12(%eax)
+; X86-NEXT: movl %esi, 8(%eax)
+; X86-NEXT: movl %edx, 4(%eax)
+; X86-NEXT: movl %ecx, (%eax)
+; X86-NEXT: leal -12(%ebp), %esp
+; X86-NEXT: popl %esi
+; X86-NEXT: popl %edi
+; X86-NEXT: popl %ebx
+; X86-NEXT: popl %ebp
+; X86-NEXT: retl $4
+;
+; SSE-LABEL: extractSub512_128:
+; SSE: # %bb.0:
+; SSE-NEXT: pushq %rax
+; SSE-NEXT: # kill: def $esi killed $esi def $rsi
+; SSE-NEXT: movups (%rdi), %xmm0
+; SSE-NEXT: movups 16(%rdi), %xmm1
+; SSE-NEXT: movups 32(%rdi), %xmm2
+; SSE-NEXT: movups 48(%rdi), %xmm3
+; SSE-NEXT: xorps %xmm4, %xmm4
+; SSE-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: movaps %xmm3, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: shrl $3, %esi
+; SSE-NEXT: andl $48, %esi
+; SSE-NEXT: movq -128(%rsp,%rsi), %rax
+; SSE-NEXT: movq -120(%rsp,%rsi), %rdx
+; SSE-NEXT: popq %rcx
+; SSE-NEXT: retq
+;
+; AVX-LABEL: extractSub512_128:
+; AVX: # %bb.0:
+; AVX-NEXT: pushq %rax
+; AVX-NEXT: # kill: def $esi killed $esi def $rsi
+; AVX-NEXT: vmovups (%rdi), %ymm0
+; AVX-NEXT: vmovups 32(%rdi), %ymm1
+; AVX-NEXT: vxorps %xmm2, %xmm2, %xmm2
+; AVX-NEXT: vmovups %ymm2, -{{[0-9]+}}(%rsp)
+; AVX-NEXT: vmovups %ymm2, -{{[0-9]+}}(%rsp)
+; AVX-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp)
+; AVX-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; AVX-NEXT: shrl $3, %esi
+; AVX-NEXT: andl $48, %esi
+; AVX-NEXT: movq -128(%rsp,%rsi), %rax
+; AVX-NEXT: movq -120(%rsp,%rsi), %rdx
+; AVX-NEXT: popq %rcx
+; AVX-NEXT: vzeroupper
+; AVX-NEXT: retq
+ %idx_bounds = and i32 %idx, 511
+ %idx_align = and i32 %idx_bounds, -128
+ %sh = zext nneg i32 %idx_align to i512
+ %ld = load i512, ptr %word, align 8
+ %sub = lshr i512 %ld, %sh
+ %res = trunc i512 %sub to i128
+ ret i128 %res
+}
+
+define i64 @extractSub4096_64(ptr %word, i32 %idx) nounwind {
+; X86-LABEL: extractSub4096_64:
+; X86: # %bb.0:
+; X86-NEXT: pushl %ebp
+; X86-NEXT: movl %esp, %ebp
+; X86-NEXT: pushl %ebx
+; X86-NEXT: pushl %edi
+; X86-NEXT: pushl %esi
+; X86-NEXT: andl $-16, %esp
+; X86-NEXT: subl $1536, %esp # imm = 0x600
+; X86-NEXT: movl 8(%ebp), %eax
+; X86-NEXT: movl 4(%eax), %ecx
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 8(%eax), %ecx
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 12(%eax), %ecx
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 16(%eax), %ecx
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 20(%eax), %ecx
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 24(%eax), %ecx
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 28(%eax), %ecx
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 32(%eax), %ecx
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 36(%eax), %ecx
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 40(%eax), %ecx
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 44(%eax), %ecx
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 48(%eax), %ecx
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 52(%eax), %ecx
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 56(%eax), %ecx
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 60(%eax), %ecx
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 64(%eax), %ecx
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 68(%eax), %ecx
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 72(%eax), %ecx
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 76(%eax), %ecx
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 80(%eax), %ecx
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 84(%eax), %ecx
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 88(%eax), %ecx
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 92(%eax), %ecx
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 96(%eax), %ecx
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 100(%eax), %ecx
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 104(%eax), %ecx
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 108(%eax), %ecx
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 112(%eax), %ecx
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 116(%eax), %ecx
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 120(%eax), %ecx
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 124(%eax), %ecx
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 128(%eax), %ecx
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 132(%eax), %ecx
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 136(%eax), %ecx
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 140(%eax), %ecx
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 144(%eax), %ecx
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 148(%eax), %ecx
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 152(%eax), %ecx
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 156(%eax), %ecx
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 160(%eax), %ecx
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 164(%eax), %ecx
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 168(%eax), %ecx
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 172(%eax), %ecx
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 176(%eax), %ecx
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 180(%eax), %ecx
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 184(%eax), %ecx
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 188(%eax), %ecx
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 192(%eax), %ecx
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 196(%eax), %ecx
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 200(%eax), %ecx
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 204(%eax), %ecx
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 208(%eax), %ecx
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 212(%eax), %ecx
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 216(%eax), %ecx
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 220(%eax), %ecx
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 224(%eax), %ecx
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 228(%eax), %ecx
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 232(%eax), %ecx
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 236(%eax), %ecx
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 240(%eax), %ecx
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 244(%eax), %ecx
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 248(%eax), %ecx
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 252(%eax), %ecx
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 256(%eax), %ecx
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 260(%eax), %ecx
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 264(%eax), %ecx
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 268(%eax), %ecx
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 272(%eax), %ecx
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 276(%eax), %ecx
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 280(%eax), %ecx
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 284(%eax), %ecx
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 288(%eax), %ecx
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 292(%eax), %ecx
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 296(%eax), %ecx
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 300(%eax), %ecx
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 304(%eax), %ecx
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 308(%eax), %ecx
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 312(%eax), %ecx
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 316(%eax), %ecx
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 320(%eax), %ecx
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 324(%eax), %ecx
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 328(%eax), %ecx
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 332(%eax), %ecx
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 336(%eax), %ecx
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 340(%eax), %ecx
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 344(%eax), %ecx
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 348(%eax), %ecx
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 352(%eax), %ecx
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 356(%eax), %ecx
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 360(%eax), %ecx
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 364(%eax), %ecx
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 368(%eax), %ecx
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 372(%eax), %ecx
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 376(%eax), %ecx
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 380(%eax), %ecx
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl (%eax), %ecx
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 384(%eax), %ecx
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 388(%eax), %ecx
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 392(%eax), %ecx
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 396(%eax), %ecx
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 400(%eax), %ecx
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 404(%eax), %ecx
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 408(%eax), %ecx
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 412(%eax), %ecx
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 416(%eax), %ecx
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 420(%eax), %ecx
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 424(%eax), %ecx
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 428(%eax), %ecx
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 432(%eax), %ecx
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 436(%eax), %ecx
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 440(%eax), %ecx
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 444(%eax), %ecx
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 448(%eax), %ecx
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 452(%eax), %ecx
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 456(%eax), %ecx
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 460(%eax), %ecx
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 464(%eax), %ecx
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 468(%eax), %ecx
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 472(%eax), %ecx
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 476(%eax), %ecx
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 480(%eax), %ecx
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 484(%eax), %ecx
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 488(%eax), %ebx
+; X86-NEXT: movl 492(%eax), %edi
+; X86-NEXT: movl 496(%eax), %esi
+; X86-NEXT: movl 500(%eax), %edx
+; X86-NEXT: movl 504(%eax), %ecx
+; X86-NEXT: movl 508(%eax), %eax
+; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; X86-NEXT: movl %esi, {{[0-9]+}}(%esp)
+; X86-NEXT: movl %edi, {{[0-9]+}}(%esp)
+; X86-NEXT: movl %ebx, {{[0-9]+}}(%esp)
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $4032, %ecx # imm = 0xFC0
+; X86-NEXT: andl 12(%ebp), %ecx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: shrl $3, %ecx
+; X86-NEXT: movl 496(%esp,%ecx), %eax
+; X86-NEXT: movl 500(%esp,%ecx), %edx
+; X86-NEXT: leal -12(%ebp), %esp
+; X86-NEXT: popl %esi
+; X86-NEXT: popl %edi
+; X86-NEXT: popl %ebx
+; X86-NEXT: popl %ebp
+; X86-NEXT: retl
+;
+; SSE-LABEL: extractSub4096_64:
+; SSE: # %bb.0:
+; SSE-NEXT: subq $1176, %rsp # imm = 0x498
+; SSE-NEXT: # kill: def $esi killed $esi def $rsi
+; SSE-NEXT: movups (%rdi), %xmm0
+; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT: movups 16(%rdi), %xmm0
+; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT: movups 32(%rdi), %xmm0
+; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT: movups 48(%rdi), %xmm0
+; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT: movups 64(%rdi), %xmm0
+; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT: movups 80(%rdi), %xmm0
+; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT: movups 96(%rdi), %xmm0
+; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT: movups 112(%rdi), %xmm0
+; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT: movups 128(%rdi), %xmm0
+; SSE-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill
+; SSE-NEXT: movups 144(%rdi), %xmm0
+; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT: movups 160(%rdi), %xmm0
+; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT: movups 176(%rdi), %xmm0
+; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT: movups 192(%rdi), %xmm0
+; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT: movups 208(%rdi), %xmm0
+; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT: movups 224(%rdi), %xmm0
+; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT: movups 240(%rdi), %xmm0
+; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT: movups 256(%rdi), %xmm0
+; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT: movups 272(%rdi), %xmm15
+; SSE-NEXT: movups 288(%rdi), %xmm14
+; SSE-NEXT: movups 304(%rdi), %xmm13
+; SSE-NEXT: movups 320(%rdi), %xmm12
+; SSE-NEXT: movups 336(%rdi), %xmm11
+; SSE-NEXT: movups 352(%rdi), %xmm10
+; SSE-NEXT: movups 368(%rdi), %xmm9
+; SSE-NEXT: movups 384(%rdi), %xmm8
+; SSE-NEXT: movups 400(%rdi), %xmm7
+; SSE-NEXT: movups 416(%rdi), %xmm6
+; SSE-NEXT: movups 432(%rdi), %xmm5
+; SSE-NEXT: movups 448(%rdi), %xmm4
+; SSE-NEXT: movups 464(%rdi), %xmm3
+; SSE-NEXT: movups 480(%rdi), %xmm2
+; SSE-NEXT: movups 496(%rdi), %xmm1
+; SSE-NEXT: xorps %xmm0, %xmm0
+; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE-NEXT: movaps %xmm1, {{[0-9]+}}(%rsp)
+; SSE-NEXT: movaps %xmm2, {{[0-9]+}}(%rsp)
+; SSE-NEXT: movaps %xmm3, {{[0-9]+}}(%rsp)
+; SSE-NEXT: movaps %xmm4, {{[0-9]+}}(%rsp)
+; SSE-NEXT: movaps %xmm5, {{[0-9]+}}(%rsp)
+; SSE-NEXT: movaps %xmm6, {{[0-9]+}}(%rsp)
+; SSE-NEXT: movaps %xmm7, {{[0-9]+}}(%rsp)
+; SSE-NEXT: movaps %xmm8, {{[0-9]+}}(%rsp)
+; SSE-NEXT: movaps %xmm9, {{[0-9]+}}(%rsp)
+; SSE-NEXT: movaps %xmm10, {{[0-9]+}}(%rsp)
+; SSE-NEXT: movaps %xmm11, {{[0-9]+}}(%rsp)
+; SSE-NEXT: movaps %xmm12, {{[0-9]+}}(%rsp)
+; SSE-NEXT: movaps %xmm13, {{[0-9]+}}(%rsp)
+; SSE-NEXT: movaps %xmm14, {{[0-9]+}}(%rsp)
+; SSE-NEXT: movaps %xmm15, {{[0-9]+}}(%rsp)
+; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload
+; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE-NEXT: andl $4032, %esi # imm = 0xFC0
+; SSE-NEXT: shrl $3, %esi
+; SSE-NEXT: movq 144(%rsp,%rsi), %rax
+; SSE-NEXT: addq $1176, %rsp # imm = 0x498
+; SSE-NEXT: retq
+;
+; AVX2-LABEL: extractSub4096_64:
+; AVX2: # %bb.0:
+; AVX2-NEXT: subq $936, %rsp # imm = 0x3A8
+; AVX2-NEXT: vmovups (%rdi), %ymm0
+; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT: vmovups 32(%rdi), %ymm1
+; AVX2-NEXT: vmovups 64(%rdi), %ymm2
+; AVX2-NEXT: vmovups 96(%rdi), %ymm3
+; AVX2-NEXT: vmovups 128(%rdi), %ymm4
+; AVX2-NEXT: vmovups 160(%rdi), %ymm5
+; AVX2-NEXT: vmovups 192(%rdi), %ymm6
+; AVX2-NEXT: vmovups 224(%rdi), %ymm7
+; AVX2-NEXT: vmovups 256(%rdi), %ymm8
+; AVX2-NEXT: vmovups 288(%rdi), %ymm9
+; AVX2-NEXT: vmovups 320(%rdi), %ymm10
+; AVX2-NEXT: vmovups 352(%rdi), %ymm11
+; AVX2-NEXT: vmovups 384(%rdi), %ymm12
+; AVX2-NEXT: vmovups 416(%rdi), %ymm13
+; AVX2-NEXT: vmovups 448(%rdi), %ymm14
+; AVX2-NEXT: vmovups 480(%rdi), %ymm15
+; AVX2-NEXT: vxorps %xmm0, %xmm0, %xmm0
+; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovups %ymm15, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovups %ymm14, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovups %ymm13, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovups %ymm12, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovups %ymm11, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovups %ymm10, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovups %ymm9, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovups %ymm8, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovups %ymm7, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovups %ymm6, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovups %ymm5, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovups %ymm4, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovups %ymm3, (%rsp)
+; AVX2-NEXT: vmovups %ymm2, -{{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; AVX2-NEXT: # kill: def $esi killed $esi def $rsi
+; AVX2-NEXT: andl $4032, %esi # imm = 0xFC0
+; AVX2-NEXT: shrl $3, %esi
+; AVX2-NEXT: movq -96(%rsp,%rsi), %rax
+; AVX2-NEXT: addq $936, %rsp # imm = 0x3A8
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: extractSub4096_64:
+; AVX512: # %bb.0:
+; AVX512-NEXT: subq $904, %rsp # imm = 0x388
+; AVX512-NEXT: # kill: def $esi killed $esi def $rsi
+; AVX512-NEXT: vmovups (%rdi), %ymm0
+; AVX512-NEXT: vmovups 32(%rdi), %ymm1
+; AVX512-NEXT: vmovups 64(%rdi), %ymm2
+; AVX512-NEXT: vmovups 96(%rdi), %ymm3
+; AVX512-NEXT: vmovups 128(%rdi), %ymm4
+; AVX512-NEXT: vmovups 160(%rdi), %ymm5
+; AVX512-NEXT: vmovups 192(%rdi), %ymm6
+; AVX512-NEXT: vmovups 224(%rdi), %ymm7
+; AVX512-NEXT: vmovups 256(%rdi), %ymm8
+; AVX512-NEXT: vmovups 288(%rdi), %ymm9
+; AVX512-NEXT: vmovups 320(%rdi), %ymm10
+; AVX512-NEXT: vmovups 352(%rdi), %ymm11
+; AVX512-NEXT: vmovups 384(%rdi), %ymm12
+; AVX512-NEXT: vmovups 416(%rdi), %ymm13
+; AVX512-NEXT: andl $4032, %esi # imm = 0xFC0
+; AVX512-NEXT: vmovups 448(%rdi), %ymm14
+; AVX512-NEXT: vmovups 480(%rdi), %ymm15
+; AVX512-NEXT: vxorps %xmm16, %xmm16, %xmm16
+; AVX512-NEXT: vmovups %ymm16, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: vmovups %ymm16, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: vmovups %ymm16, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: vmovups %ymm16, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: vmovups %ymm16, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: vmovups %ymm16, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: vmovups %ymm16, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: vmovups %ymm16, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: vmovups %ymm16, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: vmovups %ymm16, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: vmovups %ymm16, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: vmovups %ymm16, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: vmovups %ymm16, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: vmovups %ymm16, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: vmovups %ymm16, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: vmovups %ymm16, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: vmovups %ymm15, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: vmovups %ymm14, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: vmovups %ymm13, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: vmovups %ymm12, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: vmovups %ymm11, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: vmovups %ymm10, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: vmovups %ymm9, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: vmovups %ymm8, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: vmovups %ymm7, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: vmovups %ymm6, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: vmovups %ymm5, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: vmovups %ymm4, (%rsp)
+; AVX512-NEXT: vmovups %ymm3, -{{[0-9]+}}(%rsp)
+; AVX512-NEXT: vmovups %ymm2, -{{[0-9]+}}(%rsp)
+; AVX512-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp)
+; AVX512-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; AVX512-NEXT: shrl $3, %esi
+; AVX512-NEXT: movq -128(%rsp,%rsi), %rax
+; AVX512-NEXT: addq $904, %rsp # imm = 0x388
+; AVX512-NEXT: vzeroupper
+; AVX512-NEXT: retq
+ %idx_bounds = and i32 %idx, 4095
+ %idx_align = and i32 %idx_bounds, -64
+ %sh = zext nneg i32 %idx_align to i4096
+ %ld = load i4096, ptr %word, align 8
+ %sub = lshr i4096 %ld, %sh
+ %res = trunc i4096 %sub to i64
+ ret i64 %res
+}